]> git.saurik.com Git - apple/xnu.git/blob - bsd/kern/uipc_socket.c
4d6e12a7036bae05037a16fc5c55092140bcb6a6
[apple/xnu.git] / bsd / kern / uipc_socket.c
1 /*
2 * Copyright (c) 1998-2018 Apple Inc. All rights reserved.
3 *
4 * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
5 *
6 * This file contains Original Code and/or Modifications of Original Code
7 * as defined in and that are subject to the Apple Public Source License
8 * Version 2.0 (the 'License'). You may not use this file except in
9 * compliance with the License. The rights granted to you under the License
10 * may not be used to create, or enable the creation or redistribution of,
11 * unlawful or unlicensed copies of an Apple operating system, or to
12 * circumvent, violate, or enable the circumvention or violation of, any
13 * terms of an Apple operating system software license agreement.
14 *
15 * Please obtain a copy of the License at
16 * http://www.opensource.apple.com/apsl/ and read it before using this file.
17 *
18 * The Original Code and all software distributed under the License are
19 * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
20 * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
21 * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
22 * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
23 * Please see the License for the specific language governing rights and
24 * limitations under the License.
25 *
26 * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
27 */
28 /* Copyright (c) 1995 NeXT Computer, Inc. All Rights Reserved */
29 /*
30 * Copyright (c) 1982, 1986, 1988, 1990, 1993
31 * The Regents of the University of California. All rights reserved.
32 *
33 * Redistribution and use in source and binary forms, with or without
34 * modification, are permitted provided that the following conditions
35 * are met:
36 * 1. Redistributions of source code must retain the above copyright
37 * notice, this list of conditions and the following disclaimer.
38 * 2. Redistributions in binary form must reproduce the above copyright
39 * notice, this list of conditions and the following disclaimer in the
40 * documentation and/or other materials provided with the distribution.
41 * 3. All advertising materials mentioning features or use of this software
42 * must display the following acknowledgement:
43 * This product includes software developed by the University of
44 * California, Berkeley and its contributors.
45 * 4. Neither the name of the University nor the names of its contributors
46 * may be used to endorse or promote products derived from this software
47 * without specific prior written permission.
48 *
49 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
50 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
51 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
52 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
53 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
54 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
55 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
56 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
57 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
58 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
59 * SUCH DAMAGE.
60 *
61 * @(#)uipc_socket.c 8.3 (Berkeley) 4/15/94
62 */
63 /*
64 * NOTICE: This file was modified by SPARTA, Inc. in 2005 to introduce
65 * support for mandatory and extensible security protections. This notice
66 * is included in support of clause 2.2 (b) of the Apple Public License,
67 * Version 2.0.
68 */
69
70 #include <sys/param.h>
71 #include <sys/systm.h>
72 #include <sys/filedesc.h>
73 #include <sys/proc.h>
74 #include <sys/proc_internal.h>
75 #include <sys/kauth.h>
76 #include <sys/file_internal.h>
77 #include <sys/fcntl.h>
78 #include <sys/malloc.h>
79 #include <sys/mbuf.h>
80 #include <sys/domain.h>
81 #include <sys/kernel.h>
82 #include <sys/event.h>
83 #include <sys/poll.h>
84 #include <sys/protosw.h>
85 #include <sys/socket.h>
86 #include <sys/socketvar.h>
87 #include <sys/resourcevar.h>
88 #include <sys/signalvar.h>
89 #include <sys/sysctl.h>
90 #include <sys/syslog.h>
91 #include <sys/uio.h>
92 #include <sys/uio_internal.h>
93 #include <sys/ev.h>
94 #include <sys/kdebug.h>
95 #include <sys/un.h>
96 #include <sys/user.h>
97 #include <sys/priv.h>
98 #include <sys/kern_event.h>
99 #include <net/route.h>
100 #include <net/init.h>
101 #include <net/net_api_stats.h>
102 #include <net/ntstat.h>
103 #include <net/content_filter.h>
104 #include <netinet/in.h>
105 #include <netinet/in_pcb.h>
106 #include <netinet/in_tclass.h>
107 #include <netinet/tcp_var.h>
108 #include <netinet/ip6.h>
109 #include <netinet6/ip6_var.h>
110 #include <netinet/flow_divert.h>
111 #include <kern/zalloc.h>
112 #include <kern/locks.h>
113 #include <machine/limits.h>
114 #include <libkern/OSAtomic.h>
115 #include <pexpert/pexpert.h>
116 #include <kern/assert.h>
117 #include <kern/task.h>
118 #include <kern/policy_internal.h>
119
120 #include <sys/kpi_mbuf.h>
121 #include <sys/mcache.h>
122 #include <sys/unpcb.h>
123 #include <libkern/section_keywords.h>
124
125 #if CONFIG_MACF
126 #include <security/mac_framework.h>
127 #endif /* MAC */
128
129 #if MULTIPATH
130 #include <netinet/mp_pcb.h>
131 #include <netinet/mptcp_var.h>
132 #endif /* MULTIPATH */
133
134 #define ROUNDUP(a, b) (((a) + ((b) - 1)) & (~((b) - 1)))
135
136 #if DEBUG || DEVELOPMENT
137 #define DEBUG_KERNEL_ADDRPERM(_v) (_v)
138 #else
139 #define DEBUG_KERNEL_ADDRPERM(_v) VM_KERNEL_ADDRPERM(_v)
140 #endif
141
142 /* TODO: this should be in a header file somewhere */
143 extern char *proc_name_address(void *p);
144
145 static u_int32_t so_cache_hw; /* High water mark for socache */
146 static u_int32_t so_cache_timeouts; /* number of timeouts */
147 static u_int32_t so_cache_max_freed; /* max freed per timeout */
148 static u_int32_t cached_sock_count = 0;
149 STAILQ_HEAD(, socket) so_cache_head;
150 int max_cached_sock_count = MAX_CACHED_SOCKETS;
151 static u_int32_t so_cache_time;
152 static int socketinit_done;
153 static struct zone *so_cache_zone;
154
155 static lck_grp_t *so_cache_mtx_grp;
156 static lck_attr_t *so_cache_mtx_attr;
157 static lck_grp_attr_t *so_cache_mtx_grp_attr;
158 static lck_mtx_t *so_cache_mtx;
159
160 #include <machine/limits.h>
161
162 static int filt_sorattach(struct knote *kn, struct kevent_internal_s *kev);
163 static void filt_sordetach(struct knote *kn);
164 static int filt_soread(struct knote *kn, long hint);
165 static int filt_sortouch(struct knote *kn, struct kevent_internal_s *kev);
166 static int filt_sorprocess(struct knote *kn, struct filt_process_s *data, struct kevent_internal_s *kev);
167
168 static int filt_sowattach(struct knote *kn, struct kevent_internal_s *kev);
169 static void filt_sowdetach(struct knote *kn);
170 static int filt_sowrite(struct knote *kn, long hint);
171 static int filt_sowtouch(struct knote *kn, struct kevent_internal_s *kev);
172 static int filt_sowprocess(struct knote *kn, struct filt_process_s *data, struct kevent_internal_s *kev);
173
174 static int filt_sockattach(struct knote *kn, struct kevent_internal_s *kev);
175 static void filt_sockdetach(struct knote *kn);
176 static int filt_sockev(struct knote *kn, long hint);
177 static int filt_socktouch(struct knote *kn, struct kevent_internal_s *kev);
178 static int filt_sockprocess(struct knote *kn, struct filt_process_s *data, struct kevent_internal_s *kev);
179
180 static int sooptcopyin_timeval(struct sockopt *, struct timeval *);
181 static int sooptcopyout_timeval(struct sockopt *, const struct timeval *);
182
183 SECURITY_READ_ONLY_EARLY(struct filterops) soread_filtops = {
184 .f_isfd = 1,
185 .f_attach = filt_sorattach,
186 .f_detach = filt_sordetach,
187 .f_event = filt_soread,
188 .f_touch = filt_sortouch,
189 .f_process = filt_sorprocess,
190 };
191
192 SECURITY_READ_ONLY_EARLY(struct filterops) sowrite_filtops = {
193 .f_isfd = 1,
194 .f_attach = filt_sowattach,
195 .f_detach = filt_sowdetach,
196 .f_event = filt_sowrite,
197 .f_touch = filt_sowtouch,
198 .f_process = filt_sowprocess,
199 };
200
201 SECURITY_READ_ONLY_EARLY(struct filterops) sock_filtops = {
202 .f_isfd = 1,
203 .f_attach = filt_sockattach,
204 .f_detach = filt_sockdetach,
205 .f_event = filt_sockev,
206 .f_touch = filt_socktouch,
207 .f_process = filt_sockprocess,
208 };
209
210 SECURITY_READ_ONLY_EARLY(struct filterops) soexcept_filtops = {
211 .f_isfd = 1,
212 .f_attach = filt_sorattach,
213 .f_detach = filt_sordetach,
214 .f_event = filt_soread,
215 .f_touch = filt_sortouch,
216 .f_process = filt_sorprocess,
217 };
218
219 SYSCTL_DECL(_kern_ipc);
220
221 #define EVEN_MORE_LOCKING_DEBUG 0
222
223 int socket_debug = 0;
224 SYSCTL_INT(_kern_ipc, OID_AUTO, socket_debug,
225 CTLFLAG_RW | CTLFLAG_LOCKED, &socket_debug, 0, "");
226
227 static unsigned long sodefunct_calls = 0;
228 SYSCTL_LONG(_kern_ipc, OID_AUTO, sodefunct_calls, CTLFLAG_LOCKED,
229 &sodefunct_calls, "");
230
231 static int socket_zone = M_SOCKET;
232 so_gen_t so_gencnt; /* generation count for sockets */
233
234 MALLOC_DEFINE(M_SONAME, "soname", "socket name");
235 MALLOC_DEFINE(M_PCB, "pcb", "protocol control block");
236
237 #define DBG_LAYER_IN_BEG NETDBG_CODE(DBG_NETSOCK, 0)
238 #define DBG_LAYER_IN_END NETDBG_CODE(DBG_NETSOCK, 2)
239 #define DBG_LAYER_OUT_BEG NETDBG_CODE(DBG_NETSOCK, 1)
240 #define DBG_LAYER_OUT_END NETDBG_CODE(DBG_NETSOCK, 3)
241 #define DBG_FNC_SOSEND NETDBG_CODE(DBG_NETSOCK, (4 << 8) | 1)
242 #define DBG_FNC_SOSEND_LIST NETDBG_CODE(DBG_NETSOCK, (4 << 8) | 3)
243 #define DBG_FNC_SORECEIVE NETDBG_CODE(DBG_NETSOCK, (8 << 8))
244 #define DBG_FNC_SORECEIVE_LIST NETDBG_CODE(DBG_NETSOCK, (8 << 8) | 3)
245 #define DBG_FNC_SOSHUTDOWN NETDBG_CODE(DBG_NETSOCK, (9 << 8))
246
247 #define MAX_SOOPTGETM_SIZE (128 * MCLBYTES)
248
249 int somaxconn = SOMAXCONN;
250 SYSCTL_INT(_kern_ipc, KIPC_SOMAXCONN, somaxconn,
251 CTLFLAG_RW | CTLFLAG_LOCKED, &somaxconn, 0, "");
252
253 /* Should we get a maximum also ??? */
254 static int sosendmaxchain = 65536;
255 static int sosendminchain = 16384;
256 static int sorecvmincopy = 16384;
257 SYSCTL_INT(_kern_ipc, OID_AUTO, sosendminchain,
258 CTLFLAG_RW | CTLFLAG_LOCKED, &sosendminchain, 0, "");
259 SYSCTL_INT(_kern_ipc, OID_AUTO, sorecvmincopy,
260 CTLFLAG_RW | CTLFLAG_LOCKED, &sorecvmincopy, 0, "");
261
262 /*
263 * Set to enable jumbo clusters (if available) for large writes when
264 * the socket is marked with SOF_MULTIPAGES; see below.
265 */
266 int sosendjcl = 1;
267 SYSCTL_INT(_kern_ipc, OID_AUTO, sosendjcl,
268 CTLFLAG_RW | CTLFLAG_LOCKED, &sosendjcl, 0, "");
269
270 /*
271 * Set this to ignore SOF_MULTIPAGES and use jumbo clusters for large
272 * writes on the socket for all protocols on any network interfaces,
273 * depending upon sosendjcl above. Be extra careful when setting this
274 * to 1, because sending down packets that cross physical pages down to
275 * broken drivers (those that falsely assume that the physical pages
276 * are contiguous) might lead to system panics or silent data corruption.
277 * When set to 0, the system will respect SOF_MULTIPAGES, which is set
278 * only for TCP sockets whose outgoing interface is IFNET_MULTIPAGES
279 * capable. Set this to 1 only for testing/debugging purposes.
280 */
281 int sosendjcl_ignore_capab = 0;
282 SYSCTL_INT(_kern_ipc, OID_AUTO, sosendjcl_ignore_capab,
283 CTLFLAG_RW | CTLFLAG_LOCKED, &sosendjcl_ignore_capab, 0, "");
284
285 /*
286 * Set this to ignore SOF1_IF_2KCL and use big clusters for large
287 * writes on the socket for all protocols on any network interfaces.
288 * Be extra careful when setting this to 1, because sending down packets with
289 * clusters larger that 2 KB might lead to system panics or data corruption.
290 * When set to 0, the system will respect SOF1_IF_2KCL, which is set
291 * on the outgoing interface
292 * Set this to 1 for testing/debugging purposes only.
293 */
294 int sosendbigcl_ignore_capab = 0;
295 SYSCTL_INT(_kern_ipc, OID_AUTO, sosendbigcl_ignore_capab,
296 CTLFLAG_RW | CTLFLAG_LOCKED, &sosendbigcl_ignore_capab, 0, "");
297
298 int sodefunctlog = 0;
299 SYSCTL_INT(_kern_ipc, OID_AUTO, sodefunctlog, CTLFLAG_RW | CTLFLAG_LOCKED,
300 &sodefunctlog, 0, "");
301
302 int sothrottlelog = 0;
303 SYSCTL_INT(_kern_ipc, OID_AUTO, sothrottlelog, CTLFLAG_RW | CTLFLAG_LOCKED,
304 &sothrottlelog, 0, "");
305
306 int sorestrictrecv = 1;
307 SYSCTL_INT(_kern_ipc, OID_AUTO, sorestrictrecv, CTLFLAG_RW | CTLFLAG_LOCKED,
308 &sorestrictrecv, 0, "Enable inbound interface restrictions");
309
310 int sorestrictsend = 1;
311 SYSCTL_INT(_kern_ipc, OID_AUTO, sorestrictsend, CTLFLAG_RW | CTLFLAG_LOCKED,
312 &sorestrictsend, 0, "Enable outbound interface restrictions");
313
314 int soreserveheadroom = 1;
315 SYSCTL_INT(_kern_ipc, OID_AUTO, soreserveheadroom, CTLFLAG_RW | CTLFLAG_LOCKED,
316 &soreserveheadroom, 0, "To allocate contiguous datagram buffers");
317
318 #if (DEBUG || DEVELOPMENT)
319 int so_notsent_lowat_check = 1;
320 SYSCTL_INT(_kern_ipc, OID_AUTO, notsent_lowat, CTLFLAG_RW | CTLFLAG_LOCKED,
321 &so_notsent_lowat_check, 0, "enable/disable notsnet lowat check");
322 #endif /* DEBUG || DEVELOPMENT */
323
324 int so_accept_list_waits = 0;
325 #if (DEBUG || DEVELOPMENT)
326 SYSCTL_INT(_kern_ipc, OID_AUTO, accept_list_waits, CTLFLAG_RW | CTLFLAG_LOCKED,
327 &so_accept_list_waits, 0, "number of waits for listener incomp list");
328 #endif /* DEBUG || DEVELOPMENT */
329
330 extern struct inpcbinfo tcbinfo;
331
332 /* TODO: these should be in header file */
333 extern int get_inpcb_str_size(void);
334 extern int get_tcp_str_size(void);
335
336 vm_size_t so_cache_zone_element_size;
337
338 static int sodelayed_copy(struct socket *, struct uio *, struct mbuf **,
339 user_ssize_t *);
340 static void cached_sock_alloc(struct socket **, int);
341 static void cached_sock_free(struct socket *);
342
343 /*
344 * Maximum of extended background idle sockets per process
345 * Set to zero to disable further setting of the option
346 */
347
348 #define SO_IDLE_BK_IDLE_MAX_PER_PROC 1
349 #define SO_IDLE_BK_IDLE_TIME 600
350 #define SO_IDLE_BK_IDLE_RCV_HIWAT 131072
351
352 struct soextbkidlestat soextbkidlestat;
353
354 SYSCTL_UINT(_kern_ipc, OID_AUTO, maxextbkidleperproc,
355 CTLFLAG_RW | CTLFLAG_LOCKED, &soextbkidlestat.so_xbkidle_maxperproc, 0,
356 "Maximum of extended background idle sockets per process");
357
358 SYSCTL_UINT(_kern_ipc, OID_AUTO, extbkidletime, CTLFLAG_RW | CTLFLAG_LOCKED,
359 &soextbkidlestat.so_xbkidle_time, 0,
360 "Time in seconds to keep extended background idle sockets");
361
362 SYSCTL_UINT(_kern_ipc, OID_AUTO, extbkidlercvhiwat, CTLFLAG_RW | CTLFLAG_LOCKED,
363 &soextbkidlestat.so_xbkidle_rcvhiwat, 0,
364 "High water mark for extended background idle sockets");
365
366 SYSCTL_STRUCT(_kern_ipc, OID_AUTO, extbkidlestat, CTLFLAG_RD | CTLFLAG_LOCKED,
367 &soextbkidlestat, soextbkidlestat, "");
368
369 int so_set_extended_bk_idle(struct socket *, int);
370
371
372 /*
373 * SOTCDB_NO_DSCP is set by default, to prevent the networking stack from
374 * setting the DSCP code on the packet based on the service class; see
375 * <rdar://problem/11277343> for details.
376 */
377 __private_extern__ u_int32_t sotcdb = 0;
378 SYSCTL_INT(_kern_ipc, OID_AUTO, sotcdb, CTLFLAG_RW | CTLFLAG_LOCKED,
379 &sotcdb, 0, "");
380
381 void
382 socketinit(void)
383 {
384 _CASSERT(sizeof(so_gencnt) == sizeof(uint64_t));
385 VERIFY(IS_P2ALIGNED(&so_gencnt, sizeof(uint32_t)));
386
387 #ifdef __LP64__
388 _CASSERT(sizeof(struct sa_endpoints) == sizeof(struct user64_sa_endpoints));
389 _CASSERT(offsetof(struct sa_endpoints, sae_srcif) == offsetof(struct user64_sa_endpoints, sae_srcif));
390 _CASSERT(offsetof(struct sa_endpoints, sae_srcaddr) == offsetof(struct user64_sa_endpoints, sae_srcaddr));
391 _CASSERT(offsetof(struct sa_endpoints, sae_srcaddrlen) == offsetof(struct user64_sa_endpoints, sae_srcaddrlen));
392 _CASSERT(offsetof(struct sa_endpoints, sae_dstaddr) == offsetof(struct user64_sa_endpoints, sae_dstaddr));
393 _CASSERT(offsetof(struct sa_endpoints, sae_dstaddrlen) == offsetof(struct user64_sa_endpoints, sae_dstaddrlen));
394 #else
395 _CASSERT(sizeof(struct sa_endpoints) == sizeof(struct user32_sa_endpoints));
396 _CASSERT(offsetof(struct sa_endpoints, sae_srcif) == offsetof(struct user32_sa_endpoints, sae_srcif));
397 _CASSERT(offsetof(struct sa_endpoints, sae_srcaddr) == offsetof(struct user32_sa_endpoints, sae_srcaddr));
398 _CASSERT(offsetof(struct sa_endpoints, sae_srcaddrlen) == offsetof(struct user32_sa_endpoints, sae_srcaddrlen));
399 _CASSERT(offsetof(struct sa_endpoints, sae_dstaddr) == offsetof(struct user32_sa_endpoints, sae_dstaddr));
400 _CASSERT(offsetof(struct sa_endpoints, sae_dstaddrlen) == offsetof(struct user32_sa_endpoints, sae_dstaddrlen));
401 #endif
402
403 if (socketinit_done) {
404 printf("socketinit: already called...\n");
405 return;
406 }
407 socketinit_done = 1;
408
409 PE_parse_boot_argn("socket_debug", &socket_debug,
410 sizeof(socket_debug));
411
412 /*
413 * allocate lock group attribute and group for socket cache mutex
414 */
415 so_cache_mtx_grp_attr = lck_grp_attr_alloc_init();
416 so_cache_mtx_grp = lck_grp_alloc_init("so_cache",
417 so_cache_mtx_grp_attr);
418
419 /*
420 * allocate the lock attribute for socket cache mutex
421 */
422 so_cache_mtx_attr = lck_attr_alloc_init();
423
424 /* cached sockets mutex */
425 so_cache_mtx = lck_mtx_alloc_init(so_cache_mtx_grp, so_cache_mtx_attr);
426 if (so_cache_mtx == NULL) {
427 panic("%s: unable to allocate so_cache_mtx\n", __func__);
428 /* NOTREACHED */
429 }
430 STAILQ_INIT(&so_cache_head);
431
432 so_cache_zone_element_size = (vm_size_t)(sizeof(struct socket) + 4
433 + get_inpcb_str_size() + 4 + get_tcp_str_size());
434
435 so_cache_zone = zinit(so_cache_zone_element_size,
436 (120000 * so_cache_zone_element_size), 8192, "socache zone");
437 zone_change(so_cache_zone, Z_CALLERACCT, FALSE);
438 zone_change(so_cache_zone, Z_NOENCRYPT, TRUE);
439
440 bzero(&soextbkidlestat, sizeof(struct soextbkidlestat));
441 soextbkidlestat.so_xbkidle_maxperproc = SO_IDLE_BK_IDLE_MAX_PER_PROC;
442 soextbkidlestat.so_xbkidle_time = SO_IDLE_BK_IDLE_TIME;
443 soextbkidlestat.so_xbkidle_rcvhiwat = SO_IDLE_BK_IDLE_RCV_HIWAT;
444
445 in_pcbinit();
446 sflt_init();
447 socket_tclass_init();
448 #if MULTIPATH
449 mp_pcbinit();
450 #endif /* MULTIPATH */
451 }
452
453 static void
454 cached_sock_alloc(struct socket **so, int waitok)
455 {
456 caddr_t temp;
457 uintptr_t offset;
458
459 lck_mtx_lock(so_cache_mtx);
460
461 if (!STAILQ_EMPTY(&so_cache_head)) {
462 VERIFY(cached_sock_count > 0);
463
464 *so = STAILQ_FIRST(&so_cache_head);
465 STAILQ_REMOVE_HEAD(&so_cache_head, so_cache_ent);
466 STAILQ_NEXT((*so), so_cache_ent) = NULL;
467
468 cached_sock_count--;
469 lck_mtx_unlock(so_cache_mtx);
470
471 temp = (*so)->so_saved_pcb;
472 bzero((caddr_t)*so, sizeof(struct socket));
473
474 (*so)->so_saved_pcb = temp;
475 } else {
476 lck_mtx_unlock(so_cache_mtx);
477
478 if (waitok) {
479 *so = (struct socket *)zalloc(so_cache_zone);
480 } else {
481 *so = (struct socket *)zalloc_noblock(so_cache_zone);
482 }
483
484 if (*so == NULL) {
485 return;
486 }
487
488 bzero((caddr_t)*so, sizeof(struct socket));
489
490 /*
491 * Define offsets for extra structures into our
492 * single block of memory. Align extra structures
493 * on longword boundaries.
494 */
495
496 offset = (uintptr_t)*so;
497 offset += sizeof(struct socket);
498
499 offset = ALIGN(offset);
500
501 (*so)->so_saved_pcb = (caddr_t)offset;
502 offset += get_inpcb_str_size();
503
504 offset = ALIGN(offset);
505
506 ((struct inpcb *)(void *)(*so)->so_saved_pcb)->inp_saved_ppcb =
507 (caddr_t)offset;
508 }
509
510 OSBitOrAtomic(SOF1_CACHED_IN_SOCK_LAYER, &(*so)->so_flags1);
511 }
512
513 static void
514 cached_sock_free(struct socket *so)
515 {
516 lck_mtx_lock(so_cache_mtx);
517
518 so_cache_time = net_uptime();
519 if (++cached_sock_count > max_cached_sock_count) {
520 --cached_sock_count;
521 lck_mtx_unlock(so_cache_mtx);
522 zfree(so_cache_zone, so);
523 } else {
524 if (so_cache_hw < cached_sock_count) {
525 so_cache_hw = cached_sock_count;
526 }
527
528 STAILQ_INSERT_TAIL(&so_cache_head, so, so_cache_ent);
529
530 so->cache_timestamp = so_cache_time;
531 lck_mtx_unlock(so_cache_mtx);
532 }
533 }
534
535 void
536 so_update_last_owner_locked(struct socket *so, proc_t self)
537 {
538 if (so->last_pid != 0) {
539 /*
540 * last_pid and last_upid should remain zero for sockets
541 * created using sock_socket. The check above achieves that
542 */
543 if (self == PROC_NULL) {
544 self = current_proc();
545 }
546
547 if (so->last_upid != proc_uniqueid(self) ||
548 so->last_pid != proc_pid(self)) {
549 so->last_upid = proc_uniqueid(self);
550 so->last_pid = proc_pid(self);
551 proc_getexecutableuuid(self, so->last_uuid,
552 sizeof(so->last_uuid));
553 }
554 proc_pidoriginatoruuid(so->so_vuuid, sizeof(so->so_vuuid));
555 }
556 }
557
558 void
559 so_update_policy(struct socket *so)
560 {
561 if (SOCK_DOM(so) == PF_INET || SOCK_DOM(so) == PF_INET6) {
562 (void) inp_update_policy(sotoinpcb(so));
563 }
564 }
565
566 #if NECP
567 static void
568 so_update_necp_policy(struct socket *so, struct sockaddr *override_local_addr,
569 struct sockaddr *override_remote_addr)
570 {
571 if (SOCK_DOM(so) == PF_INET || SOCK_DOM(so) == PF_INET6) {
572 inp_update_necp_policy(sotoinpcb(so), override_local_addr,
573 override_remote_addr, 0);
574 }
575 }
576 #endif /* NECP */
577
578 boolean_t
579 so_cache_timer(void)
580 {
581 struct socket *p;
582 int n_freed = 0;
583 boolean_t rc = FALSE;
584
585 lck_mtx_lock(so_cache_mtx);
586 so_cache_timeouts++;
587 so_cache_time = net_uptime();
588
589 while (!STAILQ_EMPTY(&so_cache_head)) {
590 VERIFY(cached_sock_count > 0);
591 p = STAILQ_FIRST(&so_cache_head);
592 if ((so_cache_time - p->cache_timestamp) <
593 SO_CACHE_TIME_LIMIT) {
594 break;
595 }
596
597 STAILQ_REMOVE_HEAD(&so_cache_head, so_cache_ent);
598 --cached_sock_count;
599
600 zfree(so_cache_zone, p);
601
602 if (++n_freed >= SO_CACHE_MAX_FREE_BATCH) {
603 so_cache_max_freed++;
604 break;
605 }
606 }
607
608 /* Schedule again if there is more to cleanup */
609 if (!STAILQ_EMPTY(&so_cache_head)) {
610 rc = TRUE;
611 }
612
613 lck_mtx_unlock(so_cache_mtx);
614 return rc;
615 }
616
617 /*
618 * Get a socket structure from our zone, and initialize it.
619 * We don't implement `waitok' yet (see comments in uipc_domain.c).
620 * Note that it would probably be better to allocate socket
621 * and PCB at the same time, but I'm not convinced that all
622 * the protocols can be easily modified to do this.
623 */
624 struct socket *
625 soalloc(int waitok, int dom, int type)
626 {
627 struct socket *so;
628
629 if ((dom == PF_INET) && (type == SOCK_STREAM)) {
630 cached_sock_alloc(&so, waitok);
631 } else {
632 MALLOC_ZONE(so, struct socket *, sizeof(*so), socket_zone,
633 M_WAITOK);
634 if (so != NULL) {
635 bzero(so, sizeof(*so));
636 }
637 }
638 if (so != NULL) {
639 so->so_gencnt = OSIncrementAtomic64((SInt64 *)&so_gencnt);
640 so->so_zone = socket_zone;
641
642 /*
643 * Increment the socket allocation statistics
644 */
645 INC_ATOMIC_INT64_LIM(net_api_stats.nas_socket_alloc_total);
646
647 #if CONFIG_MACF_SOCKET
648 /* Convert waitok to M_WAITOK/M_NOWAIT for MAC Framework. */
649 if (mac_socket_label_init(so, !waitok) != 0) {
650 sodealloc(so);
651 return NULL;
652 }
653 #endif /* MAC_SOCKET */
654 }
655
656 return so;
657 }
658
659 int
660 socreate_internal(int dom, struct socket **aso, int type, int proto,
661 struct proc *p, uint32_t flags, struct proc *ep)
662 {
663 struct protosw *prp;
664 struct socket *so;
665 int error = 0;
666
667 #if TCPDEBUG
668 extern int tcpconsdebug;
669 #endif
670
671 VERIFY(aso != NULL);
672 *aso = NULL;
673
674 if (proto != 0) {
675 prp = pffindproto(dom, proto, type);
676 } else {
677 prp = pffindtype(dom, type);
678 }
679
680 if (prp == NULL || prp->pr_usrreqs->pru_attach == NULL) {
681 if (pffinddomain(dom) == NULL) {
682 return EAFNOSUPPORT;
683 }
684 if (proto != 0) {
685 if (pffindprotonotype(dom, proto) != NULL) {
686 return EPROTOTYPE;
687 }
688 }
689 return EPROTONOSUPPORT;
690 }
691 if (prp->pr_type != type) {
692 return EPROTOTYPE;
693 }
694 so = soalloc(1, dom, type);
695 if (so == NULL) {
696 return ENOBUFS;
697 }
698
699 switch (dom) {
700 case PF_LOCAL:
701 INC_ATOMIC_INT64_LIM(net_api_stats.nas_socket_domain_local_total);
702 break;
703 case PF_INET:
704 INC_ATOMIC_INT64_LIM(net_api_stats.nas_socket_domain_inet_total);
705 if (type == SOCK_STREAM) {
706 INC_ATOMIC_INT64_LIM(net_api_stats.nas_socket_inet_stream_total);
707 } else {
708 INC_ATOMIC_INT64_LIM(net_api_stats.nas_socket_inet_dgram_total);
709 }
710 break;
711 case PF_ROUTE:
712 INC_ATOMIC_INT64_LIM(net_api_stats.nas_socket_domain_route_total);
713 break;
714 case PF_NDRV:
715 INC_ATOMIC_INT64_LIM(net_api_stats.nas_socket_domain_ndrv_total);
716 break;
717 case PF_KEY:
718 INC_ATOMIC_INT64_LIM(net_api_stats.nas_socket_domain_key_total);
719 break;
720 case PF_INET6:
721 INC_ATOMIC_INT64_LIM(net_api_stats.nas_socket_domain_inet6_total);
722 if (type == SOCK_STREAM) {
723 INC_ATOMIC_INT64_LIM(net_api_stats.nas_socket_inet6_stream_total);
724 } else {
725 INC_ATOMIC_INT64_LIM(net_api_stats.nas_socket_inet6_dgram_total);
726 }
727 break;
728 case PF_SYSTEM:
729 INC_ATOMIC_INT64_LIM(net_api_stats.nas_socket_domain_system_total);
730 break;
731 case PF_MULTIPATH:
732 INC_ATOMIC_INT64_LIM(net_api_stats.nas_socket_domain_multipath_total);
733 break;
734 default:
735 INC_ATOMIC_INT64_LIM(net_api_stats.nas_socket_domain_other_total);
736 break;
737 }
738
739 if (flags & SOCF_ASYNC) {
740 so->so_state |= SS_NBIO;
741 }
742
743 TAILQ_INIT(&so->so_incomp);
744 TAILQ_INIT(&so->so_comp);
745 so->so_type = type;
746 so->last_upid = proc_uniqueid(p);
747 so->last_pid = proc_pid(p);
748 proc_getexecutableuuid(p, so->last_uuid, sizeof(so->last_uuid));
749 proc_pidoriginatoruuid(so->so_vuuid, sizeof(so->so_vuuid));
750
751 if (ep != PROC_NULL && ep != p) {
752 so->e_upid = proc_uniqueid(ep);
753 so->e_pid = proc_pid(ep);
754 proc_getexecutableuuid(ep, so->e_uuid, sizeof(so->e_uuid));
755 so->so_flags |= SOF_DELEGATED;
756 }
757
758 so->so_cred = kauth_cred_proc_ref(p);
759 if (!suser(kauth_cred_get(), NULL)) {
760 so->so_state |= SS_PRIV;
761 }
762
763 so->so_proto = prp;
764 so->so_rcv.sb_flags |= SB_RECV;
765 so->so_rcv.sb_so = so->so_snd.sb_so = so;
766 so->next_lock_lr = 0;
767 so->next_unlock_lr = 0;
768
769 #if CONFIG_MACF_SOCKET
770 mac_socket_label_associate(kauth_cred_get(), so);
771 #endif /* MAC_SOCKET */
772
773 /*
774 * Attachment will create the per pcb lock if necessary and
775 * increase refcount for creation, make sure it's done before
776 * socket is inserted in lists.
777 */
778 so->so_usecount++;
779
780 error = (*prp->pr_usrreqs->pru_attach)(so, proto, p);
781 if (error != 0) {
782 /*
783 * Warning:
784 * If so_pcb is not zero, the socket will be leaked,
785 * so protocol attachment handler must be coded carefuly
786 */
787 so->so_state |= SS_NOFDREF;
788 VERIFY(so->so_usecount > 0);
789 so->so_usecount--;
790 sofreelastref(so, 1); /* will deallocate the socket */
791 return error;
792 }
793
794 atomic_add_32(&prp->pr_domain->dom_refs, 1);
795 TAILQ_INIT(&so->so_evlist);
796
797 /* Attach socket filters for this protocol */
798 sflt_initsock(so);
799 #if TCPDEBUG
800 if (tcpconsdebug == 2) {
801 so->so_options |= SO_DEBUG;
802 }
803 #endif
804 so_set_default_traffic_class(so);
805
806 /*
807 * If this thread or task is marked to create backgrounded sockets,
808 * mark the socket as background.
809 */
810 if (proc_get_effective_thread_policy(current_thread(),
811 TASK_POLICY_NEW_SOCKETS_BG)) {
812 socket_set_traffic_mgt_flags(so, TRAFFIC_MGT_SO_BACKGROUND);
813 so->so_background_thread = current_thread();
814 }
815
816 switch (dom) {
817 /*
818 * Don't mark Unix domain, system or multipath sockets as
819 * eligible for defunct by default.
820 */
821 case PF_LOCAL:
822 case PF_SYSTEM:
823 case PF_MULTIPATH:
824 so->so_flags |= SOF_NODEFUNCT;
825 break;
826 default:
827 break;
828 }
829
830 /*
831 * Entitlements can't be checked at socket creation time except if the
832 * application requested a feature guarded by a privilege (c.f., socket
833 * delegation).
834 * The priv(9) and the Sandboxing APIs are designed with the idea that
835 * a privilege check should only be triggered by a userland request.
836 * A privilege check at socket creation time is time consuming and
837 * could trigger many authorisation error messages from the security
838 * APIs.
839 */
840
841 *aso = so;
842
843 return 0;
844 }
845
846 /*
847 * Returns: 0 Success
848 * EAFNOSUPPORT
849 * EPROTOTYPE
850 * EPROTONOSUPPORT
851 * ENOBUFS
852 * <pru_attach>:ENOBUFS[AF_UNIX]
853 * <pru_attach>:ENOBUFS[TCP]
854 * <pru_attach>:ENOMEM[TCP]
855 * <pru_attach>:??? [other protocol families, IPSEC]
856 */
857 int
858 socreate(int dom, struct socket **aso, int type, int proto)
859 {
860 return socreate_internal(dom, aso, type, proto, current_proc(), 0,
861 PROC_NULL);
862 }
863
864 int
865 socreate_delegate(int dom, struct socket **aso, int type, int proto, pid_t epid)
866 {
867 int error = 0;
868 struct proc *ep = PROC_NULL;
869
870 if ((proc_selfpid() != epid) && ((ep = proc_find(epid)) == PROC_NULL)) {
871 error = ESRCH;
872 goto done;
873 }
874
875 error = socreate_internal(dom, aso, type, proto, current_proc(), 0, ep);
876
877 /*
878 * It might not be wise to hold the proc reference when calling
879 * socreate_internal since it calls soalloc with M_WAITOK
880 */
881 done:
882 if (ep != PROC_NULL) {
883 proc_rele(ep);
884 }
885
886 return error;
887 }
888
889 /*
890 * Returns: 0 Success
891 * <pru_bind>:EINVAL Invalid argument [COMMON_START]
892 * <pru_bind>:EAFNOSUPPORT Address family not supported
893 * <pru_bind>:EADDRNOTAVAIL Address not available.
894 * <pru_bind>:EINVAL Invalid argument
895 * <pru_bind>:EAFNOSUPPORT Address family not supported [notdef]
896 * <pru_bind>:EACCES Permission denied
897 * <pru_bind>:EADDRINUSE Address in use
898 * <pru_bind>:EAGAIN Resource unavailable, try again
899 * <pru_bind>:EPERM Operation not permitted
900 * <pru_bind>:???
901 * <sf_bind>:???
902 *
903 * Notes: It's not possible to fully enumerate the return codes above,
904 * since socket filter authors and protocol family authors may
905 * not choose to limit their error returns to those listed, even
906 * though this may result in some software operating incorrectly.
907 *
908 * The error codes which are enumerated above are those known to
909 * be returned by the tcp_usr_bind function supplied.
910 */
911 int
912 sobindlock(struct socket *so, struct sockaddr *nam, int dolock)
913 {
914 struct proc *p = current_proc();
915 int error = 0;
916
917 if (dolock) {
918 socket_lock(so, 1);
919 }
920
921 so_update_last_owner_locked(so, p);
922 so_update_policy(so);
923
924 #if NECP
925 so_update_necp_policy(so, nam, NULL);
926 #endif /* NECP */
927
928 /*
929 * If this is a bind request on a socket that has been marked
930 * as inactive, reject it now before we go any further.
931 */
932 if (so->so_flags & SOF_DEFUNCT) {
933 error = EINVAL;
934 SODEFUNCTLOG("%s[%d, %s]: defunct so 0x%llx [%d,%d] (%d)\n",
935 __func__, proc_pid(p), proc_best_name(p),
936 (uint64_t)DEBUG_KERNEL_ADDRPERM(so),
937 SOCK_DOM(so), SOCK_TYPE(so), error);
938 goto out;
939 }
940
941 /* Socket filter */
942 error = sflt_bind(so, nam);
943
944 if (error == 0) {
945 error = (*so->so_proto->pr_usrreqs->pru_bind)(so, nam, p);
946 }
947 out:
948 if (dolock) {
949 socket_unlock(so, 1);
950 }
951
952 if (error == EJUSTRETURN) {
953 error = 0;
954 }
955
956 return error;
957 }
958
959 void
960 sodealloc(struct socket *so)
961 {
962 kauth_cred_unref(&so->so_cred);
963
964 /* Remove any filters */
965 sflt_termsock(so);
966
967 #if CONTENT_FILTER
968 cfil_sock_detach(so);
969 #endif /* CONTENT_FILTER */
970
971 /* Delete the state allocated for msg queues on a socket */
972 if (so->so_flags & SOF_ENABLE_MSGS) {
973 FREE(so->so_msg_state, M_TEMP);
974 so->so_msg_state = NULL;
975 }
976 VERIFY(so->so_msg_state == NULL);
977
978 so->so_gencnt = OSIncrementAtomic64((SInt64 *)&so_gencnt);
979
980 #if CONFIG_MACF_SOCKET
981 mac_socket_label_destroy(so);
982 #endif /* MAC_SOCKET */
983
984 if (so->so_flags1 & SOF1_CACHED_IN_SOCK_LAYER) {
985 cached_sock_free(so);
986 } else {
987 FREE_ZONE(so, sizeof(*so), so->so_zone);
988 }
989 }
990
991 /*
992 * Returns: 0 Success
993 * EINVAL
994 * EOPNOTSUPP
995 * <pru_listen>:EINVAL[AF_UNIX]
996 * <pru_listen>:EINVAL[TCP]
997 * <pru_listen>:EADDRNOTAVAIL[TCP] Address not available.
998 * <pru_listen>:EINVAL[TCP] Invalid argument
999 * <pru_listen>:EAFNOSUPPORT[TCP] Address family not supported [notdef]
1000 * <pru_listen>:EACCES[TCP] Permission denied
1001 * <pru_listen>:EADDRINUSE[TCP] Address in use
1002 * <pru_listen>:EAGAIN[TCP] Resource unavailable, try again
1003 * <pru_listen>:EPERM[TCP] Operation not permitted
1004 * <sf_listen>:???
1005 *
1006 * Notes: Other <pru_listen> returns depend on the protocol family; all
1007 * <sf_listen> returns depend on what the filter author causes
1008 * their filter to return.
1009 */
1010 int
1011 solisten(struct socket *so, int backlog)
1012 {
1013 struct proc *p = current_proc();
1014 int error = 0;
1015
1016 socket_lock(so, 1);
1017
1018 so_update_last_owner_locked(so, p);
1019 so_update_policy(so);
1020
1021 #if NECP
1022 so_update_necp_policy(so, NULL, NULL);
1023 #endif /* NECP */
1024
1025 if (so->so_proto == NULL) {
1026 error = EINVAL;
1027 goto out;
1028 }
1029 if ((so->so_proto->pr_flags & PR_CONNREQUIRED) == 0) {
1030 error = EOPNOTSUPP;
1031 goto out;
1032 }
1033
1034 /*
1035 * If the listen request is made on a socket that is not fully
1036 * disconnected, or on a socket that has been marked as inactive,
1037 * reject the request now.
1038 */
1039 if ((so->so_state &
1040 (SS_ISCONNECTED | SS_ISCONNECTING | SS_ISDISCONNECTING)) ||
1041 (so->so_flags & SOF_DEFUNCT)) {
1042 error = EINVAL;
1043 if (so->so_flags & SOF_DEFUNCT) {
1044 SODEFUNCTLOG("%s[%d, %s]: defunct so 0x%llx [%d,%d] "
1045 "(%d)\n", __func__, proc_pid(p),
1046 proc_best_name(p),
1047 (uint64_t)DEBUG_KERNEL_ADDRPERM(so),
1048 SOCK_DOM(so), SOCK_TYPE(so), error);
1049 }
1050 goto out;
1051 }
1052
1053 if ((so->so_restrictions & SO_RESTRICT_DENY_IN) != 0) {
1054 error = EPERM;
1055 goto out;
1056 }
1057
1058 error = sflt_listen(so);
1059 if (error == 0) {
1060 error = (*so->so_proto->pr_usrreqs->pru_listen)(so, p);
1061 }
1062
1063 if (error) {
1064 if (error == EJUSTRETURN) {
1065 error = 0;
1066 }
1067 goto out;
1068 }
1069
1070 if (TAILQ_EMPTY(&so->so_comp)) {
1071 so->so_options |= SO_ACCEPTCONN;
1072 }
1073 /*
1074 * POSIX: The implementation may have an upper limit on the length of
1075 * the listen queue-either global or per accepting socket. If backlog
1076 * exceeds this limit, the length of the listen queue is set to the
1077 * limit.
1078 *
1079 * If listen() is called with a backlog argument value that is less
1080 * than 0, the function behaves as if it had been called with a backlog
1081 * argument value of 0.
1082 *
1083 * A backlog argument of 0 may allow the socket to accept connections,
1084 * in which case the length of the listen queue may be set to an
1085 * implementation-defined minimum value.
1086 */
1087 if (backlog <= 0 || backlog > somaxconn) {
1088 backlog = somaxconn;
1089 }
1090
1091 so->so_qlimit = backlog;
1092 out:
1093 socket_unlock(so, 1);
1094 return error;
1095 }
1096
1097 /*
1098 * The "accept list lock" protects the fields related to the listener queues
1099 * because we can unlock a socket to respect the lock ordering between
1100 * the listener socket and its clients sockets. The lock ordering is first to
1101 * acquire the client socket before the listener socket.
1102 *
1103 * The accept list lock serializes access to the following fields:
1104 * - of the listener socket:
1105 * - so_comp
1106 * - so_incomp
1107 * - so_qlen
1108 * - so_inqlen
1109 * - of client sockets that are in so_comp or so_incomp:
1110 * - so_head
1111 * - so_list
1112 *
1113 * As one can see the accept list lock protects the consistent of the
1114 * linkage of the client sockets.
1115 *
1116 * Note that those fields may be read without holding the accept list lock
1117 * for a preflight provided the accept list lock is taken when committing
1118 * to take an action based on the result of the preflight. The preflight
1119 * saves the cost of doing the unlock/lock dance.
1120 */
1121 void
1122 so_acquire_accept_list(struct socket *head, struct socket *so)
1123 {
1124 lck_mtx_t *mutex_held;
1125
1126 if (head->so_proto->pr_getlock == NULL) {
1127 return;
1128 }
1129 mutex_held = (*head->so_proto->pr_getlock)(head, PR_F_WILLUNLOCK);
1130 LCK_MTX_ASSERT(mutex_held, LCK_MTX_ASSERT_OWNED);
1131
1132 if (!(head->so_flags1 & SOF1_ACCEPT_LIST_HELD)) {
1133 head->so_flags1 |= SOF1_ACCEPT_LIST_HELD;
1134 return;
1135 }
1136 if (so != NULL) {
1137 socket_unlock(so, 0);
1138 }
1139 while (head->so_flags1 & SOF1_ACCEPT_LIST_HELD) {
1140 so_accept_list_waits += 1;
1141 msleep((caddr_t)&head->so_incomp, mutex_held,
1142 PSOCK | PCATCH, __func__, NULL);
1143 }
1144 head->so_flags1 |= SOF1_ACCEPT_LIST_HELD;
1145 if (so != NULL) {
1146 socket_unlock(head, 0);
1147 socket_lock(so, 0);
1148 socket_lock(head, 0);
1149 }
1150 }
1151
1152 void
1153 so_release_accept_list(struct socket *head)
1154 {
1155 if (head->so_proto->pr_getlock != NULL) {
1156 lck_mtx_t *mutex_held;
1157
1158 mutex_held = (*head->so_proto->pr_getlock)(head, 0);
1159 LCK_MTX_ASSERT(mutex_held, LCK_MTX_ASSERT_OWNED);
1160
1161 head->so_flags1 &= ~SOF1_ACCEPT_LIST_HELD;
1162 wakeup((caddr_t)&head->so_incomp);
1163 }
1164 }
1165
1166 void
1167 sofreelastref(struct socket *so, int dealloc)
1168 {
1169 struct socket *head = so->so_head;
1170
1171 /* Assume socket is locked */
1172
1173 if (!(so->so_flags & SOF_PCBCLEARING) || !(so->so_state & SS_NOFDREF)) {
1174 selthreadclear(&so->so_snd.sb_sel);
1175 selthreadclear(&so->so_rcv.sb_sel);
1176 so->so_rcv.sb_flags &= ~(SB_SEL | SB_UPCALL);
1177 so->so_snd.sb_flags &= ~(SB_SEL | SB_UPCALL);
1178 so->so_event = sonullevent;
1179 return;
1180 }
1181 if (head != NULL) {
1182 /*
1183 * Need to lock the listener when the protocol has
1184 * per socket locks
1185 */
1186 if (head->so_proto->pr_getlock != NULL) {
1187 socket_lock(head, 1);
1188 so_acquire_accept_list(head, so);
1189 }
1190 if (so->so_state & SS_INCOMP) {
1191 so->so_state &= ~SS_INCOMP;
1192 TAILQ_REMOVE(&head->so_incomp, so, so_list);
1193 head->so_incqlen--;
1194 head->so_qlen--;
1195 so->so_head = NULL;
1196
1197 if (head->so_proto->pr_getlock != NULL) {
1198 so_release_accept_list(head);
1199 socket_unlock(head, 1);
1200 }
1201 } else if (so->so_state & SS_COMP) {
1202 if (head->so_proto->pr_getlock != NULL) {
1203 so_release_accept_list(head);
1204 socket_unlock(head, 1);
1205 }
1206 /*
1207 * We must not decommission a socket that's
1208 * on the accept(2) queue. If we do, then
1209 * accept(2) may hang after select(2) indicated
1210 * that the listening socket was ready.
1211 */
1212 selthreadclear(&so->so_snd.sb_sel);
1213 selthreadclear(&so->so_rcv.sb_sel);
1214 so->so_rcv.sb_flags &= ~(SB_SEL | SB_UPCALL);
1215 so->so_snd.sb_flags &= ~(SB_SEL | SB_UPCALL);
1216 so->so_event = sonullevent;
1217 return;
1218 } else {
1219 if (head->so_proto->pr_getlock != NULL) {
1220 so_release_accept_list(head);
1221 socket_unlock(head, 1);
1222 }
1223 printf("sofree: not queued\n");
1224 }
1225 }
1226 sowflush(so);
1227 sorflush(so);
1228
1229 #if FLOW_DIVERT
1230 if (so->so_flags & SOF_FLOW_DIVERT) {
1231 flow_divert_detach(so);
1232 }
1233 #endif /* FLOW_DIVERT */
1234
1235 /* 3932268: disable upcall */
1236 so->so_rcv.sb_flags &= ~SB_UPCALL;
1237 so->so_snd.sb_flags &= ~(SB_UPCALL | SB_SNDBYTE_CNT);
1238 so->so_event = sonullevent;
1239
1240 if (dealloc) {
1241 sodealloc(so);
1242 }
1243 }
1244
1245 void
1246 soclose_wait_locked(struct socket *so)
1247 {
1248 lck_mtx_t *mutex_held;
1249
1250 if (so->so_proto->pr_getlock != NULL) {
1251 mutex_held = (*so->so_proto->pr_getlock)(so, PR_F_WILLUNLOCK);
1252 } else {
1253 mutex_held = so->so_proto->pr_domain->dom_mtx;
1254 }
1255 LCK_MTX_ASSERT(mutex_held, LCK_MTX_ASSERT_OWNED);
1256
1257 /*
1258 * Double check here and return if there's no outstanding upcall;
1259 * otherwise proceed further only if SOF_UPCALLCLOSEWAIT is set.
1260 */
1261 if (!so->so_upcallusecount || !(so->so_flags & SOF_UPCALLCLOSEWAIT)) {
1262 return;
1263 }
1264 so->so_rcv.sb_flags &= ~SB_UPCALL;
1265 so->so_snd.sb_flags &= ~SB_UPCALL;
1266 so->so_flags |= SOF_CLOSEWAIT;
1267
1268 (void) msleep((caddr_t)&so->so_upcallusecount, mutex_held, (PZERO - 1),
1269 "soclose_wait_locked", NULL);
1270 LCK_MTX_ASSERT(mutex_held, LCK_MTX_ASSERT_OWNED);
1271 so->so_flags &= ~SOF_CLOSEWAIT;
1272 }
1273
1274 /*
1275 * Close a socket on last file table reference removal.
1276 * Initiate disconnect if connected.
1277 * Free socket when disconnect complete.
1278 */
1279 int
1280 soclose_locked(struct socket *so)
1281 {
1282 int error = 0;
1283 struct timespec ts;
1284
1285 if (so->so_usecount == 0) {
1286 panic("soclose: so=%p refcount=0\n", so);
1287 /* NOTREACHED */
1288 }
1289
1290 sflt_notify(so, sock_evt_closing, NULL);
1291
1292 if (so->so_upcallusecount) {
1293 soclose_wait_locked(so);
1294 }
1295
1296 #if CONTENT_FILTER
1297 /*
1298 * We have to wait until the content filters are done
1299 */
1300 if ((so->so_flags & SOF_CONTENT_FILTER) != 0) {
1301 cfil_sock_close_wait(so);
1302 cfil_sock_is_closed(so);
1303 cfil_sock_detach(so);
1304 }
1305 #endif /* CONTENT_FILTER */
1306
1307 if (so->so_flags1 & SOF1_EXTEND_BK_IDLE_INPROG) {
1308 soresume(current_proc(), so, 1);
1309 so->so_flags1 &= ~SOF1_EXTEND_BK_IDLE_WANTED;
1310 }
1311
1312 if ((so->so_options & SO_ACCEPTCONN)) {
1313 struct socket *sp, *sonext;
1314 int persocklock = 0;
1315 int incomp_overflow_only;
1316
1317 /*
1318 * We do not want new connection to be added
1319 * to the connection queues
1320 */
1321 so->so_options &= ~SO_ACCEPTCONN;
1322
1323 /*
1324 * We can drop the lock on the listener once
1325 * we've acquired the incoming list
1326 */
1327 if (so->so_proto->pr_getlock != NULL) {
1328 persocklock = 1;
1329 so_acquire_accept_list(so, NULL);
1330 socket_unlock(so, 0);
1331 }
1332 again:
1333 incomp_overflow_only = 1;
1334
1335 TAILQ_FOREACH_SAFE(sp, &so->so_incomp, so_list, sonext) {
1336 /*
1337 * Radar 5350314
1338 * skip sockets thrown away by tcpdropdropblreq
1339 * they will get cleanup by the garbage collection.
1340 * otherwise, remove the incomp socket from the queue
1341 * and let soabort trigger the appropriate cleanup.
1342 */
1343 if (sp->so_flags & SOF_OVERFLOW) {
1344 continue;
1345 }
1346
1347 if (persocklock != 0) {
1348 socket_lock(sp, 1);
1349 }
1350
1351 /*
1352 * Radar 27945981
1353 * The extra reference for the list insure the
1354 * validity of the socket pointer when we perform the
1355 * unlock of the head above
1356 */
1357 if (sp->so_state & SS_INCOMP) {
1358 sp->so_state &= ~SS_INCOMP;
1359 sp->so_head = NULL;
1360 TAILQ_REMOVE(&so->so_incomp, sp, so_list);
1361 so->so_incqlen--;
1362 so->so_qlen--;
1363
1364 (void) soabort(sp);
1365 } else {
1366 panic("%s sp %p in so_incomp but !SS_INCOMP",
1367 __func__, sp);
1368 }
1369
1370 if (persocklock != 0) {
1371 socket_unlock(sp, 1);
1372 }
1373 }
1374
1375 TAILQ_FOREACH_SAFE(sp, &so->so_comp, so_list, sonext) {
1376 /* Dequeue from so_comp since sofree() won't do it */
1377 if (persocklock != 0) {
1378 socket_lock(sp, 1);
1379 }
1380
1381 if (sp->so_state & SS_COMP) {
1382 sp->so_state &= ~SS_COMP;
1383 sp->so_head = NULL;
1384 TAILQ_REMOVE(&so->so_comp, sp, so_list);
1385 so->so_qlen--;
1386
1387 (void) soabort(sp);
1388 } else {
1389 panic("%s sp %p in so_comp but !SS_COMP",
1390 __func__, sp);
1391 }
1392
1393 if (persocklock) {
1394 socket_unlock(sp, 1);
1395 }
1396 }
1397
1398 if (incomp_overflow_only == 0 && !TAILQ_EMPTY(&so->so_incomp)) {
1399 #if (DEBUG | DEVELOPMENT)
1400 panic("%s head %p so_comp not empty\n", __func__, so);
1401 #endif /* (DEVELOPMENT || DEBUG) */
1402
1403 goto again;
1404 }
1405
1406 if (!TAILQ_EMPTY(&so->so_comp)) {
1407 #if (DEBUG | DEVELOPMENT)
1408 panic("%s head %p so_comp not empty\n", __func__, so);
1409 #endif /* (DEVELOPMENT || DEBUG) */
1410
1411 goto again;
1412 }
1413
1414 if (persocklock) {
1415 socket_lock(so, 0);
1416 so_release_accept_list(so);
1417 }
1418 }
1419 if (so->so_pcb == NULL) {
1420 /* 3915887: mark the socket as ready for dealloc */
1421 so->so_flags |= SOF_PCBCLEARING;
1422 goto discard;
1423 }
1424 if (so->so_state & SS_ISCONNECTED) {
1425 if ((so->so_state & SS_ISDISCONNECTING) == 0) {
1426 error = sodisconnectlocked(so);
1427 if (error) {
1428 goto drop;
1429 }
1430 }
1431 if (so->so_options & SO_LINGER) {
1432 lck_mtx_t *mutex_held;
1433
1434 if ((so->so_state & SS_ISDISCONNECTING) &&
1435 (so->so_state & SS_NBIO)) {
1436 goto drop;
1437 }
1438 if (so->so_proto->pr_getlock != NULL) {
1439 mutex_held = (*so->so_proto->pr_getlock)(so, PR_F_WILLUNLOCK);
1440 } else {
1441 mutex_held = so->so_proto->pr_domain->dom_mtx;
1442 }
1443 while (so->so_state & SS_ISCONNECTED) {
1444 ts.tv_sec = (so->so_linger / 100);
1445 ts.tv_nsec = (so->so_linger % 100) *
1446 NSEC_PER_USEC * 1000 * 10;
1447 error = msleep((caddr_t)&so->so_timeo,
1448 mutex_held, PSOCK | PCATCH, "soclose", &ts);
1449 if (error) {
1450 /*
1451 * It's OK when the time fires,
1452 * don't report an error
1453 */
1454 if (error == EWOULDBLOCK) {
1455 error = 0;
1456 }
1457 break;
1458 }
1459 }
1460 }
1461 }
1462 drop:
1463 if (so->so_usecount == 0) {
1464 panic("soclose: usecount is zero so=%p\n", so);
1465 /* NOTREACHED */
1466 }
1467 if (so->so_pcb != NULL && !(so->so_flags & SOF_PCBCLEARING)) {
1468 int error2 = (*so->so_proto->pr_usrreqs->pru_detach)(so);
1469 if (error == 0) {
1470 error = error2;
1471 }
1472 }
1473 if (so->so_usecount <= 0) {
1474 panic("soclose: usecount is zero so=%p\n", so);
1475 /* NOTREACHED */
1476 }
1477 discard:
1478 if (so->so_pcb != NULL && !(so->so_flags & SOF_MP_SUBFLOW) &&
1479 (so->so_state & SS_NOFDREF)) {
1480 panic("soclose: NOFDREF");
1481 /* NOTREACHED */
1482 }
1483 so->so_state |= SS_NOFDREF;
1484
1485 if ((so->so_flags & SOF_KNOTE) != 0) {
1486 KNOTE(&so->so_klist, SO_FILT_HINT_LOCKED);
1487 }
1488
1489 atomic_add_32(&so->so_proto->pr_domain->dom_refs, -1);
1490 evsofree(so);
1491
1492 VERIFY(so->so_usecount > 0);
1493 so->so_usecount--;
1494 sofree(so);
1495 return error;
1496 }
1497
1498 int
1499 soclose(struct socket *so)
1500 {
1501 int error = 0;
1502 socket_lock(so, 1);
1503
1504 if (so->so_retaincnt == 0) {
1505 error = soclose_locked(so);
1506 } else {
1507 /*
1508 * if the FD is going away, but socket is
1509 * retained in kernel remove its reference
1510 */
1511 so->so_usecount--;
1512 if (so->so_usecount < 2) {
1513 panic("soclose: retaincnt non null and so=%p "
1514 "usecount=%d\n", so, so->so_usecount);
1515 }
1516 }
1517 socket_unlock(so, 1);
1518 return error;
1519 }
1520
1521 /*
1522 * Must be called at splnet...
1523 */
1524 /* Should already be locked */
1525 int
1526 soabort(struct socket *so)
1527 {
1528 int error;
1529
1530 #ifdef MORE_LOCKING_DEBUG
1531 lck_mtx_t *mutex_held;
1532
1533 if (so->so_proto->pr_getlock != NULL) {
1534 mutex_held = (*so->so_proto->pr_getlock)(so, 0);
1535 } else {
1536 mutex_held = so->so_proto->pr_domain->dom_mtx;
1537 }
1538 LCK_MTX_ASSERT(mutex_held, LCK_MTX_ASSERT_OWNED);
1539 #endif
1540
1541 if ((so->so_flags & SOF_ABORTED) == 0) {
1542 so->so_flags |= SOF_ABORTED;
1543 error = (*so->so_proto->pr_usrreqs->pru_abort)(so);
1544 if (error) {
1545 sofree(so);
1546 return error;
1547 }
1548 }
1549 return 0;
1550 }
1551
1552 int
1553 soacceptlock(struct socket *so, struct sockaddr **nam, int dolock)
1554 {
1555 int error;
1556
1557 if (dolock) {
1558 socket_lock(so, 1);
1559 }
1560
1561 so_update_last_owner_locked(so, PROC_NULL);
1562 so_update_policy(so);
1563 #if NECP
1564 so_update_necp_policy(so, NULL, NULL);
1565 #endif /* NECP */
1566
1567 if ((so->so_state & SS_NOFDREF) == 0) {
1568 panic("soaccept: !NOFDREF");
1569 }
1570 so->so_state &= ~SS_NOFDREF;
1571 error = (*so->so_proto->pr_usrreqs->pru_accept)(so, nam);
1572
1573 if (dolock) {
1574 socket_unlock(so, 1);
1575 }
1576 return error;
1577 }
1578
1579 int
1580 soaccept(struct socket *so, struct sockaddr **nam)
1581 {
1582 return soacceptlock(so, nam, 1);
1583 }
1584
1585 int
1586 soacceptfilter(struct socket *so, struct socket *head)
1587 {
1588 struct sockaddr *local = NULL, *remote = NULL;
1589 int error = 0;
1590
1591 /*
1592 * Hold the lock even if this socket has not been made visible
1593 * to the filter(s). For sockets with global locks, this protects
1594 * against the head or peer going away
1595 */
1596 socket_lock(so, 1);
1597 if (sogetaddr_locked(so, &remote, 1) != 0 ||
1598 sogetaddr_locked(so, &local, 0) != 0) {
1599 so->so_state &= ~SS_NOFDREF;
1600 socket_unlock(so, 1);
1601 soclose(so);
1602 /* Out of resources; try it again next time */
1603 error = ECONNABORTED;
1604 goto done;
1605 }
1606
1607 error = sflt_accept(head, so, local, remote);
1608
1609 /*
1610 * If we get EJUSTRETURN from one of the filters, mark this socket
1611 * as inactive and return it anyway. This newly accepted socket
1612 * will be disconnected later before we hand it off to the caller.
1613 */
1614 if (error == EJUSTRETURN) {
1615 error = 0;
1616 (void) sosetdefunct(current_proc(), so,
1617 SHUTDOWN_SOCKET_LEVEL_DISCONNECT_INTERNAL, FALSE);
1618 }
1619
1620 if (error != 0) {
1621 /*
1622 * This may seem like a duplication to the above error
1623 * handling part when we return ECONNABORTED, except
1624 * the following is done while holding the lock since
1625 * the socket has been exposed to the filter(s) earlier.
1626 */
1627 so->so_state &= ~SS_NOFDREF;
1628 socket_unlock(so, 1);
1629 soclose(so);
1630 /* Propagate socket filter's error code to the caller */
1631 } else {
1632 socket_unlock(so, 1);
1633 }
1634 done:
1635 /* Callee checks for NULL pointer */
1636 sock_freeaddr(remote);
1637 sock_freeaddr(local);
1638 return error;
1639 }
1640
1641 /*
1642 * Returns: 0 Success
1643 * EOPNOTSUPP Operation not supported on socket
1644 * EISCONN Socket is connected
1645 * <pru_connect>:EADDRNOTAVAIL Address not available.
1646 * <pru_connect>:EINVAL Invalid argument
1647 * <pru_connect>:EAFNOSUPPORT Address family not supported [notdef]
1648 * <pru_connect>:EACCES Permission denied
1649 * <pru_connect>:EADDRINUSE Address in use
1650 * <pru_connect>:EAGAIN Resource unavailable, try again
1651 * <pru_connect>:EPERM Operation not permitted
1652 * <sf_connect_out>:??? [anything a filter writer might set]
1653 */
1654 int
1655 soconnectlock(struct socket *so, struct sockaddr *nam, int dolock)
1656 {
1657 int error;
1658 struct proc *p = current_proc();
1659
1660 if (dolock) {
1661 socket_lock(so, 1);
1662 }
1663
1664 so_update_last_owner_locked(so, p);
1665 so_update_policy(so);
1666
1667 #if NECP
1668 so_update_necp_policy(so, NULL, nam);
1669 #endif /* NECP */
1670
1671 /*
1672 * If this is a listening socket or if this is a previously-accepted
1673 * socket that has been marked as inactive, reject the connect request.
1674 */
1675 if ((so->so_options & SO_ACCEPTCONN) || (so->so_flags & SOF_DEFUNCT)) {
1676 error = EOPNOTSUPP;
1677 if (so->so_flags & SOF_DEFUNCT) {
1678 SODEFUNCTLOG("%s[%d, %s]: defunct so 0x%llx [%d,%d] "
1679 "(%d)\n", __func__, proc_pid(p),
1680 proc_best_name(p),
1681 (uint64_t)DEBUG_KERNEL_ADDRPERM(so),
1682 SOCK_DOM(so), SOCK_TYPE(so), error);
1683 }
1684 if (dolock) {
1685 socket_unlock(so, 1);
1686 }
1687 return error;
1688 }
1689
1690 if ((so->so_restrictions & SO_RESTRICT_DENY_OUT) != 0) {
1691 if (dolock) {
1692 socket_unlock(so, 1);
1693 }
1694 return EPERM;
1695 }
1696
1697 /*
1698 * If protocol is connection-based, can only connect once.
1699 * Otherwise, if connected, try to disconnect first.
1700 * This allows user to disconnect by connecting to, e.g.,
1701 * a null address.
1702 */
1703 if (so->so_state & (SS_ISCONNECTED | SS_ISCONNECTING) &&
1704 ((so->so_proto->pr_flags & PR_CONNREQUIRED) ||
1705 (error = sodisconnectlocked(so)))) {
1706 error = EISCONN;
1707 } else {
1708 /*
1709 * Run connect filter before calling protocol:
1710 * - non-blocking connect returns before completion;
1711 */
1712 error = sflt_connectout(so, nam);
1713 if (error != 0) {
1714 if (error == EJUSTRETURN) {
1715 error = 0;
1716 }
1717 } else {
1718 error = (*so->so_proto->pr_usrreqs->pru_connect)
1719 (so, nam, p);
1720 }
1721 }
1722 if (dolock) {
1723 socket_unlock(so, 1);
1724 }
1725 return error;
1726 }
1727
1728 int
1729 soconnect(struct socket *so, struct sockaddr *nam)
1730 {
1731 return soconnectlock(so, nam, 1);
1732 }
1733
1734 /*
1735 * Returns: 0 Success
1736 * <pru_connect2>:EINVAL[AF_UNIX]
1737 * <pru_connect2>:EPROTOTYPE[AF_UNIX]
1738 * <pru_connect2>:??? [other protocol families]
1739 *
1740 * Notes: <pru_connect2> is not supported by [TCP].
1741 */
1742 int
1743 soconnect2(struct socket *so1, struct socket *so2)
1744 {
1745 int error;
1746
1747 socket_lock(so1, 1);
1748 if (so2->so_proto->pr_lock) {
1749 socket_lock(so2, 1);
1750 }
1751
1752 error = (*so1->so_proto->pr_usrreqs->pru_connect2)(so1, so2);
1753
1754 socket_unlock(so1, 1);
1755 if (so2->so_proto->pr_lock) {
1756 socket_unlock(so2, 1);
1757 }
1758 return error;
1759 }
1760
1761 int
1762 soconnectxlocked(struct socket *so, struct sockaddr *src,
1763 struct sockaddr *dst, struct proc *p, uint32_t ifscope,
1764 sae_associd_t aid, sae_connid_t *pcid, uint32_t flags, void *arg,
1765 uint32_t arglen, uio_t auio, user_ssize_t *bytes_written)
1766 {
1767 int error;
1768
1769 so_update_last_owner_locked(so, p);
1770 so_update_policy(so);
1771
1772 /*
1773 * If this is a listening socket or if this is a previously-accepted
1774 * socket that has been marked as inactive, reject the connect request.
1775 */
1776 if ((so->so_options & SO_ACCEPTCONN) || (so->so_flags & SOF_DEFUNCT)) {
1777 error = EOPNOTSUPP;
1778 if (so->so_flags & SOF_DEFUNCT) {
1779 SODEFUNCTLOG("%s[%d, %s]: defunct so 0x%llx [%d,%d] "
1780 "(%d)\n", __func__, proc_pid(p),
1781 proc_best_name(p),
1782 (uint64_t)DEBUG_KERNEL_ADDRPERM(so),
1783 SOCK_DOM(so), SOCK_TYPE(so), error);
1784 }
1785 return error;
1786 }
1787
1788 if ((so->so_restrictions & SO_RESTRICT_DENY_OUT) != 0) {
1789 return EPERM;
1790 }
1791
1792 /*
1793 * If protocol is connection-based, can only connect once
1794 * unless PR_MULTICONN is set. Otherwise, if connected,
1795 * try to disconnect first. This allows user to disconnect
1796 * by connecting to, e.g., a null address.
1797 */
1798 if ((so->so_state & (SS_ISCONNECTED | SS_ISCONNECTING)) &&
1799 !(so->so_proto->pr_flags & PR_MULTICONN) &&
1800 ((so->so_proto->pr_flags & PR_CONNREQUIRED) ||
1801 (error = sodisconnectlocked(so)) != 0)) {
1802 error = EISCONN;
1803 } else {
1804 /*
1805 * Run connect filter before calling protocol:
1806 * - non-blocking connect returns before completion;
1807 */
1808 error = sflt_connectout(so, dst);
1809 if (error != 0) {
1810 /* Disable PRECONNECT_DATA, as we don't need to send a SYN anymore. */
1811 so->so_flags1 &= ~SOF1_PRECONNECT_DATA;
1812 if (error == EJUSTRETURN) {
1813 error = 0;
1814 }
1815 } else {
1816 error = (*so->so_proto->pr_usrreqs->pru_connectx)
1817 (so, src, dst, p, ifscope, aid, pcid,
1818 flags, arg, arglen, auio, bytes_written);
1819 }
1820 }
1821
1822 return error;
1823 }
1824
1825 int
1826 sodisconnectlocked(struct socket *so)
1827 {
1828 int error;
1829
1830 if ((so->so_state & SS_ISCONNECTED) == 0) {
1831 error = ENOTCONN;
1832 goto bad;
1833 }
1834 if (so->so_state & SS_ISDISCONNECTING) {
1835 error = EALREADY;
1836 goto bad;
1837 }
1838
1839 error = (*so->so_proto->pr_usrreqs->pru_disconnect)(so);
1840 if (error == 0) {
1841 sflt_notify(so, sock_evt_disconnected, NULL);
1842 }
1843
1844 bad:
1845 return error;
1846 }
1847
1848 /* Locking version */
1849 int
1850 sodisconnect(struct socket *so)
1851 {
1852 int error;
1853
1854 socket_lock(so, 1);
1855 error = sodisconnectlocked(so);
1856 socket_unlock(so, 1);
1857 return error;
1858 }
1859
1860 int
1861 sodisconnectxlocked(struct socket *so, sae_associd_t aid, sae_connid_t cid)
1862 {
1863 int error;
1864
1865 /*
1866 * Call the protocol disconnectx handler; let it handle all
1867 * matters related to the connection state of this session.
1868 */
1869 error = (*so->so_proto->pr_usrreqs->pru_disconnectx)(so, aid, cid);
1870 if (error == 0) {
1871 /*
1872 * The event applies only for the session, not for
1873 * the disconnection of individual subflows.
1874 */
1875 if (so->so_state & (SS_ISDISCONNECTING | SS_ISDISCONNECTED)) {
1876 sflt_notify(so, sock_evt_disconnected, NULL);
1877 }
1878 }
1879 return error;
1880 }
1881
1882 int
1883 sodisconnectx(struct socket *so, sae_associd_t aid, sae_connid_t cid)
1884 {
1885 int error;
1886
1887 socket_lock(so, 1);
1888 error = sodisconnectxlocked(so, aid, cid);
1889 socket_unlock(so, 1);
1890 return error;
1891 }
1892
1893 #define SBLOCKWAIT(f) (((f) & MSG_DONTWAIT) ? 0 : SBL_WAIT)
1894
1895 /*
1896 * sosendcheck will lock the socket buffer if it isn't locked and
1897 * verify that there is space for the data being inserted.
1898 *
1899 * Returns: 0 Success
1900 * EPIPE
1901 * sblock:EWOULDBLOCK
1902 * sblock:EINTR
1903 * sbwait:EBADF
1904 * sbwait:EINTR
1905 * [so_error]:???
1906 */
1907 int
1908 sosendcheck(struct socket *so, struct sockaddr *addr, user_ssize_t resid,
1909 int32_t clen, int32_t atomic, int flags, int *sblocked,
1910 struct mbuf *control)
1911 {
1912 int error = 0;
1913 int32_t space;
1914 int assumelock = 0;
1915
1916 restart:
1917 if (*sblocked == 0) {
1918 if ((so->so_snd.sb_flags & SB_LOCK) != 0 &&
1919 so->so_send_filt_thread != 0 &&
1920 so->so_send_filt_thread == current_thread()) {
1921 /*
1922 * We're being called recursively from a filter,
1923 * allow this to continue. Radar 4150520.
1924 * Don't set sblocked because we don't want
1925 * to perform an unlock later.
1926 */
1927 assumelock = 1;
1928 } else {
1929 error = sblock(&so->so_snd, SBLOCKWAIT(flags));
1930 if (error) {
1931 if (so->so_flags & SOF_DEFUNCT) {
1932 goto defunct;
1933 }
1934 return error;
1935 }
1936 *sblocked = 1;
1937 }
1938 }
1939
1940 /*
1941 * If a send attempt is made on a socket that has been marked
1942 * as inactive (disconnected), reject the request.
1943 */
1944 if (so->so_flags & SOF_DEFUNCT) {
1945 defunct:
1946 error = EPIPE;
1947 SODEFUNCTLOG("%s[%d, %s]: defunct so 0x%llx [%d,%d] (%d)\n",
1948 __func__, proc_selfpid(), proc_best_name(current_proc()),
1949 (uint64_t)DEBUG_KERNEL_ADDRPERM(so),
1950 SOCK_DOM(so), SOCK_TYPE(so), error);
1951 return error;
1952 }
1953
1954 if (so->so_state & SS_CANTSENDMORE) {
1955 #if CONTENT_FILTER
1956 /*
1957 * Can re-inject data of half closed connections
1958 */
1959 if ((so->so_state & SS_ISDISCONNECTED) == 0 &&
1960 so->so_snd.sb_cfil_thread == current_thread() &&
1961 cfil_sock_data_pending(&so->so_snd) != 0) {
1962 CFIL_LOG(LOG_INFO,
1963 "so %llx ignore SS_CANTSENDMORE",
1964 (uint64_t)DEBUG_KERNEL_ADDRPERM(so));
1965 } else
1966 #endif /* CONTENT_FILTER */
1967 return EPIPE;
1968 }
1969 if (so->so_error) {
1970 error = so->so_error;
1971 so->so_error = 0;
1972 return error;
1973 }
1974
1975 if ((so->so_state & SS_ISCONNECTED) == 0) {
1976 if ((so->so_proto->pr_flags & PR_CONNREQUIRED) != 0) {
1977 if (((so->so_state & SS_ISCONFIRMING) == 0) &&
1978 (resid != 0 || clen == 0) &&
1979 !(so->so_flags1 & SOF1_PRECONNECT_DATA)) {
1980 return ENOTCONN;
1981 }
1982 } else if (addr == 0 && !(flags & MSG_HOLD)) {
1983 return (so->so_proto->pr_flags & PR_CONNREQUIRED) ?
1984 ENOTCONN : EDESTADDRREQ;
1985 }
1986 }
1987
1988 if (so->so_flags & SOF_ENABLE_MSGS) {
1989 space = msgq_sbspace(so, control);
1990 } else {
1991 space = sbspace(&so->so_snd);
1992 }
1993
1994 if (flags & MSG_OOB) {
1995 space += 1024;
1996 }
1997 if ((atomic && resid > so->so_snd.sb_hiwat) ||
1998 clen > so->so_snd.sb_hiwat) {
1999 return EMSGSIZE;
2000 }
2001
2002 if ((space < resid + clen &&
2003 (atomic || (space < (int32_t)so->so_snd.sb_lowat) ||
2004 space < clen)) ||
2005 (so->so_type == SOCK_STREAM && so_wait_for_if_feedback(so))) {
2006 /*
2007 * don't block the connectx call when there's more data
2008 * than can be copied.
2009 */
2010 if (so->so_flags1 & SOF1_PRECONNECT_DATA) {
2011 if (space == 0) {
2012 return EWOULDBLOCK;
2013 }
2014 if (space < (int32_t)so->so_snd.sb_lowat) {
2015 return 0;
2016 }
2017 }
2018 if ((so->so_state & SS_NBIO) || (flags & MSG_NBIO) ||
2019 assumelock) {
2020 return EWOULDBLOCK;
2021 }
2022 sbunlock(&so->so_snd, TRUE); /* keep socket locked */
2023 *sblocked = 0;
2024 error = sbwait(&so->so_snd);
2025 if (error) {
2026 if (so->so_flags & SOF_DEFUNCT) {
2027 goto defunct;
2028 }
2029 return error;
2030 }
2031 goto restart;
2032 }
2033 return 0;
2034 }
2035
2036 /*
2037 * Send on a socket.
2038 * If send must go all at once and message is larger than
2039 * send buffering, then hard error.
2040 * Lock against other senders.
2041 * If must go all at once and not enough room now, then
2042 * inform user that this would block and do nothing.
2043 * Otherwise, if nonblocking, send as much as possible.
2044 * The data to be sent is described by "uio" if nonzero,
2045 * otherwise by the mbuf chain "top" (which must be null
2046 * if uio is not). Data provided in mbuf chain must be small
2047 * enough to send all at once.
2048 *
2049 * Returns nonzero on error, timeout or signal; callers
2050 * must check for short counts if EINTR/ERESTART are returned.
2051 * Data and control buffers are freed on return.
2052 * Experiment:
2053 * MSG_HOLD: go thru most of sosend(), but just enqueue the mbuf
2054 * MSG_SEND: go thru as for MSG_HOLD on current fragment, then
2055 * point at the mbuf chain being constructed and go from there.
2056 *
2057 * Returns: 0 Success
2058 * EOPNOTSUPP
2059 * EINVAL
2060 * ENOBUFS
2061 * uiomove:EFAULT
2062 * sosendcheck:EPIPE
2063 * sosendcheck:EWOULDBLOCK
2064 * sosendcheck:EINTR
2065 * sosendcheck:EBADF
2066 * sosendcheck:EINTR
2067 * sosendcheck:??? [value from so_error]
2068 * <pru_send>:ECONNRESET[TCP]
2069 * <pru_send>:EINVAL[TCP]
2070 * <pru_send>:ENOBUFS[TCP]
2071 * <pru_send>:EADDRINUSE[TCP]
2072 * <pru_send>:EADDRNOTAVAIL[TCP]
2073 * <pru_send>:EAFNOSUPPORT[TCP]
2074 * <pru_send>:EACCES[TCP]
2075 * <pru_send>:EAGAIN[TCP]
2076 * <pru_send>:EPERM[TCP]
2077 * <pru_send>:EMSGSIZE[TCP]
2078 * <pru_send>:EHOSTUNREACH[TCP]
2079 * <pru_send>:ENETUNREACH[TCP]
2080 * <pru_send>:ENETDOWN[TCP]
2081 * <pru_send>:ENOMEM[TCP]
2082 * <pru_send>:ENOBUFS[TCP]
2083 * <pru_send>:???[TCP] [ignorable: mostly IPSEC/firewall/DLIL]
2084 * <pru_send>:EINVAL[AF_UNIX]
2085 * <pru_send>:EOPNOTSUPP[AF_UNIX]
2086 * <pru_send>:EPIPE[AF_UNIX]
2087 * <pru_send>:ENOTCONN[AF_UNIX]
2088 * <pru_send>:EISCONN[AF_UNIX]
2089 * <pru_send>:???[AF_UNIX] [whatever a filter author chooses]
2090 * <sf_data_out>:??? [whatever a filter author chooses]
2091 *
2092 * Notes: Other <pru_send> returns depend on the protocol family; all
2093 * <sf_data_out> returns depend on what the filter author causes
2094 * their filter to return.
2095 */
2096 int
2097 sosend(struct socket *so, struct sockaddr *addr, struct uio *uio,
2098 struct mbuf *top, struct mbuf *control, int flags)
2099 {
2100 struct mbuf **mp;
2101 struct mbuf *m, *freelist = NULL;
2102 user_ssize_t space, len, resid, orig_resid;
2103 int clen = 0, error, dontroute, mlen, sendflags;
2104 int atomic = sosendallatonce(so) || top;
2105 int sblocked = 0;
2106 struct proc *p = current_proc();
2107 struct mbuf *control_copy = NULL;
2108 uint16_t headroom = 0;
2109 boolean_t en_tracing = FALSE;
2110
2111 if (uio != NULL) {
2112 resid = uio_resid(uio);
2113 } else {
2114 resid = top->m_pkthdr.len;
2115 }
2116
2117 KERNEL_DEBUG((DBG_FNC_SOSEND | DBG_FUNC_START), so, resid,
2118 so->so_snd.sb_cc, so->so_snd.sb_lowat, so->so_snd.sb_hiwat);
2119
2120 socket_lock(so, 1);
2121
2122 /*
2123 * trace if tracing & network (vs. unix) sockets & and
2124 * non-loopback
2125 */
2126 if (ENTR_SHOULDTRACE &&
2127 (SOCK_CHECK_DOM(so, AF_INET) || SOCK_CHECK_DOM(so, AF_INET6))) {
2128 struct inpcb *inp = sotoinpcb(so);
2129 if (inp->inp_last_outifp != NULL &&
2130 !(inp->inp_last_outifp->if_flags & IFF_LOOPBACK)) {
2131 en_tracing = TRUE;
2132 KERNEL_ENERGYTRACE(kEnTrActKernSockWrite, DBG_FUNC_START,
2133 VM_KERNEL_ADDRPERM(so),
2134 ((so->so_state & SS_NBIO) ? kEnTrFlagNonBlocking : 0),
2135 (int64_t)resid);
2136 orig_resid = resid;
2137 }
2138 }
2139
2140 /*
2141 * Re-injection should not affect process accounting
2142 */
2143 if ((flags & MSG_SKIPCFIL) == 0) {
2144 so_update_last_owner_locked(so, p);
2145 so_update_policy(so);
2146
2147 #if NECP
2148 so_update_necp_policy(so, NULL, addr);
2149 #endif /* NECP */
2150 }
2151
2152 if (so->so_type != SOCK_STREAM && (flags & MSG_OOB) != 0) {
2153 error = EOPNOTSUPP;
2154 goto out_locked;
2155 }
2156
2157 /*
2158 * In theory resid should be unsigned.
2159 * However, space must be signed, as it might be less than 0
2160 * if we over-committed, and we must use a signed comparison
2161 * of space and resid. On the other hand, a negative resid
2162 * causes us to loop sending 0-length segments to the protocol.
2163 *
2164 * Usually, MSG_EOR isn't used on SOCK_STREAM type sockets.
2165 * But it will be used by sockets doing message delivery.
2166 *
2167 * Note: We limit resid to be a positive int value as we use
2168 * imin() to set bytes_to_copy -- radr://14558484
2169 */
2170 if (resid < 0 || resid > INT_MAX || (so->so_type == SOCK_STREAM &&
2171 !(so->so_flags & SOF_ENABLE_MSGS) && (flags & MSG_EOR))) {
2172 error = EINVAL;
2173 goto out_locked;
2174 }
2175
2176 dontroute = (flags & MSG_DONTROUTE) &&
2177 (so->so_options & SO_DONTROUTE) == 0 &&
2178 (so->so_proto->pr_flags & PR_ATOMIC);
2179 OSIncrementAtomicLong(&p->p_stats->p_ru.ru_msgsnd);
2180
2181 if (control != NULL) {
2182 clen = control->m_len;
2183 }
2184
2185 if (soreserveheadroom != 0) {
2186 headroom = so->so_pktheadroom;
2187 }
2188
2189 do {
2190 error = sosendcheck(so, addr, resid, clen, atomic, flags,
2191 &sblocked, control);
2192 if (error) {
2193 goto out_locked;
2194 }
2195
2196 mp = &top;
2197 if (so->so_flags & SOF_ENABLE_MSGS) {
2198 space = msgq_sbspace(so, control);
2199 } else {
2200 space = sbspace(&so->so_snd) - clen;
2201 }
2202 space += ((flags & MSG_OOB) ? 1024 : 0);
2203
2204 do {
2205 if (uio == NULL) {
2206 /*
2207 * Data is prepackaged in "top".
2208 */
2209 resid = 0;
2210 if (flags & MSG_EOR) {
2211 top->m_flags |= M_EOR;
2212 }
2213 } else {
2214 int chainlength;
2215 int bytes_to_copy;
2216 boolean_t jumbocl;
2217 boolean_t bigcl;
2218 int bytes_to_alloc;
2219
2220 bytes_to_copy = imin(resid, space);
2221
2222 bytes_to_alloc = bytes_to_copy;
2223 if (top == NULL) {
2224 bytes_to_alloc += headroom;
2225 }
2226
2227 if (sosendminchain > 0) {
2228 chainlength = 0;
2229 } else {
2230 chainlength = sosendmaxchain;
2231 }
2232
2233 /*
2234 * Use big 4 KB cluster when the outgoing interface
2235 * does not prefer 2 KB clusters
2236 */
2237 bigcl = !(so->so_flags1 & SOF1_IF_2KCL) ||
2238 sosendbigcl_ignore_capab;
2239
2240 /*
2241 * Attempt to use larger than system page-size
2242 * clusters for large writes only if there is
2243 * a jumbo cluster pool and if the socket is
2244 * marked accordingly.
2245 */
2246 jumbocl = sosendjcl && njcl > 0 &&
2247 ((so->so_flags & SOF_MULTIPAGES) ||
2248 sosendjcl_ignore_capab) &&
2249 bigcl;
2250
2251 socket_unlock(so, 0);
2252
2253 do {
2254 int num_needed;
2255 int hdrs_needed = (top == NULL) ? 1 : 0;
2256
2257 /*
2258 * try to maintain a local cache of mbuf
2259 * clusters needed to complete this
2260 * write the list is further limited to
2261 * the number that are currently needed
2262 * to fill the socket this mechanism
2263 * allows a large number of mbufs/
2264 * clusters to be grabbed under a single
2265 * mbuf lock... if we can't get any
2266 * clusters, than fall back to trying
2267 * for mbufs if we fail early (or
2268 * miscalcluate the number needed) make
2269 * sure to release any clusters we
2270 * haven't yet consumed.
2271 */
2272 if (freelist == NULL &&
2273 bytes_to_alloc > MBIGCLBYTES &&
2274 jumbocl) {
2275 num_needed =
2276 bytes_to_alloc / M16KCLBYTES;
2277
2278 if ((bytes_to_alloc -
2279 (num_needed * M16KCLBYTES))
2280 >= MINCLSIZE) {
2281 num_needed++;
2282 }
2283
2284 freelist =
2285 m_getpackets_internal(
2286 (unsigned int *)&num_needed,
2287 hdrs_needed, M_WAIT, 0,
2288 M16KCLBYTES);
2289 /*
2290 * Fall back to 4K cluster size
2291 * if allocation failed
2292 */
2293 }
2294
2295 if (freelist == NULL &&
2296 bytes_to_alloc > MCLBYTES &&
2297 bigcl) {
2298 num_needed =
2299 bytes_to_alloc / MBIGCLBYTES;
2300
2301 if ((bytes_to_alloc -
2302 (num_needed * MBIGCLBYTES)) >=
2303 MINCLSIZE) {
2304 num_needed++;
2305 }
2306
2307 freelist =
2308 m_getpackets_internal(
2309 (unsigned int *)&num_needed,
2310 hdrs_needed, M_WAIT, 0,
2311 MBIGCLBYTES);
2312 /*
2313 * Fall back to cluster size
2314 * if allocation failed
2315 */
2316 }
2317
2318 /*
2319 * Allocate a cluster as we want to
2320 * avoid to split the data in more
2321 * that one segment and using MINCLSIZE
2322 * would lead us to allocate two mbufs
2323 */
2324 if (soreserveheadroom != 0 &&
2325 freelist == NULL &&
2326 ((top == NULL &&
2327 bytes_to_alloc > _MHLEN) ||
2328 bytes_to_alloc > _MLEN)) {
2329 num_needed = ROUNDUP(bytes_to_alloc, MCLBYTES) /
2330 MCLBYTES;
2331 freelist =
2332 m_getpackets_internal(
2333 (unsigned int *)&num_needed,
2334 hdrs_needed, M_WAIT, 0,
2335 MCLBYTES);
2336 /*
2337 * Fall back to a single mbuf
2338 * if allocation failed
2339 */
2340 } else if (freelist == NULL &&
2341 bytes_to_alloc > MINCLSIZE) {
2342 num_needed =
2343 bytes_to_alloc / MCLBYTES;
2344
2345 if ((bytes_to_alloc -
2346 (num_needed * MCLBYTES)) >=
2347 MINCLSIZE) {
2348 num_needed++;
2349 }
2350
2351 freelist =
2352 m_getpackets_internal(
2353 (unsigned int *)&num_needed,
2354 hdrs_needed, M_WAIT, 0,
2355 MCLBYTES);
2356 /*
2357 * Fall back to a single mbuf
2358 * if allocation failed
2359 */
2360 }
2361 /*
2362 * For datagram protocols, leave
2363 * headroom for protocol headers
2364 * in the first cluster of the chain
2365 */
2366 if (freelist != NULL && atomic &&
2367 top == NULL && headroom > 0) {
2368 freelist->m_data += headroom;
2369 }
2370
2371 /*
2372 * Fall back to regular mbufs without
2373 * reserving the socket headroom
2374 */
2375 if (freelist == NULL) {
2376 if (top == NULL) {
2377 MGETHDR(freelist,
2378 M_WAIT, MT_DATA);
2379 } else {
2380 MGET(freelist,
2381 M_WAIT, MT_DATA);
2382 }
2383
2384 if (freelist == NULL) {
2385 error = ENOBUFS;
2386 socket_lock(so, 0);
2387 goto out_locked;
2388 }
2389 /*
2390 * For datagram protocols,
2391 * leave room for protocol
2392 * headers in first mbuf.
2393 */
2394 if (atomic && top == NULL &&
2395 bytes_to_copy < MHLEN) {
2396 MH_ALIGN(freelist,
2397 bytes_to_copy);
2398 }
2399 }
2400 m = freelist;
2401 freelist = m->m_next;
2402 m->m_next = NULL;
2403
2404 if ((m->m_flags & M_EXT)) {
2405 mlen = m->m_ext.ext_size -
2406 M_LEADINGSPACE(m);
2407 } else if ((m->m_flags & M_PKTHDR)) {
2408 mlen =
2409 MHLEN - M_LEADINGSPACE(m);
2410 } else {
2411 mlen = MLEN - M_LEADINGSPACE(m);
2412 }
2413 len = imin(mlen, bytes_to_copy);
2414
2415 chainlength += len;
2416
2417 space -= len;
2418
2419 error = uiomove(mtod(m, caddr_t),
2420 len, uio);
2421
2422 resid = uio_resid(uio);
2423
2424 m->m_len = len;
2425 *mp = m;
2426 top->m_pkthdr.len += len;
2427 if (error) {
2428 break;
2429 }
2430 mp = &m->m_next;
2431 if (resid <= 0) {
2432 if (flags & MSG_EOR) {
2433 top->m_flags |= M_EOR;
2434 }
2435 break;
2436 }
2437 bytes_to_copy = min(resid, space);
2438 } while (space > 0 &&
2439 (chainlength < sosendmaxchain || atomic ||
2440 resid < MINCLSIZE));
2441
2442 socket_lock(so, 0);
2443
2444 if (error) {
2445 goto out_locked;
2446 }
2447 }
2448
2449 if (flags & (MSG_HOLD | MSG_SEND)) {
2450 /* Enqueue for later, go away if HOLD */
2451 struct mbuf *mb1;
2452 if (so->so_temp && (flags & MSG_FLUSH)) {
2453 m_freem(so->so_temp);
2454 so->so_temp = NULL;
2455 }
2456 if (so->so_temp) {
2457 so->so_tail->m_next = top;
2458 } else {
2459 so->so_temp = top;
2460 }
2461 mb1 = top;
2462 while (mb1->m_next) {
2463 mb1 = mb1->m_next;
2464 }
2465 so->so_tail = mb1;
2466 if (flags & MSG_HOLD) {
2467 top = NULL;
2468 goto out_locked;
2469 }
2470 top = so->so_temp;
2471 }
2472 if (dontroute) {
2473 so->so_options |= SO_DONTROUTE;
2474 }
2475
2476 /*
2477 * Compute flags here, for pru_send and NKEs
2478 *
2479 * If the user set MSG_EOF, the protocol
2480 * understands this flag and nothing left to
2481 * send then use PRU_SEND_EOF instead of PRU_SEND.
2482 */
2483 sendflags = (flags & MSG_OOB) ? PRUS_OOB :
2484 ((flags & MSG_EOF) &&
2485 (so->so_proto->pr_flags & PR_IMPLOPCL) &&
2486 (resid <= 0)) ? PRUS_EOF :
2487 /* If there is more to send set PRUS_MORETOCOME */
2488 (resid > 0 && space > 0) ? PRUS_MORETOCOME : 0;
2489
2490 if ((flags & MSG_SKIPCFIL) == 0) {
2491 /*
2492 * Socket filter processing
2493 */
2494 error = sflt_data_out(so, addr, &top,
2495 &control, (sendflags & MSG_OOB) ?
2496 sock_data_filt_flag_oob : 0);
2497 if (error) {
2498 if (error == EJUSTRETURN) {
2499 error = 0;
2500 clen = 0;
2501 control = NULL;
2502 top = NULL;
2503 }
2504 goto out_locked;
2505 }
2506 #if CONTENT_FILTER
2507 /*
2508 * Content filter processing
2509 */
2510 error = cfil_sock_data_out(so, addr, top,
2511 control, sendflags);
2512 if (error) {
2513 if (error == EJUSTRETURN) {
2514 error = 0;
2515 clen = 0;
2516 control = NULL;
2517 top = NULL;
2518 }
2519 goto out_locked;
2520 }
2521 #endif /* CONTENT_FILTER */
2522 }
2523 if (so->so_flags & SOF_ENABLE_MSGS) {
2524 /*
2525 * Make a copy of control mbuf,
2526 * so that msg priority can be
2527 * passed to subsequent mbufs.
2528 */
2529 control_copy = m_dup(control, M_NOWAIT);
2530 }
2531 error = (*so->so_proto->pr_usrreqs->pru_send)
2532 (so, sendflags, top, addr, control, p);
2533
2534 if (flags & MSG_SEND) {
2535 so->so_temp = NULL;
2536 }
2537
2538 if (dontroute) {
2539 so->so_options &= ~SO_DONTROUTE;
2540 }
2541
2542 clen = 0;
2543 control = control_copy;
2544 control_copy = NULL;
2545 top = NULL;
2546 mp = &top;
2547 if (error) {
2548 goto out_locked;
2549 }
2550 } while (resid && space > 0);
2551 } while (resid);
2552
2553 out_locked:
2554 if (sblocked) {
2555 sbunlock(&so->so_snd, FALSE); /* will unlock socket */
2556 } else {
2557 socket_unlock(so, 1);
2558 }
2559 if (top != NULL) {
2560 m_freem(top);
2561 }
2562 if (control != NULL) {
2563 m_freem(control);
2564 }
2565 if (freelist != NULL) {
2566 m_freem_list(freelist);
2567 }
2568 if (control_copy != NULL) {
2569 m_freem(control_copy);
2570 }
2571
2572 soclearfastopen(so);
2573
2574 if (en_tracing) {
2575 /* resid passed here is the bytes left in uio */
2576 KERNEL_ENERGYTRACE(kEnTrActKernSockWrite, DBG_FUNC_END,
2577 VM_KERNEL_ADDRPERM(so),
2578 ((error == EWOULDBLOCK) ? kEnTrFlagNoWork : 0),
2579 (int64_t)(orig_resid - resid));
2580 }
2581 KERNEL_DEBUG(DBG_FNC_SOSEND | DBG_FUNC_END, so, resid,
2582 so->so_snd.sb_cc, space, error);
2583
2584 return error;
2585 }
2586
2587 int
2588 sosend_reinject(struct socket *so, struct sockaddr *addr, struct mbuf *top, struct mbuf *control, uint32_t sendflags)
2589 {
2590 struct mbuf *m0, *control_end;
2591
2592 socket_lock_assert_owned(so);
2593
2594 /*
2595 * top must points to mbuf chain to be sent.
2596 * If control is not NULL, top must be packet header
2597 */
2598 VERIFY(top != NULL &&
2599 (control == NULL || top->m_flags & M_PKTHDR));
2600
2601 /*
2602 * If control is not passed in, see if we can get it
2603 * from top.
2604 */
2605 if (control == NULL && (top->m_flags & M_PKTHDR) == 0) {
2606 // Locate start of control if present and start of data
2607 for (m0 = top; m0 != NULL; m0 = m0->m_next) {
2608 if (m0->m_flags & M_PKTHDR) {
2609 top = m0;
2610 break;
2611 } else if (m0->m_type == MT_CONTROL) {
2612 if (control == NULL) {
2613 // Found start of control
2614 control = m0;
2615 }
2616 if (control != NULL && m0->m_next != NULL && m0->m_next->m_type != MT_CONTROL) {
2617 // Found end of control
2618 control_end = m0;
2619 }
2620 }
2621 }
2622 if (control_end != NULL) {
2623 control_end->m_next = NULL;
2624 }
2625 }
2626
2627 int error = (*so->so_proto->pr_usrreqs->pru_send)
2628 (so, sendflags, top, addr, control, current_proc());
2629
2630 return error;
2631 }
2632
2633 /*
2634 * Supported only connected sockets (no address) without ancillary data
2635 * (control mbuf) for atomic protocols
2636 */
2637 int
2638 sosend_list(struct socket *so, struct uio **uioarray, u_int uiocnt, int flags)
2639 {
2640 struct mbuf *m, *freelist = NULL;
2641 user_ssize_t len, resid;
2642 int error, dontroute, mlen;
2643 int atomic = sosendallatonce(so);
2644 int sblocked = 0;
2645 struct proc *p = current_proc();
2646 u_int uiofirst = 0;
2647 u_int uiolast = 0;
2648 struct mbuf *top = NULL;
2649 uint16_t headroom = 0;
2650 boolean_t bigcl;
2651
2652 KERNEL_DEBUG((DBG_FNC_SOSEND_LIST | DBG_FUNC_START), so, uiocnt,
2653 so->so_snd.sb_cc, so->so_snd.sb_lowat, so->so_snd.sb_hiwat);
2654
2655 if (so->so_type != SOCK_DGRAM) {
2656 error = EINVAL;
2657 goto out;
2658 }
2659 if (atomic == 0) {
2660 error = EINVAL;
2661 goto out;
2662 }
2663 if (so->so_proto->pr_usrreqs->pru_send_list == NULL) {
2664 error = EPROTONOSUPPORT;
2665 goto out;
2666 }
2667 if (flags & ~(MSG_DONTWAIT | MSG_NBIO)) {
2668 error = EINVAL;
2669 goto out;
2670 }
2671 resid = uio_array_resid(uioarray, uiocnt);
2672
2673 /*
2674 * In theory resid should be unsigned.
2675 * However, space must be signed, as it might be less than 0
2676 * if we over-committed, and we must use a signed comparison
2677 * of space and resid. On the other hand, a negative resid
2678 * causes us to loop sending 0-length segments to the protocol.
2679 *
2680 * Note: We limit resid to be a positive int value as we use
2681 * imin() to set bytes_to_copy -- radr://14558484
2682 */
2683 if (resid < 0 || resid > INT_MAX) {
2684 error = EINVAL;
2685 goto out;
2686 }
2687
2688 socket_lock(so, 1);
2689 so_update_last_owner_locked(so, p);
2690 so_update_policy(so);
2691
2692 #if NECP
2693 so_update_necp_policy(so, NULL, NULL);
2694 #endif /* NECP */
2695
2696 dontroute = (flags & MSG_DONTROUTE) &&
2697 (so->so_options & SO_DONTROUTE) == 0 &&
2698 (so->so_proto->pr_flags & PR_ATOMIC);
2699 OSIncrementAtomicLong(&p->p_stats->p_ru.ru_msgsnd);
2700
2701 error = sosendcheck(so, NULL, resid, 0, atomic, flags,
2702 &sblocked, NULL);
2703 if (error) {
2704 goto release;
2705 }
2706
2707 /*
2708 * Use big 4 KB clusters when the outgoing interface does not prefer
2709 * 2 KB clusters
2710 */
2711 bigcl = !(so->so_flags1 & SOF1_IF_2KCL) || sosendbigcl_ignore_capab;
2712
2713 if (soreserveheadroom != 0) {
2714 headroom = so->so_pktheadroom;
2715 }
2716
2717 do {
2718 int i;
2719 int num_needed = 0;
2720 int chainlength;
2721 size_t maxpktlen = 0;
2722 int bytes_to_alloc;
2723
2724 if (sosendminchain > 0) {
2725 chainlength = 0;
2726 } else {
2727 chainlength = sosendmaxchain;
2728 }
2729
2730 socket_unlock(so, 0);
2731
2732 /*
2733 * Find a set of uio that fit in a reasonable number
2734 * of mbuf packets
2735 */
2736 for (i = uiofirst; i < uiocnt; i++) {
2737 struct uio *auio = uioarray[i];
2738
2739 len = uio_resid(auio);
2740
2741 /* Do nothing for empty messages */
2742 if (len == 0) {
2743 continue;
2744 }
2745
2746 num_needed += 1;
2747 uiolast += 1;
2748
2749 if (len > maxpktlen) {
2750 maxpktlen = len;
2751 }
2752
2753 chainlength += len;
2754 if (chainlength > sosendmaxchain) {
2755 break;
2756 }
2757 }
2758 /*
2759 * Nothing left to send
2760 */
2761 if (num_needed == 0) {
2762 socket_lock(so, 0);
2763 break;
2764 }
2765 /*
2766 * Allocate buffer large enough to include headroom space for
2767 * network and link header
2768 *
2769 */
2770 bytes_to_alloc = maxpktlen + headroom;
2771
2772 /*
2773 * Allocate a single contiguous buffer of the smallest available
2774 * size when possible
2775 */
2776 if (bytes_to_alloc > MCLBYTES &&
2777 bytes_to_alloc <= MBIGCLBYTES && bigcl) {
2778 freelist = m_getpackets_internal(
2779 (unsigned int *)&num_needed,
2780 num_needed, M_WAIT, 1,
2781 MBIGCLBYTES);
2782 } else if (bytes_to_alloc > _MHLEN &&
2783 bytes_to_alloc <= MCLBYTES) {
2784 freelist = m_getpackets_internal(
2785 (unsigned int *)&num_needed,
2786 num_needed, M_WAIT, 1,
2787 MCLBYTES);
2788 } else {
2789 freelist = m_allocpacket_internal(
2790 (unsigned int *)&num_needed,
2791 bytes_to_alloc, NULL, M_WAIT, 1, 0);
2792 }
2793
2794 if (freelist == NULL) {
2795 socket_lock(so, 0);
2796 error = ENOMEM;
2797 goto release;
2798 }
2799 /*
2800 * Copy each uio of the set into its own mbuf packet
2801 */
2802 for (i = uiofirst, m = freelist;
2803 i < uiolast && m != NULL;
2804 i++) {
2805 int bytes_to_copy;
2806 struct mbuf *n;
2807 struct uio *auio = uioarray[i];
2808
2809 bytes_to_copy = uio_resid(auio);
2810
2811 /* Do nothing for empty messages */
2812 if (bytes_to_copy == 0) {
2813 continue;
2814 }
2815 /*
2816 * Leave headroom for protocol headers
2817 * in the first mbuf of the chain
2818 */
2819 m->m_data += headroom;
2820
2821 for (n = m; n != NULL; n = n->m_next) {
2822 if ((m->m_flags & M_EXT)) {
2823 mlen = m->m_ext.ext_size -
2824 M_LEADINGSPACE(m);
2825 } else if ((m->m_flags & M_PKTHDR)) {
2826 mlen =
2827 MHLEN - M_LEADINGSPACE(m);
2828 } else {
2829 mlen = MLEN - M_LEADINGSPACE(m);
2830 }
2831 len = imin(mlen, bytes_to_copy);
2832
2833 /*
2834 * Note: uiomove() decrements the iovec
2835 * length
2836 */
2837 error = uiomove(mtod(n, caddr_t),
2838 len, auio);
2839 if (error != 0) {
2840 break;
2841 }
2842 n->m_len = len;
2843 m->m_pkthdr.len += len;
2844
2845 VERIFY(m->m_pkthdr.len <= maxpktlen);
2846
2847 bytes_to_copy -= len;
2848 resid -= len;
2849 }
2850 if (m->m_pkthdr.len == 0) {
2851 printf(
2852 "%s:%d so %llx pkt %llx type %u len null\n",
2853 __func__, __LINE__,
2854 (uint64_t)DEBUG_KERNEL_ADDRPERM(so),
2855 (uint64_t)DEBUG_KERNEL_ADDRPERM(m),
2856 m->m_type);
2857 }
2858 if (error != 0) {
2859 break;
2860 }
2861 m = m->m_nextpkt;
2862 }
2863
2864 socket_lock(so, 0);
2865
2866 if (error) {
2867 goto release;
2868 }
2869 top = freelist;
2870 freelist = NULL;
2871
2872 if (dontroute) {
2873 so->so_options |= SO_DONTROUTE;
2874 }
2875
2876 if ((flags & MSG_SKIPCFIL) == 0) {
2877 struct mbuf **prevnextp = NULL;
2878
2879 for (i = uiofirst, m = top;
2880 i < uiolast && m != NULL;
2881 i++) {
2882 struct mbuf *nextpkt = m->m_nextpkt;
2883
2884 /*
2885 * Socket filter processing
2886 */
2887 error = sflt_data_out(so, NULL, &m,
2888 NULL, 0);
2889 if (error != 0 && error != EJUSTRETURN) {
2890 goto release;
2891 }
2892
2893 #if CONTENT_FILTER
2894 if (error == 0) {
2895 /*
2896 * Content filter processing
2897 */
2898 error = cfil_sock_data_out(so, NULL, m,
2899 NULL, 0);
2900 if (error != 0 && error != EJUSTRETURN) {
2901 goto release;
2902 }
2903 }
2904 #endif /* CONTENT_FILTER */
2905 /*
2906 * Remove packet from the list when
2907 * swallowed by a filter
2908 */
2909 if (error == EJUSTRETURN) {
2910 error = 0;
2911 if (prevnextp != NULL) {
2912 *prevnextp = nextpkt;
2913 } else {
2914 top = nextpkt;
2915 }
2916 }
2917
2918 m = nextpkt;
2919 if (m != NULL) {
2920 prevnextp = &m->m_nextpkt;
2921 }
2922 }
2923 }
2924 if (top != NULL) {
2925 error = (*so->so_proto->pr_usrreqs->pru_send_list)
2926 (so, 0, top, NULL, NULL, p);
2927 }
2928
2929 if (dontroute) {
2930 so->so_options &= ~SO_DONTROUTE;
2931 }
2932
2933 top = NULL;
2934 uiofirst = uiolast;
2935 } while (resid > 0 && error == 0);
2936 release:
2937 if (sblocked) {
2938 sbunlock(&so->so_snd, FALSE); /* will unlock socket */
2939 } else {
2940 socket_unlock(so, 1);
2941 }
2942 out:
2943 if (top != NULL) {
2944 m_freem(top);
2945 }
2946 if (freelist != NULL) {
2947 m_freem_list(freelist);
2948 }
2949
2950 KERNEL_DEBUG(DBG_FNC_SOSEND_LIST | DBG_FUNC_END, so, resid,
2951 so->so_snd.sb_cc, 0, error);
2952
2953 return error;
2954 }
2955
2956 /*
2957 * May return ERESTART when packet is dropped by MAC policy check
2958 */
2959 static int
2960 soreceive_addr(struct proc *p, struct socket *so, struct sockaddr **psa,
2961 int flags, struct mbuf **mp, struct mbuf **nextrecordp, int canwait)
2962 {
2963 int error = 0;
2964 struct mbuf *m = *mp;
2965 struct mbuf *nextrecord = *nextrecordp;
2966
2967 KASSERT(m->m_type == MT_SONAME, ("receive 1a"));
2968 #if CONFIG_MACF_SOCKET_SUBSET
2969 /*
2970 * Call the MAC framework for policy checking if we're in
2971 * the user process context and the socket isn't connected.
2972 */
2973 if (p != kernproc && !(so->so_state & SS_ISCONNECTED)) {
2974 struct mbuf *m0 = m;
2975 /*
2976 * Dequeue this record (temporarily) from the receive
2977 * list since we're about to drop the socket's lock
2978 * where a new record may arrive and be appended to
2979 * the list. Upon MAC policy failure, the record
2980 * will be freed. Otherwise, we'll add it back to
2981 * the head of the list. We cannot rely on SB_LOCK
2982 * because append operation uses the socket's lock.
2983 */
2984 do {
2985 m->m_nextpkt = NULL;
2986 sbfree(&so->so_rcv, m);
2987 m = m->m_next;
2988 } while (m != NULL);
2989 m = m0;
2990 so->so_rcv.sb_mb = nextrecord;
2991 SB_EMPTY_FIXUP(&so->so_rcv);
2992 SBLASTRECORDCHK(&so->so_rcv, "soreceive 1a");
2993 SBLASTMBUFCHK(&so->so_rcv, "soreceive 1a");
2994 socket_unlock(so, 0);
2995
2996 if (mac_socket_check_received(proc_ucred(p), so,
2997 mtod(m, struct sockaddr *)) != 0) {
2998 /*
2999 * MAC policy failure; free this record and
3000 * process the next record (or block until
3001 * one is available). We have adjusted sb_cc
3002 * and sb_mbcnt above so there is no need to
3003 * call sbfree() again.
3004 */
3005 m_freem(m);
3006 /*
3007 * Clear SB_LOCK but don't unlock the socket.
3008 * Process the next record or wait for one.
3009 */
3010 socket_lock(so, 0);
3011 sbunlock(&so->so_rcv, TRUE); /* stay locked */
3012 error = ERESTART;
3013 goto done;
3014 }
3015 socket_lock(so, 0);
3016 /*
3017 * If the socket has been defunct'd, drop it.
3018 */
3019 if (so->so_flags & SOF_DEFUNCT) {
3020 m_freem(m);
3021 error = ENOTCONN;
3022 goto done;
3023 }
3024 /*
3025 * Re-adjust the socket receive list and re-enqueue
3026 * the record in front of any packets which may have
3027 * been appended while we dropped the lock.
3028 */
3029 for (m = m0; m->m_next != NULL; m = m->m_next) {
3030 sballoc(&so->so_rcv, m);
3031 }
3032 sballoc(&so->so_rcv, m);
3033 if (so->so_rcv.sb_mb == NULL) {
3034 so->so_rcv.sb_lastrecord = m0;
3035 so->so_rcv.sb_mbtail = m;
3036 }
3037 m = m0;
3038 nextrecord = m->m_nextpkt = so->so_rcv.sb_mb;
3039 so->so_rcv.sb_mb = m;
3040 SBLASTRECORDCHK(&so->so_rcv, "soreceive 1b");
3041 SBLASTMBUFCHK(&so->so_rcv, "soreceive 1b");
3042 }
3043 #endif /* CONFIG_MACF_SOCKET_SUBSET */
3044 if (psa != NULL) {
3045 *psa = dup_sockaddr(mtod(m, struct sockaddr *), canwait);
3046 if ((*psa == NULL) && (flags & MSG_NEEDSA)) {
3047 error = EWOULDBLOCK;
3048 goto done;
3049 }
3050 }
3051 if (flags & MSG_PEEK) {
3052 m = m->m_next;
3053 } else {
3054 sbfree(&so->so_rcv, m);
3055 if (m->m_next == NULL && so->so_rcv.sb_cc != 0) {
3056 panic("%s: about to create invalid socketbuf",
3057 __func__);
3058 /* NOTREACHED */
3059 }
3060 MFREE(m, so->so_rcv.sb_mb);
3061 m = so->so_rcv.sb_mb;
3062 if (m != NULL) {
3063 m->m_nextpkt = nextrecord;
3064 } else {
3065 so->so_rcv.sb_mb = nextrecord;
3066 SB_EMPTY_FIXUP(&so->so_rcv);
3067 }
3068 }
3069 done:
3070 *mp = m;
3071 *nextrecordp = nextrecord;
3072
3073 return error;
3074 }
3075
3076 /*
3077 * Process one or more MT_CONTROL mbufs present before any data mbufs
3078 * in the first mbuf chain on the socket buffer. If MSG_PEEK, we
3079 * just copy the data; if !MSG_PEEK, we call into the protocol to
3080 * perform externalization.
3081 */
3082 static int
3083 soreceive_ctl(struct socket *so, struct mbuf **controlp, int flags,
3084 struct mbuf **mp, struct mbuf **nextrecordp)
3085 {
3086 int error = 0;
3087 struct mbuf *cm = NULL, *cmn;
3088 struct mbuf **cme = &cm;
3089 struct sockbuf *sb_rcv = &so->so_rcv;
3090 struct mbuf **msgpcm = NULL;
3091 struct mbuf *m = *mp;
3092 struct mbuf *nextrecord = *nextrecordp;
3093 struct protosw *pr = so->so_proto;
3094
3095 /*
3096 * Externalizing the control messages would require us to
3097 * drop the socket's lock below. Once we re-acquire the
3098 * lock, the mbuf chain might change. In order to preserve
3099 * consistency, we unlink all control messages from the
3100 * first mbuf chain in one shot and link them separately
3101 * onto a different chain.
3102 */
3103 do {
3104 if (flags & MSG_PEEK) {
3105 if (controlp != NULL) {
3106 if (*controlp == NULL) {
3107 msgpcm = controlp;
3108 }
3109 *controlp = m_copy(m, 0, m->m_len);
3110
3111 /*
3112 * If we failed to allocate an mbuf,
3113 * release any previously allocated
3114 * mbufs for control data. Return
3115 * an error. Keep the mbufs in the
3116 * socket as this is using
3117 * MSG_PEEK flag.
3118 */
3119 if (*controlp == NULL) {
3120 m_freem(*msgpcm);
3121 error = ENOBUFS;
3122 goto done;
3123 }
3124 controlp = &(*controlp)->m_next;
3125 }
3126 m = m->m_next;
3127 } else {
3128 m->m_nextpkt = NULL;
3129 sbfree(sb_rcv, m);
3130 sb_rcv->sb_mb = m->m_next;
3131 m->m_next = NULL;
3132 *cme = m;
3133 cme = &(*cme)->m_next;
3134 m = sb_rcv->sb_mb;
3135 }
3136 } while (m != NULL && m->m_type == MT_CONTROL);
3137
3138 if (!(flags & MSG_PEEK)) {
3139 if (sb_rcv->sb_mb != NULL) {
3140 sb_rcv->sb_mb->m_nextpkt = nextrecord;
3141 } else {
3142 sb_rcv->sb_mb = nextrecord;
3143 SB_EMPTY_FIXUP(sb_rcv);
3144 }
3145 if (nextrecord == NULL) {
3146 sb_rcv->sb_lastrecord = m;
3147 }
3148 }
3149
3150 SBLASTRECORDCHK(&so->so_rcv, "soreceive ctl");
3151 SBLASTMBUFCHK(&so->so_rcv, "soreceive ctl");
3152
3153 while (cm != NULL) {
3154 int cmsg_type;
3155
3156 cmn = cm->m_next;
3157 cm->m_next = NULL;
3158 cmsg_type = mtod(cm, struct cmsghdr *)->cmsg_type;
3159
3160 /*
3161 * Call the protocol to externalize SCM_RIGHTS message
3162 * and return the modified message to the caller upon
3163 * success. Otherwise, all other control messages are
3164 * returned unmodified to the caller. Note that we
3165 * only get into this loop if MSG_PEEK is not set.
3166 */
3167 if (pr->pr_domain->dom_externalize != NULL &&
3168 cmsg_type == SCM_RIGHTS) {
3169 /*
3170 * Release socket lock: see 3903171. This
3171 * would also allow more records to be appended
3172 * to the socket buffer. We still have SB_LOCK
3173 * set on it, so we can be sure that the head
3174 * of the mbuf chain won't change.
3175 */
3176 socket_unlock(so, 0);
3177 error = (*pr->pr_domain->dom_externalize)(cm);
3178 socket_lock(so, 0);
3179 } else {
3180 error = 0;
3181 }
3182
3183 if (controlp != NULL && error == 0) {
3184 *controlp = cm;
3185 controlp = &(*controlp)->m_next;
3186 } else {
3187 (void) m_free(cm);
3188 }
3189 cm = cmn;
3190 }
3191 /*
3192 * Update the value of nextrecord in case we received new
3193 * records when the socket was unlocked above for
3194 * externalizing SCM_RIGHTS.
3195 */
3196 if (m != NULL) {
3197 nextrecord = sb_rcv->sb_mb->m_nextpkt;
3198 } else {
3199 nextrecord = sb_rcv->sb_mb;
3200 }
3201
3202 done:
3203 *mp = m;
3204 *nextrecordp = nextrecord;
3205
3206 return error;
3207 }
3208
3209 /*
3210 * Implement receive operations on a socket.
3211 * We depend on the way that records are added to the sockbuf
3212 * by sbappend*. In particular, each record (mbufs linked through m_next)
3213 * must begin with an address if the protocol so specifies,
3214 * followed by an optional mbuf or mbufs containing ancillary data,
3215 * and then zero or more mbufs of data.
3216 * In order to avoid blocking network interrupts for the entire time here,
3217 * we splx() while doing the actual copy to user space.
3218 * Although the sockbuf is locked, new data may still be appended,
3219 * and thus we must maintain consistency of the sockbuf during that time.
3220 *
3221 * The caller may receive the data as a single mbuf chain by supplying
3222 * an mbuf **mp0 for use in returning the chain. The uio is then used
3223 * only for the count in uio_resid.
3224 *
3225 * Returns: 0 Success
3226 * ENOBUFS
3227 * ENOTCONN
3228 * EWOULDBLOCK
3229 * uiomove:EFAULT
3230 * sblock:EWOULDBLOCK
3231 * sblock:EINTR
3232 * sbwait:EBADF
3233 * sbwait:EINTR
3234 * sodelayed_copy:EFAULT
3235 * <pru_rcvoob>:EINVAL[TCP]
3236 * <pru_rcvoob>:EWOULDBLOCK[TCP]
3237 * <pru_rcvoob>:???
3238 * <pr_domain->dom_externalize>:EMSGSIZE[AF_UNIX]
3239 * <pr_domain->dom_externalize>:ENOBUFS[AF_UNIX]
3240 * <pr_domain->dom_externalize>:???
3241 *
3242 * Notes: Additional return values from calls through <pru_rcvoob> and
3243 * <pr_domain->dom_externalize> depend on protocols other than
3244 * TCP or AF_UNIX, which are documented above.
3245 */
3246 int
3247 soreceive(struct socket *so, struct sockaddr **psa, struct uio *uio,
3248 struct mbuf **mp0, struct mbuf **controlp, int *flagsp)
3249 {
3250 struct mbuf *m, **mp, *ml = NULL;
3251 struct mbuf *nextrecord, *free_list;
3252 int flags, error, offset;
3253 user_ssize_t len;
3254 struct protosw *pr = so->so_proto;
3255 int moff, type = 0;
3256 user_ssize_t orig_resid = uio_resid(uio);
3257 user_ssize_t delayed_copy_len;
3258 int can_delay;
3259 int need_event;
3260 struct proc *p = current_proc();
3261 boolean_t en_tracing = FALSE;
3262
3263 /*
3264 * Sanity check on the length passed by caller as we are making 'int'
3265 * comparisons
3266 */
3267 if (orig_resid < 0 || orig_resid > INT_MAX) {
3268 return EINVAL;
3269 }
3270
3271 KERNEL_DEBUG(DBG_FNC_SORECEIVE | DBG_FUNC_START, so,
3272 uio_resid(uio), so->so_rcv.sb_cc, so->so_rcv.sb_lowat,
3273 so->so_rcv.sb_hiwat);
3274
3275 socket_lock(so, 1);
3276 so_update_last_owner_locked(so, p);
3277 so_update_policy(so);
3278
3279 #ifdef MORE_LOCKING_DEBUG
3280 if (so->so_usecount == 1) {
3281 panic("%s: so=%x no other reference on socket\n", __func__, so);
3282 /* NOTREACHED */
3283 }
3284 #endif
3285 mp = mp0;
3286 if (psa != NULL) {
3287 *psa = NULL;
3288 }
3289 if (controlp != NULL) {
3290 *controlp = NULL;
3291 }
3292 if (flagsp != NULL) {
3293 flags = *flagsp & ~MSG_EOR;
3294 } else {
3295 flags = 0;
3296 }
3297
3298 /*
3299 * If a recv attempt is made on a previously-accepted socket
3300 * that has been marked as inactive (disconnected), reject
3301 * the request.
3302 */
3303 if (so->so_flags & SOF_DEFUNCT) {
3304 struct sockbuf *sb = &so->so_rcv;
3305
3306 error = ENOTCONN;
3307 SODEFUNCTLOG("%s[%d, %s]: defunct so 0x%llx [%d,%d] (%d)\n",
3308 __func__, proc_pid(p), proc_best_name(p),
3309 (uint64_t)DEBUG_KERNEL_ADDRPERM(so),
3310 SOCK_DOM(so), SOCK_TYPE(so), error);
3311 /*
3312 * This socket should have been disconnected and flushed
3313 * prior to being returned from sodefunct(); there should
3314 * be no data on its receive list, so panic otherwise.
3315 */
3316 if (so->so_state & SS_DEFUNCT) {
3317 sb_empty_assert(sb, __func__);
3318 }
3319 socket_unlock(so, 1);
3320 return error;
3321 }
3322
3323 if ((so->so_flags1 & SOF1_PRECONNECT_DATA) &&
3324 pr->pr_usrreqs->pru_preconnect) {
3325 /*
3326 * A user may set the CONNECT_RESUME_ON_READ_WRITE-flag but not
3327 * calling write() right after this. *If* the app calls a read
3328 * we do not want to block this read indefinetely. Thus,
3329 * we trigger a connect so that the session gets initiated.
3330 */
3331 error = (*pr->pr_usrreqs->pru_preconnect)(so);
3332
3333 if (error) {
3334 socket_unlock(so, 1);
3335 return error;
3336 }
3337 }
3338
3339 if (ENTR_SHOULDTRACE &&
3340 (SOCK_CHECK_DOM(so, AF_INET) || SOCK_CHECK_DOM(so, AF_INET6))) {
3341 /*
3342 * enable energy tracing for inet sockets that go over
3343 * non-loopback interfaces only.
3344 */
3345 struct inpcb *inp = sotoinpcb(so);
3346 if (inp->inp_last_outifp != NULL &&
3347 !(inp->inp_last_outifp->if_flags & IFF_LOOPBACK)) {
3348 en_tracing = TRUE;
3349 KERNEL_ENERGYTRACE(kEnTrActKernSockRead, DBG_FUNC_START,
3350 VM_KERNEL_ADDRPERM(so),
3351 ((so->so_state & SS_NBIO) ?
3352 kEnTrFlagNonBlocking : 0),
3353 (int64_t)orig_resid);
3354 }
3355 }
3356
3357 /*
3358 * When SO_WANTOOBFLAG is set we try to get out-of-band data
3359 * regardless of the flags argument. Here is the case were
3360 * out-of-band data is not inline.
3361 */
3362 if ((flags & MSG_OOB) ||
3363 ((so->so_options & SO_WANTOOBFLAG) != 0 &&
3364 (so->so_options & SO_OOBINLINE) == 0 &&
3365 (so->so_oobmark || (so->so_state & SS_RCVATMARK)))) {
3366 m = m_get(M_WAIT, MT_DATA);
3367 if (m == NULL) {
3368 socket_unlock(so, 1);
3369 KERNEL_DEBUG(DBG_FNC_SORECEIVE | DBG_FUNC_END,
3370 ENOBUFS, 0, 0, 0, 0);
3371 return ENOBUFS;
3372 }
3373 error = (*pr->pr_usrreqs->pru_rcvoob)(so, m, flags & MSG_PEEK);
3374 if (error) {
3375 goto bad;
3376 }
3377 socket_unlock(so, 0);
3378 do {
3379 error = uiomove(mtod(m, caddr_t),
3380 imin(uio_resid(uio), m->m_len), uio);
3381 m = m_free(m);
3382 } while (uio_resid(uio) && error == 0 && m != NULL);
3383 socket_lock(so, 0);
3384 bad:
3385 if (m != NULL) {
3386 m_freem(m);
3387 }
3388
3389 if ((so->so_options & SO_WANTOOBFLAG) != 0) {
3390 if (error == EWOULDBLOCK || error == EINVAL) {
3391 /*
3392 * Let's try to get normal data:
3393 * EWOULDBLOCK: out-of-band data not
3394 * receive yet. EINVAL: out-of-band data
3395 * already read.
3396 */
3397 error = 0;
3398 goto nooob;
3399 } else if (error == 0 && flagsp != NULL) {
3400 *flagsp |= MSG_OOB;
3401 }
3402 }
3403 socket_unlock(so, 1);
3404 if (en_tracing) {
3405 KERNEL_ENERGYTRACE(kEnTrActKernSockRead, DBG_FUNC_END,
3406 VM_KERNEL_ADDRPERM(so), 0,
3407 (int64_t)(orig_resid - uio_resid(uio)));
3408 }
3409 KERNEL_DEBUG(DBG_FNC_SORECEIVE | DBG_FUNC_END, error,
3410 0, 0, 0, 0);
3411
3412 return error;
3413 }
3414 nooob:
3415 if (mp != NULL) {
3416 *mp = NULL;
3417 }
3418
3419 if (so->so_state & SS_ISCONFIRMING && uio_resid(uio)) {
3420 (*pr->pr_usrreqs->pru_rcvd)(so, 0);
3421 }
3422
3423 free_list = NULL;
3424 delayed_copy_len = 0;
3425 restart:
3426 #ifdef MORE_LOCKING_DEBUG
3427 if (so->so_usecount <= 1) {
3428 printf("soreceive: sblock so=0x%llx ref=%d on socket\n",
3429 (uint64_t)DEBUG_KERNEL_ADDRPERM(so), so->so_usecount);
3430 }
3431 #endif
3432 /*
3433 * See if the socket has been closed (SS_NOFDREF|SS_CANTRCVMORE)
3434 * and if so just return to the caller. This could happen when
3435 * soreceive() is called by a socket upcall function during the
3436 * time the socket is freed. The socket buffer would have been
3437 * locked across the upcall, therefore we cannot put this thread
3438 * to sleep (else we will deadlock) or return EWOULDBLOCK (else
3439 * we may livelock), because the lock on the socket buffer will
3440 * only be released when the upcall routine returns to its caller.
3441 * Because the socket has been officially closed, there can be
3442 * no further read on it.
3443 *
3444 * A multipath subflow socket would have its SS_NOFDREF set by
3445 * default, so check for SOF_MP_SUBFLOW socket flag; when the
3446 * socket is closed for real, SOF_MP_SUBFLOW would be cleared.
3447 */
3448 if ((so->so_state & (SS_NOFDREF | SS_CANTRCVMORE)) ==
3449 (SS_NOFDREF | SS_CANTRCVMORE) && !(so->so_flags & SOF_MP_SUBFLOW)) {
3450 socket_unlock(so, 1);
3451 return 0;
3452 }
3453
3454 error = sblock(&so->so_rcv, SBLOCKWAIT(flags));
3455 if (error) {
3456 socket_unlock(so, 1);
3457 KERNEL_DEBUG(DBG_FNC_SORECEIVE | DBG_FUNC_END, error,
3458 0, 0, 0, 0);
3459 if (en_tracing) {
3460 KERNEL_ENERGYTRACE(kEnTrActKernSockRead, DBG_FUNC_END,
3461 VM_KERNEL_ADDRPERM(so), 0,
3462 (int64_t)(orig_resid - uio_resid(uio)));
3463 }
3464 return error;
3465 }
3466
3467 m = so->so_rcv.sb_mb;
3468 /*
3469 * If we have less data than requested, block awaiting more
3470 * (subject to any timeout) if:
3471 * 1. the current count is less than the low water mark, or
3472 * 2. MSG_WAITALL is set, and it is possible to do the entire
3473 * receive operation at once if we block (resid <= hiwat).
3474 * 3. MSG_DONTWAIT is not set
3475 * If MSG_WAITALL is set but resid is larger than the receive buffer,
3476 * we have to do the receive in sections, and thus risk returning
3477 * a short count if a timeout or signal occurs after we start.
3478 */
3479 if (m == NULL || (((flags & MSG_DONTWAIT) == 0 &&
3480 so->so_rcv.sb_cc < uio_resid(uio)) &&
3481 (so->so_rcv.sb_cc < so->so_rcv.sb_lowat ||
3482 ((flags & MSG_WAITALL) && uio_resid(uio) <= so->so_rcv.sb_hiwat)) &&
3483 m->m_nextpkt == NULL && (pr->pr_flags & PR_ATOMIC) == 0)) {
3484 /*
3485 * Panic if we notice inconsistencies in the socket's
3486 * receive list; both sb_mb and sb_cc should correctly
3487 * reflect the contents of the list, otherwise we may
3488 * end up with false positives during select() or poll()
3489 * which could put the application in a bad state.
3490 */
3491 SB_MB_CHECK(&so->so_rcv);
3492
3493 if (so->so_error) {
3494 if (m != NULL) {
3495 goto dontblock;
3496 }
3497 error = so->so_error;
3498 if ((flags & MSG_PEEK) == 0) {
3499 so->so_error = 0;
3500 }
3501 goto release;
3502 }
3503 if (so->so_state & SS_CANTRCVMORE) {
3504 #if CONTENT_FILTER
3505 /*
3506 * Deal with half closed connections
3507 */
3508 if ((so->so_state & SS_ISDISCONNECTED) == 0 &&
3509 cfil_sock_data_pending(&so->so_rcv) != 0) {
3510 CFIL_LOG(LOG_INFO,
3511 "so %llx ignore SS_CANTRCVMORE",
3512 (uint64_t)DEBUG_KERNEL_ADDRPERM(so));
3513 } else
3514 #endif /* CONTENT_FILTER */
3515 if (m != NULL) {
3516 goto dontblock;
3517 } else {
3518 goto release;
3519 }
3520 }
3521 for (; m != NULL; m = m->m_next) {
3522 if (m->m_type == MT_OOBDATA || (m->m_flags & M_EOR)) {
3523 m = so->so_rcv.sb_mb;
3524 goto dontblock;
3525 }
3526 }
3527 if ((so->so_state & (SS_ISCONNECTED | SS_ISCONNECTING)) == 0 &&
3528 (so->so_proto->pr_flags & PR_CONNREQUIRED)) {
3529 error = ENOTCONN;
3530 goto release;
3531 }
3532 if (uio_resid(uio) == 0) {
3533 goto release;
3534 }
3535
3536 if ((so->so_state & SS_NBIO) ||
3537 (flags & (MSG_DONTWAIT | MSG_NBIO))) {
3538 error = EWOULDBLOCK;
3539 goto release;
3540 }
3541 SBLASTRECORDCHK(&so->so_rcv, "soreceive sbwait 1");
3542 SBLASTMBUFCHK(&so->so_rcv, "soreceive sbwait 1");
3543 sbunlock(&so->so_rcv, TRUE); /* keep socket locked */
3544 #if EVEN_MORE_LOCKING_DEBUG
3545 if (socket_debug) {
3546 printf("Waiting for socket data\n");
3547 }
3548 #endif
3549
3550 error = sbwait(&so->so_rcv);
3551 #if EVEN_MORE_LOCKING_DEBUG
3552 if (socket_debug) {
3553 printf("SORECEIVE - sbwait returned %d\n", error);
3554 }
3555 #endif
3556 if (so->so_usecount < 1) {
3557 panic("%s: after 2nd sblock so=%p ref=%d on socket\n",
3558 __func__, so, so->so_usecount);
3559 /* NOTREACHED */
3560 }
3561 if (error) {
3562 socket_unlock(so, 1);
3563 KERNEL_DEBUG(DBG_FNC_SORECEIVE | DBG_FUNC_END, error,
3564 0, 0, 0, 0);
3565 if (en_tracing) {
3566 KERNEL_ENERGYTRACE(kEnTrActKernSockRead, DBG_FUNC_END,
3567 VM_KERNEL_ADDRPERM(so), 0,
3568 (int64_t)(orig_resid - uio_resid(uio)));
3569 }
3570 return error;
3571 }
3572 goto restart;
3573 }
3574 dontblock:
3575 OSIncrementAtomicLong(&p->p_stats->p_ru.ru_msgrcv);
3576 SBLASTRECORDCHK(&so->so_rcv, "soreceive 1");
3577 SBLASTMBUFCHK(&so->so_rcv, "soreceive 1");
3578 nextrecord = m->m_nextpkt;
3579
3580 if ((pr->pr_flags & PR_ADDR) && m->m_type == MT_SONAME) {
3581 error = soreceive_addr(p, so, psa, flags, &m, &nextrecord,
3582 mp0 == NULL);
3583 if (error == ERESTART) {
3584 goto restart;
3585 } else if (error != 0) {
3586 goto release;
3587 }
3588 orig_resid = 0;
3589 }
3590
3591 /*
3592 * Process one or more MT_CONTROL mbufs present before any data mbufs
3593 * in the first mbuf chain on the socket buffer. If MSG_PEEK, we
3594 * just copy the data; if !MSG_PEEK, we call into the protocol to
3595 * perform externalization.
3596 */
3597 if (m != NULL && m->m_type == MT_CONTROL) {
3598 error = soreceive_ctl(so, controlp, flags, &m, &nextrecord);
3599 if (error != 0) {
3600 goto release;
3601 }
3602 orig_resid = 0;
3603 }
3604
3605 /*
3606 * If the socket is a TCP socket with message delivery
3607 * enabled, then create a control msg to deliver the
3608 * relative TCP sequence number for this data. Waiting
3609 * until this point will protect against failures to
3610 * allocate an mbuf for control msgs.
3611 */
3612 if (so->so_type == SOCK_STREAM && SOCK_PROTO(so) == IPPROTO_TCP &&
3613 (so->so_flags & SOF_ENABLE_MSGS) && controlp != NULL) {
3614 struct mbuf *seq_cm;
3615
3616 seq_cm = sbcreatecontrol((caddr_t)&m->m_pkthdr.msg_seq,
3617 sizeof(uint32_t), SCM_SEQNUM, SOL_SOCKET);
3618 if (seq_cm == NULL) {
3619 /* unable to allocate a control mbuf */
3620 error = ENOBUFS;
3621 goto release;
3622 }
3623 *controlp = seq_cm;
3624 controlp = &seq_cm->m_next;
3625 }
3626
3627 if (m != NULL) {
3628 if (!(flags & MSG_PEEK)) {
3629 /*
3630 * We get here because m points to an mbuf following
3631 * any MT_SONAME or MT_CONTROL mbufs which have been
3632 * processed above. In any case, m should be pointing
3633 * to the head of the mbuf chain, and the nextrecord
3634 * should be either NULL or equal to m->m_nextpkt.
3635 * See comments above about SB_LOCK.
3636 */
3637 if (m != so->so_rcv.sb_mb ||
3638 m->m_nextpkt != nextrecord) {
3639 panic("%s: post-control !sync so=%p m=%p "
3640 "nextrecord=%p\n", __func__, so, m,
3641 nextrecord);
3642 /* NOTREACHED */
3643 }
3644 if (nextrecord == NULL) {
3645 so->so_rcv.sb_lastrecord = m;
3646 }
3647 }
3648 type = m->m_type;
3649 if (type == MT_OOBDATA) {
3650 flags |= MSG_OOB;
3651 }
3652 } else {
3653 if (!(flags & MSG_PEEK)) {
3654 SB_EMPTY_FIXUP(&so->so_rcv);
3655 }
3656 }
3657 SBLASTRECORDCHK(&so->so_rcv, "soreceive 2");
3658 SBLASTMBUFCHK(&so->so_rcv, "soreceive 2");
3659
3660 moff = 0;
3661 offset = 0;
3662
3663 if (!(flags & MSG_PEEK) && uio_resid(uio) > sorecvmincopy) {
3664 can_delay = 1;
3665 } else {
3666 can_delay = 0;
3667 }
3668
3669 need_event = 0;
3670
3671 while (m != NULL &&
3672 (uio_resid(uio) - delayed_copy_len) > 0 && error == 0) {
3673 if (m->m_type == MT_OOBDATA) {
3674 if (type != MT_OOBDATA) {
3675 break;
3676 }
3677 } else if (type == MT_OOBDATA) {
3678 break;
3679 }
3680 /*
3681 * Make sure to allways set MSG_OOB event when getting
3682 * out of band data inline.
3683 */
3684 if ((so->so_options & SO_WANTOOBFLAG) != 0 &&
3685 (so->so_options & SO_OOBINLINE) != 0 &&
3686 (so->so_state & SS_RCVATMARK) != 0) {
3687 flags |= MSG_OOB;
3688 }
3689 so->so_state &= ~SS_RCVATMARK;
3690 len = uio_resid(uio) - delayed_copy_len;
3691 if (so->so_oobmark && len > so->so_oobmark - offset) {
3692 len = so->so_oobmark - offset;
3693 }
3694 if (len > m->m_len - moff) {
3695 len = m->m_len - moff;
3696 }
3697 /*
3698 * If mp is set, just pass back the mbufs.
3699 * Otherwise copy them out via the uio, then free.
3700 * Sockbuf must be consistent here (points to current mbuf,
3701 * it points to next record) when we drop priority;
3702 * we must note any additions to the sockbuf when we
3703 * block interrupts again.
3704 */
3705 if (mp == NULL) {
3706 SBLASTRECORDCHK(&so->so_rcv, "soreceive uiomove");
3707 SBLASTMBUFCHK(&so->so_rcv, "soreceive uiomove");
3708 if (can_delay && len == m->m_len) {
3709 /*
3710 * only delay the copy if we're consuming the
3711 * mbuf and we're NOT in MSG_PEEK mode
3712 * and we have enough data to make it worthwile
3713 * to drop and retake the lock... can_delay
3714 * reflects the state of the 2 latter
3715 * constraints moff should always be zero
3716 * in these cases
3717 */
3718 delayed_copy_len += len;
3719 } else {
3720 if (delayed_copy_len) {
3721 error = sodelayed_copy(so, uio,
3722 &free_list, &delayed_copy_len);
3723
3724 if (error) {
3725 goto release;
3726 }
3727 /*
3728 * can only get here if MSG_PEEK is not
3729 * set therefore, m should point at the
3730 * head of the rcv queue; if it doesn't,
3731 * it means something drastically
3732 * changed while we were out from behind
3733 * the lock in sodelayed_copy. perhaps
3734 * a RST on the stream. in any event,
3735 * the stream has been interrupted. it's
3736 * probably best just to return whatever
3737 * data we've moved and let the caller
3738 * sort it out...
3739 */
3740 if (m != so->so_rcv.sb_mb) {
3741 break;
3742 }
3743 }
3744 socket_unlock(so, 0);
3745 error = uiomove(mtod(m, caddr_t) + moff,
3746 (int)len, uio);
3747 socket_lock(so, 0);
3748
3749 if (error) {
3750 goto release;
3751 }
3752 }
3753 } else {
3754 uio_setresid(uio, (uio_resid(uio) - len));
3755 }
3756 if (len == m->m_len - moff) {
3757 if (m->m_flags & M_EOR) {
3758 flags |= MSG_EOR;
3759 }
3760 if (flags & MSG_PEEK) {
3761 m = m->m_next;
3762 moff = 0;
3763 } else {
3764 nextrecord = m->m_nextpkt;
3765 sbfree(&so->so_rcv, m);
3766 m->m_nextpkt = NULL;
3767
3768 /*
3769 * If this packet is an unordered packet
3770 * (indicated by M_UNORDERED_DATA flag), remove
3771 * the additional bytes added to the
3772 * receive socket buffer size.
3773 */
3774 if ((so->so_flags & SOF_ENABLE_MSGS) &&
3775 m->m_len &&
3776 (m->m_flags & M_UNORDERED_DATA) &&
3777 sbreserve(&so->so_rcv,
3778 so->so_rcv.sb_hiwat - m->m_len)) {
3779 if (so->so_msg_state->msg_uno_bytes >
3780 m->m_len) {
3781 so->so_msg_state->
3782 msg_uno_bytes -= m->m_len;
3783 } else {
3784 so->so_msg_state->
3785 msg_uno_bytes = 0;
3786 }
3787 m->m_flags &= ~M_UNORDERED_DATA;
3788 }
3789
3790 if (mp != NULL) {
3791 *mp = m;
3792 mp = &m->m_next;
3793 so->so_rcv.sb_mb = m = m->m_next;
3794 *mp = NULL;
3795 } else {
3796 if (free_list == NULL) {
3797 free_list = m;
3798 } else {
3799 ml->m_next = m;
3800 }
3801 ml = m;
3802 so->so_rcv.sb_mb = m = m->m_next;
3803 ml->m_next = NULL;
3804 }
3805 if (m != NULL) {
3806 m->m_nextpkt = nextrecord;
3807 if (nextrecord == NULL) {
3808 so->so_rcv.sb_lastrecord = m;
3809 }
3810 } else {
3811 so->so_rcv.sb_mb = nextrecord;
3812 SB_EMPTY_FIXUP(&so->so_rcv);
3813 }
3814 SBLASTRECORDCHK(&so->so_rcv, "soreceive 3");
3815 SBLASTMBUFCHK(&so->so_rcv, "soreceive 3");
3816 }
3817 } else {
3818 if (flags & MSG_PEEK) {
3819 moff += len;
3820 } else {
3821 if (mp != NULL) {
3822 int copy_flag;
3823
3824 if (flags & MSG_DONTWAIT) {
3825 copy_flag = M_DONTWAIT;
3826 } else {
3827 copy_flag = M_WAIT;
3828 }
3829 *mp = m_copym(m, 0, len, copy_flag);
3830 /*
3831 * Failed to allocate an mbuf?
3832 * Adjust uio_resid back, it was
3833 * adjusted down by len bytes which
3834 * we didn't copy over.
3835 */
3836 if (*mp == NULL) {
3837 uio_setresid(uio,
3838 (uio_resid(uio) + len));
3839 break;
3840 }
3841 }
3842 m->m_data += len;
3843 m->m_len -= len;
3844 so->so_rcv.sb_cc -= len;
3845 }
3846 }
3847 if (so->so_oobmark) {
3848 if ((flags & MSG_PEEK) == 0) {
3849 so->so_oobmark -= len;
3850 if (so->so_oobmark == 0) {
3851 so->so_state |= SS_RCVATMARK;
3852 /*
3853 * delay posting the actual event until
3854 * after any delayed copy processing
3855 * has finished
3856 */
3857 need_event = 1;
3858 break;
3859 }
3860 } else {
3861 offset += len;
3862 if (offset == so->so_oobmark) {
3863 break;
3864 }
3865 }
3866 }
3867 if (flags & MSG_EOR) {
3868 break;
3869 }
3870 /*
3871 * If the MSG_WAITALL or MSG_WAITSTREAM flag is set
3872 * (for non-atomic socket), we must not quit until
3873 * "uio->uio_resid == 0" or an error termination.
3874 * If a signal/timeout occurs, return with a short
3875 * count but without error. Keep sockbuf locked
3876 * against other readers.
3877 */
3878 while (flags & (MSG_WAITALL | MSG_WAITSTREAM) && m == NULL &&
3879 (uio_resid(uio) - delayed_copy_len) > 0 &&
3880 !sosendallatonce(so) && !nextrecord) {
3881 if (so->so_error || ((so->so_state & SS_CANTRCVMORE)
3882 #if CONTENT_FILTER
3883 && cfil_sock_data_pending(&so->so_rcv) == 0
3884 #endif /* CONTENT_FILTER */
3885 )) {
3886 goto release;
3887 }
3888
3889 /*
3890 * Depending on the protocol (e.g. TCP), the following
3891 * might cause the socket lock to be dropped and later
3892 * be reacquired, and more data could have arrived and
3893 * have been appended to the receive socket buffer by
3894 * the time it returns. Therefore, we only sleep in
3895 * sbwait() below if and only if the socket buffer is
3896 * empty, in order to avoid a false sleep.
3897 */
3898 if (pr->pr_flags & PR_WANTRCVD && so->so_pcb &&
3899 (((struct inpcb *)so->so_pcb)->inp_state !=
3900 INPCB_STATE_DEAD)) {
3901 (*pr->pr_usrreqs->pru_rcvd)(so, flags);
3902 }
3903
3904 SBLASTRECORDCHK(&so->so_rcv, "soreceive sbwait 2");
3905 SBLASTMBUFCHK(&so->so_rcv, "soreceive sbwait 2");
3906
3907 if (so->so_rcv.sb_mb == NULL && sbwait(&so->so_rcv)) {
3908 error = 0;
3909 goto release;
3910 }
3911 /*
3912 * have to wait until after we get back from the sbwait
3913 * to do the copy because we will drop the lock if we
3914 * have enough data that has been delayed... by dropping
3915 * the lock we open up a window allowing the netisr
3916 * thread to process the incoming packets and to change
3917 * the state of this socket... we're issuing the sbwait
3918 * because the socket is empty and we're expecting the
3919 * netisr thread to wake us up when more packets arrive;
3920 * if we allow that processing to happen and then sbwait
3921 * we could stall forever with packets sitting in the
3922 * socket if no further packets arrive from the remote
3923 * side.
3924 *
3925 * we want to copy before we've collected all the data
3926 * to satisfy this request to allow the copy to overlap
3927 * the incoming packet processing on an MP system
3928 */
3929 if (delayed_copy_len > sorecvmincopy &&
3930 (delayed_copy_len > (so->so_rcv.sb_hiwat / 2))) {
3931 error = sodelayed_copy(so, uio,
3932 &free_list, &delayed_copy_len);
3933
3934 if (error) {
3935 goto release;
3936 }
3937 }
3938 m = so->so_rcv.sb_mb;
3939 if (m != NULL) {
3940 nextrecord = m->m_nextpkt;
3941 }
3942 SB_MB_CHECK(&so->so_rcv);
3943 }
3944 }
3945 #ifdef MORE_LOCKING_DEBUG
3946 if (so->so_usecount <= 1) {
3947 panic("%s: after big while so=%p ref=%d on socket\n",
3948 __func__, so, so->so_usecount);
3949 /* NOTREACHED */
3950 }
3951 #endif
3952
3953 if (m != NULL && pr->pr_flags & PR_ATOMIC) {
3954 if (so->so_options & SO_DONTTRUNC) {
3955 flags |= MSG_RCVMORE;
3956 } else {
3957 flags |= MSG_TRUNC;
3958 if ((flags & MSG_PEEK) == 0) {
3959 (void) sbdroprecord(&so->so_rcv);
3960 }
3961 }
3962 }
3963
3964 /*
3965 * pru_rcvd below (for TCP) may cause more data to be received
3966 * if the socket lock is dropped prior to sending the ACK; some
3967 * legacy OpenTransport applications don't handle this well
3968 * (if it receives less data than requested while MSG_HAVEMORE
3969 * is set), and so we set the flag now based on what we know
3970 * prior to calling pru_rcvd.
3971 */
3972 if ((so->so_options & SO_WANTMORE) && so->so_rcv.sb_cc > 0) {
3973 flags |= MSG_HAVEMORE;
3974 }
3975
3976 if ((flags & MSG_PEEK) == 0) {
3977 if (m == NULL) {
3978 so->so_rcv.sb_mb = nextrecord;
3979 /*
3980 * First part is an inline SB_EMPTY_FIXUP(). Second
3981 * part makes sure sb_lastrecord is up-to-date if
3982 * there is still data in the socket buffer.
3983 */
3984 if (so->so_rcv.sb_mb == NULL) {
3985 so->so_rcv.sb_mbtail = NULL;
3986 so->so_rcv.sb_lastrecord = NULL;
3987 } else if (nextrecord->m_nextpkt == NULL) {
3988 so->so_rcv.sb_lastrecord = nextrecord;
3989 }
3990 SB_MB_CHECK(&so->so_rcv);
3991 }
3992 SBLASTRECORDCHK(&so->so_rcv, "soreceive 4");
3993 SBLASTMBUFCHK(&so->so_rcv, "soreceive 4");
3994 if (pr->pr_flags & PR_WANTRCVD && so->so_pcb) {
3995 (*pr->pr_usrreqs->pru_rcvd)(so, flags);
3996 }
3997 }
3998
3999 if (delayed_copy_len) {
4000 error = sodelayed_copy(so, uio, &free_list, &delayed_copy_len);
4001 if (error) {
4002 goto release;
4003 }
4004 }
4005 if (free_list != NULL) {
4006 m_freem_list(free_list);
4007 free_list = NULL;
4008 }
4009 if (need_event) {
4010 postevent(so, 0, EV_OOB);
4011 }
4012
4013 if (orig_resid == uio_resid(uio) && orig_resid &&
4014 (flags & MSG_EOR) == 0 && (so->so_state & SS_CANTRCVMORE) == 0) {
4015 sbunlock(&so->so_rcv, TRUE); /* keep socket locked */
4016 goto restart;
4017 }
4018
4019 if (flagsp != NULL) {
4020 *flagsp |= flags;
4021 }
4022 release:
4023 #ifdef MORE_LOCKING_DEBUG
4024 if (so->so_usecount <= 1) {
4025 panic("%s: release so=%p ref=%d on socket\n", __func__,
4026 so, so->so_usecount);
4027 /* NOTREACHED */
4028 }
4029 #endif
4030 if (delayed_copy_len) {
4031 error = sodelayed_copy(so, uio, &free_list, &delayed_copy_len);
4032 }
4033
4034 if (free_list != NULL) {
4035 m_freem_list(free_list);
4036 }
4037
4038 sbunlock(&so->so_rcv, FALSE); /* will unlock socket */
4039
4040 if (en_tracing) {
4041 KERNEL_ENERGYTRACE(kEnTrActKernSockRead, DBG_FUNC_END,
4042 VM_KERNEL_ADDRPERM(so),
4043 ((error == EWOULDBLOCK) ? kEnTrFlagNoWork : 0),
4044 (int64_t)(orig_resid - uio_resid(uio)));
4045 }
4046 KERNEL_DEBUG(DBG_FNC_SORECEIVE | DBG_FUNC_END, so, uio_resid(uio),
4047 so->so_rcv.sb_cc, 0, error);
4048
4049 return error;
4050 }
4051
4052 /*
4053 * Returns: 0 Success
4054 * uiomove:EFAULT
4055 */
4056 static int
4057 sodelayed_copy(struct socket *so, struct uio *uio, struct mbuf **free_list,
4058 user_ssize_t *resid)
4059 {
4060 int error = 0;
4061 struct mbuf *m;
4062
4063 m = *free_list;
4064
4065 socket_unlock(so, 0);
4066
4067 while (m != NULL && error == 0) {
4068 error = uiomove(mtod(m, caddr_t), (int)m->m_len, uio);
4069 m = m->m_next;
4070 }
4071 m_freem_list(*free_list);
4072
4073 *free_list = NULL;
4074 *resid = 0;
4075
4076 socket_lock(so, 0);
4077
4078 return error;
4079 }
4080
4081 static int
4082 sodelayed_copy_list(struct socket *so, struct recv_msg_elem *msgarray,
4083 u_int uiocnt, struct mbuf **free_list, user_ssize_t *resid)
4084 {
4085 #pragma unused(so)
4086 int error = 0;
4087 struct mbuf *ml, *m;
4088 int i = 0;
4089 struct uio *auio;
4090
4091 for (ml = *free_list, i = 0; ml != NULL && i < uiocnt;
4092 ml = ml->m_nextpkt, i++) {
4093 auio = msgarray[i].uio;
4094 for (m = ml; m != NULL; m = m->m_next) {
4095 error = uiomove(mtod(m, caddr_t), m->m_len, auio);
4096 if (error != 0) {
4097 goto out;
4098 }
4099 }
4100 }
4101 out:
4102 m_freem_list(*free_list);
4103
4104 *free_list = NULL;
4105 *resid = 0;
4106
4107 return error;
4108 }
4109
4110 int
4111 soreceive_list(struct socket *so, struct recv_msg_elem *msgarray, u_int uiocnt,
4112 int *flagsp)
4113 {
4114 struct mbuf *m;
4115 struct mbuf *nextrecord;
4116 struct mbuf *ml = NULL, *free_list = NULL, *free_tail = NULL;
4117 int error;
4118 user_ssize_t len, pktlen, delayed_copy_len = 0;
4119 struct protosw *pr = so->so_proto;
4120 user_ssize_t resid;
4121 struct proc *p = current_proc();
4122 struct uio *auio = NULL;
4123 int npkts = 0;
4124 int sblocked = 0;
4125 struct sockaddr **psa = NULL;
4126 struct mbuf **controlp = NULL;
4127 int can_delay;
4128 int flags;
4129 struct mbuf *free_others = NULL;
4130
4131 KERNEL_DEBUG(DBG_FNC_SORECEIVE_LIST | DBG_FUNC_START,
4132 so, uiocnt,
4133 so->so_rcv.sb_cc, so->so_rcv.sb_lowat, so->so_rcv.sb_hiwat);
4134
4135 /*
4136 * Sanity checks:
4137 * - Only supports don't wait flags
4138 * - Only support datagram sockets (could be extended to raw)
4139 * - Must be atomic
4140 * - Protocol must support packet chains
4141 * - The uio array is NULL (should we panic?)
4142 */
4143 if (flagsp != NULL) {
4144 flags = *flagsp;
4145 } else {
4146 flags = 0;
4147 }
4148 if (flags & ~(MSG_PEEK | MSG_WAITALL | MSG_DONTWAIT | MSG_NEEDSA |
4149 MSG_NBIO)) {
4150 printf("%s invalid flags 0x%x\n", __func__, flags);
4151 error = EINVAL;
4152 goto out;
4153 }
4154 if (so->so_type != SOCK_DGRAM) {
4155 error = EINVAL;
4156 goto out;
4157 }
4158 if (sosendallatonce(so) == 0) {
4159 error = EINVAL;
4160 goto out;
4161 }
4162 if (so->so_proto->pr_usrreqs->pru_send_list == NULL) {
4163 error = EPROTONOSUPPORT;
4164 goto out;
4165 }
4166 if (msgarray == NULL) {
4167 printf("%s uioarray is NULL\n", __func__);
4168 error = EINVAL;
4169 goto out;
4170 }
4171 if (uiocnt == 0) {
4172 printf("%s uiocnt is 0\n", __func__);
4173 error = EINVAL;
4174 goto out;
4175 }
4176 /*
4177 * Sanity check on the length passed by caller as we are making 'int'
4178 * comparisons
4179 */
4180 resid = recv_msg_array_resid(msgarray, uiocnt);
4181 if (resid < 0 || resid > INT_MAX) {
4182 error = EINVAL;
4183 goto out;
4184 }
4185
4186 if (!(flags & MSG_PEEK) && sorecvmincopy > 0) {
4187 can_delay = 1;
4188 } else {
4189 can_delay = 0;
4190 }
4191
4192 socket_lock(so, 1);
4193 so_update_last_owner_locked(so, p);
4194 so_update_policy(so);
4195
4196 #if NECP
4197 so_update_necp_policy(so, NULL, NULL);
4198 #endif /* NECP */
4199
4200 /*
4201 * If a recv attempt is made on a previously-accepted socket
4202 * that has been marked as inactive (disconnected), reject
4203 * the request.
4204 */
4205 if (so->so_flags & SOF_DEFUNCT) {
4206 struct sockbuf *sb = &so->so_rcv;
4207
4208 error = ENOTCONN;
4209 SODEFUNCTLOG("%s[%d, %s]: defunct so 0x%llx [%d,%d] (%d)\n",
4210 __func__, proc_pid(p), proc_best_name(p),
4211 (uint64_t)DEBUG_KERNEL_ADDRPERM(so),
4212 SOCK_DOM(so), SOCK_TYPE(so), error);
4213 /*
4214 * This socket should have been disconnected and flushed
4215 * prior to being returned from sodefunct(); there should
4216 * be no data on its receive list, so panic otherwise.
4217 */
4218 if (so->so_state & SS_DEFUNCT) {
4219 sb_empty_assert(sb, __func__);
4220 }
4221 goto release;
4222 }
4223
4224 next:
4225 /*
4226 * The uio may be empty
4227 */
4228 if (npkts >= uiocnt) {
4229 error = 0;
4230 goto release;
4231 }
4232 restart:
4233 /*
4234 * See if the socket has been closed (SS_NOFDREF|SS_CANTRCVMORE)
4235 * and if so just return to the caller. This could happen when
4236 * soreceive() is called by a socket upcall function during the
4237 * time the socket is freed. The socket buffer would have been
4238 * locked across the upcall, therefore we cannot put this thread
4239 * to sleep (else we will deadlock) or return EWOULDBLOCK (else
4240 * we may livelock), because the lock on the socket buffer will
4241 * only be released when the upcall routine returns to its caller.
4242 * Because the socket has been officially closed, there can be
4243 * no further read on it.
4244 */
4245 if ((so->so_state & (SS_NOFDREF | SS_CANTRCVMORE)) ==
4246 (SS_NOFDREF | SS_CANTRCVMORE)) {
4247 error = 0;
4248 goto release;
4249 }
4250
4251 error = sblock(&so->so_rcv, SBLOCKWAIT(flags));
4252 if (error) {
4253 goto release;
4254 }
4255 sblocked = 1;
4256
4257 m = so->so_rcv.sb_mb;
4258 /*
4259 * Block awaiting more datagram if needed
4260 */
4261 if (m == NULL || (((flags & MSG_DONTWAIT) == 0 &&
4262 (so->so_rcv.sb_cc < so->so_rcv.sb_lowat ||
4263 ((flags & MSG_WAITALL) && npkts < uiocnt))))) {
4264 /*
4265 * Panic if we notice inconsistencies in the socket's
4266 * receive list; both sb_mb and sb_cc should correctly
4267 * reflect the contents of the list, otherwise we may
4268 * end up with false positives during select() or poll()
4269 * which could put the application in a bad state.
4270 */
4271 SB_MB_CHECK(&so->so_rcv);
4272
4273 if (so->so_error) {
4274 error = so->so_error;
4275 if ((flags & MSG_PEEK) == 0) {
4276 so->so_error = 0;
4277 }
4278 goto release;
4279 }
4280 if (so->so_state & SS_CANTRCVMORE) {
4281 goto release;
4282 }
4283 if ((so->so_state & (SS_ISCONNECTED | SS_ISCONNECTING)) == 0 &&
4284 (so->so_proto->pr_flags & PR_CONNREQUIRED)) {
4285 error = ENOTCONN;
4286 goto release;
4287 }
4288 if ((so->so_state & SS_NBIO) ||
4289 (flags & (MSG_DONTWAIT | MSG_NBIO))) {
4290 error = EWOULDBLOCK;
4291 goto release;
4292 }
4293 /*
4294 * Do not block if we got some data
4295 */
4296 if (free_list != NULL) {
4297 error = 0;
4298 goto release;
4299 }
4300
4301 SBLASTRECORDCHK(&so->so_rcv, "soreceive sbwait 1");
4302 SBLASTMBUFCHK(&so->so_rcv, "soreceive sbwait 1");
4303
4304 sbunlock(&so->so_rcv, TRUE); /* keep socket locked */
4305 sblocked = 0;
4306
4307 error = sbwait(&so->so_rcv);
4308 if (error) {
4309 goto release;
4310 }
4311 goto restart;
4312 }
4313
4314 OSIncrementAtomicLong(&p->p_stats->p_ru.ru_msgrcv);
4315 SBLASTRECORDCHK(&so->so_rcv, "soreceive 1");
4316 SBLASTMBUFCHK(&so->so_rcv, "soreceive 1");
4317
4318 /*
4319 * Consume the current uio index as we have a datagram
4320 */
4321 auio = msgarray[npkts].uio;
4322 resid = uio_resid(auio);
4323 msgarray[npkts].which |= SOCK_MSG_DATA;
4324 psa = (msgarray[npkts].which & SOCK_MSG_SA) ?
4325 &msgarray[npkts].psa : NULL;
4326 controlp = (msgarray[npkts].which & SOCK_MSG_CONTROL) ?
4327 &msgarray[npkts].controlp : NULL;
4328 npkts += 1;
4329 nextrecord = m->m_nextpkt;
4330
4331 if ((pr->pr_flags & PR_ADDR) && m->m_type == MT_SONAME) {
4332 error = soreceive_addr(p, so, psa, flags, &m, &nextrecord, 1);
4333 if (error == ERESTART) {
4334 goto restart;
4335 } else if (error != 0) {
4336 goto release;
4337 }
4338 }
4339
4340 if (m != NULL && m->m_type == MT_CONTROL) {
4341 error = soreceive_ctl(so, controlp, flags, &m, &nextrecord);
4342 if (error != 0) {
4343 goto release;
4344 }
4345 }
4346
4347 if (m->m_pkthdr.len == 0) {
4348 printf("%s:%d so %llx pkt %llx type %u pktlen null\n",
4349 __func__, __LINE__,
4350 (uint64_t)DEBUG_KERNEL_ADDRPERM(so),
4351 (uint64_t)DEBUG_KERNEL_ADDRPERM(m),
4352 m->m_type);
4353 }
4354
4355 /*
4356 * Loop to copy the mbufs of the current record
4357 * Support zero length packets
4358 */
4359 ml = NULL;
4360 pktlen = 0;
4361 while (m != NULL && (len = resid - pktlen) >= 0 && error == 0) {
4362 if (m->m_len == 0) {
4363 panic("%p m_len zero", m);
4364 }
4365 if (m->m_type == 0) {
4366 panic("%p m_type zero", m);
4367 }
4368 /*
4369 * Clip to the residual length
4370 */
4371 if (len > m->m_len) {
4372 len = m->m_len;
4373 }
4374 pktlen += len;
4375 /*
4376 * Copy the mbufs via the uio or delay the copy
4377 * Sockbuf must be consistent here (points to current mbuf,
4378 * it points to next record) when we drop priority;
4379 * we must note any additions to the sockbuf when we
4380 * block interrupts again.
4381 */
4382 if (len > 0 && can_delay == 0) {
4383 socket_unlock(so, 0);
4384 error = uiomove(mtod(m, caddr_t), (int)len, auio);
4385 socket_lock(so, 0);
4386 if (error) {
4387 goto release;
4388 }
4389 } else {
4390 delayed_copy_len += len;
4391 }
4392
4393 if (len == m->m_len) {
4394 /*
4395 * m was entirely copied
4396 */
4397 sbfree(&so->so_rcv, m);
4398 nextrecord = m->m_nextpkt;
4399 m->m_nextpkt = NULL;
4400
4401 /*
4402 * Set the first packet to the head of the free list
4403 */
4404 if (free_list == NULL) {
4405 free_list = m;
4406 }
4407 /*
4408 * Link current packet to tail of free list
4409 */
4410 if (ml == NULL) {
4411 if (free_tail != NULL) {
4412 free_tail->m_nextpkt = m;
4413 }
4414 free_tail = m;
4415 }
4416 /*
4417 * Link current mbuf to last mbuf of current packet
4418 */
4419 if (ml != NULL) {
4420 ml->m_next = m;
4421 }
4422 ml = m;
4423
4424 /*
4425 * Move next buf to head of socket buffer
4426 */
4427 so->so_rcv.sb_mb = m = ml->m_next;
4428 ml->m_next = NULL;
4429
4430 if (m != NULL) {
4431 m->m_nextpkt = nextrecord;
4432 if (nextrecord == NULL) {
4433 so->so_rcv.sb_lastrecord = m;
4434 }
4435 } else {
4436 so->so_rcv.sb_mb = nextrecord;
4437 SB_EMPTY_FIXUP(&so->so_rcv);
4438 }
4439 SBLASTRECORDCHK(&so->so_rcv, "soreceive 3");
4440 SBLASTMBUFCHK(&so->so_rcv, "soreceive 3");
4441 } else {
4442 /*
4443 * Stop the loop on partial copy
4444 */
4445 break;
4446 }
4447 }
4448 #ifdef MORE_LOCKING_DEBUG
4449 if (so->so_usecount <= 1) {
4450 panic("%s: after big while so=%llx ref=%d on socket\n",
4451 __func__,
4452 (uint64_t)DEBUG_KERNEL_ADDRPERM(so), so->so_usecount);
4453 /* NOTREACHED */
4454 }
4455 #endif
4456 /*
4457 * Tell the caller we made a partial copy
4458 */
4459 if (m != NULL) {
4460 if (so->so_options & SO_DONTTRUNC) {
4461 /*
4462 * Copyout first the freelist then the partial mbuf
4463 */
4464 socket_unlock(so, 0);
4465 if (delayed_copy_len) {
4466 error = sodelayed_copy_list(so, msgarray,
4467 uiocnt, &free_list, &delayed_copy_len);
4468 }
4469
4470 if (error == 0) {
4471 error = uiomove(mtod(m, caddr_t), (int)len,
4472 auio);
4473 }
4474 socket_lock(so, 0);
4475 if (error) {
4476 goto release;
4477 }
4478
4479 m->m_data += len;
4480 m->m_len -= len;
4481 so->so_rcv.sb_cc -= len;
4482 flags |= MSG_RCVMORE;
4483 } else {
4484 (void) sbdroprecord(&so->so_rcv);
4485 nextrecord = so->so_rcv.sb_mb;
4486 m = NULL;
4487 flags |= MSG_TRUNC;
4488 }
4489 }
4490
4491 if (m == NULL) {
4492 so->so_rcv.sb_mb = nextrecord;
4493 /*
4494 * First part is an inline SB_EMPTY_FIXUP(). Second
4495 * part makes sure sb_lastrecord is up-to-date if
4496 * there is still data in the socket buffer.
4497 */
4498 if (so->so_rcv.sb_mb == NULL) {
4499 so->so_rcv.sb_mbtail = NULL;
4500 so->so_rcv.sb_lastrecord = NULL;
4501 } else if (nextrecord->m_nextpkt == NULL) {
4502 so->so_rcv.sb_lastrecord = nextrecord;
4503 }
4504 SB_MB_CHECK(&so->so_rcv);
4505 }
4506 SBLASTRECORDCHK(&so->so_rcv, "soreceive 4");
4507 SBLASTMBUFCHK(&so->so_rcv, "soreceive 4");
4508
4509 /*
4510 * We can continue to the next packet as long as:
4511 * - We haven't exhausted the uio array
4512 * - There was no error
4513 * - A packet was not truncated
4514 * - We can still receive more data
4515 */
4516 if (npkts < uiocnt && error == 0 &&
4517 (flags & (MSG_RCVMORE | MSG_TRUNC)) == 0 &&
4518 (so->so_state & SS_CANTRCVMORE) == 0) {
4519 sbunlock(&so->so_rcv, TRUE); /* keep socket locked */
4520 sblocked = 0;
4521
4522 goto next;
4523 }
4524 if (flagsp != NULL) {
4525 *flagsp |= flags;
4526 }
4527
4528 release:
4529 /*
4530 * pru_rcvd may cause more data to be received if the socket lock
4531 * is dropped so we set MSG_HAVEMORE now based on what we know.
4532 * That way the caller won't be surprised if it receives less data
4533 * than requested.
4534 */
4535 if ((so->so_options & SO_WANTMORE) && so->so_rcv.sb_cc > 0) {
4536 flags |= MSG_HAVEMORE;
4537 }
4538
4539 if (pr->pr_flags & PR_WANTRCVD && so->so_pcb) {
4540 (*pr->pr_usrreqs->pru_rcvd)(so, flags);
4541 }
4542
4543 if (sblocked) {
4544 sbunlock(&so->so_rcv, FALSE); /* will unlock socket */
4545 } else {
4546 socket_unlock(so, 1);
4547 }
4548
4549 if (delayed_copy_len) {
4550 error = sodelayed_copy_list(so, msgarray, uiocnt,
4551 &free_list, &delayed_copy_len);
4552 }
4553 out:
4554 /*
4555 * Amortize the cost of freeing the mbufs
4556 */
4557 if (free_list != NULL) {
4558 m_freem_list(free_list);
4559 }
4560 if (free_others != NULL) {
4561 m_freem_list(free_others);
4562 }
4563
4564 KERNEL_DEBUG(DBG_FNC_SORECEIVE_LIST | DBG_FUNC_END, error,
4565 0, 0, 0, 0);
4566 return error;
4567 }
4568
4569 /*
4570 * Returns: 0 Success
4571 * EINVAL
4572 * ENOTCONN
4573 * <pru_shutdown>:EINVAL
4574 * <pru_shutdown>:EADDRNOTAVAIL[TCP]
4575 * <pru_shutdown>:ENOBUFS[TCP]
4576 * <pru_shutdown>:EMSGSIZE[TCP]
4577 * <pru_shutdown>:EHOSTUNREACH[TCP]
4578 * <pru_shutdown>:ENETUNREACH[TCP]
4579 * <pru_shutdown>:ENETDOWN[TCP]
4580 * <pru_shutdown>:ENOMEM[TCP]
4581 * <pru_shutdown>:EACCES[TCP]
4582 * <pru_shutdown>:EMSGSIZE[TCP]
4583 * <pru_shutdown>:ENOBUFS[TCP]
4584 * <pru_shutdown>:???[TCP] [ignorable: mostly IPSEC/firewall/DLIL]
4585 * <pru_shutdown>:??? [other protocol families]
4586 */
4587 int
4588 soshutdown(struct socket *so, int how)
4589 {
4590 int error;
4591
4592 KERNEL_DEBUG(DBG_FNC_SOSHUTDOWN | DBG_FUNC_START, how, 0, 0, 0, 0);
4593
4594 switch (how) {
4595 case SHUT_RD:
4596 case SHUT_WR:
4597 case SHUT_RDWR:
4598 socket_lock(so, 1);
4599 if ((so->so_state &
4600 (SS_ISCONNECTED | SS_ISCONNECTING | SS_ISDISCONNECTING)) == 0) {
4601 error = ENOTCONN;
4602 } else {
4603 error = soshutdownlock(so, how);
4604 }
4605 socket_unlock(so, 1);
4606 break;
4607 default:
4608 error = EINVAL;
4609 break;
4610 }
4611
4612 KERNEL_DEBUG(DBG_FNC_SOSHUTDOWN | DBG_FUNC_END, how, error, 0, 0, 0);
4613
4614 return error;
4615 }
4616
4617 int
4618 soshutdownlock_final(struct socket *so, int how)
4619 {
4620 struct protosw *pr = so->so_proto;
4621 int error = 0;
4622
4623 sflt_notify(so, sock_evt_shutdown, &how);
4624
4625 if (how != SHUT_WR) {
4626 if ((so->so_state & SS_CANTRCVMORE) != 0) {
4627 /* read already shut down */
4628 error = ENOTCONN;
4629 goto done;
4630 }
4631 sorflush(so);
4632 postevent(so, 0, EV_RCLOSED);
4633 }
4634 if (how != SHUT_RD) {
4635 if ((so->so_state & SS_CANTSENDMORE) != 0) {
4636 /* write already shut down */
4637 error = ENOTCONN;
4638 goto done;
4639 }
4640 error = (*pr->pr_usrreqs->pru_shutdown)(so);
4641 postevent(so, 0, EV_WCLOSED);
4642 }
4643 done:
4644 KERNEL_DEBUG(DBG_FNC_SOSHUTDOWN, how, 1, 0, 0, 0);
4645 return error;
4646 }
4647
4648 int
4649 soshutdownlock(struct socket *so, int how)
4650 {
4651 int error = 0;
4652
4653 #if CONTENT_FILTER
4654 /*
4655 * A content filter may delay the actual shutdown until it
4656 * has processed the pending data
4657 */
4658 if (so->so_flags & SOF_CONTENT_FILTER) {
4659 error = cfil_sock_shutdown(so, &how);
4660 if (error == EJUSTRETURN) {
4661 error = 0;
4662 goto done;
4663 } else if (error != 0) {
4664 goto done;
4665 }
4666 }
4667 #endif /* CONTENT_FILTER */
4668
4669 error = soshutdownlock_final(so, how);
4670
4671 done:
4672 return error;
4673 }
4674
4675 void
4676 sowflush(struct socket *so)
4677 {
4678 struct sockbuf *sb = &so->so_snd;
4679
4680 /*
4681 * Obtain lock on the socket buffer (SB_LOCK). This is required
4682 * to prevent the socket buffer from being unexpectedly altered
4683 * while it is used by another thread in socket send/receive.
4684 *
4685 * sblock() must not fail here, hence the assertion.
4686 */
4687 (void) sblock(sb, SBL_WAIT | SBL_NOINTR | SBL_IGNDEFUNCT);
4688 VERIFY(sb->sb_flags & SB_LOCK);
4689
4690 sb->sb_flags &= ~(SB_SEL | SB_UPCALL);
4691 sb->sb_flags |= SB_DROP;
4692 sb->sb_upcall = NULL;
4693 sb->sb_upcallarg = NULL;
4694
4695 sbunlock(sb, TRUE); /* keep socket locked */
4696
4697 selthreadclear(&sb->sb_sel);
4698 sbrelease(sb);
4699 }
4700
4701 void
4702 sorflush(struct socket *so)
4703 {
4704 struct sockbuf *sb = &so->so_rcv;
4705 struct protosw *pr = so->so_proto;
4706 struct sockbuf asb;
4707 #ifdef notyet
4708 lck_mtx_t *mutex_held;
4709 /*
4710 * XXX: This code is currently commented out, because we may get here
4711 * as part of sofreelastref(), and at that time, pr_getlock() may no
4712 * longer be able to return us the lock; this will be fixed in future.
4713 */
4714 if (so->so_proto->pr_getlock != NULL) {
4715 mutex_held = (*so->so_proto->pr_getlock)(so, 0);
4716 } else {
4717 mutex_held = so->so_proto->pr_domain->dom_mtx;
4718 }
4719
4720 LCK_MTX_ASSERT(mutex_held, LCK_MTX_ASSERT_OWNED);
4721 #endif /* notyet */
4722
4723 sflt_notify(so, sock_evt_flush_read, NULL);
4724
4725 socantrcvmore(so);
4726
4727 /*
4728 * Obtain lock on the socket buffer (SB_LOCK). This is required
4729 * to prevent the socket buffer from being unexpectedly altered
4730 * while it is used by another thread in socket send/receive.
4731 *
4732 * sblock() must not fail here, hence the assertion.
4733 */
4734 (void) sblock(sb, SBL_WAIT | SBL_NOINTR | SBL_IGNDEFUNCT);
4735 VERIFY(sb->sb_flags & SB_LOCK);
4736
4737 /*
4738 * Copy only the relevant fields from "sb" to "asb" which we
4739 * need for sbrelease() to function. In particular, skip
4740 * sb_sel as it contains the wait queue linkage, which would
4741 * wreak havoc if we were to issue selthreadclear() on "asb".
4742 * Make sure to not carry over SB_LOCK in "asb", as we need
4743 * to acquire it later as part of sbrelease().
4744 */
4745 bzero(&asb, sizeof(asb));
4746 asb.sb_cc = sb->sb_cc;
4747 asb.sb_hiwat = sb->sb_hiwat;
4748 asb.sb_mbcnt = sb->sb_mbcnt;
4749 asb.sb_mbmax = sb->sb_mbmax;
4750 asb.sb_ctl = sb->sb_ctl;
4751 asb.sb_lowat = sb->sb_lowat;
4752 asb.sb_mb = sb->sb_mb;
4753 asb.sb_mbtail = sb->sb_mbtail;
4754 asb.sb_lastrecord = sb->sb_lastrecord;
4755 asb.sb_so = sb->sb_so;
4756 asb.sb_flags = sb->sb_flags;
4757 asb.sb_flags &= ~(SB_LOCK | SB_SEL | SB_KNOTE | SB_UPCALL);
4758 asb.sb_flags |= SB_DROP;
4759
4760 /*
4761 * Ideally we'd bzero() these and preserve the ones we need;
4762 * but to do that we'd need to shuffle things around in the
4763 * sockbuf, and we can't do it now because there are KEXTS
4764 * that are directly referring to the socket structure.
4765 *
4766 * Setting SB_DROP acts as a barrier to prevent further appends.
4767 * Clearing SB_SEL is done for selthreadclear() below.
4768 */
4769 sb->sb_cc = 0;
4770 sb->sb_hiwat = 0;
4771 sb->sb_mbcnt = 0;
4772 sb->sb_mbmax = 0;
4773 sb->sb_ctl = 0;
4774 sb->sb_lowat = 0;
4775 sb->sb_mb = NULL;
4776 sb->sb_mbtail = NULL;
4777 sb->sb_lastrecord = NULL;
4778 sb->sb_timeo.tv_sec = 0;
4779 sb->sb_timeo.tv_usec = 0;
4780 sb->sb_upcall = NULL;
4781 sb->sb_upcallarg = NULL;
4782 sb->sb_flags &= ~(SB_SEL | SB_UPCALL);
4783 sb->sb_flags |= SB_DROP;
4784
4785 sbunlock(sb, TRUE); /* keep socket locked */
4786
4787 /*
4788 * Note that selthreadclear() is called on the original "sb" and
4789 * not the local "asb" because of the way wait queue linkage is
4790 * implemented. Given that selwakeup() may be triggered, SB_SEL
4791 * should no longer be set (cleared above.)
4792 */
4793 selthreadclear(&sb->sb_sel);
4794
4795 if ((pr->pr_flags & PR_RIGHTS) && pr->pr_domain->dom_dispose) {
4796 (*pr->pr_domain->dom_dispose)(asb.sb_mb);
4797 }
4798
4799 sbrelease(&asb);
4800 }
4801
4802 /*
4803 * Perhaps this routine, and sooptcopyout(), below, ought to come in
4804 * an additional variant to handle the case where the option value needs
4805 * to be some kind of integer, but not a specific size.
4806 * In addition to their use here, these functions are also called by the
4807 * protocol-level pr_ctloutput() routines.
4808 *
4809 * Returns: 0 Success
4810 * EINVAL
4811 * copyin:EFAULT
4812 */
4813 int
4814 sooptcopyin(struct sockopt *sopt, void *buf, size_t len, size_t minlen)
4815 {
4816 size_t valsize;
4817
4818 /*
4819 * If the user gives us more than we wanted, we ignore it,
4820 * but if we don't get the minimum length the caller
4821 * wants, we return EINVAL. On success, sopt->sopt_valsize
4822 * is set to however much we actually retrieved.
4823 */
4824 if ((valsize = sopt->sopt_valsize) < minlen) {
4825 return EINVAL;
4826 }
4827 if (valsize > len) {
4828 sopt->sopt_valsize = valsize = len;
4829 }
4830
4831 if (sopt->sopt_p != kernproc) {
4832 return copyin(sopt->sopt_val, buf, valsize);
4833 }
4834
4835 bcopy(CAST_DOWN(caddr_t, sopt->sopt_val), buf, valsize);
4836 return 0;
4837 }
4838
4839 /*
4840 * sooptcopyin_timeval
4841 * Copy in a timeval value into tv_p, and take into account whether the
4842 * the calling process is 64-bit or 32-bit. Moved the sanity checking
4843 * code here so that we can verify the 64-bit tv_sec value before we lose
4844 * the top 32-bits assigning tv64.tv_sec to tv_p->tv_sec.
4845 */
4846 static int
4847 sooptcopyin_timeval(struct sockopt *sopt, struct timeval *tv_p)
4848 {
4849 int error;
4850
4851 if (proc_is64bit(sopt->sopt_p)) {
4852 struct user64_timeval tv64;
4853
4854 if (sopt->sopt_valsize < sizeof(tv64)) {
4855 return EINVAL;
4856 }
4857
4858 sopt->sopt_valsize = sizeof(tv64);
4859 if (sopt->sopt_p != kernproc) {
4860 error = copyin(sopt->sopt_val, &tv64, sizeof(tv64));
4861 if (error != 0) {
4862 return error;
4863 }
4864 } else {
4865 bcopy(CAST_DOWN(caddr_t, sopt->sopt_val), &tv64,
4866 sizeof(tv64));
4867 }
4868 if (tv64.tv_sec < 0 || tv64.tv_sec > LONG_MAX ||
4869 tv64.tv_usec < 0 || tv64.tv_usec >= 1000000) {
4870 return EDOM;
4871 }
4872
4873 tv_p->tv_sec = tv64.tv_sec;
4874 tv_p->tv_usec = tv64.tv_usec;
4875 } else {
4876 struct user32_timeval tv32;
4877
4878 if (sopt->sopt_valsize < sizeof(tv32)) {
4879 return EINVAL;
4880 }
4881
4882 sopt->sopt_valsize = sizeof(tv32);
4883 if (sopt->sopt_p != kernproc) {
4884 error = copyin(sopt->sopt_val, &tv32, sizeof(tv32));
4885 if (error != 0) {
4886 return error;
4887 }
4888 } else {
4889 bcopy(CAST_DOWN(caddr_t, sopt->sopt_val), &tv32,
4890 sizeof(tv32));
4891 }
4892 #ifndef __LP64__
4893 /*
4894 * K64todo "comparison is always false due to
4895 * limited range of data type"
4896 */
4897 if (tv32.tv_sec < 0 || tv32.tv_sec > LONG_MAX ||
4898 tv32.tv_usec < 0 || tv32.tv_usec >= 1000000) {
4899 return EDOM;
4900 }
4901 #endif
4902 tv_p->tv_sec = tv32.tv_sec;
4903 tv_p->tv_usec = tv32.tv_usec;
4904 }
4905 return 0;
4906 }
4907
4908 int
4909 soopt_cred_check(struct socket *so, int priv, boolean_t allow_root)
4910 {
4911 kauth_cred_t cred = NULL;
4912 proc_t ep = PROC_NULL;
4913 uid_t uid;
4914 int error = 0;
4915
4916 if (so->so_flags & SOF_DELEGATED) {
4917 ep = proc_find(so->e_pid);
4918 if (ep) {
4919 cred = kauth_cred_proc_ref(ep);
4920 }
4921 }
4922
4923 uid = kauth_cred_getuid(cred ? cred : so->so_cred);
4924
4925 /* uid is 0 for root */
4926 if (uid != 0 || !allow_root) {
4927 error = priv_check_cred(cred ? cred : so->so_cred, priv, 0);
4928 }
4929 if (cred) {
4930 kauth_cred_unref(&cred);
4931 }
4932 if (ep != PROC_NULL) {
4933 proc_rele(ep);
4934 }
4935
4936 return error;
4937 }
4938
4939 /*
4940 * Returns: 0 Success
4941 * EINVAL
4942 * ENOPROTOOPT
4943 * ENOBUFS
4944 * EDOM
4945 * sooptcopyin:EINVAL
4946 * sooptcopyin:EFAULT
4947 * sooptcopyin_timeval:EINVAL
4948 * sooptcopyin_timeval:EFAULT
4949 * sooptcopyin_timeval:EDOM
4950 * <pr_ctloutput>:EOPNOTSUPP[AF_UNIX]
4951 * <pr_ctloutput>:???w
4952 * sflt_attach_private:??? [whatever a filter author chooses]
4953 * <sf_setoption>:??? [whatever a filter author chooses]
4954 *
4955 * Notes: Other <pru_listen> returns depend on the protocol family; all
4956 * <sf_listen> returns depend on what the filter author causes
4957 * their filter to return.
4958 */
4959 int
4960 sosetoptlock(struct socket *so, struct sockopt *sopt, int dolock)
4961 {
4962 int error, optval;
4963 struct linger l;
4964 struct timeval tv;
4965 #if CONFIG_MACF_SOCKET
4966 struct mac extmac;
4967 #endif /* MAC_SOCKET */
4968
4969 if (sopt->sopt_dir != SOPT_SET) {
4970 sopt->sopt_dir = SOPT_SET;
4971 }
4972
4973 if (dolock) {
4974 socket_lock(so, 1);
4975 }
4976
4977 if ((so->so_state & (SS_CANTRCVMORE | SS_CANTSENDMORE)) ==
4978 (SS_CANTRCVMORE | SS_CANTSENDMORE) &&
4979 (so->so_flags & SOF_NPX_SETOPTSHUT) == 0) {
4980 /* the socket has been shutdown, no more sockopt's */
4981 error = EINVAL;
4982 goto out;
4983 }
4984
4985 error = sflt_setsockopt(so, sopt);
4986 if (error != 0) {
4987 if (error == EJUSTRETURN) {
4988 error = 0;
4989 }
4990 goto out;
4991 }
4992
4993 if (sopt->sopt_level != SOL_SOCKET) {
4994 if (so->so_proto != NULL &&
4995 so->so_proto->pr_ctloutput != NULL) {
4996 error = (*so->so_proto->pr_ctloutput)(so, sopt);
4997 goto out;
4998 }
4999 error = ENOPROTOOPT;
5000 } else {
5001 /*
5002 * Allow socket-level (SOL_SOCKET) options to be filtered by
5003 * the protocol layer, if needed. A zero value returned from
5004 * the handler means use default socket-level processing as
5005 * done by the rest of this routine. Otherwise, any other
5006 * return value indicates that the option is unsupported.
5007 */
5008 if (so->so_proto != NULL && (error = so->so_proto->pr_usrreqs->
5009 pru_socheckopt(so, sopt)) != 0) {
5010 goto out;
5011 }
5012
5013 error = 0;
5014 switch (sopt->sopt_name) {
5015 case SO_LINGER:
5016 case SO_LINGER_SEC:
5017 error = sooptcopyin(sopt, &l, sizeof(l), sizeof(l));
5018 if (error != 0) {
5019 goto out;
5020 }
5021
5022 so->so_linger = (sopt->sopt_name == SO_LINGER) ?
5023 l.l_linger : l.l_linger * hz;
5024 if (l.l_onoff != 0) {
5025 so->so_options |= SO_LINGER;
5026 } else {
5027 so->so_options &= ~SO_LINGER;
5028 }
5029 break;
5030
5031 case SO_DEBUG:
5032 case SO_KEEPALIVE:
5033 case SO_DONTROUTE:
5034 case SO_USELOOPBACK:
5035 case SO_BROADCAST:
5036 case SO_REUSEADDR:
5037 case SO_REUSEPORT:
5038 case SO_OOBINLINE:
5039 case SO_TIMESTAMP:
5040 case SO_TIMESTAMP_MONOTONIC:
5041 case SO_TIMESTAMP_CONTINUOUS:
5042 case SO_DONTTRUNC:
5043 case SO_WANTMORE:
5044 case SO_WANTOOBFLAG:
5045 case SO_NOWAKEFROMSLEEP:
5046 case SO_NOAPNFALLBK:
5047 error = sooptcopyin(sopt, &optval, sizeof(optval),
5048 sizeof(optval));
5049 if (error != 0) {
5050 goto out;
5051 }
5052 if (optval) {
5053 so->so_options |= sopt->sopt_name;
5054 } else {
5055 so->so_options &= ~sopt->sopt_name;
5056 }
5057 break;
5058
5059 case SO_SNDBUF:
5060 case SO_RCVBUF:
5061 case SO_SNDLOWAT:
5062 case SO_RCVLOWAT:
5063 error = sooptcopyin(sopt, &optval, sizeof(optval),
5064 sizeof(optval));
5065 if (error != 0) {
5066 goto out;
5067 }
5068
5069 /*
5070 * Values < 1 make no sense for any of these
5071 * options, so disallow them.
5072 */
5073 if (optval < 1) {
5074 error = EINVAL;
5075 goto out;
5076 }
5077
5078 switch (sopt->sopt_name) {
5079 case SO_SNDBUF:
5080 case SO_RCVBUF: {
5081 struct sockbuf *sb =
5082 (sopt->sopt_name == SO_SNDBUF) ?
5083 &so->so_snd : &so->so_rcv;
5084 if (sbreserve(sb, (u_int32_t)optval) == 0) {
5085 error = ENOBUFS;
5086 goto out;
5087 }
5088 sb->sb_flags |= SB_USRSIZE;
5089 sb->sb_flags &= ~SB_AUTOSIZE;
5090 sb->sb_idealsize = (u_int32_t)optval;
5091 break;
5092 }
5093 /*
5094 * Make sure the low-water is never greater than
5095 * the high-water.
5096 */
5097 case SO_SNDLOWAT: {
5098 int space = sbspace(&so->so_snd);
5099 u_int32_t hiwat = so->so_snd.sb_hiwat;
5100
5101 if (so->so_snd.sb_flags & SB_UNIX) {
5102 struct unpcb *unp =
5103 (struct unpcb *)(so->so_pcb);
5104 if (unp != NULL &&
5105 unp->unp_conn != NULL) {
5106 hiwat += unp->unp_conn->unp_cc;
5107 }
5108 }
5109
5110 so->so_snd.sb_lowat =
5111 (optval > hiwat) ?
5112 hiwat : optval;
5113
5114 if (space >= so->so_snd.sb_lowat) {
5115 sowwakeup(so);
5116 }
5117 break;
5118 }
5119 case SO_RCVLOWAT: {
5120 int64_t data_len;
5121 so->so_rcv.sb_lowat =
5122 (optval > so->so_rcv.sb_hiwat) ?
5123 so->so_rcv.sb_hiwat : optval;
5124 data_len = so->so_rcv.sb_cc
5125 - so->so_rcv.sb_ctl;
5126 if (data_len >= so->so_rcv.sb_lowat) {
5127 sorwakeup(so);
5128 }
5129 break;
5130 }
5131 }
5132 break;
5133
5134 case SO_SNDTIMEO:
5135 case SO_RCVTIMEO:
5136 error = sooptcopyin_timeval(sopt, &tv);
5137 if (error != 0) {
5138 goto out;
5139 }
5140
5141 switch (sopt->sopt_name) {
5142 case SO_SNDTIMEO:
5143 so->so_snd.sb_timeo = tv;
5144 break;
5145 case SO_RCVTIMEO:
5146 so->so_rcv.sb_timeo = tv;
5147 break;
5148 }
5149 break;
5150
5151 case SO_NKE: {
5152 struct so_nke nke;
5153
5154 error = sooptcopyin(sopt, &nke, sizeof(nke),
5155 sizeof(nke));
5156 if (error != 0) {
5157 goto out;
5158 }
5159
5160 error = sflt_attach_internal(so, nke.nke_handle);
5161 break;
5162 }
5163
5164 case SO_NOSIGPIPE:
5165 error = sooptcopyin(sopt, &optval, sizeof(optval),
5166 sizeof(optval));
5167 if (error != 0) {
5168 goto out;
5169 }
5170 if (optval != 0) {
5171 so->so_flags |= SOF_NOSIGPIPE;
5172 } else {
5173 so->so_flags &= ~SOF_NOSIGPIPE;
5174 }
5175 break;
5176
5177 case SO_NOADDRERR:
5178 error = sooptcopyin(sopt, &optval, sizeof(optval),
5179 sizeof(optval));
5180 if (error != 0) {
5181 goto out;
5182 }
5183 if (optval != 0) {
5184 so->so_flags |= SOF_NOADDRAVAIL;
5185 } else {
5186 so->so_flags &= ~SOF_NOADDRAVAIL;
5187 }
5188 break;
5189
5190 case SO_REUSESHAREUID:
5191 error = sooptcopyin(sopt, &optval, sizeof(optval),
5192 sizeof(optval));
5193 if (error != 0) {
5194 goto out;
5195 }
5196 if (optval != 0) {
5197 so->so_flags |= SOF_REUSESHAREUID;
5198 } else {
5199 so->so_flags &= ~SOF_REUSESHAREUID;
5200 }
5201 break;
5202
5203 case SO_NOTIFYCONFLICT:
5204 if (kauth_cred_issuser(kauth_cred_get()) == 0) {
5205 error = EPERM;
5206 goto out;
5207 }
5208 error = sooptcopyin(sopt, &optval, sizeof(optval),
5209 sizeof(optval));
5210 if (error != 0) {
5211 goto out;
5212 }
5213 if (optval != 0) {
5214 so->so_flags |= SOF_NOTIFYCONFLICT;
5215 } else {
5216 so->so_flags &= ~SOF_NOTIFYCONFLICT;
5217 }
5218 break;
5219
5220 case SO_RESTRICTIONS:
5221 error = sooptcopyin(sopt, &optval, sizeof(optval),
5222 sizeof(optval));
5223 if (error != 0) {
5224 goto out;
5225 }
5226
5227 error = so_set_restrictions(so, optval);
5228 break;
5229
5230 case SO_AWDL_UNRESTRICTED:
5231 if (SOCK_DOM(so) != PF_INET &&
5232 SOCK_DOM(so) != PF_INET6) {
5233 error = EOPNOTSUPP;
5234 goto out;
5235 }
5236 error = sooptcopyin(sopt, &optval, sizeof(optval),
5237 sizeof(optval));
5238 if (error != 0) {
5239 goto out;
5240 }
5241 if (optval != 0) {
5242 error = soopt_cred_check(so,
5243 PRIV_NET_RESTRICTED_AWDL, false);
5244 if (error == 0) {
5245 inp_set_awdl_unrestricted(
5246 sotoinpcb(so));
5247 }
5248 } else {
5249 inp_clear_awdl_unrestricted(sotoinpcb(so));
5250 }
5251 break;
5252 case SO_INTCOPROC_ALLOW:
5253 if (SOCK_DOM(so) != PF_INET6) {
5254 error = EOPNOTSUPP;
5255 goto out;
5256 }
5257 error = sooptcopyin(sopt, &optval, sizeof(optval),
5258 sizeof(optval));
5259 if (error != 0) {
5260 goto out;
5261 }
5262 if (optval != 0 &&
5263 inp_get_intcoproc_allowed(sotoinpcb(so)) == FALSE) {
5264 error = soopt_cred_check(so,
5265 PRIV_NET_RESTRICTED_INTCOPROC, false);
5266 if (error == 0) {
5267 inp_set_intcoproc_allowed(
5268 sotoinpcb(so));
5269 }
5270 } else if (optval == 0) {
5271 inp_clear_intcoproc_allowed(sotoinpcb(so));
5272 }
5273 break;
5274
5275 case SO_LABEL:
5276 #if CONFIG_MACF_SOCKET
5277 if ((error = sooptcopyin(sopt, &extmac, sizeof(extmac),
5278 sizeof(extmac))) != 0) {
5279 goto out;
5280 }
5281
5282 error = mac_setsockopt_label(proc_ucred(sopt->sopt_p),
5283 so, &extmac);
5284 #else
5285 error = EOPNOTSUPP;
5286 #endif /* MAC_SOCKET */
5287 break;
5288
5289 case SO_UPCALLCLOSEWAIT:
5290 error = sooptcopyin(sopt, &optval, sizeof(optval),
5291 sizeof(optval));
5292 if (error != 0) {
5293 goto out;
5294 }
5295 if (optval != 0) {
5296 so->so_flags |= SOF_UPCALLCLOSEWAIT;
5297 } else {
5298 so->so_flags &= ~SOF_UPCALLCLOSEWAIT;
5299 }
5300 break;
5301
5302 case SO_RANDOMPORT:
5303 error = sooptcopyin(sopt, &optval, sizeof(optval),
5304 sizeof(optval));
5305 if (error != 0) {
5306 goto out;
5307 }
5308 if (optval != 0) {
5309 so->so_flags |= SOF_BINDRANDOMPORT;
5310 } else {
5311 so->so_flags &= ~SOF_BINDRANDOMPORT;
5312 }
5313 break;
5314
5315 case SO_NP_EXTENSIONS: {
5316 struct so_np_extensions sonpx;
5317
5318 error = sooptcopyin(sopt, &sonpx, sizeof(sonpx),
5319 sizeof(sonpx));
5320 if (error != 0) {
5321 goto out;
5322 }
5323 if (sonpx.npx_mask & ~SONPX_MASK_VALID) {
5324 error = EINVAL;
5325 goto out;
5326 }
5327 /*
5328 * Only one bit defined for now
5329 */
5330 if ((sonpx.npx_mask & SONPX_SETOPTSHUT)) {
5331 if ((sonpx.npx_flags & SONPX_SETOPTSHUT)) {
5332 so->so_flags |= SOF_NPX_SETOPTSHUT;
5333 } else {
5334 so->so_flags &= ~SOF_NPX_SETOPTSHUT;
5335 }
5336 }
5337 break;
5338 }
5339
5340 case SO_TRAFFIC_CLASS: {
5341 error = sooptcopyin(sopt, &optval, sizeof(optval),
5342 sizeof(optval));
5343 if (error != 0) {
5344 goto out;
5345 }
5346 if (optval >= SO_TC_NET_SERVICE_OFFSET) {
5347 int netsvc = optval - SO_TC_NET_SERVICE_OFFSET;
5348 error = so_set_net_service_type(so, netsvc);
5349 goto out;
5350 }
5351 error = so_set_traffic_class(so, optval);
5352 if (error != 0) {
5353 goto out;
5354 }
5355 so->so_flags1 &= ~SOF1_TC_NET_SERV_TYPE;
5356 so->so_netsvctype = _NET_SERVICE_TYPE_UNSPEC;
5357 break;
5358 }
5359
5360 case SO_RECV_TRAFFIC_CLASS: {
5361 error = sooptcopyin(sopt, &optval, sizeof(optval),
5362 sizeof(optval));
5363 if (error != 0) {
5364 goto out;
5365 }
5366 if (optval == 0) {
5367 so->so_flags &= ~SOF_RECV_TRAFFIC_CLASS;
5368 } else {
5369 so->so_flags |= SOF_RECV_TRAFFIC_CLASS;
5370 }
5371 break;
5372 }
5373
5374 #if (DEVELOPMENT || DEBUG)
5375 case SO_TRAFFIC_CLASS_DBG: {
5376 struct so_tcdbg so_tcdbg;
5377
5378 error = sooptcopyin(sopt, &so_tcdbg,
5379 sizeof(struct so_tcdbg), sizeof(struct so_tcdbg));
5380 if (error != 0) {
5381 goto out;
5382 }
5383 error = so_set_tcdbg(so, &so_tcdbg);
5384 if (error != 0) {
5385 goto out;
5386 }
5387 break;
5388 }
5389 #endif /* (DEVELOPMENT || DEBUG) */
5390
5391 case SO_PRIVILEGED_TRAFFIC_CLASS:
5392 error = priv_check_cred(kauth_cred_get(),
5393 PRIV_NET_PRIVILEGED_TRAFFIC_CLASS, 0);
5394 if (error != 0) {
5395 goto out;
5396 }
5397 error = sooptcopyin(sopt, &optval, sizeof(optval),
5398 sizeof(optval));
5399 if (error != 0) {
5400 goto out;
5401 }
5402 if (optval == 0) {
5403 so->so_flags &= ~SOF_PRIVILEGED_TRAFFIC_CLASS;
5404 } else {
5405 so->so_flags |= SOF_PRIVILEGED_TRAFFIC_CLASS;
5406 }
5407 break;
5408
5409 #if (DEVELOPMENT || DEBUG)
5410 case SO_DEFUNCTIT:
5411 error = sosetdefunct(current_proc(), so, 0, FALSE);
5412 if (error == 0) {
5413 error = sodefunct(current_proc(), so, 0);
5414 }
5415
5416 break;
5417 #endif /* (DEVELOPMENT || DEBUG) */
5418
5419 case SO_DEFUNCTOK:
5420 error = sooptcopyin(sopt, &optval, sizeof(optval),
5421 sizeof(optval));
5422 if (error != 0 || (so->so_flags & SOF_DEFUNCT)) {
5423 if (error == 0) {
5424 error = EBADF;
5425 }
5426 goto out;
5427 }
5428 /*
5429 * Any process can set SO_DEFUNCTOK (clear
5430 * SOF_NODEFUNCT), but only root can clear
5431 * SO_DEFUNCTOK (set SOF_NODEFUNCT).
5432 */
5433 if (optval == 0 &&
5434 kauth_cred_issuser(kauth_cred_get()) == 0) {
5435 error = EPERM;
5436 goto out;
5437 }
5438 if (optval) {
5439 so->so_flags &= ~SOF_NODEFUNCT;
5440 } else {
5441 so->so_flags |= SOF_NODEFUNCT;
5442 }
5443
5444 if (SOCK_DOM(so) == PF_INET ||
5445 SOCK_DOM(so) == PF_INET6) {
5446 char s[MAX_IPv6_STR_LEN];
5447 char d[MAX_IPv6_STR_LEN];
5448 struct inpcb *inp = sotoinpcb(so);
5449
5450 SODEFUNCTLOG("%s[%d, %s]: so 0x%llx "
5451 "[%s %s:%d -> %s:%d] is now marked "
5452 "as %seligible for "
5453 "defunct\n", __func__, proc_selfpid(),
5454 proc_best_name(current_proc()),
5455 (uint64_t)DEBUG_KERNEL_ADDRPERM(so),
5456 (SOCK_TYPE(so) == SOCK_STREAM) ?
5457 "TCP" : "UDP", inet_ntop(SOCK_DOM(so),
5458 ((SOCK_DOM(so) == PF_INET) ?
5459 (void *)&inp->inp_laddr.s_addr :
5460 (void *)&inp->in6p_laddr), s, sizeof(s)),
5461 ntohs(inp->in6p_lport),
5462 inet_ntop(SOCK_DOM(so),
5463 (SOCK_DOM(so) == PF_INET) ?
5464 (void *)&inp->inp_faddr.s_addr :
5465 (void *)&inp->in6p_faddr, d, sizeof(d)),
5466 ntohs(inp->in6p_fport),
5467 (so->so_flags & SOF_NODEFUNCT) ?
5468 "not " : "");
5469 } else {
5470 SODEFUNCTLOG("%s[%d, %s]: so 0x%llx [%d,%d] "
5471 "is now marked as %seligible for "
5472 "defunct\n",
5473 __func__, proc_selfpid(),
5474 proc_best_name(current_proc()),
5475 (uint64_t)DEBUG_KERNEL_ADDRPERM(so),
5476 SOCK_DOM(so), SOCK_TYPE(so),
5477 (so->so_flags & SOF_NODEFUNCT) ?
5478 "not " : "");
5479 }
5480 break;
5481
5482 case SO_ISDEFUNCT:
5483 /* This option is not settable */
5484 error = EINVAL;
5485 break;
5486
5487 case SO_OPPORTUNISTIC:
5488 error = sooptcopyin(sopt, &optval, sizeof(optval),
5489 sizeof(optval));
5490 if (error == 0) {
5491 error = so_set_opportunistic(so, optval);
5492 }
5493 break;
5494
5495 case SO_FLUSH:
5496 /* This option is handled by lower layer(s) */
5497 error = 0;
5498 break;
5499
5500 case SO_RECV_ANYIF:
5501 error = sooptcopyin(sopt, &optval, sizeof(optval),
5502 sizeof(optval));
5503 if (error == 0) {
5504 error = so_set_recv_anyif(so, optval);
5505 }
5506 break;
5507
5508 case SO_TRAFFIC_MGT_BACKGROUND: {
5509 /* This option is handled by lower layer(s) */
5510 error = 0;
5511 break;
5512 }
5513
5514 #if FLOW_DIVERT
5515 case SO_FLOW_DIVERT_TOKEN:
5516 error = flow_divert_token_set(so, sopt);
5517 break;
5518 #endif /* FLOW_DIVERT */
5519
5520
5521 case SO_DELEGATED:
5522 if ((error = sooptcopyin(sopt, &optval, sizeof(optval),
5523 sizeof(optval))) != 0) {
5524 break;
5525 }
5526
5527 error = so_set_effective_pid(so, optval, sopt->sopt_p);
5528 break;
5529
5530 case SO_DELEGATED_UUID: {
5531 uuid_t euuid;
5532
5533 if ((error = sooptcopyin(sopt, &euuid, sizeof(euuid),
5534 sizeof(euuid))) != 0) {
5535 break;
5536 }
5537
5538 error = so_set_effective_uuid(so, euuid, sopt->sopt_p);
5539 break;
5540 }
5541
5542 #if NECP
5543 case SO_NECP_ATTRIBUTES:
5544 error = necp_set_socket_attributes(so, sopt);
5545 break;
5546
5547 case SO_NECP_CLIENTUUID:
5548 if (SOCK_DOM(so) == PF_MULTIPATH) {
5549 /* Handled by MPTCP itself */
5550 break;
5551 }
5552
5553 if (SOCK_DOM(so) != PF_INET && SOCK_DOM(so) != PF_INET6) {
5554 error = EINVAL;
5555 goto out;
5556 }
5557
5558 struct inpcb *inp = sotoinpcb(so);
5559 if (!uuid_is_null(inp->necp_client_uuid)) {
5560 // Clear out the old client UUID if present
5561 necp_inpcb_remove_cb(inp);
5562 }
5563
5564 error = sooptcopyin(sopt, &inp->necp_client_uuid,
5565 sizeof(uuid_t), sizeof(uuid_t));
5566 if (error != 0) {
5567 goto out;
5568 }
5569
5570 if (uuid_is_null(inp->necp_client_uuid)) {
5571 error = EINVAL;
5572 goto out;
5573 }
5574
5575 error = necp_client_register_socket_flow(so->last_pid,
5576 inp->necp_client_uuid, inp);
5577 if (error != 0) {
5578 uuid_clear(inp->necp_client_uuid);
5579 goto out;
5580 }
5581
5582 if (inp->inp_lport != 0) {
5583 // There is bound local port, so this is not
5584 // a fresh socket. Assign to the client.
5585 necp_client_assign_from_socket(so->last_pid, inp->necp_client_uuid, inp);
5586 }
5587
5588 break;
5589 #endif /* NECP */
5590
5591 case SO_EXTENDED_BK_IDLE:
5592 error = sooptcopyin(sopt, &optval, sizeof(optval),
5593 sizeof(optval));
5594 if (error == 0) {
5595 error = so_set_extended_bk_idle(so, optval);
5596 }
5597 break;
5598
5599 case SO_MARK_CELLFALLBACK:
5600 error = sooptcopyin(sopt, &optval, sizeof(optval),
5601 sizeof(optval));
5602 if (error != 0) {
5603 goto out;
5604 }
5605 if (optval < 0) {
5606 error = EINVAL;
5607 goto out;
5608 }
5609 if (optval == 0) {
5610 so->so_flags1 &= ~SOF1_CELLFALLBACK;
5611 } else {
5612 so->so_flags1 |= SOF1_CELLFALLBACK;
5613 }
5614 break;
5615
5616 case SO_NET_SERVICE_TYPE: {
5617 error = sooptcopyin(sopt, &optval, sizeof(optval),
5618 sizeof(optval));
5619 if (error != 0) {
5620 goto out;
5621 }
5622 error = so_set_net_service_type(so, optval);
5623 break;
5624 }
5625
5626 case SO_QOSMARKING_POLICY_OVERRIDE:
5627 error = priv_check_cred(kauth_cred_get(),
5628 PRIV_NET_QOSMARKING_POLICY_OVERRIDE, 0);
5629 if (error != 0) {
5630 goto out;
5631 }
5632 error = sooptcopyin(sopt, &optval, sizeof(optval),
5633 sizeof(optval));
5634 if (error != 0) {
5635 goto out;
5636 }
5637 if (optval == 0) {
5638 so->so_flags1 &= ~SOF1_QOSMARKING_POLICY_OVERRIDE;
5639 } else {
5640 so->so_flags1 |= SOF1_QOSMARKING_POLICY_OVERRIDE;
5641 }
5642 break;
5643
5644 default:
5645 error = ENOPROTOOPT;
5646 break;
5647 }
5648 if (error == 0 && so->so_proto != NULL &&
5649 so->so_proto->pr_ctloutput != NULL) {
5650 (void) so->so_proto->pr_ctloutput(so, sopt);
5651 }
5652 }
5653 out:
5654 if (dolock) {
5655 socket_unlock(so, 1);
5656 }
5657 return error;
5658 }
5659
5660 /* Helper routines for getsockopt */
5661 int
5662 sooptcopyout(struct sockopt *sopt, void *buf, size_t len)
5663 {
5664 int error;
5665 size_t valsize;
5666
5667 error = 0;
5668
5669 /*
5670 * Documented get behavior is that we always return a value,
5671 * possibly truncated to fit in the user's buffer.
5672 * Traditional behavior is that we always tell the user
5673 * precisely how much we copied, rather than something useful
5674 * like the total amount we had available for her.
5675 * Note that this interface is not idempotent; the entire answer must
5676 * generated ahead of time.
5677 */
5678 valsize = min(len, sopt->sopt_valsize);
5679 sopt->sopt_valsize = valsize;
5680 if (sopt->sopt_val != USER_ADDR_NULL) {
5681 if (sopt->sopt_p != kernproc) {
5682 error = copyout(buf, sopt->sopt_val, valsize);
5683 } else {
5684 bcopy(buf, CAST_DOWN(caddr_t, sopt->sopt_val), valsize);
5685 }
5686 }
5687 return error;
5688 }
5689
5690 static int
5691 sooptcopyout_timeval(struct sockopt *sopt, const struct timeval *tv_p)
5692 {
5693 int error;
5694 size_t len;
5695 struct user64_timeval tv64 = {};
5696 struct user32_timeval tv32 = {};
5697 const void * val;
5698 size_t valsize;
5699
5700 error = 0;
5701 if (proc_is64bit(sopt->sopt_p)) {
5702 len = sizeof(tv64);
5703 tv64.tv_sec = tv_p->tv_sec;
5704 tv64.tv_usec = tv_p->tv_usec;
5705 val = &tv64;
5706 } else {
5707 len = sizeof(tv32);
5708 tv32.tv_sec = tv_p->tv_sec;
5709 tv32.tv_usec = tv_p->tv_usec;
5710 val = &tv32;
5711 }
5712 valsize = min(len, sopt->sopt_valsize);
5713 sopt->sopt_valsize = valsize;
5714 if (sopt->sopt_val != USER_ADDR_NULL) {
5715 if (sopt->sopt_p != kernproc) {
5716 error = copyout(val, sopt->sopt_val, valsize);
5717 } else {
5718 bcopy(val, CAST_DOWN(caddr_t, sopt->sopt_val), valsize);
5719 }
5720 }
5721 return error;
5722 }
5723
5724 /*
5725 * Return: 0 Success
5726 * ENOPROTOOPT
5727 * <pr_ctloutput>:EOPNOTSUPP[AF_UNIX]
5728 * <pr_ctloutput>:???
5729 * <sf_getoption>:???
5730 */
5731 int
5732 sogetoptlock(struct socket *so, struct sockopt *sopt, int dolock)
5733 {
5734 int error, optval;
5735 struct linger l;
5736 struct timeval tv;
5737 #if CONFIG_MACF_SOCKET
5738 struct mac extmac;
5739 #endif /* MAC_SOCKET */
5740
5741 if (sopt->sopt_dir != SOPT_GET) {
5742 sopt->sopt_dir = SOPT_GET;
5743 }
5744
5745 if (dolock) {
5746 socket_lock(so, 1);
5747 }
5748
5749 error = sflt_getsockopt(so, sopt);
5750 if (error != 0) {
5751 if (error == EJUSTRETURN) {
5752 error = 0;
5753 }
5754 goto out;
5755 }
5756
5757 if (sopt->sopt_level != SOL_SOCKET) {
5758 if (so->so_proto != NULL &&
5759 so->so_proto->pr_ctloutput != NULL) {
5760 error = (*so->so_proto->pr_ctloutput)(so, sopt);
5761 goto out;
5762 }
5763 error = ENOPROTOOPT;
5764 } else {
5765 /*
5766 * Allow socket-level (SOL_SOCKET) options to be filtered by
5767 * the protocol layer, if needed. A zero value returned from
5768 * the handler means use default socket-level processing as
5769 * done by the rest of this routine. Otherwise, any other
5770 * return value indicates that the option is unsupported.
5771 */
5772 if (so->so_proto != NULL && (error = so->so_proto->pr_usrreqs->
5773 pru_socheckopt(so, sopt)) != 0) {
5774 goto out;
5775 }
5776
5777 error = 0;
5778 switch (sopt->sopt_name) {
5779 case SO_LINGER:
5780 case SO_LINGER_SEC:
5781 l.l_onoff = ((so->so_options & SO_LINGER) ? 1 : 0);
5782 l.l_linger = (sopt->sopt_name == SO_LINGER) ?
5783 so->so_linger : so->so_linger / hz;
5784 error = sooptcopyout(sopt, &l, sizeof(l));
5785 break;
5786
5787 case SO_USELOOPBACK:
5788 case SO_DONTROUTE:
5789 case SO_DEBUG:
5790 case SO_KEEPALIVE:
5791 case SO_REUSEADDR:
5792 case SO_REUSEPORT:
5793 case SO_BROADCAST:
5794 case SO_OOBINLINE:
5795 case SO_TIMESTAMP:
5796 case SO_TIMESTAMP_MONOTONIC:
5797 case SO_TIMESTAMP_CONTINUOUS:
5798 case SO_DONTTRUNC:
5799 case SO_WANTMORE:
5800 case SO_WANTOOBFLAG:
5801 case SO_NOWAKEFROMSLEEP:
5802 case SO_NOAPNFALLBK:
5803 optval = so->so_options & sopt->sopt_name;
5804 integer:
5805 error = sooptcopyout(sopt, &optval, sizeof(optval));
5806 break;
5807
5808 case SO_TYPE:
5809 optval = so->so_type;
5810 goto integer;
5811
5812 case SO_NREAD:
5813 if (so->so_proto->pr_flags & PR_ATOMIC) {
5814 int pkt_total;
5815 struct mbuf *m1;
5816
5817 pkt_total = 0;
5818 m1 = so->so_rcv.sb_mb;
5819 while (m1 != NULL) {
5820 if (m1->m_type == MT_DATA ||
5821 m1->m_type == MT_HEADER ||
5822 m1->m_type == MT_OOBDATA) {
5823 pkt_total += m1->m_len;
5824 }
5825 m1 = m1->m_next;
5826 }
5827 optval = pkt_total;
5828 } else {
5829 optval = so->so_rcv.sb_cc - so->so_rcv.sb_ctl;
5830 }
5831 goto integer;
5832
5833 case SO_NUMRCVPKT:
5834 if (so->so_proto->pr_flags & PR_ATOMIC) {
5835 int cnt = 0;
5836 struct mbuf *m1;
5837
5838 m1 = so->so_rcv.sb_mb;
5839 while (m1 != NULL) {
5840 if (m1->m_type == MT_DATA ||
5841 m1->m_type == MT_HEADER ||
5842 m1->m_type == MT_OOBDATA) {
5843 cnt += 1;
5844 }
5845 m1 = m1->m_nextpkt;
5846 }
5847 optval = cnt;
5848 goto integer;
5849 } else {
5850 error = EINVAL;
5851 break;
5852 }
5853
5854 case SO_NWRITE:
5855 optval = so->so_snd.sb_cc;
5856 goto integer;
5857
5858 case SO_ERROR:
5859 optval = so->so_error;
5860 so->so_error = 0;
5861 goto integer;
5862
5863 case SO_SNDBUF: {
5864 u_int32_t hiwat = so->so_snd.sb_hiwat;
5865
5866 if (so->so_snd.sb_flags & SB_UNIX) {
5867 struct unpcb *unp =
5868 (struct unpcb *)(so->so_pcb);
5869 if (unp != NULL && unp->unp_conn != NULL) {
5870 hiwat += unp->unp_conn->unp_cc;
5871 }
5872 }
5873
5874 optval = hiwat;
5875 goto integer;
5876 }
5877 case SO_RCVBUF:
5878 optval = so->so_rcv.sb_hiwat;
5879 goto integer;
5880
5881 case SO_SNDLOWAT:
5882 optval = so->so_snd.sb_lowat;
5883 goto integer;
5884
5885 case SO_RCVLOWAT:
5886 optval = so->so_rcv.sb_lowat;
5887 goto integer;
5888
5889 case SO_SNDTIMEO:
5890 case SO_RCVTIMEO:
5891 tv = (sopt->sopt_name == SO_SNDTIMEO ?
5892 so->so_snd.sb_timeo : so->so_rcv.sb_timeo);
5893
5894 error = sooptcopyout_timeval(sopt, &tv);
5895 break;
5896
5897 case SO_NOSIGPIPE:
5898 optval = (so->so_flags & SOF_NOSIGPIPE);
5899 goto integer;
5900
5901 case SO_NOADDRERR:
5902 optval = (so->so_flags & SOF_NOADDRAVAIL);
5903 goto integer;
5904
5905 case SO_REUSESHAREUID:
5906 optval = (so->so_flags & SOF_REUSESHAREUID);
5907 goto integer;
5908
5909
5910 case SO_NOTIFYCONFLICT:
5911 optval = (so->so_flags & SOF_NOTIFYCONFLICT);
5912 goto integer;
5913
5914 case SO_RESTRICTIONS:
5915 optval = so_get_restrictions(so);
5916 goto integer;
5917
5918 case SO_AWDL_UNRESTRICTED:
5919 if (SOCK_DOM(so) == PF_INET ||
5920 SOCK_DOM(so) == PF_INET6) {
5921 optval = inp_get_awdl_unrestricted(
5922 sotoinpcb(so));
5923 goto integer;
5924 } else {
5925 error = EOPNOTSUPP;
5926 }
5927 break;
5928
5929 case SO_INTCOPROC_ALLOW:
5930 if (SOCK_DOM(so) == PF_INET6) {
5931 optval = inp_get_intcoproc_allowed(
5932 sotoinpcb(so));
5933 goto integer;
5934 } else {
5935 error = EOPNOTSUPP;
5936 }
5937 break;
5938
5939 case SO_LABEL:
5940 #if CONFIG_MACF_SOCKET
5941 if ((error = sooptcopyin(sopt, &extmac, sizeof(extmac),
5942 sizeof(extmac))) != 0 ||
5943 (error = mac_socket_label_get(proc_ucred(
5944 sopt->sopt_p), so, &extmac)) != 0) {
5945 break;
5946 }
5947
5948 error = sooptcopyout(sopt, &extmac, sizeof(extmac));
5949 #else
5950 error = EOPNOTSUPP;
5951 #endif /* MAC_SOCKET */
5952 break;
5953
5954 case SO_PEERLABEL:
5955 #if CONFIG_MACF_SOCKET
5956 if ((error = sooptcopyin(sopt, &extmac, sizeof(extmac),
5957 sizeof(extmac))) != 0 ||
5958 (error = mac_socketpeer_label_get(proc_ucred(
5959 sopt->sopt_p), so, &extmac)) != 0) {
5960 break;
5961 }
5962
5963 error = sooptcopyout(sopt, &extmac, sizeof(extmac));
5964 #else
5965 error = EOPNOTSUPP;
5966 #endif /* MAC_SOCKET */
5967 break;
5968
5969 #ifdef __APPLE_API_PRIVATE
5970 case SO_UPCALLCLOSEWAIT:
5971 optval = (so->so_flags & SOF_UPCALLCLOSEWAIT);
5972 goto integer;
5973 #endif
5974 case SO_RANDOMPORT:
5975 optval = (so->so_flags & SOF_BINDRANDOMPORT);
5976 goto integer;
5977
5978 case SO_NP_EXTENSIONS: {
5979 struct so_np_extensions sonpx = {};
5980
5981 sonpx.npx_flags = (so->so_flags & SOF_NPX_SETOPTSHUT) ?
5982 SONPX_SETOPTSHUT : 0;
5983 sonpx.npx_mask = SONPX_MASK_VALID;
5984
5985 error = sooptcopyout(sopt, &sonpx,
5986 sizeof(struct so_np_extensions));
5987 break;
5988 }
5989
5990 case SO_TRAFFIC_CLASS:
5991 optval = so->so_traffic_class;
5992 goto integer;
5993
5994 case SO_RECV_TRAFFIC_CLASS:
5995 optval = (so->so_flags & SOF_RECV_TRAFFIC_CLASS);
5996 goto integer;
5997
5998 case SO_TRAFFIC_CLASS_STATS:
5999 error = sooptcopyout(sopt, &so->so_tc_stats,
6000 sizeof(so->so_tc_stats));
6001 break;
6002
6003 #if (DEVELOPMENT || DEBUG)
6004 case SO_TRAFFIC_CLASS_DBG:
6005 error = sogetopt_tcdbg(so, sopt);
6006 break;
6007 #endif /* (DEVELOPMENT || DEBUG) */
6008
6009 case SO_PRIVILEGED_TRAFFIC_CLASS:
6010 optval = (so->so_flags & SOF_PRIVILEGED_TRAFFIC_CLASS);
6011 goto integer;
6012
6013 case SO_DEFUNCTOK:
6014 optval = !(so->so_flags & SOF_NODEFUNCT);
6015 goto integer;
6016
6017 case SO_ISDEFUNCT:
6018 optval = (so->so_flags & SOF_DEFUNCT);
6019 goto integer;
6020
6021 case SO_OPPORTUNISTIC:
6022 optval = so_get_opportunistic(so);
6023 goto integer;
6024
6025 case SO_FLUSH:
6026 /* This option is not gettable */
6027 error = EINVAL;
6028 break;
6029
6030 case SO_RECV_ANYIF:
6031 optval = so_get_recv_anyif(so);
6032 goto integer;
6033
6034 case SO_TRAFFIC_MGT_BACKGROUND:
6035 /* This option is handled by lower layer(s) */
6036 if (so->so_proto != NULL &&
6037 so->so_proto->pr_ctloutput != NULL) {
6038 (void) so->so_proto->pr_ctloutput(so, sopt);
6039 }
6040 break;
6041
6042 #if FLOW_DIVERT
6043 case SO_FLOW_DIVERT_TOKEN:
6044 error = flow_divert_token_get(so, sopt);
6045 break;
6046 #endif /* FLOW_DIVERT */
6047
6048 #if NECP
6049 case SO_NECP_ATTRIBUTES:
6050 error = necp_get_socket_attributes(so, sopt);
6051 break;
6052
6053 case SO_NECP_CLIENTUUID:
6054 {
6055 uuid_t *ncu;
6056
6057 if (SOCK_DOM(so) == PF_MULTIPATH) {
6058 ncu = &mpsotomppcb(so)->necp_client_uuid;
6059 } else if (SOCK_DOM(so) == PF_INET || SOCK_DOM(so) == PF_INET6) {
6060 ncu = &sotoinpcb(so)->necp_client_uuid;
6061 } else {
6062 error = EINVAL;
6063 goto out;
6064 }
6065
6066 error = sooptcopyout(sopt, ncu, sizeof(uuid_t));
6067 break;
6068 }
6069 #endif /* NECP */
6070
6071 #if CONTENT_FILTER
6072 case SO_CFIL_SOCK_ID: {
6073 cfil_sock_id_t sock_id;
6074
6075 sock_id = cfil_sock_id_from_socket(so);
6076
6077 error = sooptcopyout(sopt, &sock_id,
6078 sizeof(cfil_sock_id_t));
6079 break;
6080 }
6081 #endif /* CONTENT_FILTER */
6082
6083 case SO_EXTENDED_BK_IDLE:
6084 optval = (so->so_flags1 & SOF1_EXTEND_BK_IDLE_WANTED);
6085 goto integer;
6086 case SO_MARK_CELLFALLBACK:
6087 optval = ((so->so_flags1 & SOF1_CELLFALLBACK) > 0)
6088 ? 1 : 0;
6089 goto integer;
6090 case SO_NET_SERVICE_TYPE: {
6091 if ((so->so_flags1 & SOF1_TC_NET_SERV_TYPE)) {
6092 optval = so->so_netsvctype;
6093 } else {
6094 optval = NET_SERVICE_TYPE_BE;
6095 }
6096 goto integer;
6097 }
6098 case SO_NETSVC_MARKING_LEVEL:
6099 optval = so_get_netsvc_marking_level(so);
6100 goto integer;
6101
6102 default:
6103 error = ENOPROTOOPT;
6104 break;
6105 }
6106 }
6107 out:
6108 if (dolock) {
6109 socket_unlock(so, 1);
6110 }
6111 return error;
6112 }
6113
6114 /*
6115 * The size limits on our soopt_getm is different from that on FreeBSD.
6116 * We limit the size of options to MCLBYTES. This will have to change
6117 * if we need to define options that need more space than MCLBYTES.
6118 */
6119 int
6120 soopt_getm(struct sockopt *sopt, struct mbuf **mp)
6121 {
6122 struct mbuf *m, *m_prev;
6123 int sopt_size = sopt->sopt_valsize;
6124 int how;
6125
6126 if (sopt_size <= 0 || sopt_size > MCLBYTES) {
6127 return EMSGSIZE;
6128 }
6129
6130 how = sopt->sopt_p != kernproc ? M_WAIT : M_DONTWAIT;
6131 MGET(m, how, MT_DATA);
6132 if (m == NULL) {
6133 return ENOBUFS;
6134 }
6135 if (sopt_size > MLEN) {
6136 MCLGET(m, how);
6137 if ((m->m_flags & M_EXT) == 0) {
6138 m_free(m);
6139 return ENOBUFS;
6140 }
6141 m->m_len = min(MCLBYTES, sopt_size);
6142 } else {
6143 m->m_len = min(MLEN, sopt_size);
6144 }
6145 sopt_size -= m->m_len;
6146 *mp = m;
6147 m_prev = m;
6148
6149 while (sopt_size > 0) {
6150 MGET(m, how, MT_DATA);
6151 if (m == NULL) {
6152 m_freem(*mp);
6153 return ENOBUFS;
6154 }
6155 if (sopt_size > MLEN) {
6156 MCLGET(m, how);
6157 if ((m->m_flags & M_EXT) == 0) {
6158 m_freem(*mp);
6159 m_freem(m);
6160 return ENOBUFS;
6161 }
6162 m->m_len = min(MCLBYTES, sopt_size);
6163 } else {
6164 m->m_len = min(MLEN, sopt_size);
6165 }
6166 sopt_size -= m->m_len;
6167 m_prev->m_next = m;
6168 m_prev = m;
6169 }
6170 return 0;
6171 }
6172
6173 /* copyin sopt data into mbuf chain */
6174 int
6175 soopt_mcopyin(struct sockopt *sopt, struct mbuf *m)
6176 {
6177 struct mbuf *m0 = m;
6178
6179 if (sopt->sopt_val == USER_ADDR_NULL) {
6180 return 0;
6181 }
6182 while (m != NULL && sopt->sopt_valsize >= m->m_len) {
6183 if (sopt->sopt_p != kernproc) {
6184 int error;
6185
6186 error = copyin(sopt->sopt_val, mtod(m, char *),
6187 m->m_len);
6188 if (error != 0) {
6189 m_freem(m0);
6190 return error;
6191 }
6192 } else {
6193 bcopy(CAST_DOWN(caddr_t, sopt->sopt_val),
6194 mtod(m, char *), m->m_len);
6195 }
6196 sopt->sopt_valsize -= m->m_len;
6197 sopt->sopt_val += m->m_len;
6198 m = m->m_next;
6199 }
6200 /* should be allocated enoughly at ip6_sooptmcopyin() */
6201 if (m != NULL) {
6202 panic("soopt_mcopyin");
6203 /* NOTREACHED */
6204 }
6205 return 0;
6206 }
6207
6208 /* copyout mbuf chain data into soopt */
6209 int
6210 soopt_mcopyout(struct sockopt *sopt, struct mbuf *m)
6211 {
6212 struct mbuf *m0 = m;
6213 size_t valsize = 0;
6214
6215 if (sopt->sopt_val == USER_ADDR_NULL) {
6216 return 0;
6217 }
6218 while (m != NULL && sopt->sopt_valsize >= m->m_len) {
6219 if (sopt->sopt_p != kernproc) {
6220 int error;
6221
6222 error = copyout(mtod(m, char *), sopt->sopt_val,
6223 m->m_len);
6224 if (error != 0) {
6225 m_freem(m0);
6226 return error;
6227 }
6228 } else {
6229 bcopy(mtod(m, char *),
6230 CAST_DOWN(caddr_t, sopt->sopt_val), m->m_len);
6231 }
6232 sopt->sopt_valsize -= m->m_len;
6233 sopt->sopt_val += m->m_len;
6234 valsize += m->m_len;
6235 m = m->m_next;
6236 }
6237 if (m != NULL) {
6238 /* enough soopt buffer should be given from user-land */
6239 m_freem(m0);
6240 return EINVAL;
6241 }
6242 sopt->sopt_valsize = valsize;
6243 return 0;
6244 }
6245
6246 void
6247 sohasoutofband(struct socket *so)
6248 {
6249 if (so->so_pgid < 0) {
6250 gsignal(-so->so_pgid, SIGURG);
6251 } else if (so->so_pgid > 0) {
6252 proc_signal(so->so_pgid, SIGURG);
6253 }
6254 selwakeup(&so->so_rcv.sb_sel);
6255 if (so->so_rcv.sb_flags & SB_KNOTE) {
6256 KNOTE(&so->so_rcv.sb_sel.si_note,
6257 (NOTE_OOB | SO_FILT_HINT_LOCKED));
6258 }
6259 }
6260
6261 int
6262 sopoll(struct socket *so, int events, kauth_cred_t cred, void * wql)
6263 {
6264 #pragma unused(cred)
6265 struct proc *p = current_proc();
6266 int revents = 0;
6267
6268 socket_lock(so, 1);
6269 so_update_last_owner_locked(so, PROC_NULL);
6270 so_update_policy(so);
6271
6272 if (events & (POLLIN | POLLRDNORM)) {
6273 if (soreadable(so)) {
6274 revents |= events & (POLLIN | POLLRDNORM);
6275 }
6276 }
6277
6278 if (events & (POLLOUT | POLLWRNORM)) {
6279 if (sowriteable(so)) {
6280 revents |= events & (POLLOUT | POLLWRNORM);
6281 }
6282 }
6283
6284 if (events & (POLLPRI | POLLRDBAND)) {
6285 if (so->so_oobmark || (so->so_state & SS_RCVATMARK)) {
6286 revents |= events & (POLLPRI | POLLRDBAND);
6287 }
6288 }
6289
6290 if (revents == 0) {
6291 if (events & (POLLIN | POLLPRI | POLLRDNORM | POLLRDBAND)) {
6292 /*
6293 * Darwin sets the flag first,
6294 * BSD calls selrecord first
6295 */
6296 so->so_rcv.sb_flags |= SB_SEL;
6297 selrecord(p, &so->so_rcv.sb_sel, wql);
6298 }
6299
6300 if (events & (POLLOUT | POLLWRNORM)) {
6301 /*
6302 * Darwin sets the flag first,
6303 * BSD calls selrecord first
6304 */
6305 so->so_snd.sb_flags |= SB_SEL;
6306 selrecord(p, &so->so_snd.sb_sel, wql);
6307 }
6308 }
6309
6310 socket_unlock(so, 1);
6311 return revents;
6312 }
6313
6314 int
6315 soo_kqfilter(struct fileproc *fp, struct knote *kn,
6316 struct kevent_internal_s *kev, vfs_context_t ctx)
6317 {
6318 #pragma unused(fp)
6319 #if !CONFIG_MACF_SOCKET
6320 #pragma unused(ctx)
6321 #endif /* MAC_SOCKET */
6322 struct socket *so = (struct socket *)kn->kn_fp->f_fglob->fg_data;
6323 int result;
6324
6325 socket_lock(so, 1);
6326 so_update_last_owner_locked(so, PROC_NULL);
6327 so_update_policy(so);
6328
6329 #if CONFIG_MACF_SOCKET
6330 if (mac_socket_check_kqfilter(proc_ucred(vfs_context_proc(ctx)),
6331 kn, so) != 0) {
6332 socket_unlock(so, 1);
6333 kn->kn_flags = EV_ERROR;
6334 kn->kn_data = EPERM;
6335 return 0;
6336 }
6337 #endif /* MAC_SOCKET */
6338
6339 switch (kn->kn_filter) {
6340 case EVFILT_READ:
6341 kn->kn_filtid = EVFILTID_SOREAD;
6342 break;
6343 case EVFILT_WRITE:
6344 kn->kn_filtid = EVFILTID_SOWRITE;
6345 break;
6346 case EVFILT_SOCK:
6347 kn->kn_filtid = EVFILTID_SCK;
6348 break;
6349 case EVFILT_EXCEPT:
6350 kn->kn_filtid = EVFILTID_SOEXCEPT;
6351 break;
6352 default:
6353 socket_unlock(so, 1);
6354 kn->kn_flags = EV_ERROR;
6355 kn->kn_data = EINVAL;
6356 return 0;
6357 }
6358
6359 /*
6360 * call the appropriate sub-filter attach
6361 * with the socket still locked
6362 */
6363 result = knote_fops(kn)->f_attach(kn, kev);
6364
6365 socket_unlock(so, 1);
6366
6367 return result;
6368 }
6369
6370 static int
6371 filt_soread_common(struct knote *kn, struct socket *so)
6372 {
6373 if (so->so_options & SO_ACCEPTCONN) {
6374 int is_not_empty;
6375
6376 /*
6377 * Radar 6615193 handle the listen case dynamically
6378 * for kqueue read filter. This allows to call listen()
6379 * after registering the kqueue EVFILT_READ.
6380 */
6381
6382 kn->kn_data = so->so_qlen;
6383 is_not_empty = !TAILQ_EMPTY(&so->so_comp);
6384
6385 return is_not_empty;
6386 }
6387
6388 /* socket isn't a listener */
6389 /*
6390 * NOTE_LOWAT specifies new low water mark in data, i.e.
6391 * the bytes of protocol data. We therefore exclude any
6392 * control bytes.
6393 */
6394 kn->kn_data = so->so_rcv.sb_cc - so->so_rcv.sb_ctl;
6395
6396 if (kn->kn_sfflags & NOTE_OOB) {
6397 if (so->so_oobmark || (so->so_state & SS_RCVATMARK)) {
6398 kn->kn_fflags |= NOTE_OOB;
6399 kn->kn_data -= so->so_oobmark;
6400 return 1;
6401 }
6402 }
6403
6404 if ((so->so_state & SS_CANTRCVMORE)
6405 #if CONTENT_FILTER
6406 && cfil_sock_data_pending(&so->so_rcv) == 0
6407 #endif /* CONTENT_FILTER */
6408 ) {
6409 kn->kn_flags |= EV_EOF;
6410 kn->kn_fflags = so->so_error;
6411 return 1;
6412 }
6413
6414 if (so->so_error) { /* temporary udp error */
6415 return 1;
6416 }
6417
6418 int64_t lowwat = so->so_rcv.sb_lowat;
6419 /*
6420 * Ensure that when NOTE_LOWAT is used, the derived
6421 * low water mark is bounded by socket's rcv buf's
6422 * high and low water mark values.
6423 */
6424 if (kn->kn_sfflags & NOTE_LOWAT) {
6425 if (kn->kn_sdata > so->so_rcv.sb_hiwat) {
6426 lowwat = so->so_rcv.sb_hiwat;
6427 } else if (kn->kn_sdata > lowwat) {
6428 lowwat = kn->kn_sdata;
6429 }
6430 }
6431
6432 /*
6433 * The order below is important. Since NOTE_LOWAT
6434 * overrides sb_lowat, check for NOTE_LOWAT case
6435 * first.
6436 */
6437 if (kn->kn_sfflags & NOTE_LOWAT) {
6438 return kn->kn_data >= lowwat;
6439 }
6440
6441 return so->so_rcv.sb_cc >= lowwat;
6442 }
6443
6444 static int
6445 filt_sorattach(struct knote *kn, __unused struct kevent_internal_s *kev)
6446 {
6447 struct socket *so = (struct socket *)kn->kn_fp->f_fglob->fg_data;
6448
6449 /* socket locked */
6450
6451 /*
6452 * If the caller explicitly asked for OOB results (e.g. poll())
6453 * from EVFILT_READ, then save that off in the hookid field
6454 * and reserve the kn_flags EV_OOBAND bit for output only.
6455 */
6456 if (kn->kn_filter == EVFILT_READ &&
6457 kn->kn_flags & EV_OOBAND) {
6458 kn->kn_flags &= ~EV_OOBAND;
6459 kn->kn_hookid = EV_OOBAND;
6460 } else {
6461 kn->kn_hookid = 0;
6462 }
6463 if (KNOTE_ATTACH(&so->so_rcv.sb_sel.si_note, kn)) {
6464 so->so_rcv.sb_flags |= SB_KNOTE;
6465 }
6466
6467 /* indicate if event is already fired */
6468 return filt_soread_common(kn, so);
6469 }
6470
6471 static void
6472 filt_sordetach(struct knote *kn)
6473 {
6474 struct socket *so = (struct socket *)kn->kn_fp->f_fglob->fg_data;
6475
6476 socket_lock(so, 1);
6477 if (so->so_rcv.sb_flags & SB_KNOTE) {
6478 if (KNOTE_DETACH(&so->so_rcv.sb_sel.si_note, kn)) {
6479 so->so_rcv.sb_flags &= ~SB_KNOTE;
6480 }
6481 }
6482 socket_unlock(so, 1);
6483 }
6484
6485 /*ARGSUSED*/
6486 static int
6487 filt_soread(struct knote *kn, long hint)
6488 {
6489 struct socket *so = (struct socket *)kn->kn_fp->f_fglob->fg_data;
6490 int retval;
6491
6492 if ((hint & SO_FILT_HINT_LOCKED) == 0) {
6493 socket_lock(so, 1);
6494 }
6495
6496 retval = filt_soread_common(kn, so);
6497
6498 if ((hint & SO_FILT_HINT_LOCKED) == 0) {
6499 socket_unlock(so, 1);
6500 }
6501
6502 return retval;
6503 }
6504
6505 static int
6506 filt_sortouch(struct knote *kn, struct kevent_internal_s *kev)
6507 {
6508 struct socket *so = (struct socket *)kn->kn_fp->f_fglob->fg_data;
6509 int retval;
6510
6511 socket_lock(so, 1);
6512
6513 /* save off the new input fflags and data */
6514 kn->kn_sfflags = kev->fflags;
6515 kn->kn_sdata = kev->data;
6516
6517 /* determine if changes result in fired events */
6518 retval = filt_soread_common(kn, so);
6519
6520 socket_unlock(so, 1);
6521
6522 return retval;
6523 }
6524
6525 static int
6526 filt_sorprocess(struct knote *kn, struct filt_process_s *data, struct kevent_internal_s *kev)
6527 {
6528 #pragma unused(data)
6529 struct socket *so = (struct socket *)kn->kn_fp->f_fglob->fg_data;
6530 int retval;
6531
6532 socket_lock(so, 1);
6533 retval = filt_soread_common(kn, so);
6534 if (retval) {
6535 *kev = kn->kn_kevent;
6536 if (kn->kn_flags & EV_CLEAR) {
6537 kn->kn_fflags = 0;
6538 kn->kn_data = 0;
6539 }
6540 }
6541 socket_unlock(so, 1);
6542
6543 return retval;
6544 }
6545
6546 int
6547 so_wait_for_if_feedback(struct socket *so)
6548 {
6549 if ((SOCK_DOM(so) == PF_INET || SOCK_DOM(so) == PF_INET6) &&
6550 (so->so_state & SS_ISCONNECTED)) {
6551 struct inpcb *inp = sotoinpcb(so);
6552 if (INP_WAIT_FOR_IF_FEEDBACK(inp)) {
6553 return 1;
6554 }
6555 }
6556 return 0;
6557 }
6558
6559 static int
6560 filt_sowrite_common(struct knote *kn, struct socket *so)
6561 {
6562 int ret = 0;
6563
6564 kn->kn_data = sbspace(&so->so_snd);
6565 if (so->so_state & SS_CANTSENDMORE) {
6566 kn->kn_flags |= EV_EOF;
6567 kn->kn_fflags = so->so_error;
6568 return 1;
6569 }
6570 if (so->so_error) { /* temporary udp error */
6571 return 1;
6572 }
6573 if (!socanwrite(so)) {
6574 return 0;
6575 }
6576 if (so->so_flags1 & SOF1_PRECONNECT_DATA) {
6577 return 1;
6578 }
6579 int64_t lowwat = so->so_snd.sb_lowat;
6580 if (kn->kn_sfflags & NOTE_LOWAT) {
6581 if (kn->kn_sdata > so->so_snd.sb_hiwat) {
6582 lowwat = so->so_snd.sb_hiwat;
6583 } else if (kn->kn_sdata > lowwat) {
6584 lowwat = kn->kn_sdata;
6585 }
6586 }
6587 if (kn->kn_data >= lowwat) {
6588 if ((so->so_flags & SOF_NOTSENT_LOWAT)
6589 #if (DEBUG || DEVELOPMENT)
6590 && so_notsent_lowat_check == 1
6591 #endif /* DEBUG || DEVELOPMENT */
6592 ) {
6593 if ((SOCK_DOM(so) == PF_INET ||
6594 SOCK_DOM(so) == PF_INET6) &&
6595 so->so_type == SOCK_STREAM) {
6596 ret = tcp_notsent_lowat_check(so);
6597 }
6598 #if MPTCP
6599 else if ((SOCK_DOM(so) == PF_MULTIPATH) &&
6600 (SOCK_PROTO(so) == IPPROTO_TCP)) {
6601 ret = mptcp_notsent_lowat_check(so);
6602 }
6603 #endif
6604 else {
6605 return 1;
6606 }
6607 } else {
6608 ret = 1;
6609 }
6610 }
6611 if (so_wait_for_if_feedback(so)) {
6612 ret = 0;
6613 }
6614 return ret;
6615 }
6616
6617 static int
6618 filt_sowattach(struct knote *kn, __unused struct kevent_internal_s *kev)
6619 {
6620 struct socket *so = (struct socket *)kn->kn_fp->f_fglob->fg_data;
6621
6622 /* socket locked */
6623 if (KNOTE_ATTACH(&so->so_snd.sb_sel.si_note, kn)) {
6624 so->so_snd.sb_flags |= SB_KNOTE;
6625 }
6626
6627 /* determine if its already fired */
6628 return filt_sowrite_common(kn, so);
6629 }
6630
6631 static void
6632 filt_sowdetach(struct knote *kn)
6633 {
6634 struct socket *so = (struct socket *)kn->kn_fp->f_fglob->fg_data;
6635 socket_lock(so, 1);
6636
6637 if (so->so_snd.sb_flags & SB_KNOTE) {
6638 if (KNOTE_DETACH(&so->so_snd.sb_sel.si_note, kn)) {
6639 so->so_snd.sb_flags &= ~SB_KNOTE;
6640 }
6641 }
6642 socket_unlock(so, 1);
6643 }
6644
6645 /*ARGSUSED*/
6646 static int
6647 filt_sowrite(struct knote *kn, long hint)
6648 {
6649 struct socket *so = (struct socket *)kn->kn_fp->f_fglob->fg_data;
6650 int ret;
6651
6652 if ((hint & SO_FILT_HINT_LOCKED) == 0) {
6653 socket_lock(so, 1);
6654 }
6655
6656 ret = filt_sowrite_common(kn, so);
6657
6658 if ((hint & SO_FILT_HINT_LOCKED) == 0) {
6659 socket_unlock(so, 1);
6660 }
6661
6662 return ret;
6663 }
6664
6665 static int
6666 filt_sowtouch(struct knote *kn, struct kevent_internal_s *kev)
6667 {
6668 struct socket *so = (struct socket *)kn->kn_fp->f_fglob->fg_data;
6669 int ret;
6670
6671 socket_lock(so, 1);
6672
6673 /*save off the new input fflags and data */
6674 kn->kn_sfflags = kev->fflags;
6675 kn->kn_sdata = kev->data;
6676
6677 /* determine if these changes result in a triggered event */
6678 ret = filt_sowrite_common(kn, so);
6679
6680 socket_unlock(so, 1);
6681
6682 return ret;
6683 }
6684
6685 static int
6686 filt_sowprocess(struct knote *kn, struct filt_process_s *data, struct kevent_internal_s *kev)
6687 {
6688 #pragma unused(data)
6689 struct socket *so = (struct socket *)kn->kn_fp->f_fglob->fg_data;
6690 int ret;
6691
6692 socket_lock(so, 1);
6693 ret = filt_sowrite_common(kn, so);
6694 if (ret) {
6695 *kev = kn->kn_kevent;
6696 if (kn->kn_flags & EV_CLEAR) {
6697 kn->kn_fflags = 0;
6698 kn->kn_data = 0;
6699 }
6700 }
6701 socket_unlock(so, 1);
6702 return ret;
6703 }
6704
6705 static int
6706 filt_sockev_common(struct knote *kn, struct socket *so, long ev_hint)
6707 {
6708 int ret = 0;
6709 uint32_t level_trigger = 0;
6710
6711 if (ev_hint & SO_FILT_HINT_CONNRESET) {
6712 kn->kn_fflags |= NOTE_CONNRESET;
6713 }
6714 if (ev_hint & SO_FILT_HINT_TIMEOUT) {
6715 kn->kn_fflags |= NOTE_TIMEOUT;
6716 }
6717 if (ev_hint & SO_FILT_HINT_NOSRCADDR) {
6718 kn->kn_fflags |= NOTE_NOSRCADDR;
6719 }
6720 if (ev_hint & SO_FILT_HINT_IFDENIED) {
6721 kn->kn_fflags |= NOTE_IFDENIED;
6722 }
6723 if (ev_hint & SO_FILT_HINT_KEEPALIVE) {
6724 kn->kn_fflags |= NOTE_KEEPALIVE;
6725 }
6726 if (ev_hint & SO_FILT_HINT_ADAPTIVE_WTIMO) {
6727 kn->kn_fflags |= NOTE_ADAPTIVE_WTIMO;
6728 }
6729 if (ev_hint & SO_FILT_HINT_ADAPTIVE_RTIMO) {
6730 kn->kn_fflags |= NOTE_ADAPTIVE_RTIMO;
6731 }
6732 if ((ev_hint & SO_FILT_HINT_CONNECTED) ||
6733 (so->so_state & SS_ISCONNECTED)) {
6734 kn->kn_fflags |= NOTE_CONNECTED;
6735 level_trigger |= NOTE_CONNECTED;
6736 }
6737 if ((ev_hint & SO_FILT_HINT_DISCONNECTED) ||
6738 (so->so_state & SS_ISDISCONNECTED)) {
6739 kn->kn_fflags |= NOTE_DISCONNECTED;
6740 level_trigger |= NOTE_DISCONNECTED;
6741 }
6742 if (ev_hint & SO_FILT_HINT_CONNINFO_UPDATED) {
6743 if (so->so_proto != NULL &&
6744 (so->so_proto->pr_flags & PR_EVCONNINFO)) {
6745 kn->kn_fflags |= NOTE_CONNINFO_UPDATED;
6746 }
6747 }
6748
6749 if ((ev_hint & SO_FILT_HINT_NOTIFY_ACK) ||
6750 tcp_notify_ack_active(so)) {
6751 kn->kn_fflags |= NOTE_NOTIFY_ACK;
6752 }
6753
6754 if ((so->so_state & SS_CANTRCVMORE)
6755 #if CONTENT_FILTER
6756 && cfil_sock_data_pending(&so->so_rcv) == 0
6757 #endif /* CONTENT_FILTER */
6758 ) {
6759 kn->kn_fflags |= NOTE_READCLOSED;
6760 level_trigger |= NOTE_READCLOSED;
6761 }
6762
6763 if (so->so_state & SS_CANTSENDMORE) {
6764 kn->kn_fflags |= NOTE_WRITECLOSED;
6765 level_trigger |= NOTE_WRITECLOSED;
6766 }
6767
6768 if ((ev_hint & SO_FILT_HINT_SUSPEND) ||
6769 (so->so_flags & SOF_SUSPENDED)) {
6770 kn->kn_fflags &= ~(NOTE_SUSPEND | NOTE_RESUME);
6771
6772 /* If resume event was delivered before, reset it */
6773 kn->kn_hookid &= ~NOTE_RESUME;
6774
6775 kn->kn_fflags |= NOTE_SUSPEND;
6776 level_trigger |= NOTE_SUSPEND;
6777 }
6778
6779 if ((ev_hint & SO_FILT_HINT_RESUME) ||
6780 (so->so_flags & SOF_SUSPENDED) == 0) {
6781 kn->kn_fflags &= ~(NOTE_SUSPEND | NOTE_RESUME);
6782
6783 /* If suspend event was delivered before, reset it */
6784 kn->kn_hookid &= ~NOTE_SUSPEND;
6785
6786 kn->kn_fflags |= NOTE_RESUME;
6787 level_trigger |= NOTE_RESUME;
6788 }
6789
6790 if (so->so_error != 0) {
6791 ret = 1;
6792 kn->kn_data = so->so_error;
6793 kn->kn_flags |= EV_EOF;
6794 } else {
6795 get_sockev_state(so, (u_int32_t *)&(kn->kn_data));
6796 }
6797
6798 /* Reset any events that are not requested on this knote */
6799 kn->kn_fflags &= (kn->kn_sfflags & EVFILT_SOCK_ALL_MASK);
6800 level_trigger &= (kn->kn_sfflags & EVFILT_SOCK_ALL_MASK);
6801
6802 /* Find the level triggerred events that are already delivered */
6803 level_trigger &= kn->kn_hookid;
6804 level_trigger &= EVFILT_SOCK_LEVEL_TRIGGER_MASK;
6805
6806 /* Do not deliver level triggerred events more than once */
6807 if ((kn->kn_fflags & ~level_trigger) != 0) {
6808 ret = 1;
6809 }
6810
6811 return ret;
6812 }
6813
6814 static int
6815 filt_sockattach(struct knote *kn, __unused struct kevent_internal_s *kev)
6816 {
6817 struct socket *so = (struct socket *)kn->kn_fp->f_fglob->fg_data;
6818
6819 /* socket locked */
6820 kn->kn_hookid = 0;
6821 if (KNOTE_ATTACH(&so->so_klist, kn)) {
6822 so->so_flags |= SOF_KNOTE;
6823 }
6824
6825 /* determine if event already fired */
6826 return filt_sockev_common(kn, so, 0);
6827 }
6828
6829 static void
6830 filt_sockdetach(struct knote *kn)
6831 {
6832 struct socket *so = (struct socket *)kn->kn_fp->f_fglob->fg_data;
6833 socket_lock(so, 1);
6834
6835 if ((so->so_flags & SOF_KNOTE) != 0) {
6836 if (KNOTE_DETACH(&so->so_klist, kn)) {
6837 so->so_flags &= ~SOF_KNOTE;
6838 }
6839 }
6840 socket_unlock(so, 1);
6841 }
6842
6843 static int
6844 filt_sockev(struct knote *kn, long hint)
6845 {
6846 int ret = 0, locked = 0;
6847 struct socket *so = (struct socket *)kn->kn_fp->f_fglob->fg_data;
6848 long ev_hint = (hint & SO_FILT_HINT_EV);
6849
6850 if ((hint & SO_FILT_HINT_LOCKED) == 0) {
6851 socket_lock(so, 1);
6852 locked = 1;
6853 }
6854
6855 ret = filt_sockev_common(kn, so, ev_hint);
6856
6857 if (locked) {
6858 socket_unlock(so, 1);
6859 }
6860
6861 return ret;
6862 }
6863
6864
6865
6866 /*
6867 * filt_socktouch - update event state
6868 */
6869 static int
6870 filt_socktouch(
6871 struct knote *kn,
6872 struct kevent_internal_s *kev)
6873 {
6874 struct socket *so = (struct socket *)kn->kn_fp->f_fglob->fg_data;
6875 uint32_t changed_flags;
6876 int ret;
6877
6878 socket_lock(so, 1);
6879
6880 /* save off the [result] data and fflags */
6881 changed_flags = (kn->kn_sfflags ^ kn->kn_hookid);
6882
6883 /* save off the new input fflags and data */
6884 kn->kn_sfflags = kev->fflags;
6885 kn->kn_sdata = kev->data;
6886
6887 /* restrict the current results to the (smaller?) set of new interest */
6888 /*
6889 * For compatibility with previous implementations, we leave kn_fflags
6890 * as they were before.
6891 */
6892 //kn->kn_fflags &= kev->fflags;
6893
6894 /*
6895 * Since we keep track of events that are already
6896 * delivered, if any of those events are not requested
6897 * anymore the state related to them can be reset
6898 */
6899 kn->kn_hookid &=
6900 ~(changed_flags & EVFILT_SOCK_LEVEL_TRIGGER_MASK);
6901
6902 /* determine if we have events to deliver */
6903 ret = filt_sockev_common(kn, so, 0);
6904
6905 socket_unlock(so, 1);
6906
6907 return ret;
6908 }
6909
6910 /*
6911 * filt_sockprocess - query event fired state and return data
6912 */
6913 static int
6914 filt_sockprocess(
6915 struct knote *kn,
6916 struct filt_process_s *data,
6917 struct kevent_internal_s *kev)
6918 {
6919 #pragma unused(data)
6920
6921 struct socket *so = (struct socket *)kn->kn_fp->f_fglob->fg_data;
6922 int ret = 0;
6923
6924 socket_lock(so, 1);
6925
6926 ret = filt_sockev_common(kn, so, 0);
6927 if (ret) {
6928 *kev = kn->kn_kevent;
6929
6930 /*
6931 * Store the state of the events being delivered. This
6932 * state can be used to deliver level triggered events
6933 * ateast once and still avoid waking up the application
6934 * multiple times as long as the event is active.
6935 */
6936 if (kn->kn_fflags != 0) {
6937 kn->kn_hookid |= (kn->kn_fflags &
6938 EVFILT_SOCK_LEVEL_TRIGGER_MASK);
6939 }
6940
6941 /*
6942 * NOTE_RESUME and NOTE_SUSPEND are an exception, deliver
6943 * only one of them and remember the last one that was
6944 * delivered last
6945 */
6946 if (kn->kn_fflags & NOTE_SUSPEND) {
6947 kn->kn_hookid &= ~NOTE_RESUME;
6948 }
6949 if (kn->kn_fflags & NOTE_RESUME) {
6950 kn->kn_hookid &= ~NOTE_SUSPEND;
6951 }
6952
6953 if (kn->kn_flags & EV_CLEAR) {
6954 kn->kn_data = 0;
6955 kn->kn_fflags = 0;
6956 }
6957 }
6958
6959 socket_unlock(so, 1);
6960
6961 return ret;
6962 }
6963
6964 void
6965 get_sockev_state(struct socket *so, u_int32_t *statep)
6966 {
6967 u_int32_t state = *(statep);
6968
6969 /*
6970 * If the state variable is already used by a previous event,
6971 * reset it.
6972 */
6973 if (state != 0) {
6974 return;
6975 }
6976
6977 if (so->so_state & SS_ISCONNECTED) {
6978 state |= SOCKEV_CONNECTED;
6979 } else {
6980 state &= ~(SOCKEV_CONNECTED);
6981 }
6982 state |= ((so->so_state & SS_ISDISCONNECTED) ? SOCKEV_DISCONNECTED : 0);
6983 *(statep) = state;
6984 }
6985
6986 #define SO_LOCK_HISTORY_STR_LEN \
6987 (2 * SO_LCKDBG_MAX * (2 + (2 * sizeof (void *)) + 1) + 1)
6988
6989 __private_extern__ const char *
6990 solockhistory_nr(struct socket *so)
6991 {
6992 size_t n = 0;
6993 int i;
6994 static char lock_history_str[SO_LOCK_HISTORY_STR_LEN];
6995
6996 bzero(lock_history_str, sizeof(lock_history_str));
6997 for (i = SO_LCKDBG_MAX - 1; i >= 0; i--) {
6998 n += snprintf(lock_history_str + n,
6999 SO_LOCK_HISTORY_STR_LEN - n, "%p:%p ",
7000 so->lock_lr[(so->next_lock_lr + i) % SO_LCKDBG_MAX],
7001 so->unlock_lr[(so->next_unlock_lr + i) % SO_LCKDBG_MAX]);
7002 }
7003 return lock_history_str;
7004 }
7005
7006 void
7007 socket_lock(struct socket *so, int refcount)
7008 {
7009 void *lr_saved;
7010
7011 lr_saved = __builtin_return_address(0);
7012
7013 if (so->so_proto->pr_lock) {
7014 (*so->so_proto->pr_lock)(so, refcount, lr_saved);
7015 } else {
7016 #ifdef MORE_LOCKING_DEBUG
7017 LCK_MTX_ASSERT(so->so_proto->pr_domain->dom_mtx,
7018 LCK_MTX_ASSERT_NOTOWNED);
7019 #endif
7020 lck_mtx_lock(so->so_proto->pr_domain->dom_mtx);
7021 if (refcount) {
7022 so->so_usecount++;
7023 }
7024 so->lock_lr[so->next_lock_lr] = lr_saved;
7025 so->next_lock_lr = (so->next_lock_lr + 1) % SO_LCKDBG_MAX;
7026 }
7027 }
7028
7029 void
7030 socket_lock_assert_owned(struct socket *so)
7031 {
7032 lck_mtx_t *mutex_held;
7033
7034 if (so->so_proto->pr_getlock != NULL) {
7035 mutex_held = (*so->so_proto->pr_getlock)(so, 0);
7036 } else {
7037 mutex_held = so->so_proto->pr_domain->dom_mtx;
7038 }
7039
7040 LCK_MTX_ASSERT(mutex_held, LCK_MTX_ASSERT_OWNED);
7041 }
7042
7043 int
7044 socket_try_lock(struct socket *so)
7045 {
7046 lck_mtx_t *mtx;
7047
7048 if (so->so_proto->pr_getlock != NULL) {
7049 mtx = (*so->so_proto->pr_getlock)(so, 0);
7050 } else {
7051 mtx = so->so_proto->pr_domain->dom_mtx;
7052 }
7053
7054 return lck_mtx_try_lock(mtx);
7055 }
7056
7057 void
7058 socket_unlock(struct socket *so, int refcount)
7059 {
7060 void *lr_saved;
7061 lck_mtx_t *mutex_held;
7062
7063 lr_saved = __builtin_return_address(0);
7064
7065 if (so->so_proto == NULL) {
7066 panic("%s: null so_proto so=%p\n", __func__, so);
7067 /* NOTREACHED */
7068 }
7069
7070 if (so && so->so_proto->pr_unlock) {
7071 (*so->so_proto->pr_unlock)(so, refcount, lr_saved);
7072 } else {
7073 mutex_held = so->so_proto->pr_domain->dom_mtx;
7074 #ifdef MORE_LOCKING_DEBUG
7075 LCK_MTX_ASSERT(mutex_held, LCK_MTX_ASSERT_OWNED);
7076 #endif
7077 so->unlock_lr[so->next_unlock_lr] = lr_saved;
7078 so->next_unlock_lr = (so->next_unlock_lr + 1) % SO_LCKDBG_MAX;
7079
7080 if (refcount) {
7081 if (so->so_usecount <= 0) {
7082 panic("%s: bad refcount=%d so=%p (%d, %d, %d) "
7083 "lrh=%s", __func__, so->so_usecount, so,
7084 SOCK_DOM(so), so->so_type,
7085 SOCK_PROTO(so), solockhistory_nr(so));
7086 /* NOTREACHED */
7087 }
7088
7089 so->so_usecount--;
7090 if (so->so_usecount == 0) {
7091 sofreelastref(so, 1);
7092 }
7093 }
7094 lck_mtx_unlock(mutex_held);
7095 }
7096 }
7097
7098 /* Called with socket locked, will unlock socket */
7099 void
7100 sofree(struct socket *so)
7101 {
7102 lck_mtx_t *mutex_held;
7103
7104 if (so->so_proto->pr_getlock != NULL) {
7105 mutex_held = (*so->so_proto->pr_getlock)(so, 0);
7106 } else {
7107 mutex_held = so->so_proto->pr_domain->dom_mtx;
7108 }
7109 LCK_MTX_ASSERT(mutex_held, LCK_MTX_ASSERT_OWNED);
7110
7111 sofreelastref(so, 0);
7112 }
7113
7114 void
7115 soreference(struct socket *so)
7116 {
7117 socket_lock(so, 1); /* locks & take one reference on socket */
7118 socket_unlock(so, 0); /* unlock only */
7119 }
7120
7121 void
7122 sodereference(struct socket *so)
7123 {
7124 socket_lock(so, 0);
7125 socket_unlock(so, 1);
7126 }
7127
7128 /*
7129 * Set or clear SOF_MULTIPAGES on the socket to enable or disable the
7130 * possibility of using jumbo clusters. Caller must ensure to hold
7131 * the socket lock.
7132 */
7133 void
7134 somultipages(struct socket *so, boolean_t set)
7135 {
7136 if (set) {
7137 so->so_flags |= SOF_MULTIPAGES;
7138 } else {
7139 so->so_flags &= ~SOF_MULTIPAGES;
7140 }
7141 }
7142
7143 void
7144 soif2kcl(struct socket *so, boolean_t set)
7145 {
7146 if (set) {
7147 so->so_flags1 |= SOF1_IF_2KCL;
7148 } else {
7149 so->so_flags1 &= ~SOF1_IF_2KCL;
7150 }
7151 }
7152
7153 int
7154 so_isdstlocal(struct socket *so)
7155 {
7156 struct inpcb *inp = (struct inpcb *)so->so_pcb;
7157
7158 if (SOCK_DOM(so) == PF_INET) {
7159 return inaddr_local(inp->inp_faddr);
7160 } else if (SOCK_DOM(so) == PF_INET6) {
7161 return in6addr_local(&inp->in6p_faddr);
7162 }
7163
7164 return 0;
7165 }
7166
7167 int
7168 sosetdefunct(struct proc *p, struct socket *so, int level, boolean_t noforce)
7169 {
7170 struct sockbuf *rcv, *snd;
7171 int err = 0, defunct;
7172
7173 rcv = &so->so_rcv;
7174 snd = &so->so_snd;
7175
7176 defunct = (so->so_flags & SOF_DEFUNCT);
7177 if (defunct) {
7178 if (!(snd->sb_flags & rcv->sb_flags & SB_DROP)) {
7179 panic("%s: SB_DROP not set", __func__);
7180 /* NOTREACHED */
7181 }
7182 goto done;
7183 }
7184
7185 if (so->so_flags & SOF_NODEFUNCT) {
7186 if (noforce) {
7187 err = EOPNOTSUPP;
7188 if (p != PROC_NULL) {
7189 SODEFUNCTLOG("%s[%d, %s]: (target pid %d "
7190 "name %s level %d) so 0x%llx [%d,%d] "
7191 "is not eligible for defunct "
7192 "(%d)\n", __func__, proc_selfpid(),
7193 proc_best_name(current_proc()), proc_pid(p),
7194 proc_best_name(p), level,
7195 (uint64_t)DEBUG_KERNEL_ADDRPERM(so),
7196 SOCK_DOM(so), SOCK_TYPE(so), err);
7197 }
7198 return err;
7199 }
7200 so->so_flags &= ~SOF_NODEFUNCT;
7201 if (p != PROC_NULL) {
7202 SODEFUNCTLOG("%s[%d, %s]: (target pid %d "
7203 "name %s level %d) so 0x%llx [%d,%d] "
7204 "defunct by force "
7205 "(%d)\n", __func__, proc_selfpid(),
7206 proc_best_name(current_proc()), proc_pid(p),
7207 proc_best_name(p), level,
7208 (uint64_t)DEBUG_KERNEL_ADDRPERM(so),
7209 SOCK_DOM(so), SOCK_TYPE(so), err);
7210 }
7211 } else if (so->so_flags1 & SOF1_EXTEND_BK_IDLE_WANTED) {
7212 struct inpcb *inp = (struct inpcb *)so->so_pcb;
7213 struct ifnet *ifp = inp->inp_last_outifp;
7214
7215 if (ifp && IFNET_IS_CELLULAR(ifp)) {
7216 OSIncrementAtomic(&soextbkidlestat.so_xbkidle_nocell);
7217 } else if (so->so_flags & SOF_DELEGATED) {
7218 OSIncrementAtomic(&soextbkidlestat.so_xbkidle_nodlgtd);
7219 } else if (soextbkidlestat.so_xbkidle_time == 0) {
7220 OSIncrementAtomic(&soextbkidlestat.so_xbkidle_notime);
7221 } else if (noforce && p != PROC_NULL) {
7222 OSIncrementAtomic(&soextbkidlestat.so_xbkidle_active);
7223
7224 so->so_flags1 |= SOF1_EXTEND_BK_IDLE_INPROG;
7225 so->so_extended_bk_start = net_uptime();
7226 OSBitOrAtomic(P_LXBKIDLEINPROG, &p->p_ladvflag);
7227
7228 inpcb_timer_sched(inp->inp_pcbinfo, INPCB_TIMER_LAZY);
7229
7230 err = EOPNOTSUPP;
7231 SODEFUNCTLOG("%s[%d, %s]: (target pid %d "
7232 "name %s level %d) so 0x%llx [%d,%d] "
7233 "extend bk idle "
7234 "(%d)\n", __func__, proc_selfpid(),
7235 proc_best_name(current_proc()), proc_pid(p),
7236 proc_best_name(p), level,
7237 (uint64_t)DEBUG_KERNEL_ADDRPERM(so),
7238 SOCK_DOM(so), SOCK_TYPE(so), err);
7239 return err;
7240 } else {
7241 OSIncrementAtomic(&soextbkidlestat.so_xbkidle_forced);
7242 }
7243 }
7244
7245 so->so_flags |= SOF_DEFUNCT;
7246
7247 /* Prevent further data from being appended to the socket buffers */
7248 snd->sb_flags |= SB_DROP;
7249 rcv->sb_flags |= SB_DROP;
7250
7251 /* Flush any existing data in the socket buffers */
7252 if (rcv->sb_cc != 0) {
7253 rcv->sb_flags &= ~SB_SEL;
7254 selthreadclear(&rcv->sb_sel);
7255 sbrelease(rcv);
7256 }
7257 if (snd->sb_cc != 0) {
7258 snd->sb_flags &= ~SB_SEL;
7259 selthreadclear(&snd->sb_sel);
7260 sbrelease(snd);
7261 }
7262
7263 done:
7264 if (p != PROC_NULL) {
7265 SODEFUNCTLOG("%s[%d, %s]: (target pid %d name %s level %d) "
7266 "so 0x%llx [%d,%d] %s defunct%s\n", __func__,
7267 proc_selfpid(), proc_best_name(current_proc()),
7268 proc_pid(p), proc_best_name(p), level,
7269 (uint64_t)DEBUG_KERNEL_ADDRPERM(so), SOCK_DOM(so),
7270 SOCK_TYPE(so), defunct ? "is already" : "marked as",
7271 (so->so_flags1 & SOF1_EXTEND_BK_IDLE_WANTED) ?
7272 " extbkidle" : "");
7273 }
7274 return err;
7275 }
7276
7277 int
7278 sodefunct(struct proc *p, struct socket *so, int level)
7279 {
7280 struct sockbuf *rcv, *snd;
7281
7282 if (!(so->so_flags & SOF_DEFUNCT)) {
7283 panic("%s improperly called", __func__);
7284 /* NOTREACHED */
7285 }
7286 if (so->so_state & SS_DEFUNCT) {
7287 goto done;
7288 }
7289
7290 rcv = &so->so_rcv;
7291 snd = &so->so_snd;
7292
7293 if (SOCK_DOM(so) == PF_INET || SOCK_DOM(so) == PF_INET6) {
7294 char s[MAX_IPv6_STR_LEN];
7295 char d[MAX_IPv6_STR_LEN];
7296 struct inpcb *inp = sotoinpcb(so);
7297
7298 if (p != PROC_NULL) {
7299 SODEFUNCTLOG(
7300 "%s[%d, %s]: (target pid %d name %s level %d) "
7301 "so 0x%llx [%s %s:%d -> %s:%d] is now defunct "
7302 "[rcv_si 0x%x, snd_si 0x%x, rcv_fl 0x%x, "
7303 " snd_fl 0x%x]\n", __func__,
7304 proc_selfpid(), proc_best_name(current_proc()),
7305 proc_pid(p), proc_best_name(p), level,
7306 (uint64_t)DEBUG_KERNEL_ADDRPERM(so),
7307 (SOCK_TYPE(so) == SOCK_STREAM) ? "TCP" : "UDP",
7308 inet_ntop(SOCK_DOM(so), ((SOCK_DOM(so) == PF_INET) ?
7309 (void *)&inp->inp_laddr.s_addr :
7310 (void *)&inp->in6p_laddr),
7311 s, sizeof(s)), ntohs(inp->in6p_lport),
7312 inet_ntop(SOCK_DOM(so), (SOCK_DOM(so) == PF_INET) ?
7313 (void *)&inp->inp_faddr.s_addr :
7314 (void *)&inp->in6p_faddr,
7315 d, sizeof(d)), ntohs(inp->in6p_fport),
7316 (uint32_t)rcv->sb_sel.si_flags,
7317 (uint32_t)snd->sb_sel.si_flags,
7318 rcv->sb_flags, snd->sb_flags);
7319 }
7320 } else if (p != PROC_NULL) {
7321 SODEFUNCTLOG("%s[%d, %s]: (target pid %d name %s level %d) "
7322 "so 0x%llx [%d,%d] is now defunct [rcv_si 0x%x, "
7323 "snd_si 0x%x, rcv_fl 0x%x, snd_fl 0x%x]\n", __func__,
7324 proc_selfpid(), proc_best_name(current_proc()),
7325 proc_pid(p), proc_best_name(p), level,
7326 (uint64_t)DEBUG_KERNEL_ADDRPERM(so),
7327 SOCK_DOM(so), SOCK_TYPE(so),
7328 (uint32_t)rcv->sb_sel.si_flags,
7329 (uint32_t)snd->sb_sel.si_flags, rcv->sb_flags,
7330 snd->sb_flags);
7331 }
7332
7333 /*
7334 * Unwedge threads blocked on sbwait() and sb_lock().
7335 */
7336 sbwakeup(rcv);
7337 sbwakeup(snd);
7338
7339 so->so_flags1 |= SOF1_DEFUNCTINPROG;
7340 if (rcv->sb_flags & SB_LOCK) {
7341 sbunlock(rcv, TRUE); /* keep socket locked */
7342 }
7343 if (snd->sb_flags & SB_LOCK) {
7344 sbunlock(snd, TRUE); /* keep socket locked */
7345 }
7346 /*
7347 * Flush the buffers and disconnect. We explicitly call shutdown
7348 * on both data directions to ensure that SS_CANT{RCV,SEND}MORE
7349 * states are set for the socket. This would also flush out data
7350 * hanging off the receive list of this socket.
7351 */
7352 (void) soshutdownlock_final(so, SHUT_RD);
7353 (void) soshutdownlock_final(so, SHUT_WR);
7354 (void) sodisconnectlocked(so);
7355
7356 /*
7357 * Explicitly handle connectionless-protocol disconnection
7358 * and release any remaining data in the socket buffers.
7359 */
7360 if (!(so->so_state & SS_ISDISCONNECTED)) {
7361 (void) soisdisconnected(so);
7362 }
7363
7364 if (so->so_error == 0) {
7365 so->so_error = EBADF;
7366 }
7367
7368 if (rcv->sb_cc != 0) {
7369 rcv->sb_flags &= ~SB_SEL;
7370 selthreadclear(&rcv->sb_sel);
7371 sbrelease(rcv);
7372 }
7373 if (snd->sb_cc != 0) {
7374 snd->sb_flags &= ~SB_SEL;
7375 selthreadclear(&snd->sb_sel);
7376 sbrelease(snd);
7377 }
7378 so->so_state |= SS_DEFUNCT;
7379 OSIncrementAtomicLong((volatile long *)&sodefunct_calls);
7380
7381 done:
7382 return 0;
7383 }
7384
7385 int
7386 soresume(struct proc *p, struct socket *so, int locked)
7387 {
7388 if (locked == 0) {
7389 socket_lock(so, 1);
7390 }
7391
7392 if (so->so_flags1 & SOF1_EXTEND_BK_IDLE_INPROG) {
7393 SODEFUNCTLOG("%s[%d, %s]: (target pid %d name %s) so 0x%llx "
7394 "[%d,%d] resumed from bk idle\n",
7395 __func__, proc_selfpid(), proc_best_name(current_proc()),
7396 proc_pid(p), proc_best_name(p),
7397 (uint64_t)DEBUG_KERNEL_ADDRPERM(so),
7398 SOCK_DOM(so), SOCK_TYPE(so));
7399
7400 so->so_flags1 &= ~SOF1_EXTEND_BK_IDLE_INPROG;
7401 so->so_extended_bk_start = 0;
7402 OSBitAndAtomic(~P_LXBKIDLEINPROG, &p->p_ladvflag);
7403
7404 OSIncrementAtomic(&soextbkidlestat.so_xbkidle_resumed);
7405 OSDecrementAtomic(&soextbkidlestat.so_xbkidle_active);
7406 VERIFY(soextbkidlestat.so_xbkidle_active >= 0);
7407 }
7408 if (locked == 0) {
7409 socket_unlock(so, 1);
7410 }
7411
7412 return 0;
7413 }
7414
7415 /*
7416 * Does not attempt to account for sockets that are delegated from
7417 * the current process
7418 */
7419 int
7420 so_set_extended_bk_idle(struct socket *so, int optval)
7421 {
7422 int error = 0;
7423
7424 if ((SOCK_DOM(so) != PF_INET && SOCK_DOM(so) != PF_INET6) ||
7425 SOCK_PROTO(so) != IPPROTO_TCP) {
7426 OSDecrementAtomic(&soextbkidlestat.so_xbkidle_notsupp);
7427 error = EOPNOTSUPP;
7428 } else if (optval == 0) {
7429 so->so_flags1 &= ~SOF1_EXTEND_BK_IDLE_WANTED;
7430
7431 soresume(current_proc(), so, 1);
7432 } else {
7433 struct proc *p = current_proc();
7434 int i;
7435 struct filedesc *fdp;
7436 int count = 0;
7437
7438 /*
7439 * Unlock socket to avoid lock ordering issue with
7440 * the proc fd table lock
7441 */
7442 socket_unlock(so, 0);
7443
7444 proc_fdlock(p);
7445
7446 fdp = p->p_fd;
7447 for (i = 0; i < fdp->fd_nfiles; i++) {
7448 struct fileproc *fp = fdp->fd_ofiles[i];
7449 struct socket *so2;
7450
7451 if (fp == NULL ||
7452 (fdp->fd_ofileflags[i] & UF_RESERVED) != 0 ||
7453 FILEGLOB_DTYPE(fp->f_fglob) != DTYPE_SOCKET) {
7454 continue;
7455 }
7456
7457 so2 = (struct socket *)fp->f_fglob->fg_data;
7458 if (so != so2 &&
7459 so2->so_flags1 & SOF1_EXTEND_BK_IDLE_WANTED) {
7460 count++;
7461 }
7462 if (count >= soextbkidlestat.so_xbkidle_maxperproc) {
7463 break;
7464 }
7465 }
7466 proc_fdunlock(p);
7467
7468 socket_lock(so, 0);
7469
7470 if (count >= soextbkidlestat.so_xbkidle_maxperproc) {
7471 OSIncrementAtomic(&soextbkidlestat.so_xbkidle_toomany);
7472 error = EBUSY;
7473 } else if (so->so_flags & SOF_DELEGATED) {
7474 OSIncrementAtomic(&soextbkidlestat.so_xbkidle_nodlgtd);
7475 error = EBUSY;
7476 } else {
7477 so->so_flags1 |= SOF1_EXTEND_BK_IDLE_WANTED;
7478 OSIncrementAtomic(&soextbkidlestat.so_xbkidle_wantok);
7479 }
7480 SODEFUNCTLOG("%s[%d, %s]: so 0x%llx [%d,%d] "
7481 "%s marked for extended bk idle\n",
7482 __func__, proc_selfpid(), proc_best_name(current_proc()),
7483 (uint64_t)DEBUG_KERNEL_ADDRPERM(so),
7484 SOCK_DOM(so), SOCK_TYPE(so),
7485 (so->so_flags1 & SOF1_EXTEND_BK_IDLE_WANTED) ?
7486 "is" : "not");
7487 }
7488
7489 return error;
7490 }
7491
7492 static void
7493 so_stop_extended_bk_idle(struct socket *so)
7494 {
7495 so->so_flags1 &= ~SOF1_EXTEND_BK_IDLE_INPROG;
7496 so->so_extended_bk_start = 0;
7497
7498 OSDecrementAtomic(&soextbkidlestat.so_xbkidle_active);
7499 VERIFY(soextbkidlestat.so_xbkidle_active >= 0);
7500 /*
7501 * Force defunct
7502 */
7503 sosetdefunct(current_proc(), so,
7504 SHUTDOWN_SOCKET_LEVEL_DISCONNECT_INTERNAL, FALSE);
7505 if (so->so_flags & SOF_DEFUNCT) {
7506 sodefunct(current_proc(), so,
7507 SHUTDOWN_SOCKET_LEVEL_DISCONNECT_INTERNAL);
7508 }
7509 }
7510
7511 void
7512 so_drain_extended_bk_idle(struct socket *so)
7513 {
7514 if (so && (so->so_flags1 & SOF1_EXTEND_BK_IDLE_INPROG)) {
7515 /*
7516 * Only penalize sockets that have outstanding data
7517 */
7518 if (so->so_rcv.sb_cc || so->so_snd.sb_cc) {
7519 so_stop_extended_bk_idle(so);
7520
7521 OSIncrementAtomic(&soextbkidlestat.so_xbkidle_drained);
7522 }
7523 }
7524 }
7525
7526 /*
7527 * Return values tells if socket is still in extended background idle
7528 */
7529 int
7530 so_check_extended_bk_idle_time(struct socket *so)
7531 {
7532 int ret = 1;
7533
7534 if ((so->so_flags1 & SOF1_EXTEND_BK_IDLE_INPROG)) {
7535 SODEFUNCTLOG("%s[%d, %s]: so 0x%llx [%d,%d]\n",
7536 __func__, proc_selfpid(), proc_best_name(current_proc()),
7537 (uint64_t)DEBUG_KERNEL_ADDRPERM(so),
7538 SOCK_DOM(so), SOCK_TYPE(so));
7539 if (net_uptime() - so->so_extended_bk_start >
7540 soextbkidlestat.so_xbkidle_time) {
7541 so_stop_extended_bk_idle(so);
7542
7543 OSIncrementAtomic(&soextbkidlestat.so_xbkidle_expired);
7544
7545 ret = 0;
7546 } else {
7547 struct inpcb *inp = (struct inpcb *)so->so_pcb;
7548
7549 inpcb_timer_sched(inp->inp_pcbinfo, INPCB_TIMER_LAZY);
7550 OSIncrementAtomic(&soextbkidlestat.so_xbkidle_resched);
7551 }
7552 }
7553
7554 return ret;
7555 }
7556
7557 void
7558 resume_proc_sockets(proc_t p)
7559 {
7560 if (p->p_ladvflag & P_LXBKIDLEINPROG) {
7561 struct filedesc *fdp;
7562 int i;
7563
7564 proc_fdlock(p);
7565 fdp = p->p_fd;
7566 for (i = 0; i < fdp->fd_nfiles; i++) {
7567 struct fileproc *fp;
7568 struct socket *so;
7569
7570 fp = fdp->fd_ofiles[i];
7571 if (fp == NULL ||
7572 (fdp->fd_ofileflags[i] & UF_RESERVED) != 0 ||
7573 FILEGLOB_DTYPE(fp->f_fglob) != DTYPE_SOCKET) {
7574 continue;
7575 }
7576
7577 so = (struct socket *)fp->f_fglob->fg_data;
7578 (void) soresume(p, so, 0);
7579 }
7580 proc_fdunlock(p);
7581
7582 OSBitAndAtomic(~P_LXBKIDLEINPROG, &p->p_ladvflag);
7583 }
7584 }
7585
7586 __private_extern__ int
7587 so_set_recv_anyif(struct socket *so, int optval)
7588 {
7589 int ret = 0;
7590
7591 #if INET6
7592 if (SOCK_DOM(so) == PF_INET || SOCK_DOM(so) == PF_INET6) {
7593 #else
7594 if (SOCK_DOM(so) == PF_INET) {
7595 #endif /* !INET6 */
7596 if (optval) {
7597 sotoinpcb(so)->inp_flags |= INP_RECV_ANYIF;
7598 } else {
7599 sotoinpcb(so)->inp_flags &= ~INP_RECV_ANYIF;
7600 }
7601 }
7602
7603
7604 return ret;
7605 }
7606
7607 __private_extern__ int
7608 so_get_recv_anyif(struct socket *so)
7609 {
7610 int ret = 0;
7611
7612 #if INET6
7613 if (SOCK_DOM(so) == PF_INET || SOCK_DOM(so) == PF_INET6) {
7614 #else
7615 if (SOCK_DOM(so) == PF_INET) {
7616 #endif /* !INET6 */
7617 ret = (sotoinpcb(so)->inp_flags & INP_RECV_ANYIF) ? 1 : 0;
7618 }
7619
7620 return ret;
7621 }
7622
7623 int
7624 so_set_restrictions(struct socket *so, uint32_t vals)
7625 {
7626 int nocell_old, nocell_new;
7627 int noexpensive_old, noexpensive_new;
7628
7629 /*
7630 * Deny-type restrictions are trapdoors; once set they cannot be
7631 * unset for the lifetime of the socket. This allows them to be
7632 * issued by a framework on behalf of the application without
7633 * having to worry that they can be undone.
7634 *
7635 * Note here that socket-level restrictions overrides any protocol
7636 * level restrictions. For instance, SO_RESTRICT_DENY_CELLULAR
7637 * socket restriction issued on the socket has a higher precendence
7638 * than INP_NO_IFT_CELLULAR. The latter is affected by the UUID
7639 * policy PROC_UUID_NO_CELLULAR for unrestricted sockets only,
7640 * i.e. when SO_RESTRICT_DENY_CELLULAR has not been issued.
7641 */
7642 nocell_old = (so->so_restrictions & SO_RESTRICT_DENY_CELLULAR);
7643 noexpensive_old = (so->so_restrictions & SO_RESTRICT_DENY_EXPENSIVE);
7644 so->so_restrictions |= (vals & (SO_RESTRICT_DENY_IN |
7645 SO_RESTRICT_DENY_OUT | SO_RESTRICT_DENY_CELLULAR |
7646 SO_RESTRICT_DENY_EXPENSIVE));
7647 nocell_new = (so->so_restrictions & SO_RESTRICT_DENY_CELLULAR);
7648 noexpensive_new = (so->so_restrictions & SO_RESTRICT_DENY_EXPENSIVE);
7649
7650 /* we can only set, not clear restrictions */
7651 if ((nocell_new - nocell_old) == 0 &&
7652 (noexpensive_new - noexpensive_old) == 0) {
7653 return 0;
7654 }
7655 #if INET6
7656 if (SOCK_DOM(so) == PF_INET || SOCK_DOM(so) == PF_INET6) {
7657 #else
7658 if (SOCK_DOM(so) == PF_INET) {
7659 #endif /* !INET6 */
7660 if (nocell_new - nocell_old != 0) {
7661 /*
7662 * if deny cellular is now set, do what's needed
7663 * for INPCB
7664 */
7665 inp_set_nocellular(sotoinpcb(so));
7666 }
7667 if (noexpensive_new - noexpensive_old != 0) {
7668 inp_set_noexpensive(sotoinpcb(so));
7669 }
7670 }
7671
7672 if (SOCK_DOM(so) == PF_MULTIPATH) {
7673 mptcp_set_restrictions(so);
7674 }
7675
7676 return 0;
7677 }
7678
7679 uint32_t
7680 so_get_restrictions(struct socket *so)
7681 {
7682 return so->so_restrictions & (SO_RESTRICT_DENY_IN |
7683 SO_RESTRICT_DENY_OUT |
7684 SO_RESTRICT_DENY_CELLULAR | SO_RESTRICT_DENY_EXPENSIVE);
7685 }
7686
7687 int
7688 so_set_effective_pid(struct socket *so, int epid, struct proc *p)
7689 {
7690 struct proc *ep = PROC_NULL;
7691 int error = 0;
7692
7693 /* pid 0 is reserved for kernel */
7694 if (epid == 0) {
7695 error = EINVAL;
7696 goto done;
7697 }
7698
7699 /*
7700 * If this is an in-kernel socket, prevent its delegate
7701 * association from changing unless the socket option is
7702 * coming from within the kernel itself.
7703 */
7704 if (so->last_pid == 0 && p != kernproc) {
7705 error = EACCES;
7706 goto done;
7707 }
7708
7709 /*
7710 * If this is issued by a process that's recorded as the
7711 * real owner of the socket, or if the pid is the same as
7712 * the process's own pid, then proceed. Otherwise ensure
7713 * that the issuing process has the necessary privileges.
7714 */
7715 if (epid != so->last_pid || epid != proc_pid(p)) {
7716 if ((error = priv_check_cred(kauth_cred_get(),
7717 PRIV_NET_PRIVILEGED_SOCKET_DELEGATE, 0))) {
7718 error = EACCES;
7719 goto done;
7720 }
7721 }
7722
7723 /* Find the process that corresponds to the effective pid */
7724 if ((ep = proc_find(epid)) == PROC_NULL) {
7725 error = ESRCH;
7726 goto done;
7727 }
7728
7729 /*
7730 * If a process tries to delegate the socket to itself, then
7731 * there's really nothing to do; treat it as a way for the
7732 * delegate association to be cleared. Note that we check
7733 * the passed-in proc rather than calling proc_selfpid(),
7734 * as we need to check the process issuing the socket option
7735 * which could be kernproc. Given that we don't allow 0 for
7736 * effective pid, it means that a delegated in-kernel socket
7737 * stays delegated during its lifetime (which is probably OK.)
7738 */
7739 if (epid == proc_pid(p)) {
7740 so->so_flags &= ~SOF_DELEGATED;
7741 so->e_upid = 0;
7742 so->e_pid = 0;
7743 uuid_clear(so->e_uuid);
7744 } else {
7745 so->so_flags |= SOF_DELEGATED;
7746 so->e_upid = proc_uniqueid(ep);
7747 so->e_pid = proc_pid(ep);
7748 proc_getexecutableuuid(ep, so->e_uuid, sizeof(so->e_uuid));
7749 }
7750 done:
7751 if (error == 0 && net_io_policy_log) {
7752 uuid_string_t buf;
7753
7754 uuid_unparse(so->e_uuid, buf);
7755 log(LOG_DEBUG, "%s[%s,%d]: so 0x%llx [%d,%d] epid %d (%s) "
7756 "euuid %s%s\n", __func__, proc_name_address(p),
7757 proc_pid(p), (uint64_t)DEBUG_KERNEL_ADDRPERM(so),
7758 SOCK_DOM(so), SOCK_TYPE(so),
7759 so->e_pid, proc_name_address(ep), buf,
7760 ((so->so_flags & SOF_DELEGATED) ? " [delegated]" : ""));
7761 } else if (error != 0 && net_io_policy_log) {
7762 log(LOG_ERR, "%s[%s,%d]: so 0x%llx [%d,%d] epid %d (%s) "
7763 "ERROR (%d)\n", __func__, proc_name_address(p),
7764 proc_pid(p), (uint64_t)DEBUG_KERNEL_ADDRPERM(so),
7765 SOCK_DOM(so), SOCK_TYPE(so),
7766 epid, (ep == PROC_NULL) ? "PROC_NULL" :
7767 proc_name_address(ep), error);
7768 }
7769
7770 /* Update this socket's policy upon success */
7771 if (error == 0) {
7772 so->so_policy_gencnt *= -1;
7773 so_update_policy(so);
7774 #if NECP
7775 so_update_necp_policy(so, NULL, NULL);
7776 #endif /* NECP */
7777 }
7778
7779 if (ep != PROC_NULL) {
7780 proc_rele(ep);
7781 }
7782
7783 return error;
7784 }
7785
7786 int
7787 so_set_effective_uuid(struct socket *so, uuid_t euuid, struct proc *p)
7788 {
7789 uuid_string_t buf;
7790 uuid_t uuid;
7791 int error = 0;
7792
7793 /* UUID must not be all-zeroes (reserved for kernel) */
7794 if (uuid_is_null(euuid)) {
7795 error = EINVAL;
7796 goto done;
7797 }
7798
7799 /*
7800 * If this is an in-kernel socket, prevent its delegate
7801 * association from changing unless the socket option is
7802 * coming from within the kernel itself.
7803 */
7804 if (so->last_pid == 0 && p != kernproc) {
7805 error = EACCES;
7806 goto done;
7807 }
7808
7809 /* Get the UUID of the issuing process */
7810 proc_getexecutableuuid(p, uuid, sizeof(uuid));
7811
7812 /*
7813 * If this is issued by a process that's recorded as the
7814 * real owner of the socket, or if the uuid is the same as
7815 * the process's own uuid, then proceed. Otherwise ensure
7816 * that the issuing process has the necessary privileges.
7817 */
7818 if (uuid_compare(euuid, so->last_uuid) != 0 ||
7819 uuid_compare(euuid, uuid) != 0) {
7820 if ((error = priv_check_cred(kauth_cred_get(),
7821 PRIV_NET_PRIVILEGED_SOCKET_DELEGATE, 0))) {
7822 error = EACCES;
7823 goto done;
7824 }
7825 }
7826
7827 /*
7828 * If a process tries to delegate the socket to itself, then
7829 * there's really nothing to do; treat it as a way for the
7830 * delegate association to be cleared. Note that we check
7831 * the uuid of the passed-in proc rather than that of the
7832 * current process, as we need to check the process issuing
7833 * the socket option which could be kernproc itself. Given
7834 * that we don't allow 0 for effective uuid, it means that
7835 * a delegated in-kernel socket stays delegated during its
7836 * lifetime (which is okay.)
7837 */
7838 if (uuid_compare(euuid, uuid) == 0) {
7839 so->so_flags &= ~SOF_DELEGATED;
7840 so->e_upid = 0;
7841 so->e_pid = 0;
7842 uuid_clear(so->e_uuid);
7843 } else {
7844 so->so_flags |= SOF_DELEGATED;
7845 /*
7846 * Unlike so_set_effective_pid(), we only have the UUID
7847 * here and the process ID is not known. Inherit the
7848 * real {pid,upid} of the socket.
7849 */
7850 so->e_upid = so->last_upid;
7851 so->e_pid = so->last_pid;
7852 uuid_copy(so->e_uuid, euuid);
7853 }
7854
7855 done:
7856 if (error == 0 && net_io_policy_log) {
7857 uuid_unparse(so->e_uuid, buf);
7858 log(LOG_DEBUG, "%s[%s,%d]: so 0x%llx [%d,%d] epid %d "
7859 "euuid %s%s\n", __func__, proc_name_address(p), proc_pid(p),
7860 (uint64_t)DEBUG_KERNEL_ADDRPERM(so), SOCK_DOM(so),
7861 SOCK_TYPE(so), so->e_pid, buf,
7862 ((so->so_flags & SOF_DELEGATED) ? " [delegated]" : ""));
7863 } else if (error != 0 && net_io_policy_log) {
7864 uuid_unparse(euuid, buf);
7865 log(LOG_DEBUG, "%s[%s,%d]: so 0x%llx [%d,%d] euuid %s "
7866 "ERROR (%d)\n", __func__, proc_name_address(p), proc_pid(p),
7867 (uint64_t)DEBUG_KERNEL_ADDRPERM(so), SOCK_DOM(so),
7868 SOCK_TYPE(so), buf, error);
7869 }
7870
7871 /* Update this socket's policy upon success */
7872 if (error == 0) {
7873 so->so_policy_gencnt *= -1;
7874 so_update_policy(so);
7875 #if NECP
7876 so_update_necp_policy(so, NULL, NULL);
7877 #endif /* NECP */
7878 }
7879
7880 return error;
7881 }
7882
7883 void
7884 netpolicy_post_msg(uint32_t ev_code, struct netpolicy_event_data *ev_data,
7885 uint32_t ev_datalen)
7886 {
7887 struct kev_msg ev_msg;
7888
7889 /*
7890 * A netpolicy event always starts with a netpolicy_event_data
7891 * structure, but the caller can provide for a longer event
7892 * structure to post, depending on the event code.
7893 */
7894 VERIFY(ev_data != NULL && ev_datalen >= sizeof(*ev_data));
7895
7896 bzero(&ev_msg, sizeof(ev_msg));
7897 ev_msg.vendor_code = KEV_VENDOR_APPLE;
7898 ev_msg.kev_class = KEV_NETWORK_CLASS;
7899 ev_msg.kev_subclass = KEV_NETPOLICY_SUBCLASS;
7900 ev_msg.event_code = ev_code;
7901
7902 ev_msg.dv[0].data_ptr = ev_data;
7903 ev_msg.dv[0].data_length = ev_datalen;
7904
7905 kev_post_msg(&ev_msg);
7906 }
7907
7908 void
7909 socket_post_kev_msg(uint32_t ev_code,
7910 struct kev_socket_event_data *ev_data,
7911 uint32_t ev_datalen)
7912 {
7913 struct kev_msg ev_msg;
7914
7915 bzero(&ev_msg, sizeof(ev_msg));
7916 ev_msg.vendor_code = KEV_VENDOR_APPLE;
7917 ev_msg.kev_class = KEV_NETWORK_CLASS;
7918 ev_msg.kev_subclass = KEV_SOCKET_SUBCLASS;
7919 ev_msg.event_code = ev_code;
7920
7921 ev_msg.dv[0].data_ptr = ev_data;
7922 ev_msg.dv[0].data_length = ev_datalen;
7923
7924 kev_post_msg(&ev_msg);
7925 }
7926
7927 void
7928 socket_post_kev_msg_closed(struct socket *so)
7929 {
7930 struct kev_socket_closed ev;
7931 struct sockaddr *socksa = NULL, *peersa = NULL;
7932 int err;
7933 bzero(&ev, sizeof(ev));
7934 err = (*so->so_proto->pr_usrreqs->pru_sockaddr)(so, &socksa);
7935 if (err == 0) {
7936 err = (*so->so_proto->pr_usrreqs->pru_peeraddr)(so,
7937 &peersa);
7938 if (err == 0) {
7939 memcpy(&ev.ev_data.kev_sockname, socksa,
7940 min(socksa->sa_len,
7941 sizeof(ev.ev_data.kev_sockname)));
7942 memcpy(&ev.ev_data.kev_peername, peersa,
7943 min(peersa->sa_len,
7944 sizeof(ev.ev_data.kev_peername)));
7945 socket_post_kev_msg(KEV_SOCKET_CLOSED,
7946 &ev.ev_data, sizeof(ev));
7947 }
7948 }
7949 if (socksa != NULL) {
7950 FREE(socksa, M_SONAME);
7951 }
7952 if (peersa != NULL) {
7953 FREE(peersa, M_SONAME);
7954 }
7955 }