]> git.saurik.com Git - apple/xnu.git/blob - bsd/kern/uipc_socket.c
xnu-6153.121.1.tar.gz
[apple/xnu.git] / bsd / kern / uipc_socket.c
1 /*
2 * Copyright (c) 1998-2019 Apple Inc. All rights reserved.
3 *
4 * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
5 *
6 * This file contains Original Code and/or Modifications of Original Code
7 * as defined in and that are subject to the Apple Public Source License
8 * Version 2.0 (the 'License'). You may not use this file except in
9 * compliance with the License. The rights granted to you under the License
10 * may not be used to create, or enable the creation or redistribution of,
11 * unlawful or unlicensed copies of an Apple operating system, or to
12 * circumvent, violate, or enable the circumvention or violation of, any
13 * terms of an Apple operating system software license agreement.
14 *
15 * Please obtain a copy of the License at
16 * http://www.opensource.apple.com/apsl/ and read it before using this file.
17 *
18 * The Original Code and all software distributed under the License are
19 * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
20 * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
21 * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
22 * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
23 * Please see the License for the specific language governing rights and
24 * limitations under the License.
25 *
26 * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
27 */
28 /* Copyright (c) 1995 NeXT Computer, Inc. All Rights Reserved */
29 /*
30 * Copyright (c) 1982, 1986, 1988, 1990, 1993
31 * The Regents of the University of California. All rights reserved.
32 *
33 * Redistribution and use in source and binary forms, with or without
34 * modification, are permitted provided that the following conditions
35 * are met:
36 * 1. Redistributions of source code must retain the above copyright
37 * notice, this list of conditions and the following disclaimer.
38 * 2. Redistributions in binary form must reproduce the above copyright
39 * notice, this list of conditions and the following disclaimer in the
40 * documentation and/or other materials provided with the distribution.
41 * 3. All advertising materials mentioning features or use of this software
42 * must display the following acknowledgement:
43 * This product includes software developed by the University of
44 * California, Berkeley and its contributors.
45 * 4. Neither the name of the University nor the names of its contributors
46 * may be used to endorse or promote products derived from this software
47 * without specific prior written permission.
48 *
49 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
50 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
51 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
52 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
53 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
54 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
55 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
56 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
57 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
58 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
59 * SUCH DAMAGE.
60 *
61 * @(#)uipc_socket.c 8.3 (Berkeley) 4/15/94
62 */
63 /*
64 * NOTICE: This file was modified by SPARTA, Inc. in 2005 to introduce
65 * support for mandatory and extensible security protections. This notice
66 * is included in support of clause 2.2 (b) of the Apple Public License,
67 * Version 2.0.
68 */
69
70 #include <sys/param.h>
71 #include <sys/systm.h>
72 #include <sys/filedesc.h>
73 #include <sys/proc.h>
74 #include <sys/proc_internal.h>
75 #include <sys/kauth.h>
76 #include <sys/file_internal.h>
77 #include <sys/fcntl.h>
78 #include <sys/malloc.h>
79 #include <sys/mbuf.h>
80 #include <sys/domain.h>
81 #include <sys/kernel.h>
82 #include <sys/event.h>
83 #include <sys/poll.h>
84 #include <sys/protosw.h>
85 #include <sys/socket.h>
86 #include <sys/socketvar.h>
87 #include <sys/resourcevar.h>
88 #include <sys/signalvar.h>
89 #include <sys/sysctl.h>
90 #include <sys/syslog.h>
91 #include <sys/uio.h>
92 #include <sys/uio_internal.h>
93 #include <sys/ev.h>
94 #include <sys/kdebug.h>
95 #include <sys/un.h>
96 #include <sys/user.h>
97 #include <sys/priv.h>
98 #include <sys/kern_event.h>
99 #include <net/route.h>
100 #include <net/init.h>
101 #include <net/net_api_stats.h>
102 #include <net/ntstat.h>
103 #include <net/content_filter.h>
104 #include <netinet/in.h>
105 #include <netinet/in_pcb.h>
106 #include <netinet/in_tclass.h>
107 #include <netinet/in_var.h>
108 #include <netinet/tcp_var.h>
109 #include <netinet/ip6.h>
110 #include <netinet6/ip6_var.h>
111 #include <netinet/flow_divert.h>
112 #include <kern/zalloc.h>
113 #include <kern/locks.h>
114 #include <machine/limits.h>
115 #include <libkern/OSAtomic.h>
116 #include <pexpert/pexpert.h>
117 #include <kern/assert.h>
118 #include <kern/task.h>
119 #include <kern/policy_internal.h>
120
121 #include <sys/kpi_mbuf.h>
122 #include <sys/mcache.h>
123 #include <sys/unpcb.h>
124 #include <libkern/section_keywords.h>
125
126 #if CONFIG_MACF
127 #include <security/mac_framework.h>
128 #endif /* MAC */
129
130 #if MULTIPATH
131 #include <netinet/mp_pcb.h>
132 #include <netinet/mptcp_var.h>
133 #endif /* MULTIPATH */
134
135 #define ROUNDUP(a, b) (((a) + ((b) - 1)) & (~((b) - 1)))
136
137 #if DEBUG || DEVELOPMENT
138 #define DEBUG_KERNEL_ADDRPERM(_v) (_v)
139 #else
140 #define DEBUG_KERNEL_ADDRPERM(_v) VM_KERNEL_ADDRPERM(_v)
141 #endif
142
143 /* TODO: this should be in a header file somewhere */
144 extern char *proc_name_address(void *p);
145
146 static u_int32_t so_cache_hw; /* High water mark for socache */
147 static u_int32_t so_cache_timeouts; /* number of timeouts */
148 static u_int32_t so_cache_max_freed; /* max freed per timeout */
149 static u_int32_t cached_sock_count = 0;
150 STAILQ_HEAD(, socket) so_cache_head;
151 int max_cached_sock_count = MAX_CACHED_SOCKETS;
152 static u_int32_t so_cache_time;
153 static int socketinit_done;
154 static struct zone *so_cache_zone;
155
156 static lck_grp_t *so_cache_mtx_grp;
157 static lck_attr_t *so_cache_mtx_attr;
158 static lck_grp_attr_t *so_cache_mtx_grp_attr;
159 static lck_mtx_t *so_cache_mtx;
160
161 #include <machine/limits.h>
162
163 static int filt_sorattach(struct knote *kn, struct kevent_qos_s *kev);
164 static void filt_sordetach(struct knote *kn);
165 static int filt_soread(struct knote *kn, long hint);
166 static int filt_sortouch(struct knote *kn, struct kevent_qos_s *kev);
167 static int filt_sorprocess(struct knote *kn, struct kevent_qos_s *kev);
168
169 static int filt_sowattach(struct knote *kn, struct kevent_qos_s *kev);
170 static void filt_sowdetach(struct knote *kn);
171 static int filt_sowrite(struct knote *kn, long hint);
172 static int filt_sowtouch(struct knote *kn, struct kevent_qos_s *kev);
173 static int filt_sowprocess(struct knote *kn, struct kevent_qos_s *kev);
174
175 static int filt_sockattach(struct knote *kn, struct kevent_qos_s *kev);
176 static void filt_sockdetach(struct knote *kn);
177 static int filt_sockev(struct knote *kn, long hint);
178 static int filt_socktouch(struct knote *kn, struct kevent_qos_s *kev);
179 static int filt_sockprocess(struct knote *kn, struct kevent_qos_s *kev);
180
181 static int sooptcopyin_timeval(struct sockopt *, struct timeval *);
182 static int sooptcopyout_timeval(struct sockopt *, const struct timeval *);
183
184 SECURITY_READ_ONLY_EARLY(struct filterops) soread_filtops = {
185 .f_isfd = 1,
186 .f_attach = filt_sorattach,
187 .f_detach = filt_sordetach,
188 .f_event = filt_soread,
189 .f_touch = filt_sortouch,
190 .f_process = filt_sorprocess,
191 };
192
193 SECURITY_READ_ONLY_EARLY(struct filterops) sowrite_filtops = {
194 .f_isfd = 1,
195 .f_attach = filt_sowattach,
196 .f_detach = filt_sowdetach,
197 .f_event = filt_sowrite,
198 .f_touch = filt_sowtouch,
199 .f_process = filt_sowprocess,
200 };
201
202 SECURITY_READ_ONLY_EARLY(struct filterops) sock_filtops = {
203 .f_isfd = 1,
204 .f_attach = filt_sockattach,
205 .f_detach = filt_sockdetach,
206 .f_event = filt_sockev,
207 .f_touch = filt_socktouch,
208 .f_process = filt_sockprocess,
209 };
210
211 SECURITY_READ_ONLY_EARLY(struct filterops) soexcept_filtops = {
212 .f_isfd = 1,
213 .f_attach = filt_sorattach,
214 .f_detach = filt_sordetach,
215 .f_event = filt_soread,
216 .f_touch = filt_sortouch,
217 .f_process = filt_sorprocess,
218 };
219
220 SYSCTL_DECL(_kern_ipc);
221
222 #define EVEN_MORE_LOCKING_DEBUG 0
223
224 int socket_debug = 0;
225 SYSCTL_INT(_kern_ipc, OID_AUTO, socket_debug,
226 CTLFLAG_RW | CTLFLAG_LOCKED, &socket_debug, 0, "");
227
228 static unsigned long sodefunct_calls = 0;
229 SYSCTL_LONG(_kern_ipc, OID_AUTO, sodefunct_calls, CTLFLAG_LOCKED,
230 &sodefunct_calls, "");
231
232 static int socket_zone = M_SOCKET;
233 so_gen_t so_gencnt; /* generation count for sockets */
234
235 MALLOC_DEFINE(M_SONAME, "soname", "socket name");
236 MALLOC_DEFINE(M_PCB, "pcb", "protocol control block");
237
238 #define DBG_LAYER_IN_BEG NETDBG_CODE(DBG_NETSOCK, 0)
239 #define DBG_LAYER_IN_END NETDBG_CODE(DBG_NETSOCK, 2)
240 #define DBG_LAYER_OUT_BEG NETDBG_CODE(DBG_NETSOCK, 1)
241 #define DBG_LAYER_OUT_END NETDBG_CODE(DBG_NETSOCK, 3)
242 #define DBG_FNC_SOSEND NETDBG_CODE(DBG_NETSOCK, (4 << 8) | 1)
243 #define DBG_FNC_SOSEND_LIST NETDBG_CODE(DBG_NETSOCK, (4 << 8) | 3)
244 #define DBG_FNC_SORECEIVE NETDBG_CODE(DBG_NETSOCK, (8 << 8))
245 #define DBG_FNC_SORECEIVE_LIST NETDBG_CODE(DBG_NETSOCK, (8 << 8) | 3)
246 #define DBG_FNC_SOSHUTDOWN NETDBG_CODE(DBG_NETSOCK, (9 << 8))
247
248 #define MAX_SOOPTGETM_SIZE (128 * MCLBYTES)
249
250 int somaxconn = SOMAXCONN;
251 SYSCTL_INT(_kern_ipc, KIPC_SOMAXCONN, somaxconn,
252 CTLFLAG_RW | CTLFLAG_LOCKED, &somaxconn, 0, "");
253
254 /* Should we get a maximum also ??? */
255 static int sosendmaxchain = 65536;
256 static int sosendminchain = 16384;
257 static int sorecvmincopy = 16384;
258 SYSCTL_INT(_kern_ipc, OID_AUTO, sosendminchain,
259 CTLFLAG_RW | CTLFLAG_LOCKED, &sosendminchain, 0, "");
260 SYSCTL_INT(_kern_ipc, OID_AUTO, sorecvmincopy,
261 CTLFLAG_RW | CTLFLAG_LOCKED, &sorecvmincopy, 0, "");
262
263 /*
264 * Set to enable jumbo clusters (if available) for large writes when
265 * the socket is marked with SOF_MULTIPAGES; see below.
266 */
267 int sosendjcl = 1;
268 SYSCTL_INT(_kern_ipc, OID_AUTO, sosendjcl,
269 CTLFLAG_RW | CTLFLAG_LOCKED, &sosendjcl, 0, "");
270
271 /*
272 * Set this to ignore SOF_MULTIPAGES and use jumbo clusters for large
273 * writes on the socket for all protocols on any network interfaces,
274 * depending upon sosendjcl above. Be extra careful when setting this
275 * to 1, because sending down packets that cross physical pages down to
276 * broken drivers (those that falsely assume that the physical pages
277 * are contiguous) might lead to system panics or silent data corruption.
278 * When set to 0, the system will respect SOF_MULTIPAGES, which is set
279 * only for TCP sockets whose outgoing interface is IFNET_MULTIPAGES
280 * capable. Set this to 1 only for testing/debugging purposes.
281 */
282 int sosendjcl_ignore_capab = 0;
283 SYSCTL_INT(_kern_ipc, OID_AUTO, sosendjcl_ignore_capab,
284 CTLFLAG_RW | CTLFLAG_LOCKED, &sosendjcl_ignore_capab, 0, "");
285
286 /*
287 * Set this to ignore SOF1_IF_2KCL and use big clusters for large
288 * writes on the socket for all protocols on any network interfaces.
289 * Be extra careful when setting this to 1, because sending down packets with
290 * clusters larger that 2 KB might lead to system panics or data corruption.
291 * When set to 0, the system will respect SOF1_IF_2KCL, which is set
292 * on the outgoing interface
293 * Set this to 1 for testing/debugging purposes only.
294 */
295 int sosendbigcl_ignore_capab = 0;
296 SYSCTL_INT(_kern_ipc, OID_AUTO, sosendbigcl_ignore_capab,
297 CTLFLAG_RW | CTLFLAG_LOCKED, &sosendbigcl_ignore_capab, 0, "");
298
299 int sodefunctlog = 0;
300 SYSCTL_INT(_kern_ipc, OID_AUTO, sodefunctlog, CTLFLAG_RW | CTLFLAG_LOCKED,
301 &sodefunctlog, 0, "");
302
303 int sothrottlelog = 0;
304 SYSCTL_INT(_kern_ipc, OID_AUTO, sothrottlelog, CTLFLAG_RW | CTLFLAG_LOCKED,
305 &sothrottlelog, 0, "");
306
307 int sorestrictrecv = 1;
308 SYSCTL_INT(_kern_ipc, OID_AUTO, sorestrictrecv, CTLFLAG_RW | CTLFLAG_LOCKED,
309 &sorestrictrecv, 0, "Enable inbound interface restrictions");
310
311 int sorestrictsend = 1;
312 SYSCTL_INT(_kern_ipc, OID_AUTO, sorestrictsend, CTLFLAG_RW | CTLFLAG_LOCKED,
313 &sorestrictsend, 0, "Enable outbound interface restrictions");
314
315 int soreserveheadroom = 1;
316 SYSCTL_INT(_kern_ipc, OID_AUTO, soreserveheadroom, CTLFLAG_RW | CTLFLAG_LOCKED,
317 &soreserveheadroom, 0, "To allocate contiguous datagram buffers");
318
319 #if (DEBUG || DEVELOPMENT)
320 int so_notsent_lowat_check = 1;
321 SYSCTL_INT(_kern_ipc, OID_AUTO, notsent_lowat, CTLFLAG_RW | CTLFLAG_LOCKED,
322 &so_notsent_lowat_check, 0, "enable/disable notsnet lowat check");
323 #endif /* DEBUG || DEVELOPMENT */
324
325 int so_accept_list_waits = 0;
326 #if (DEBUG || DEVELOPMENT)
327 SYSCTL_INT(_kern_ipc, OID_AUTO, accept_list_waits, CTLFLAG_RW | CTLFLAG_LOCKED,
328 &so_accept_list_waits, 0, "number of waits for listener incomp list");
329 #endif /* DEBUG || DEVELOPMENT */
330
331 extern struct inpcbinfo tcbinfo;
332
333 /* TODO: these should be in header file */
334 extern int get_inpcb_str_size(void);
335 extern int get_tcp_str_size(void);
336
337 vm_size_t so_cache_zone_element_size;
338
339 static int sodelayed_copy(struct socket *, struct uio *, struct mbuf **,
340 user_ssize_t *);
341 static void cached_sock_alloc(struct socket **, int);
342 static void cached_sock_free(struct socket *);
343
344 /*
345 * Maximum of extended background idle sockets per process
346 * Set to zero to disable further setting of the option
347 */
348
349 #define SO_IDLE_BK_IDLE_MAX_PER_PROC 1
350 #define SO_IDLE_BK_IDLE_TIME 600
351 #define SO_IDLE_BK_IDLE_RCV_HIWAT 131072
352
353 struct soextbkidlestat soextbkidlestat;
354
355 SYSCTL_UINT(_kern_ipc, OID_AUTO, maxextbkidleperproc,
356 CTLFLAG_RW | CTLFLAG_LOCKED, &soextbkidlestat.so_xbkidle_maxperproc, 0,
357 "Maximum of extended background idle sockets per process");
358
359 SYSCTL_UINT(_kern_ipc, OID_AUTO, extbkidletime, CTLFLAG_RW | CTLFLAG_LOCKED,
360 &soextbkidlestat.so_xbkidle_time, 0,
361 "Time in seconds to keep extended background idle sockets");
362
363 SYSCTL_UINT(_kern_ipc, OID_AUTO, extbkidlercvhiwat, CTLFLAG_RW | CTLFLAG_LOCKED,
364 &soextbkidlestat.so_xbkidle_rcvhiwat, 0,
365 "High water mark for extended background idle sockets");
366
367 SYSCTL_STRUCT(_kern_ipc, OID_AUTO, extbkidlestat, CTLFLAG_RD | CTLFLAG_LOCKED,
368 &soextbkidlestat, soextbkidlestat, "");
369
370 int so_set_extended_bk_idle(struct socket *, int);
371
372
373 /*
374 * SOTCDB_NO_DSCP is set by default, to prevent the networking stack from
375 * setting the DSCP code on the packet based on the service class; see
376 * <rdar://problem/11277343> for details.
377 */
378 __private_extern__ u_int32_t sotcdb = 0;
379 SYSCTL_INT(_kern_ipc, OID_AUTO, sotcdb, CTLFLAG_RW | CTLFLAG_LOCKED,
380 &sotcdb, 0, "");
381
382 void
383 socketinit(void)
384 {
385 _CASSERT(sizeof(so_gencnt) == sizeof(uint64_t));
386 VERIFY(IS_P2ALIGNED(&so_gencnt, sizeof(uint32_t)));
387
388 #ifdef __LP64__
389 _CASSERT(sizeof(struct sa_endpoints) == sizeof(struct user64_sa_endpoints));
390 _CASSERT(offsetof(struct sa_endpoints, sae_srcif) == offsetof(struct user64_sa_endpoints, sae_srcif));
391 _CASSERT(offsetof(struct sa_endpoints, sae_srcaddr) == offsetof(struct user64_sa_endpoints, sae_srcaddr));
392 _CASSERT(offsetof(struct sa_endpoints, sae_srcaddrlen) == offsetof(struct user64_sa_endpoints, sae_srcaddrlen));
393 _CASSERT(offsetof(struct sa_endpoints, sae_dstaddr) == offsetof(struct user64_sa_endpoints, sae_dstaddr));
394 _CASSERT(offsetof(struct sa_endpoints, sae_dstaddrlen) == offsetof(struct user64_sa_endpoints, sae_dstaddrlen));
395 #else
396 _CASSERT(sizeof(struct sa_endpoints) == sizeof(struct user32_sa_endpoints));
397 _CASSERT(offsetof(struct sa_endpoints, sae_srcif) == offsetof(struct user32_sa_endpoints, sae_srcif));
398 _CASSERT(offsetof(struct sa_endpoints, sae_srcaddr) == offsetof(struct user32_sa_endpoints, sae_srcaddr));
399 _CASSERT(offsetof(struct sa_endpoints, sae_srcaddrlen) == offsetof(struct user32_sa_endpoints, sae_srcaddrlen));
400 _CASSERT(offsetof(struct sa_endpoints, sae_dstaddr) == offsetof(struct user32_sa_endpoints, sae_dstaddr));
401 _CASSERT(offsetof(struct sa_endpoints, sae_dstaddrlen) == offsetof(struct user32_sa_endpoints, sae_dstaddrlen));
402 #endif
403
404 if (socketinit_done) {
405 printf("socketinit: already called...\n");
406 return;
407 }
408 socketinit_done = 1;
409
410 PE_parse_boot_argn("socket_debug", &socket_debug,
411 sizeof(socket_debug));
412
413 /*
414 * allocate lock group attribute and group for socket cache mutex
415 */
416 so_cache_mtx_grp_attr = lck_grp_attr_alloc_init();
417 so_cache_mtx_grp = lck_grp_alloc_init("so_cache",
418 so_cache_mtx_grp_attr);
419
420 /*
421 * allocate the lock attribute for socket cache mutex
422 */
423 so_cache_mtx_attr = lck_attr_alloc_init();
424
425 /* cached sockets mutex */
426 so_cache_mtx = lck_mtx_alloc_init(so_cache_mtx_grp, so_cache_mtx_attr);
427 if (so_cache_mtx == NULL) {
428 panic("%s: unable to allocate so_cache_mtx\n", __func__);
429 /* NOTREACHED */
430 }
431 STAILQ_INIT(&so_cache_head);
432
433 so_cache_zone_element_size = (vm_size_t)(sizeof(struct socket) + 4
434 + get_inpcb_str_size() + 4 + get_tcp_str_size());
435
436 so_cache_zone = zinit(so_cache_zone_element_size,
437 (120000 * so_cache_zone_element_size), 8192, "socache zone");
438 zone_change(so_cache_zone, Z_CALLERACCT, FALSE);
439 zone_change(so_cache_zone, Z_NOENCRYPT, TRUE);
440
441 bzero(&soextbkidlestat, sizeof(struct soextbkidlestat));
442 soextbkidlestat.so_xbkidle_maxperproc = SO_IDLE_BK_IDLE_MAX_PER_PROC;
443 soextbkidlestat.so_xbkidle_time = SO_IDLE_BK_IDLE_TIME;
444 soextbkidlestat.so_xbkidle_rcvhiwat = SO_IDLE_BK_IDLE_RCV_HIWAT;
445
446 in_pcbinit();
447 sflt_init();
448 socket_tclass_init();
449 #if MULTIPATH
450 mp_pcbinit();
451 #endif /* MULTIPATH */
452 }
453
454 static void
455 cached_sock_alloc(struct socket **so, int waitok)
456 {
457 caddr_t temp;
458 uintptr_t offset;
459
460 lck_mtx_lock(so_cache_mtx);
461
462 if (!STAILQ_EMPTY(&so_cache_head)) {
463 VERIFY(cached_sock_count > 0);
464
465 *so = STAILQ_FIRST(&so_cache_head);
466 STAILQ_REMOVE_HEAD(&so_cache_head, so_cache_ent);
467 STAILQ_NEXT((*so), so_cache_ent) = NULL;
468
469 cached_sock_count--;
470 lck_mtx_unlock(so_cache_mtx);
471
472 temp = (*so)->so_saved_pcb;
473 bzero((caddr_t)*so, sizeof(struct socket));
474
475 (*so)->so_saved_pcb = temp;
476 } else {
477 lck_mtx_unlock(so_cache_mtx);
478
479 if (waitok) {
480 *so = (struct socket *)zalloc(so_cache_zone);
481 } else {
482 *so = (struct socket *)zalloc_noblock(so_cache_zone);
483 }
484
485 if (*so == NULL) {
486 return;
487 }
488
489 bzero((caddr_t)*so, sizeof(struct socket));
490
491 /*
492 * Define offsets for extra structures into our
493 * single block of memory. Align extra structures
494 * on longword boundaries.
495 */
496
497 offset = (uintptr_t)*so;
498 offset += sizeof(struct socket);
499
500 offset = ALIGN(offset);
501
502 (*so)->so_saved_pcb = (caddr_t)offset;
503 offset += get_inpcb_str_size();
504
505 offset = ALIGN(offset);
506
507 ((struct inpcb *)(void *)(*so)->so_saved_pcb)->inp_saved_ppcb =
508 (caddr_t)offset;
509 }
510
511 OSBitOrAtomic(SOF1_CACHED_IN_SOCK_LAYER, &(*so)->so_flags1);
512 }
513
514 static void
515 cached_sock_free(struct socket *so)
516 {
517 lck_mtx_lock(so_cache_mtx);
518
519 so_cache_time = net_uptime();
520 if (++cached_sock_count > max_cached_sock_count) {
521 --cached_sock_count;
522 lck_mtx_unlock(so_cache_mtx);
523 zfree(so_cache_zone, so);
524 } else {
525 if (so_cache_hw < cached_sock_count) {
526 so_cache_hw = cached_sock_count;
527 }
528
529 STAILQ_INSERT_TAIL(&so_cache_head, so, so_cache_ent);
530
531 so->cache_timestamp = so_cache_time;
532 lck_mtx_unlock(so_cache_mtx);
533 }
534 }
535
536 void
537 so_update_last_owner_locked(struct socket *so, proc_t self)
538 {
539 if (so->last_pid != 0) {
540 /*
541 * last_pid and last_upid should remain zero for sockets
542 * created using sock_socket. The check above achieves that
543 */
544 if (self == PROC_NULL) {
545 self = current_proc();
546 }
547
548 if (so->last_upid != proc_uniqueid(self) ||
549 so->last_pid != proc_pid(self)) {
550 so->last_upid = proc_uniqueid(self);
551 so->last_pid = proc_pid(self);
552 proc_getexecutableuuid(self, so->last_uuid,
553 sizeof(so->last_uuid));
554 if (so->so_proto != NULL && so->so_proto->pr_update_last_owner != NULL) {
555 (*so->so_proto->pr_update_last_owner)(so, self, NULL);
556 }
557 }
558 proc_pidoriginatoruuid(so->so_vuuid, sizeof(so->so_vuuid));
559 }
560 }
561
562 void
563 so_update_policy(struct socket *so)
564 {
565 if (SOCK_DOM(so) == PF_INET || SOCK_DOM(so) == PF_INET6) {
566 (void) inp_update_policy(sotoinpcb(so));
567 }
568 }
569
570 #if NECP
571 static void
572 so_update_necp_policy(struct socket *so, struct sockaddr *override_local_addr,
573 struct sockaddr *override_remote_addr)
574 {
575 if (SOCK_DOM(so) == PF_INET || SOCK_DOM(so) == PF_INET6) {
576 inp_update_necp_policy(sotoinpcb(so), override_local_addr,
577 override_remote_addr, 0);
578 }
579 }
580 #endif /* NECP */
581
582 boolean_t
583 so_cache_timer(void)
584 {
585 struct socket *p;
586 int n_freed = 0;
587 boolean_t rc = FALSE;
588
589 lck_mtx_lock(so_cache_mtx);
590 so_cache_timeouts++;
591 so_cache_time = net_uptime();
592
593 while (!STAILQ_EMPTY(&so_cache_head)) {
594 VERIFY(cached_sock_count > 0);
595 p = STAILQ_FIRST(&so_cache_head);
596 if ((so_cache_time - p->cache_timestamp) <
597 SO_CACHE_TIME_LIMIT) {
598 break;
599 }
600
601 STAILQ_REMOVE_HEAD(&so_cache_head, so_cache_ent);
602 --cached_sock_count;
603
604 zfree(so_cache_zone, p);
605
606 if (++n_freed >= SO_CACHE_MAX_FREE_BATCH) {
607 so_cache_max_freed++;
608 break;
609 }
610 }
611
612 /* Schedule again if there is more to cleanup */
613 if (!STAILQ_EMPTY(&so_cache_head)) {
614 rc = TRUE;
615 }
616
617 lck_mtx_unlock(so_cache_mtx);
618 return rc;
619 }
620
621 /*
622 * Get a socket structure from our zone, and initialize it.
623 * We don't implement `waitok' yet (see comments in uipc_domain.c).
624 * Note that it would probably be better to allocate socket
625 * and PCB at the same time, but I'm not convinced that all
626 * the protocols can be easily modified to do this.
627 */
628 struct socket *
629 soalloc(int waitok, int dom, int type)
630 {
631 struct socket *so;
632
633 if ((dom == PF_INET) && (type == SOCK_STREAM)) {
634 cached_sock_alloc(&so, waitok);
635 } else {
636 MALLOC_ZONE(so, struct socket *, sizeof(*so), socket_zone,
637 M_WAITOK);
638 if (so != NULL) {
639 bzero(so, sizeof(*so));
640 }
641 }
642 if (so != NULL) {
643 so->so_gencnt = OSIncrementAtomic64((SInt64 *)&so_gencnt);
644 so->so_zone = socket_zone;
645
646 /*
647 * Increment the socket allocation statistics
648 */
649 INC_ATOMIC_INT64_LIM(net_api_stats.nas_socket_alloc_total);
650
651 #if CONFIG_MACF_SOCKET
652 /* Convert waitok to M_WAITOK/M_NOWAIT for MAC Framework. */
653 if (mac_socket_label_init(so, !waitok) != 0) {
654 sodealloc(so);
655 return NULL;
656 }
657 #endif /* MAC_SOCKET */
658 }
659
660 return so;
661 }
662
663 int
664 socreate_internal(int dom, struct socket **aso, int type, int proto,
665 struct proc *p, uint32_t flags, struct proc *ep)
666 {
667 struct protosw *prp;
668 struct socket *so;
669 int error = 0;
670 #if defined(XNU_TARGET_OS_OSX)
671 pid_t rpid = -1;
672 #endif
673
674 #if TCPDEBUG
675 extern int tcpconsdebug;
676 #endif
677
678 VERIFY(aso != NULL);
679 *aso = NULL;
680
681 if (proto != 0) {
682 prp = pffindproto(dom, proto, type);
683 } else {
684 prp = pffindtype(dom, type);
685 }
686
687 if (prp == NULL || prp->pr_usrreqs->pru_attach == NULL) {
688 if (pffinddomain(dom) == NULL) {
689 return EAFNOSUPPORT;
690 }
691 if (proto != 0) {
692 if (pffindprotonotype(dom, proto) != NULL) {
693 return EPROTOTYPE;
694 }
695 }
696 return EPROTONOSUPPORT;
697 }
698 if (prp->pr_type != type) {
699 return EPROTOTYPE;
700 }
701 so = soalloc(1, dom, type);
702 if (so == NULL) {
703 return ENOBUFS;
704 }
705
706 switch (dom) {
707 case PF_LOCAL:
708 INC_ATOMIC_INT64_LIM(net_api_stats.nas_socket_domain_local_total);
709 break;
710 case PF_INET:
711 INC_ATOMIC_INT64_LIM(net_api_stats.nas_socket_domain_inet_total);
712 if (type == SOCK_STREAM) {
713 INC_ATOMIC_INT64_LIM(net_api_stats.nas_socket_inet_stream_total);
714 } else {
715 INC_ATOMIC_INT64_LIM(net_api_stats.nas_socket_inet_dgram_total);
716 }
717 break;
718 case PF_ROUTE:
719 INC_ATOMIC_INT64_LIM(net_api_stats.nas_socket_domain_route_total);
720 break;
721 case PF_NDRV:
722 INC_ATOMIC_INT64_LIM(net_api_stats.nas_socket_domain_ndrv_total);
723 break;
724 case PF_KEY:
725 INC_ATOMIC_INT64_LIM(net_api_stats.nas_socket_domain_key_total);
726 break;
727 case PF_INET6:
728 INC_ATOMIC_INT64_LIM(net_api_stats.nas_socket_domain_inet6_total);
729 if (type == SOCK_STREAM) {
730 INC_ATOMIC_INT64_LIM(net_api_stats.nas_socket_inet6_stream_total);
731 } else {
732 INC_ATOMIC_INT64_LIM(net_api_stats.nas_socket_inet6_dgram_total);
733 }
734 break;
735 case PF_SYSTEM:
736 INC_ATOMIC_INT64_LIM(net_api_stats.nas_socket_domain_system_total);
737 break;
738 case PF_MULTIPATH:
739 INC_ATOMIC_INT64_LIM(net_api_stats.nas_socket_domain_multipath_total);
740 break;
741 default:
742 INC_ATOMIC_INT64_LIM(net_api_stats.nas_socket_domain_other_total);
743 break;
744 }
745
746 if (flags & SOCF_MPTCP) {
747 so->so_state |= SS_NBIO;
748 }
749
750 TAILQ_INIT(&so->so_incomp);
751 TAILQ_INIT(&so->so_comp);
752 so->so_type = type;
753 so->last_upid = proc_uniqueid(p);
754 so->last_pid = proc_pid(p);
755 proc_getexecutableuuid(p, so->last_uuid, sizeof(so->last_uuid));
756 proc_pidoriginatoruuid(so->so_vuuid, sizeof(so->so_vuuid));
757
758 if (ep != PROC_NULL && ep != p) {
759 so->e_upid = proc_uniqueid(ep);
760 so->e_pid = proc_pid(ep);
761 proc_getexecutableuuid(ep, so->e_uuid, sizeof(so->e_uuid));
762 so->so_flags |= SOF_DELEGATED;
763 #if defined(XNU_TARGET_OS_OSX)
764 if (ep->p_responsible_pid != so->e_pid) {
765 rpid = ep->p_responsible_pid;
766 }
767 #endif
768 }
769
770 #if defined(XNU_TARGET_OS_OSX)
771 if (rpid < 0 && p->p_responsible_pid != so->last_pid) {
772 rpid = p->p_responsible_pid;
773 }
774
775 so->so_rpid = -1;
776 uuid_clear(so->so_ruuid);
777 if (rpid >= 0) {
778 proc_t rp = proc_find(rpid);
779 if (rp != PROC_NULL) {
780 proc_getexecutableuuid(rp, so->so_ruuid, sizeof(so->so_ruuid));
781 so->so_rpid = rpid;
782 proc_rele(rp);
783 }
784 }
785 #endif
786
787 so->so_cred = kauth_cred_proc_ref(p);
788 if (!suser(kauth_cred_get(), NULL)) {
789 so->so_state |= SS_PRIV;
790 }
791
792 so->so_proto = prp;
793 so->so_rcv.sb_flags |= SB_RECV;
794 so->so_rcv.sb_so = so->so_snd.sb_so = so;
795 so->next_lock_lr = 0;
796 so->next_unlock_lr = 0;
797
798 #if CONFIG_MACF_SOCKET
799 mac_socket_label_associate(kauth_cred_get(), so);
800 #endif /* MAC_SOCKET */
801
802 /*
803 * Attachment will create the per pcb lock if necessary and
804 * increase refcount for creation, make sure it's done before
805 * socket is inserted in lists.
806 */
807 so->so_usecount++;
808
809 error = (*prp->pr_usrreqs->pru_attach)(so, proto, p);
810 if (error != 0) {
811 /*
812 * Warning:
813 * If so_pcb is not zero, the socket will be leaked,
814 * so protocol attachment handler must be coded carefuly
815 */
816 so->so_state |= SS_NOFDREF;
817 VERIFY(so->so_usecount > 0);
818 so->so_usecount--;
819 sofreelastref(so, 1); /* will deallocate the socket */
820 return error;
821 }
822
823 /*
824 * Note: needs so_pcb to be set after pru_attach
825 */
826 if (prp->pr_update_last_owner != NULL) {
827 (*prp->pr_update_last_owner)(so, p, ep);
828 }
829
830 atomic_add_32(&prp->pr_domain->dom_refs, 1);
831 TAILQ_INIT(&so->so_evlist);
832
833 /* Attach socket filters for this protocol */
834 sflt_initsock(so);
835 #if TCPDEBUG
836 if (tcpconsdebug == 2) {
837 so->so_options |= SO_DEBUG;
838 }
839 #endif
840 so_set_default_traffic_class(so);
841
842 /*
843 * If this thread or task is marked to create backgrounded sockets,
844 * mark the socket as background.
845 */
846 if (!(flags & SOCF_MPTCP) &&
847 proc_get_effective_thread_policy(current_thread(), TASK_POLICY_NEW_SOCKETS_BG)) {
848 socket_set_traffic_mgt_flags(so, TRAFFIC_MGT_SO_BACKGROUND);
849 so->so_background_thread = current_thread();
850 }
851
852 switch (dom) {
853 /*
854 * Don't mark Unix domain or system
855 * eligible for defunct by default.
856 */
857 case PF_LOCAL:
858 case PF_SYSTEM:
859 so->so_flags |= SOF_NODEFUNCT;
860 break;
861 default:
862 break;
863 }
864
865 /*
866 * Entitlements can't be checked at socket creation time except if the
867 * application requested a feature guarded by a privilege (c.f., socket
868 * delegation).
869 * The priv(9) and the Sandboxing APIs are designed with the idea that
870 * a privilege check should only be triggered by a userland request.
871 * A privilege check at socket creation time is time consuming and
872 * could trigger many authorisation error messages from the security
873 * APIs.
874 */
875
876 *aso = so;
877
878 return 0;
879 }
880
881 /*
882 * Returns: 0 Success
883 * EAFNOSUPPORT
884 * EPROTOTYPE
885 * EPROTONOSUPPORT
886 * ENOBUFS
887 * <pru_attach>:ENOBUFS[AF_UNIX]
888 * <pru_attach>:ENOBUFS[TCP]
889 * <pru_attach>:ENOMEM[TCP]
890 * <pru_attach>:??? [other protocol families, IPSEC]
891 */
892 int
893 socreate(int dom, struct socket **aso, int type, int proto)
894 {
895 return socreate_internal(dom, aso, type, proto, current_proc(), 0,
896 PROC_NULL);
897 }
898
899 int
900 socreate_delegate(int dom, struct socket **aso, int type, int proto, pid_t epid)
901 {
902 int error = 0;
903 struct proc *ep = PROC_NULL;
904
905 if ((proc_selfpid() != epid) && ((ep = proc_find(epid)) == PROC_NULL)) {
906 error = ESRCH;
907 goto done;
908 }
909
910 error = socreate_internal(dom, aso, type, proto, current_proc(), 0, ep);
911
912 /*
913 * It might not be wise to hold the proc reference when calling
914 * socreate_internal since it calls soalloc with M_WAITOK
915 */
916 done:
917 if (ep != PROC_NULL) {
918 proc_rele(ep);
919 }
920
921 return error;
922 }
923
924 /*
925 * Returns: 0 Success
926 * <pru_bind>:EINVAL Invalid argument [COMMON_START]
927 * <pru_bind>:EAFNOSUPPORT Address family not supported
928 * <pru_bind>:EADDRNOTAVAIL Address not available.
929 * <pru_bind>:EINVAL Invalid argument
930 * <pru_bind>:EAFNOSUPPORT Address family not supported [notdef]
931 * <pru_bind>:EACCES Permission denied
932 * <pru_bind>:EADDRINUSE Address in use
933 * <pru_bind>:EAGAIN Resource unavailable, try again
934 * <pru_bind>:EPERM Operation not permitted
935 * <pru_bind>:???
936 * <sf_bind>:???
937 *
938 * Notes: It's not possible to fully enumerate the return codes above,
939 * since socket filter authors and protocol family authors may
940 * not choose to limit their error returns to those listed, even
941 * though this may result in some software operating incorrectly.
942 *
943 * The error codes which are enumerated above are those known to
944 * be returned by the tcp_usr_bind function supplied.
945 */
946 int
947 sobindlock(struct socket *so, struct sockaddr *nam, int dolock)
948 {
949 struct proc *p = current_proc();
950 int error = 0;
951
952 if (dolock) {
953 socket_lock(so, 1);
954 }
955
956 so_update_last_owner_locked(so, p);
957 so_update_policy(so);
958
959 #if NECP
960 so_update_necp_policy(so, nam, NULL);
961 #endif /* NECP */
962
963 /*
964 * If this is a bind request on a socket that has been marked
965 * as inactive, reject it now before we go any further.
966 */
967 if (so->so_flags & SOF_DEFUNCT) {
968 error = EINVAL;
969 SODEFUNCTLOG("%s[%d, %s]: defunct so 0x%llx [%d,%d] (%d)\n",
970 __func__, proc_pid(p), proc_best_name(p),
971 (uint64_t)DEBUG_KERNEL_ADDRPERM(so),
972 SOCK_DOM(so), SOCK_TYPE(so), error);
973 goto out;
974 }
975
976 /* Socket filter */
977 error = sflt_bind(so, nam);
978
979 if (error == 0) {
980 error = (*so->so_proto->pr_usrreqs->pru_bind)(so, nam, p);
981 }
982 out:
983 if (dolock) {
984 socket_unlock(so, 1);
985 }
986
987 if (error == EJUSTRETURN) {
988 error = 0;
989 }
990
991 return error;
992 }
993
994 void
995 sodealloc(struct socket *so)
996 {
997 kauth_cred_unref(&so->so_cred);
998
999 /* Remove any filters */
1000 sflt_termsock(so);
1001
1002 #if CONTENT_FILTER
1003 cfil_sock_detach(so);
1004 #endif /* CONTENT_FILTER */
1005
1006 /* Delete the state allocated for msg queues on a socket */
1007 if (so->so_flags & SOF_ENABLE_MSGS) {
1008 FREE(so->so_msg_state, M_TEMP);
1009 so->so_msg_state = NULL;
1010 }
1011 VERIFY(so->so_msg_state == NULL);
1012
1013 so->so_gencnt = OSIncrementAtomic64((SInt64 *)&so_gencnt);
1014
1015 #if CONFIG_MACF_SOCKET
1016 mac_socket_label_destroy(so);
1017 #endif /* MAC_SOCKET */
1018
1019 if (so->so_flags1 & SOF1_CACHED_IN_SOCK_LAYER) {
1020 cached_sock_free(so);
1021 } else {
1022 FREE_ZONE(so, sizeof(*so), so->so_zone);
1023 }
1024 }
1025
1026 /*
1027 * Returns: 0 Success
1028 * EINVAL
1029 * EOPNOTSUPP
1030 * <pru_listen>:EINVAL[AF_UNIX]
1031 * <pru_listen>:EINVAL[TCP]
1032 * <pru_listen>:EADDRNOTAVAIL[TCP] Address not available.
1033 * <pru_listen>:EINVAL[TCP] Invalid argument
1034 * <pru_listen>:EAFNOSUPPORT[TCP] Address family not supported [notdef]
1035 * <pru_listen>:EACCES[TCP] Permission denied
1036 * <pru_listen>:EADDRINUSE[TCP] Address in use
1037 * <pru_listen>:EAGAIN[TCP] Resource unavailable, try again
1038 * <pru_listen>:EPERM[TCP] Operation not permitted
1039 * <sf_listen>:???
1040 *
1041 * Notes: Other <pru_listen> returns depend on the protocol family; all
1042 * <sf_listen> returns depend on what the filter author causes
1043 * their filter to return.
1044 */
1045 int
1046 solisten(struct socket *so, int backlog)
1047 {
1048 struct proc *p = current_proc();
1049 int error = 0;
1050
1051 socket_lock(so, 1);
1052
1053 so_update_last_owner_locked(so, p);
1054 so_update_policy(so);
1055
1056 #if NECP
1057 so_update_necp_policy(so, NULL, NULL);
1058 #endif /* NECP */
1059
1060 if (so->so_proto == NULL) {
1061 error = EINVAL;
1062 goto out;
1063 }
1064 if ((so->so_proto->pr_flags & PR_CONNREQUIRED) == 0) {
1065 error = EOPNOTSUPP;
1066 goto out;
1067 }
1068
1069 /*
1070 * If the listen request is made on a socket that is not fully
1071 * disconnected, or on a socket that has been marked as inactive,
1072 * reject the request now.
1073 */
1074 if ((so->so_state &
1075 (SS_ISCONNECTED | SS_ISCONNECTING | SS_ISDISCONNECTING)) ||
1076 (so->so_flags & SOF_DEFUNCT)) {
1077 error = EINVAL;
1078 if (so->so_flags & SOF_DEFUNCT) {
1079 SODEFUNCTLOG("%s[%d, %s]: defunct so 0x%llx [%d,%d] "
1080 "(%d)\n", __func__, proc_pid(p),
1081 proc_best_name(p),
1082 (uint64_t)DEBUG_KERNEL_ADDRPERM(so),
1083 SOCK_DOM(so), SOCK_TYPE(so), error);
1084 }
1085 goto out;
1086 }
1087
1088 if ((so->so_restrictions & SO_RESTRICT_DENY_IN) != 0) {
1089 error = EPERM;
1090 goto out;
1091 }
1092
1093 error = sflt_listen(so);
1094 if (error == 0) {
1095 error = (*so->so_proto->pr_usrreqs->pru_listen)(so, p);
1096 }
1097
1098 if (error) {
1099 if (error == EJUSTRETURN) {
1100 error = 0;
1101 }
1102 goto out;
1103 }
1104
1105 if (TAILQ_EMPTY(&so->so_comp)) {
1106 so->so_options |= SO_ACCEPTCONN;
1107 }
1108 /*
1109 * POSIX: The implementation may have an upper limit on the length of
1110 * the listen queue-either global or per accepting socket. If backlog
1111 * exceeds this limit, the length of the listen queue is set to the
1112 * limit.
1113 *
1114 * If listen() is called with a backlog argument value that is less
1115 * than 0, the function behaves as if it had been called with a backlog
1116 * argument value of 0.
1117 *
1118 * A backlog argument of 0 may allow the socket to accept connections,
1119 * in which case the length of the listen queue may be set to an
1120 * implementation-defined minimum value.
1121 */
1122 if (backlog <= 0 || backlog > somaxconn) {
1123 backlog = somaxconn;
1124 }
1125
1126 so->so_qlimit = backlog;
1127 out:
1128 socket_unlock(so, 1);
1129 return error;
1130 }
1131
1132 /*
1133 * The "accept list lock" protects the fields related to the listener queues
1134 * because we can unlock a socket to respect the lock ordering between
1135 * the listener socket and its clients sockets. The lock ordering is first to
1136 * acquire the client socket before the listener socket.
1137 *
1138 * The accept list lock serializes access to the following fields:
1139 * - of the listener socket:
1140 * - so_comp
1141 * - so_incomp
1142 * - so_qlen
1143 * - so_inqlen
1144 * - of client sockets that are in so_comp or so_incomp:
1145 * - so_head
1146 * - so_list
1147 *
1148 * As one can see the accept list lock protects the consistent of the
1149 * linkage of the client sockets.
1150 *
1151 * Note that those fields may be read without holding the accept list lock
1152 * for a preflight provided the accept list lock is taken when committing
1153 * to take an action based on the result of the preflight. The preflight
1154 * saves the cost of doing the unlock/lock dance.
1155 */
1156 void
1157 so_acquire_accept_list(struct socket *head, struct socket *so)
1158 {
1159 lck_mtx_t *mutex_held;
1160
1161 if (head->so_proto->pr_getlock == NULL) {
1162 return;
1163 }
1164 mutex_held = (*head->so_proto->pr_getlock)(head, PR_F_WILLUNLOCK);
1165 LCK_MTX_ASSERT(mutex_held, LCK_MTX_ASSERT_OWNED);
1166
1167 if (!(head->so_flags1 & SOF1_ACCEPT_LIST_HELD)) {
1168 head->so_flags1 |= SOF1_ACCEPT_LIST_HELD;
1169 return;
1170 }
1171 if (so != NULL) {
1172 socket_unlock(so, 0);
1173 }
1174 while (head->so_flags1 & SOF1_ACCEPT_LIST_HELD) {
1175 so_accept_list_waits += 1;
1176 msleep((caddr_t)&head->so_incomp, mutex_held,
1177 PSOCK | PCATCH, __func__, NULL);
1178 }
1179 head->so_flags1 |= SOF1_ACCEPT_LIST_HELD;
1180 if (so != NULL) {
1181 socket_unlock(head, 0);
1182 socket_lock(so, 0);
1183 socket_lock(head, 0);
1184 }
1185 }
1186
1187 void
1188 so_release_accept_list(struct socket *head)
1189 {
1190 if (head->so_proto->pr_getlock != NULL) {
1191 lck_mtx_t *mutex_held;
1192
1193 mutex_held = (*head->so_proto->pr_getlock)(head, 0);
1194 LCK_MTX_ASSERT(mutex_held, LCK_MTX_ASSERT_OWNED);
1195
1196 head->so_flags1 &= ~SOF1_ACCEPT_LIST_HELD;
1197 wakeup((caddr_t)&head->so_incomp);
1198 }
1199 }
1200
1201 void
1202 sofreelastref(struct socket *so, int dealloc)
1203 {
1204 struct socket *head = so->so_head;
1205
1206 /* Assume socket is locked */
1207
1208 if (!(so->so_flags & SOF_PCBCLEARING) || !(so->so_state & SS_NOFDREF)) {
1209 selthreadclear(&so->so_snd.sb_sel);
1210 selthreadclear(&so->so_rcv.sb_sel);
1211 so->so_rcv.sb_flags &= ~(SB_SEL | SB_UPCALL);
1212 so->so_snd.sb_flags &= ~(SB_SEL | SB_UPCALL);
1213 so->so_event = sonullevent;
1214 return;
1215 }
1216 if (head != NULL) {
1217 /*
1218 * Need to lock the listener when the protocol has
1219 * per socket locks
1220 */
1221 if (head->so_proto->pr_getlock != NULL) {
1222 socket_lock(head, 1);
1223 so_acquire_accept_list(head, so);
1224 }
1225 if (so->so_state & SS_INCOMP) {
1226 so->so_state &= ~SS_INCOMP;
1227 TAILQ_REMOVE(&head->so_incomp, so, so_list);
1228 head->so_incqlen--;
1229 head->so_qlen--;
1230 so->so_head = NULL;
1231
1232 if (head->so_proto->pr_getlock != NULL) {
1233 so_release_accept_list(head);
1234 socket_unlock(head, 1);
1235 }
1236 } else if (so->so_state & SS_COMP) {
1237 if (head->so_proto->pr_getlock != NULL) {
1238 so_release_accept_list(head);
1239 socket_unlock(head, 1);
1240 }
1241 /*
1242 * We must not decommission a socket that's
1243 * on the accept(2) queue. If we do, then
1244 * accept(2) may hang after select(2) indicated
1245 * that the listening socket was ready.
1246 */
1247 selthreadclear(&so->so_snd.sb_sel);
1248 selthreadclear(&so->so_rcv.sb_sel);
1249 so->so_rcv.sb_flags &= ~(SB_SEL | SB_UPCALL);
1250 so->so_snd.sb_flags &= ~(SB_SEL | SB_UPCALL);
1251 so->so_event = sonullevent;
1252 return;
1253 } else {
1254 if (head->so_proto->pr_getlock != NULL) {
1255 so_release_accept_list(head);
1256 socket_unlock(head, 1);
1257 }
1258 printf("sofree: not queued\n");
1259 }
1260 }
1261 sowflush(so);
1262 sorflush(so);
1263
1264 #if FLOW_DIVERT
1265 if (so->so_flags & SOF_FLOW_DIVERT) {
1266 flow_divert_detach(so);
1267 }
1268 #endif /* FLOW_DIVERT */
1269
1270 /* 3932268: disable upcall */
1271 so->so_rcv.sb_flags &= ~SB_UPCALL;
1272 so->so_snd.sb_flags &= ~(SB_UPCALL | SB_SNDBYTE_CNT);
1273 so->so_event = sonullevent;
1274
1275 if (dealloc) {
1276 sodealloc(so);
1277 }
1278 }
1279
1280 void
1281 soclose_wait_locked(struct socket *so)
1282 {
1283 lck_mtx_t *mutex_held;
1284
1285 if (so->so_proto->pr_getlock != NULL) {
1286 mutex_held = (*so->so_proto->pr_getlock)(so, PR_F_WILLUNLOCK);
1287 } else {
1288 mutex_held = so->so_proto->pr_domain->dom_mtx;
1289 }
1290 LCK_MTX_ASSERT(mutex_held, LCK_MTX_ASSERT_OWNED);
1291
1292 /*
1293 * Double check here and return if there's no outstanding upcall;
1294 * otherwise proceed further only if SOF_UPCALLCLOSEWAIT is set.
1295 */
1296 if (!so->so_upcallusecount || !(so->so_flags & SOF_UPCALLCLOSEWAIT)) {
1297 return;
1298 }
1299 so->so_rcv.sb_flags &= ~SB_UPCALL;
1300 so->so_snd.sb_flags &= ~SB_UPCALL;
1301 so->so_flags |= SOF_CLOSEWAIT;
1302
1303 (void) msleep((caddr_t)&so->so_upcallusecount, mutex_held, (PZERO - 1),
1304 "soclose_wait_locked", NULL);
1305 LCK_MTX_ASSERT(mutex_held, LCK_MTX_ASSERT_OWNED);
1306 so->so_flags &= ~SOF_CLOSEWAIT;
1307 }
1308
1309 /*
1310 * Close a socket on last file table reference removal.
1311 * Initiate disconnect if connected.
1312 * Free socket when disconnect complete.
1313 */
1314 int
1315 soclose_locked(struct socket *so)
1316 {
1317 int error = 0;
1318 struct timespec ts;
1319
1320 if (so->so_usecount == 0) {
1321 panic("soclose: so=%p refcount=0\n", so);
1322 /* NOTREACHED */
1323 }
1324
1325 sflt_notify(so, sock_evt_closing, NULL);
1326
1327 if (so->so_upcallusecount) {
1328 soclose_wait_locked(so);
1329 }
1330
1331 #if CONTENT_FILTER
1332 /*
1333 * We have to wait until the content filters are done
1334 */
1335 if ((so->so_flags & SOF_CONTENT_FILTER) != 0) {
1336 cfil_sock_close_wait(so);
1337 cfil_sock_is_closed(so);
1338 cfil_sock_detach(so);
1339 }
1340 #endif /* CONTENT_FILTER */
1341
1342 if (so->so_flags1 & SOF1_EXTEND_BK_IDLE_INPROG) {
1343 soresume(current_proc(), so, 1);
1344 so->so_flags1 &= ~SOF1_EXTEND_BK_IDLE_WANTED;
1345 }
1346
1347 if ((so->so_options & SO_ACCEPTCONN)) {
1348 struct socket *sp, *sonext;
1349 int persocklock = 0;
1350 int incomp_overflow_only;
1351
1352 /*
1353 * We do not want new connection to be added
1354 * to the connection queues
1355 */
1356 so->so_options &= ~SO_ACCEPTCONN;
1357
1358 /*
1359 * We can drop the lock on the listener once
1360 * we've acquired the incoming list
1361 */
1362 if (so->so_proto->pr_getlock != NULL) {
1363 persocklock = 1;
1364 so_acquire_accept_list(so, NULL);
1365 socket_unlock(so, 0);
1366 }
1367 again:
1368 incomp_overflow_only = 1;
1369
1370 TAILQ_FOREACH_SAFE(sp, &so->so_incomp, so_list, sonext) {
1371 /*
1372 * Radar 5350314
1373 * skip sockets thrown away by tcpdropdropblreq
1374 * they will get cleanup by the garbage collection.
1375 * otherwise, remove the incomp socket from the queue
1376 * and let soabort trigger the appropriate cleanup.
1377 */
1378 if (sp->so_flags & SOF_OVERFLOW) {
1379 continue;
1380 }
1381
1382 if (persocklock != 0) {
1383 socket_lock(sp, 1);
1384 }
1385
1386 /*
1387 * Radar 27945981
1388 * The extra reference for the list insure the
1389 * validity of the socket pointer when we perform the
1390 * unlock of the head above
1391 */
1392 if (sp->so_state & SS_INCOMP) {
1393 sp->so_state &= ~SS_INCOMP;
1394 sp->so_head = NULL;
1395 TAILQ_REMOVE(&so->so_incomp, sp, so_list);
1396 so->so_incqlen--;
1397 so->so_qlen--;
1398
1399 (void) soabort(sp);
1400 } else {
1401 panic("%s sp %p in so_incomp but !SS_INCOMP",
1402 __func__, sp);
1403 }
1404
1405 if (persocklock != 0) {
1406 socket_unlock(sp, 1);
1407 }
1408 }
1409
1410 TAILQ_FOREACH_SAFE(sp, &so->so_comp, so_list, sonext) {
1411 /* Dequeue from so_comp since sofree() won't do it */
1412 if (persocklock != 0) {
1413 socket_lock(sp, 1);
1414 }
1415
1416 if (sp->so_state & SS_COMP) {
1417 sp->so_state &= ~SS_COMP;
1418 sp->so_head = NULL;
1419 TAILQ_REMOVE(&so->so_comp, sp, so_list);
1420 so->so_qlen--;
1421
1422 (void) soabort(sp);
1423 } else {
1424 panic("%s sp %p in so_comp but !SS_COMP",
1425 __func__, sp);
1426 }
1427
1428 if (persocklock) {
1429 socket_unlock(sp, 1);
1430 }
1431 }
1432
1433 if (incomp_overflow_only == 0 && !TAILQ_EMPTY(&so->so_incomp)) {
1434 #if (DEBUG | DEVELOPMENT)
1435 panic("%s head %p so_comp not empty\n", __func__, so);
1436 #endif /* (DEVELOPMENT || DEBUG) */
1437
1438 goto again;
1439 }
1440
1441 if (!TAILQ_EMPTY(&so->so_comp)) {
1442 #if (DEBUG | DEVELOPMENT)
1443 panic("%s head %p so_comp not empty\n", __func__, so);
1444 #endif /* (DEVELOPMENT || DEBUG) */
1445
1446 goto again;
1447 }
1448
1449 if (persocklock) {
1450 socket_lock(so, 0);
1451 so_release_accept_list(so);
1452 }
1453 }
1454 if (so->so_pcb == NULL) {
1455 /* 3915887: mark the socket as ready for dealloc */
1456 so->so_flags |= SOF_PCBCLEARING;
1457 goto discard;
1458 }
1459 if (so->so_state & SS_ISCONNECTED) {
1460 if ((so->so_state & SS_ISDISCONNECTING) == 0) {
1461 error = sodisconnectlocked(so);
1462 if (error) {
1463 goto drop;
1464 }
1465 }
1466 if (so->so_options & SO_LINGER) {
1467 lck_mtx_t *mutex_held;
1468
1469 if ((so->so_state & SS_ISDISCONNECTING) &&
1470 (so->so_state & SS_NBIO)) {
1471 goto drop;
1472 }
1473 if (so->so_proto->pr_getlock != NULL) {
1474 mutex_held = (*so->so_proto->pr_getlock)(so, PR_F_WILLUNLOCK);
1475 } else {
1476 mutex_held = so->so_proto->pr_domain->dom_mtx;
1477 }
1478 while (so->so_state & SS_ISCONNECTED) {
1479 ts.tv_sec = (so->so_linger / 100);
1480 ts.tv_nsec = (so->so_linger % 100) *
1481 NSEC_PER_USEC * 1000 * 10;
1482 error = msleep((caddr_t)&so->so_timeo,
1483 mutex_held, PSOCK | PCATCH, "soclose", &ts);
1484 if (error) {
1485 /*
1486 * It's OK when the time fires,
1487 * don't report an error
1488 */
1489 if (error == EWOULDBLOCK) {
1490 error = 0;
1491 }
1492 break;
1493 }
1494 }
1495 }
1496 }
1497 drop:
1498 if (so->so_usecount == 0) {
1499 panic("soclose: usecount is zero so=%p\n", so);
1500 /* NOTREACHED */
1501 }
1502 if (so->so_pcb != NULL && !(so->so_flags & SOF_PCBCLEARING)) {
1503 int error2 = (*so->so_proto->pr_usrreqs->pru_detach)(so);
1504 if (error == 0) {
1505 error = error2;
1506 }
1507 }
1508 if (so->so_usecount <= 0) {
1509 panic("soclose: usecount is zero so=%p\n", so);
1510 /* NOTREACHED */
1511 }
1512 discard:
1513 if (so->so_pcb != NULL && !(so->so_flags & SOF_MP_SUBFLOW) &&
1514 (so->so_state & SS_NOFDREF)) {
1515 panic("soclose: NOFDREF");
1516 /* NOTREACHED */
1517 }
1518 so->so_state |= SS_NOFDREF;
1519
1520 if ((so->so_flags & SOF_KNOTE) != 0) {
1521 KNOTE(&so->so_klist, SO_FILT_HINT_LOCKED);
1522 }
1523
1524 atomic_add_32(&so->so_proto->pr_domain->dom_refs, -1);
1525 evsofree(so);
1526
1527 VERIFY(so->so_usecount > 0);
1528 so->so_usecount--;
1529 sofree(so);
1530 return error;
1531 }
1532
1533 int
1534 soclose(struct socket *so)
1535 {
1536 int error = 0;
1537 socket_lock(so, 1);
1538
1539 if (so->so_retaincnt == 0) {
1540 error = soclose_locked(so);
1541 } else {
1542 /*
1543 * if the FD is going away, but socket is
1544 * retained in kernel remove its reference
1545 */
1546 so->so_usecount--;
1547 if (so->so_usecount < 2) {
1548 panic("soclose: retaincnt non null and so=%p "
1549 "usecount=%d\n", so, so->so_usecount);
1550 }
1551 }
1552 socket_unlock(so, 1);
1553 return error;
1554 }
1555
1556 /*
1557 * Must be called at splnet...
1558 */
1559 /* Should already be locked */
1560 int
1561 soabort(struct socket *so)
1562 {
1563 int error;
1564
1565 #ifdef MORE_LOCKING_DEBUG
1566 lck_mtx_t *mutex_held;
1567
1568 if (so->so_proto->pr_getlock != NULL) {
1569 mutex_held = (*so->so_proto->pr_getlock)(so, 0);
1570 } else {
1571 mutex_held = so->so_proto->pr_domain->dom_mtx;
1572 }
1573 LCK_MTX_ASSERT(mutex_held, LCK_MTX_ASSERT_OWNED);
1574 #endif
1575
1576 if ((so->so_flags & SOF_ABORTED) == 0) {
1577 so->so_flags |= SOF_ABORTED;
1578 error = (*so->so_proto->pr_usrreqs->pru_abort)(so);
1579 if (error) {
1580 sofree(so);
1581 return error;
1582 }
1583 }
1584 return 0;
1585 }
1586
1587 int
1588 soacceptlock(struct socket *so, struct sockaddr **nam, int dolock)
1589 {
1590 int error;
1591
1592 if (dolock) {
1593 socket_lock(so, 1);
1594 }
1595
1596 so_update_last_owner_locked(so, PROC_NULL);
1597 so_update_policy(so);
1598 #if NECP
1599 so_update_necp_policy(so, NULL, NULL);
1600 #endif /* NECP */
1601
1602 if ((so->so_state & SS_NOFDREF) == 0) {
1603 panic("soaccept: !NOFDREF");
1604 }
1605 so->so_state &= ~SS_NOFDREF;
1606 error = (*so->so_proto->pr_usrreqs->pru_accept)(so, nam);
1607
1608 if (dolock) {
1609 socket_unlock(so, 1);
1610 }
1611 return error;
1612 }
1613
1614 int
1615 soaccept(struct socket *so, struct sockaddr **nam)
1616 {
1617 return soacceptlock(so, nam, 1);
1618 }
1619
1620 int
1621 soacceptfilter(struct socket *so, struct socket *head)
1622 {
1623 struct sockaddr *local = NULL, *remote = NULL;
1624 int error = 0;
1625
1626 /*
1627 * Hold the lock even if this socket has not been made visible
1628 * to the filter(s). For sockets with global locks, this protects
1629 * against the head or peer going away
1630 */
1631 socket_lock(so, 1);
1632 if (sogetaddr_locked(so, &remote, 1) != 0 ||
1633 sogetaddr_locked(so, &local, 0) != 0) {
1634 so->so_state &= ~SS_NOFDREF;
1635 socket_unlock(so, 1);
1636 soclose(so);
1637 /* Out of resources; try it again next time */
1638 error = ECONNABORTED;
1639 goto done;
1640 }
1641
1642 error = sflt_accept(head, so, local, remote);
1643
1644 /*
1645 * If we get EJUSTRETURN from one of the filters, mark this socket
1646 * as inactive and return it anyway. This newly accepted socket
1647 * will be disconnected later before we hand it off to the caller.
1648 */
1649 if (error == EJUSTRETURN) {
1650 error = 0;
1651 (void) sosetdefunct(current_proc(), so,
1652 SHUTDOWN_SOCKET_LEVEL_DISCONNECT_INTERNAL, FALSE);
1653 }
1654
1655 if (error != 0) {
1656 /*
1657 * This may seem like a duplication to the above error
1658 * handling part when we return ECONNABORTED, except
1659 * the following is done while holding the lock since
1660 * the socket has been exposed to the filter(s) earlier.
1661 */
1662 so->so_state &= ~SS_NOFDREF;
1663 socket_unlock(so, 1);
1664 soclose(so);
1665 /* Propagate socket filter's error code to the caller */
1666 } else {
1667 socket_unlock(so, 1);
1668 }
1669 done:
1670 /* Callee checks for NULL pointer */
1671 sock_freeaddr(remote);
1672 sock_freeaddr(local);
1673 return error;
1674 }
1675
1676 /*
1677 * Returns: 0 Success
1678 * EOPNOTSUPP Operation not supported on socket
1679 * EISCONN Socket is connected
1680 * <pru_connect>:EADDRNOTAVAIL Address not available.
1681 * <pru_connect>:EINVAL Invalid argument
1682 * <pru_connect>:EAFNOSUPPORT Address family not supported [notdef]
1683 * <pru_connect>:EACCES Permission denied
1684 * <pru_connect>:EADDRINUSE Address in use
1685 * <pru_connect>:EAGAIN Resource unavailable, try again
1686 * <pru_connect>:EPERM Operation not permitted
1687 * <sf_connect_out>:??? [anything a filter writer might set]
1688 */
1689 int
1690 soconnectlock(struct socket *so, struct sockaddr *nam, int dolock)
1691 {
1692 int error;
1693 struct proc *p = current_proc();
1694
1695 if (dolock) {
1696 socket_lock(so, 1);
1697 }
1698
1699 so_update_last_owner_locked(so, p);
1700 so_update_policy(so);
1701
1702 #if NECP
1703 so_update_necp_policy(so, NULL, nam);
1704 #endif /* NECP */
1705
1706 /*
1707 * If this is a listening socket or if this is a previously-accepted
1708 * socket that has been marked as inactive, reject the connect request.
1709 */
1710 if ((so->so_options & SO_ACCEPTCONN) || (so->so_flags & SOF_DEFUNCT)) {
1711 error = EOPNOTSUPP;
1712 if (so->so_flags & SOF_DEFUNCT) {
1713 SODEFUNCTLOG("%s[%d, %s]: defunct so 0x%llx [%d,%d] "
1714 "(%d)\n", __func__, proc_pid(p),
1715 proc_best_name(p),
1716 (uint64_t)DEBUG_KERNEL_ADDRPERM(so),
1717 SOCK_DOM(so), SOCK_TYPE(so), error);
1718 }
1719 if (dolock) {
1720 socket_unlock(so, 1);
1721 }
1722 return error;
1723 }
1724
1725 if ((so->so_restrictions & SO_RESTRICT_DENY_OUT) != 0) {
1726 if (dolock) {
1727 socket_unlock(so, 1);
1728 }
1729 return EPERM;
1730 }
1731
1732 /*
1733 * If protocol is connection-based, can only connect once.
1734 * Otherwise, if connected, try to disconnect first.
1735 * This allows user to disconnect by connecting to, e.g.,
1736 * a null address.
1737 */
1738 if (so->so_state & (SS_ISCONNECTED | SS_ISCONNECTING) &&
1739 ((so->so_proto->pr_flags & PR_CONNREQUIRED) ||
1740 (error = sodisconnectlocked(so)))) {
1741 error = EISCONN;
1742 } else {
1743 /*
1744 * Run connect filter before calling protocol:
1745 * - non-blocking connect returns before completion;
1746 */
1747 error = sflt_connectout(so, nam);
1748 if (error != 0) {
1749 if (error == EJUSTRETURN) {
1750 error = 0;
1751 }
1752 } else {
1753 error = (*so->so_proto->pr_usrreqs->pru_connect)
1754 (so, nam, p);
1755 if (error != 0) {
1756 so->so_state &= ~SS_ISCONNECTING;
1757 }
1758 }
1759 }
1760 if (dolock) {
1761 socket_unlock(so, 1);
1762 }
1763 return error;
1764 }
1765
1766 int
1767 soconnect(struct socket *so, struct sockaddr *nam)
1768 {
1769 return soconnectlock(so, nam, 1);
1770 }
1771
1772 /*
1773 * Returns: 0 Success
1774 * <pru_connect2>:EINVAL[AF_UNIX]
1775 * <pru_connect2>:EPROTOTYPE[AF_UNIX]
1776 * <pru_connect2>:??? [other protocol families]
1777 *
1778 * Notes: <pru_connect2> is not supported by [TCP].
1779 */
1780 int
1781 soconnect2(struct socket *so1, struct socket *so2)
1782 {
1783 int error;
1784
1785 socket_lock(so1, 1);
1786 if (so2->so_proto->pr_lock) {
1787 socket_lock(so2, 1);
1788 }
1789
1790 error = (*so1->so_proto->pr_usrreqs->pru_connect2)(so1, so2);
1791
1792 socket_unlock(so1, 1);
1793 if (so2->so_proto->pr_lock) {
1794 socket_unlock(so2, 1);
1795 }
1796 return error;
1797 }
1798
1799 int
1800 soconnectxlocked(struct socket *so, struct sockaddr *src,
1801 struct sockaddr *dst, struct proc *p, uint32_t ifscope,
1802 sae_associd_t aid, sae_connid_t *pcid, uint32_t flags, void *arg,
1803 uint32_t arglen, uio_t auio, user_ssize_t *bytes_written)
1804 {
1805 int error;
1806
1807 so_update_last_owner_locked(so, p);
1808 so_update_policy(so);
1809
1810 /*
1811 * If this is a listening socket or if this is a previously-accepted
1812 * socket that has been marked as inactive, reject the connect request.
1813 */
1814 if ((so->so_options & SO_ACCEPTCONN) || (so->so_flags & SOF_DEFUNCT)) {
1815 error = EOPNOTSUPP;
1816 if (so->so_flags & SOF_DEFUNCT) {
1817 SODEFUNCTLOG("%s[%d, %s]: defunct so 0x%llx [%d,%d] "
1818 "(%d)\n", __func__, proc_pid(p),
1819 proc_best_name(p),
1820 (uint64_t)DEBUG_KERNEL_ADDRPERM(so),
1821 SOCK_DOM(so), SOCK_TYPE(so), error);
1822 }
1823 return error;
1824 }
1825
1826 if ((so->so_restrictions & SO_RESTRICT_DENY_OUT) != 0) {
1827 return EPERM;
1828 }
1829
1830 /*
1831 * If protocol is connection-based, can only connect once
1832 * unless PR_MULTICONN is set. Otherwise, if connected,
1833 * try to disconnect first. This allows user to disconnect
1834 * by connecting to, e.g., a null address.
1835 */
1836 if ((so->so_state & (SS_ISCONNECTED | SS_ISCONNECTING)) &&
1837 !(so->so_proto->pr_flags & PR_MULTICONN) &&
1838 ((so->so_proto->pr_flags & PR_CONNREQUIRED) ||
1839 (error = sodisconnectlocked(so)) != 0)) {
1840 error = EISCONN;
1841 } else {
1842 if ((so->so_proto->pr_flags & PR_DATA_IDEMPOTENT) &&
1843 (flags & CONNECT_DATA_IDEMPOTENT)) {
1844 so->so_flags1 |= SOF1_DATA_IDEMPOTENT;
1845
1846 if (flags & CONNECT_DATA_AUTHENTICATED) {
1847 so->so_flags1 |= SOF1_DATA_AUTHENTICATED;
1848 }
1849 }
1850
1851 /*
1852 * Case 1: CONNECT_RESUME_ON_READ_WRITE set, no data.
1853 * Case 2: CONNECT_RESUME_ON_READ_WRITE set, with data (user error)
1854 * Case 3: CONNECT_RESUME_ON_READ_WRITE not set, with data
1855 * Case 3 allows user to combine write with connect even if they have
1856 * no use for TFO (such as regular TCP, and UDP).
1857 * Case 4: CONNECT_RESUME_ON_READ_WRITE not set, no data (regular case)
1858 */
1859 if ((so->so_proto->pr_flags & PR_PRECONN_WRITE) &&
1860 ((flags & CONNECT_RESUME_ON_READ_WRITE) || auio)) {
1861 so->so_flags1 |= SOF1_PRECONNECT_DATA;
1862 }
1863
1864 /*
1865 * If a user sets data idempotent and does not pass an uio, or
1866 * sets CONNECT_RESUME_ON_READ_WRITE, this is an error, reset
1867 * SOF1_DATA_IDEMPOTENT.
1868 */
1869 if (!(so->so_flags1 & SOF1_PRECONNECT_DATA) &&
1870 (so->so_flags1 & SOF1_DATA_IDEMPOTENT)) {
1871 /* We should return EINVAL instead perhaps. */
1872 so->so_flags1 &= ~SOF1_DATA_IDEMPOTENT;
1873 }
1874
1875 /*
1876 * Run connect filter before calling protocol:
1877 * - non-blocking connect returns before completion;
1878 */
1879 error = sflt_connectout(so, dst);
1880 if (error != 0) {
1881 /* Disable PRECONNECT_DATA, as we don't need to send a SYN anymore. */
1882 so->so_flags1 &= ~SOF1_PRECONNECT_DATA;
1883 if (error == EJUSTRETURN) {
1884 error = 0;
1885 }
1886 } else {
1887 error = (*so->so_proto->pr_usrreqs->pru_connectx)
1888 (so, src, dst, p, ifscope, aid, pcid,
1889 flags, arg, arglen, auio, bytes_written);
1890 if (error != 0) {
1891 so->so_state &= ~SS_ISCONNECTING;
1892 if (error != EINPROGRESS) {
1893 so->so_flags1 &= ~SOF1_PRECONNECT_DATA;
1894 }
1895 }
1896 }
1897 }
1898
1899 return error;
1900 }
1901
1902 int
1903 sodisconnectlocked(struct socket *so)
1904 {
1905 int error;
1906
1907 if ((so->so_state & SS_ISCONNECTED) == 0) {
1908 error = ENOTCONN;
1909 goto bad;
1910 }
1911 if (so->so_state & SS_ISDISCONNECTING) {
1912 error = EALREADY;
1913 goto bad;
1914 }
1915
1916 error = (*so->so_proto->pr_usrreqs->pru_disconnect)(so);
1917 if (error == 0) {
1918 sflt_notify(so, sock_evt_disconnected, NULL);
1919 }
1920
1921 bad:
1922 return error;
1923 }
1924
1925 /* Locking version */
1926 int
1927 sodisconnect(struct socket *so)
1928 {
1929 int error;
1930
1931 socket_lock(so, 1);
1932 error = sodisconnectlocked(so);
1933 socket_unlock(so, 1);
1934 return error;
1935 }
1936
1937 int
1938 sodisconnectxlocked(struct socket *so, sae_associd_t aid, sae_connid_t cid)
1939 {
1940 int error;
1941
1942 /*
1943 * Call the protocol disconnectx handler; let it handle all
1944 * matters related to the connection state of this session.
1945 */
1946 error = (*so->so_proto->pr_usrreqs->pru_disconnectx)(so, aid, cid);
1947 if (error == 0) {
1948 /*
1949 * The event applies only for the session, not for
1950 * the disconnection of individual subflows.
1951 */
1952 if (so->so_state & (SS_ISDISCONNECTING | SS_ISDISCONNECTED)) {
1953 sflt_notify(so, sock_evt_disconnected, NULL);
1954 }
1955 }
1956 return error;
1957 }
1958
1959 int
1960 sodisconnectx(struct socket *so, sae_associd_t aid, sae_connid_t cid)
1961 {
1962 int error;
1963
1964 socket_lock(so, 1);
1965 error = sodisconnectxlocked(so, aid, cid);
1966 socket_unlock(so, 1);
1967 return error;
1968 }
1969
1970 #define SBLOCKWAIT(f) (((f) & MSG_DONTWAIT) ? 0 : SBL_WAIT)
1971
1972 /*
1973 * sosendcheck will lock the socket buffer if it isn't locked and
1974 * verify that there is space for the data being inserted.
1975 *
1976 * Returns: 0 Success
1977 * EPIPE
1978 * sblock:EWOULDBLOCK
1979 * sblock:EINTR
1980 * sbwait:EBADF
1981 * sbwait:EINTR
1982 * [so_error]:???
1983 */
1984 int
1985 sosendcheck(struct socket *so, struct sockaddr *addr, user_ssize_t resid,
1986 int32_t clen, int32_t atomic, int flags, int *sblocked,
1987 struct mbuf *control)
1988 {
1989 int error = 0;
1990 int32_t space;
1991 int assumelock = 0;
1992
1993 restart:
1994 if (*sblocked == 0) {
1995 if ((so->so_snd.sb_flags & SB_LOCK) != 0 &&
1996 so->so_send_filt_thread != 0 &&
1997 so->so_send_filt_thread == current_thread()) {
1998 /*
1999 * We're being called recursively from a filter,
2000 * allow this to continue. Radar 4150520.
2001 * Don't set sblocked because we don't want
2002 * to perform an unlock later.
2003 */
2004 assumelock = 1;
2005 } else {
2006 error = sblock(&so->so_snd, SBLOCKWAIT(flags));
2007 if (error) {
2008 if (so->so_flags & SOF_DEFUNCT) {
2009 goto defunct;
2010 }
2011 return error;
2012 }
2013 *sblocked = 1;
2014 }
2015 }
2016
2017 /*
2018 * If a send attempt is made on a socket that has been marked
2019 * as inactive (disconnected), reject the request.
2020 */
2021 if (so->so_flags & SOF_DEFUNCT) {
2022 defunct:
2023 error = EPIPE;
2024 SODEFUNCTLOG("%s[%d, %s]: defunct so 0x%llx [%d,%d] (%d)\n",
2025 __func__, proc_selfpid(), proc_best_name(current_proc()),
2026 (uint64_t)DEBUG_KERNEL_ADDRPERM(so),
2027 SOCK_DOM(so), SOCK_TYPE(so), error);
2028 return error;
2029 }
2030
2031 if (so->so_state & SS_CANTSENDMORE) {
2032 #if CONTENT_FILTER
2033 /*
2034 * Can re-inject data of half closed connections
2035 */
2036 if ((so->so_state & SS_ISDISCONNECTED) == 0 &&
2037 so->so_snd.sb_cfil_thread == current_thread() &&
2038 cfil_sock_data_pending(&so->so_snd) != 0) {
2039 CFIL_LOG(LOG_INFO,
2040 "so %llx ignore SS_CANTSENDMORE",
2041 (uint64_t)DEBUG_KERNEL_ADDRPERM(so));
2042 } else
2043 #endif /* CONTENT_FILTER */
2044 return EPIPE;
2045 }
2046 if (so->so_error) {
2047 error = so->so_error;
2048 so->so_error = 0;
2049 return error;
2050 }
2051
2052 if ((so->so_state & SS_ISCONNECTED) == 0) {
2053 if ((so->so_proto->pr_flags & PR_CONNREQUIRED) != 0) {
2054 if (((so->so_state & SS_ISCONFIRMING) == 0) &&
2055 (resid != 0 || clen == 0) &&
2056 !(so->so_flags1 & SOF1_PRECONNECT_DATA)) {
2057 return ENOTCONN;
2058 }
2059 } else if (addr == 0) {
2060 return (so->so_proto->pr_flags & PR_CONNREQUIRED) ?
2061 ENOTCONN : EDESTADDRREQ;
2062 }
2063 }
2064
2065 if (so->so_flags & SOF_ENABLE_MSGS) {
2066 space = msgq_sbspace(so, control);
2067 } else {
2068 space = sbspace(&so->so_snd);
2069 }
2070
2071 if (flags & MSG_OOB) {
2072 space += 1024;
2073 }
2074 if ((atomic && resid > so->so_snd.sb_hiwat) ||
2075 clen > so->so_snd.sb_hiwat) {
2076 return EMSGSIZE;
2077 }
2078
2079 if ((space < resid + clen &&
2080 (atomic || (space < (int32_t)so->so_snd.sb_lowat) ||
2081 space < clen)) ||
2082 (so->so_type == SOCK_STREAM && so_wait_for_if_feedback(so))) {
2083 /*
2084 * don't block the connectx call when there's more data
2085 * than can be copied.
2086 */
2087 if (so->so_flags1 & SOF1_PRECONNECT_DATA) {
2088 if (space == 0) {
2089 return EWOULDBLOCK;
2090 }
2091 if (space < (int32_t)so->so_snd.sb_lowat) {
2092 return 0;
2093 }
2094 }
2095 if ((so->so_state & SS_NBIO) || (flags & MSG_NBIO) ||
2096 assumelock) {
2097 return EWOULDBLOCK;
2098 }
2099 sbunlock(&so->so_snd, TRUE); /* keep socket locked */
2100 *sblocked = 0;
2101 error = sbwait(&so->so_snd);
2102 if (error) {
2103 if (so->so_flags & SOF_DEFUNCT) {
2104 goto defunct;
2105 }
2106 return error;
2107 }
2108 goto restart;
2109 }
2110 return 0;
2111 }
2112
2113 /*
2114 * Send on a socket.
2115 * If send must go all at once and message is larger than
2116 * send buffering, then hard error.
2117 * Lock against other senders.
2118 * If must go all at once and not enough room now, then
2119 * inform user that this would block and do nothing.
2120 * Otherwise, if nonblocking, send as much as possible.
2121 * The data to be sent is described by "uio" if nonzero,
2122 * otherwise by the mbuf chain "top" (which must be null
2123 * if uio is not). Data provided in mbuf chain must be small
2124 * enough to send all at once.
2125 *
2126 * Returns nonzero on error, timeout or signal; callers
2127 * must check for short counts if EINTR/ERESTART are returned.
2128 * Data and control buffers are freed on return.
2129 *
2130 * Returns: 0 Success
2131 * EOPNOTSUPP
2132 * EINVAL
2133 * ENOBUFS
2134 * uiomove:EFAULT
2135 * sosendcheck:EPIPE
2136 * sosendcheck:EWOULDBLOCK
2137 * sosendcheck:EINTR
2138 * sosendcheck:EBADF
2139 * sosendcheck:EINTR
2140 * sosendcheck:??? [value from so_error]
2141 * <pru_send>:ECONNRESET[TCP]
2142 * <pru_send>:EINVAL[TCP]
2143 * <pru_send>:ENOBUFS[TCP]
2144 * <pru_send>:EADDRINUSE[TCP]
2145 * <pru_send>:EADDRNOTAVAIL[TCP]
2146 * <pru_send>:EAFNOSUPPORT[TCP]
2147 * <pru_send>:EACCES[TCP]
2148 * <pru_send>:EAGAIN[TCP]
2149 * <pru_send>:EPERM[TCP]
2150 * <pru_send>:EMSGSIZE[TCP]
2151 * <pru_send>:EHOSTUNREACH[TCP]
2152 * <pru_send>:ENETUNREACH[TCP]
2153 * <pru_send>:ENETDOWN[TCP]
2154 * <pru_send>:ENOMEM[TCP]
2155 * <pru_send>:ENOBUFS[TCP]
2156 * <pru_send>:???[TCP] [ignorable: mostly IPSEC/firewall/DLIL]
2157 * <pru_send>:EINVAL[AF_UNIX]
2158 * <pru_send>:EOPNOTSUPP[AF_UNIX]
2159 * <pru_send>:EPIPE[AF_UNIX]
2160 * <pru_send>:ENOTCONN[AF_UNIX]
2161 * <pru_send>:EISCONN[AF_UNIX]
2162 * <pru_send>:???[AF_UNIX] [whatever a filter author chooses]
2163 * <sf_data_out>:??? [whatever a filter author chooses]
2164 *
2165 * Notes: Other <pru_send> returns depend on the protocol family; all
2166 * <sf_data_out> returns depend on what the filter author causes
2167 * their filter to return.
2168 */
2169 int
2170 sosend(struct socket *so, struct sockaddr *addr, struct uio *uio,
2171 struct mbuf *top, struct mbuf *control, int flags)
2172 {
2173 struct mbuf **mp;
2174 struct mbuf *m, *freelist = NULL;
2175 user_ssize_t space, len, resid, orig_resid;
2176 int clen = 0, error, dontroute, mlen, sendflags;
2177 int atomic = sosendallatonce(so) || top;
2178 int sblocked = 0;
2179 struct proc *p = current_proc();
2180 struct mbuf *control_copy = NULL;
2181 uint16_t headroom = 0;
2182 boolean_t en_tracing = FALSE;
2183
2184 if (uio != NULL) {
2185 resid = uio_resid(uio);
2186 } else {
2187 resid = top->m_pkthdr.len;
2188 }
2189
2190 KERNEL_DEBUG((DBG_FNC_SOSEND | DBG_FUNC_START), so, resid,
2191 so->so_snd.sb_cc, so->so_snd.sb_lowat, so->so_snd.sb_hiwat);
2192
2193 socket_lock(so, 1);
2194
2195 /*
2196 * trace if tracing & network (vs. unix) sockets & and
2197 * non-loopback
2198 */
2199 if (ENTR_SHOULDTRACE &&
2200 (SOCK_CHECK_DOM(so, AF_INET) || SOCK_CHECK_DOM(so, AF_INET6))) {
2201 struct inpcb *inp = sotoinpcb(so);
2202 if (inp->inp_last_outifp != NULL &&
2203 !(inp->inp_last_outifp->if_flags & IFF_LOOPBACK)) {
2204 en_tracing = TRUE;
2205 KERNEL_ENERGYTRACE(kEnTrActKernSockWrite, DBG_FUNC_START,
2206 VM_KERNEL_ADDRPERM(so),
2207 ((so->so_state & SS_NBIO) ? kEnTrFlagNonBlocking : 0),
2208 (int64_t)resid);
2209 orig_resid = resid;
2210 }
2211 }
2212
2213 /*
2214 * Re-injection should not affect process accounting
2215 */
2216 if ((flags & MSG_SKIPCFIL) == 0) {
2217 so_update_last_owner_locked(so, p);
2218 so_update_policy(so);
2219
2220 #if NECP
2221 so_update_necp_policy(so, NULL, addr);
2222 #endif /* NECP */
2223 }
2224
2225 if (so->so_type != SOCK_STREAM && (flags & MSG_OOB) != 0) {
2226 error = EOPNOTSUPP;
2227 goto out_locked;
2228 }
2229
2230 /*
2231 * In theory resid should be unsigned.
2232 * However, space must be signed, as it might be less than 0
2233 * if we over-committed, and we must use a signed comparison
2234 * of space and resid. On the other hand, a negative resid
2235 * causes us to loop sending 0-length segments to the protocol.
2236 *
2237 * Usually, MSG_EOR isn't used on SOCK_STREAM type sockets.
2238 * But it will be used by sockets doing message delivery.
2239 *
2240 * Note: We limit resid to be a positive int value as we use
2241 * imin() to set bytes_to_copy -- radr://14558484
2242 */
2243 if (resid < 0 || resid > INT_MAX || (so->so_type == SOCK_STREAM &&
2244 !(so->so_flags & SOF_ENABLE_MSGS) && (flags & MSG_EOR))) {
2245 error = EINVAL;
2246 goto out_locked;
2247 }
2248
2249 dontroute = (flags & MSG_DONTROUTE) &&
2250 (so->so_options & SO_DONTROUTE) == 0 &&
2251 (so->so_proto->pr_flags & PR_ATOMIC);
2252 OSIncrementAtomicLong(&p->p_stats->p_ru.ru_msgsnd);
2253
2254 if (control != NULL) {
2255 clen = control->m_len;
2256 }
2257
2258 if (soreserveheadroom != 0) {
2259 headroom = so->so_pktheadroom;
2260 }
2261
2262 do {
2263 error = sosendcheck(so, addr, resid, clen, atomic, flags,
2264 &sblocked, control);
2265 if (error) {
2266 goto out_locked;
2267 }
2268
2269 mp = &top;
2270 if (so->so_flags & SOF_ENABLE_MSGS) {
2271 space = msgq_sbspace(so, control);
2272 } else {
2273 space = sbspace(&so->so_snd) - clen;
2274 }
2275 space += ((flags & MSG_OOB) ? 1024 : 0);
2276
2277 do {
2278 if (uio == NULL) {
2279 /*
2280 * Data is prepackaged in "top".
2281 */
2282 resid = 0;
2283 if (flags & MSG_EOR) {
2284 top->m_flags |= M_EOR;
2285 }
2286 } else {
2287 int chainlength;
2288 int bytes_to_copy;
2289 boolean_t jumbocl;
2290 boolean_t bigcl;
2291 int bytes_to_alloc;
2292
2293 bytes_to_copy = imin(resid, space);
2294
2295 bytes_to_alloc = bytes_to_copy;
2296 if (top == NULL) {
2297 bytes_to_alloc += headroom;
2298 }
2299
2300 if (sosendminchain > 0) {
2301 chainlength = 0;
2302 } else {
2303 chainlength = sosendmaxchain;
2304 }
2305
2306 /*
2307 * Use big 4 KB cluster when the outgoing interface
2308 * does not prefer 2 KB clusters
2309 */
2310 bigcl = !(so->so_flags1 & SOF1_IF_2KCL) ||
2311 sosendbigcl_ignore_capab;
2312
2313 /*
2314 * Attempt to use larger than system page-size
2315 * clusters for large writes only if there is
2316 * a jumbo cluster pool and if the socket is
2317 * marked accordingly.
2318 */
2319 jumbocl = sosendjcl && njcl > 0 &&
2320 ((so->so_flags & SOF_MULTIPAGES) ||
2321 sosendjcl_ignore_capab) &&
2322 bigcl;
2323
2324 socket_unlock(so, 0);
2325
2326 do {
2327 int num_needed;
2328 int hdrs_needed = (top == NULL) ? 1 : 0;
2329
2330 /*
2331 * try to maintain a local cache of mbuf
2332 * clusters needed to complete this
2333 * write the list is further limited to
2334 * the number that are currently needed
2335 * to fill the socket this mechanism
2336 * allows a large number of mbufs/
2337 * clusters to be grabbed under a single
2338 * mbuf lock... if we can't get any
2339 * clusters, than fall back to trying
2340 * for mbufs if we fail early (or
2341 * miscalcluate the number needed) make
2342 * sure to release any clusters we
2343 * haven't yet consumed.
2344 */
2345 if (freelist == NULL &&
2346 bytes_to_alloc > MBIGCLBYTES &&
2347 jumbocl) {
2348 num_needed =
2349 bytes_to_alloc / M16KCLBYTES;
2350
2351 if ((bytes_to_alloc -
2352 (num_needed * M16KCLBYTES))
2353 >= MINCLSIZE) {
2354 num_needed++;
2355 }
2356
2357 freelist =
2358 m_getpackets_internal(
2359 (unsigned int *)&num_needed,
2360 hdrs_needed, M_WAIT, 0,
2361 M16KCLBYTES);
2362 /*
2363 * Fall back to 4K cluster size
2364 * if allocation failed
2365 */
2366 }
2367
2368 if (freelist == NULL &&
2369 bytes_to_alloc > MCLBYTES &&
2370 bigcl) {
2371 num_needed =
2372 bytes_to_alloc / MBIGCLBYTES;
2373
2374 if ((bytes_to_alloc -
2375 (num_needed * MBIGCLBYTES)) >=
2376 MINCLSIZE) {
2377 num_needed++;
2378 }
2379
2380 freelist =
2381 m_getpackets_internal(
2382 (unsigned int *)&num_needed,
2383 hdrs_needed, M_WAIT, 0,
2384 MBIGCLBYTES);
2385 /*
2386 * Fall back to cluster size
2387 * if allocation failed
2388 */
2389 }
2390
2391 /*
2392 * Allocate a cluster as we want to
2393 * avoid to split the data in more
2394 * that one segment and using MINCLSIZE
2395 * would lead us to allocate two mbufs
2396 */
2397 if (soreserveheadroom != 0 &&
2398 freelist == NULL &&
2399 ((top == NULL &&
2400 bytes_to_alloc > _MHLEN) ||
2401 bytes_to_alloc > _MLEN)) {
2402 num_needed = ROUNDUP(bytes_to_alloc, MCLBYTES) /
2403 MCLBYTES;
2404 freelist =
2405 m_getpackets_internal(
2406 (unsigned int *)&num_needed,
2407 hdrs_needed, M_WAIT, 0,
2408 MCLBYTES);
2409 /*
2410 * Fall back to a single mbuf
2411 * if allocation failed
2412 */
2413 } else if (freelist == NULL &&
2414 bytes_to_alloc > MINCLSIZE) {
2415 num_needed =
2416 bytes_to_alloc / MCLBYTES;
2417
2418 if ((bytes_to_alloc -
2419 (num_needed * MCLBYTES)) >=
2420 MINCLSIZE) {
2421 num_needed++;
2422 }
2423
2424 freelist =
2425 m_getpackets_internal(
2426 (unsigned int *)&num_needed,
2427 hdrs_needed, M_WAIT, 0,
2428 MCLBYTES);
2429 /*
2430 * Fall back to a single mbuf
2431 * if allocation failed
2432 */
2433 }
2434 /*
2435 * For datagram protocols, leave
2436 * headroom for protocol headers
2437 * in the first cluster of the chain
2438 */
2439 if (freelist != NULL && atomic &&
2440 top == NULL && headroom > 0) {
2441 freelist->m_data += headroom;
2442 }
2443
2444 /*
2445 * Fall back to regular mbufs without
2446 * reserving the socket headroom
2447 */
2448 if (freelist == NULL) {
2449 if (top == NULL) {
2450 MGETHDR(freelist,
2451 M_WAIT, MT_DATA);
2452 } else {
2453 MGET(freelist,
2454 M_WAIT, MT_DATA);
2455 }
2456
2457 if (freelist == NULL) {
2458 error = ENOBUFS;
2459 socket_lock(so, 0);
2460 goto out_locked;
2461 }
2462 /*
2463 * For datagram protocols,
2464 * leave room for protocol
2465 * headers in first mbuf.
2466 */
2467 if (atomic && top == NULL &&
2468 bytes_to_copy < MHLEN) {
2469 MH_ALIGN(freelist,
2470 bytes_to_copy);
2471 }
2472 }
2473 m = freelist;
2474 freelist = m->m_next;
2475 m->m_next = NULL;
2476
2477 if ((m->m_flags & M_EXT)) {
2478 mlen = m->m_ext.ext_size -
2479 M_LEADINGSPACE(m);
2480 } else if ((m->m_flags & M_PKTHDR)) {
2481 mlen =
2482 MHLEN - M_LEADINGSPACE(m);
2483 } else {
2484 mlen = MLEN - M_LEADINGSPACE(m);
2485 }
2486 len = imin(mlen, bytes_to_copy);
2487
2488 chainlength += len;
2489
2490 space -= len;
2491
2492 error = uiomove(mtod(m, caddr_t),
2493 len, uio);
2494
2495 resid = uio_resid(uio);
2496
2497 m->m_len = len;
2498 *mp = m;
2499 top->m_pkthdr.len += len;
2500 if (error) {
2501 break;
2502 }
2503 mp = &m->m_next;
2504 if (resid <= 0) {
2505 if (flags & MSG_EOR) {
2506 top->m_flags |= M_EOR;
2507 }
2508 break;
2509 }
2510 bytes_to_copy = min(resid, space);
2511 } while (space > 0 &&
2512 (chainlength < sosendmaxchain || atomic ||
2513 resid < MINCLSIZE));
2514
2515 socket_lock(so, 0);
2516
2517 if (error) {
2518 goto out_locked;
2519 }
2520 }
2521
2522 if (dontroute) {
2523 so->so_options |= SO_DONTROUTE;
2524 }
2525
2526 /*
2527 * Compute flags here, for pru_send and NKEs
2528 *
2529 * If the user set MSG_EOF, the protocol
2530 * understands this flag and nothing left to
2531 * send then use PRU_SEND_EOF instead of PRU_SEND.
2532 */
2533 sendflags = (flags & MSG_OOB) ? PRUS_OOB :
2534 ((flags & MSG_EOF) &&
2535 (so->so_proto->pr_flags & PR_IMPLOPCL) &&
2536 (resid <= 0)) ? PRUS_EOF :
2537 /* If there is more to send set PRUS_MORETOCOME */
2538 (resid > 0 && space > 0) ? PRUS_MORETOCOME : 0;
2539
2540 if ((flags & MSG_SKIPCFIL) == 0) {
2541 /*
2542 * Socket filter processing
2543 */
2544 error = sflt_data_out(so, addr, &top,
2545 &control, (sendflags & MSG_OOB) ?
2546 sock_data_filt_flag_oob : 0);
2547 if (error) {
2548 if (error == EJUSTRETURN) {
2549 error = 0;
2550 clen = 0;
2551 control = NULL;
2552 top = NULL;
2553 }
2554 goto out_locked;
2555 }
2556 #if CONTENT_FILTER
2557 /*
2558 * Content filter processing
2559 */
2560 error = cfil_sock_data_out(so, addr, top,
2561 control, sendflags);
2562 if (error) {
2563 if (error == EJUSTRETURN) {
2564 error = 0;
2565 clen = 0;
2566 control = NULL;
2567 top = NULL;
2568 }
2569 goto out_locked;
2570 }
2571 #endif /* CONTENT_FILTER */
2572 }
2573 if (so->so_flags & SOF_ENABLE_MSGS) {
2574 /*
2575 * Make a copy of control mbuf,
2576 * so that msg priority can be
2577 * passed to subsequent mbufs.
2578 */
2579 control_copy = m_dup(control, M_NOWAIT);
2580 }
2581 error = (*so->so_proto->pr_usrreqs->pru_send)
2582 (so, sendflags, top, addr, control, p);
2583
2584 if (dontroute) {
2585 so->so_options &= ~SO_DONTROUTE;
2586 }
2587
2588 clen = 0;
2589 control = control_copy;
2590 control_copy = NULL;
2591 top = NULL;
2592 mp = &top;
2593 if (error) {
2594 goto out_locked;
2595 }
2596 } while (resid && space > 0);
2597 } while (resid);
2598
2599 out_locked:
2600 if (sblocked) {
2601 sbunlock(&so->so_snd, FALSE); /* will unlock socket */
2602 } else {
2603 socket_unlock(so, 1);
2604 }
2605 if (top != NULL) {
2606 m_freem(top);
2607 }
2608 if (control != NULL) {
2609 m_freem(control);
2610 }
2611 if (freelist != NULL) {
2612 m_freem_list(freelist);
2613 }
2614 if (control_copy != NULL) {
2615 m_freem(control_copy);
2616 }
2617
2618 soclearfastopen(so);
2619
2620 if (en_tracing) {
2621 /* resid passed here is the bytes left in uio */
2622 KERNEL_ENERGYTRACE(kEnTrActKernSockWrite, DBG_FUNC_END,
2623 VM_KERNEL_ADDRPERM(so),
2624 ((error == EWOULDBLOCK) ? kEnTrFlagNoWork : 0),
2625 (int64_t)(orig_resid - resid));
2626 }
2627 KERNEL_DEBUG(DBG_FNC_SOSEND | DBG_FUNC_END, so, resid,
2628 so->so_snd.sb_cc, space, error);
2629
2630 return error;
2631 }
2632
2633 int
2634 sosend_reinject(struct socket *so, struct sockaddr *addr, struct mbuf *top, struct mbuf *control, uint32_t sendflags)
2635 {
2636 struct mbuf *m0 = NULL, *control_end = NULL;
2637
2638 socket_lock_assert_owned(so);
2639
2640 /*
2641 * top must points to mbuf chain to be sent.
2642 * If control is not NULL, top must be packet header
2643 */
2644 VERIFY(top != NULL &&
2645 (control == NULL || top->m_flags & M_PKTHDR));
2646
2647 /*
2648 * If control is not passed in, see if we can get it
2649 * from top.
2650 */
2651 if (control == NULL && (top->m_flags & M_PKTHDR) == 0) {
2652 // Locate start of control if present and start of data
2653 for (m0 = top; m0 != NULL; m0 = m0->m_next) {
2654 if (m0->m_flags & M_PKTHDR) {
2655 top = m0;
2656 break;
2657 } else if (m0->m_type == MT_CONTROL) {
2658 if (control == NULL) {
2659 // Found start of control
2660 control = m0;
2661 }
2662 if (control != NULL && m0->m_next != NULL && m0->m_next->m_type != MT_CONTROL) {
2663 // Found end of control
2664 control_end = m0;
2665 }
2666 }
2667 }
2668 if (control_end != NULL) {
2669 control_end->m_next = NULL;
2670 }
2671 }
2672
2673 int error = (*so->so_proto->pr_usrreqs->pru_send)
2674 (so, sendflags, top, addr, control, current_proc());
2675
2676 return error;
2677 }
2678
2679 /*
2680 * Supported only connected sockets (no address) without ancillary data
2681 * (control mbuf) for atomic protocols
2682 */
2683 int
2684 sosend_list(struct socket *so, struct uio **uioarray, u_int uiocnt, int flags)
2685 {
2686 struct mbuf *m, *freelist = NULL;
2687 user_ssize_t len, resid;
2688 int error, dontroute, mlen;
2689 int atomic = sosendallatonce(so);
2690 int sblocked = 0;
2691 struct proc *p = current_proc();
2692 u_int uiofirst = 0;
2693 u_int uiolast = 0;
2694 struct mbuf *top = NULL;
2695 uint16_t headroom = 0;
2696 boolean_t bigcl;
2697
2698 KERNEL_DEBUG((DBG_FNC_SOSEND_LIST | DBG_FUNC_START), so, uiocnt,
2699 so->so_snd.sb_cc, so->so_snd.sb_lowat, so->so_snd.sb_hiwat);
2700
2701 if (so->so_type != SOCK_DGRAM) {
2702 error = EINVAL;
2703 goto out;
2704 }
2705 if (atomic == 0) {
2706 error = EINVAL;
2707 goto out;
2708 }
2709 if (so->so_proto->pr_usrreqs->pru_send_list == NULL) {
2710 error = EPROTONOSUPPORT;
2711 goto out;
2712 }
2713 if (flags & ~(MSG_DONTWAIT | MSG_NBIO)) {
2714 error = EINVAL;
2715 goto out;
2716 }
2717 resid = uio_array_resid(uioarray, uiocnt);
2718
2719 /*
2720 * In theory resid should be unsigned.
2721 * However, space must be signed, as it might be less than 0
2722 * if we over-committed, and we must use a signed comparison
2723 * of space and resid. On the other hand, a negative resid
2724 * causes us to loop sending 0-length segments to the protocol.
2725 *
2726 * Note: We limit resid to be a positive int value as we use
2727 * imin() to set bytes_to_copy -- radr://14558484
2728 */
2729 if (resid < 0 || resid > INT_MAX) {
2730 error = EINVAL;
2731 goto out;
2732 }
2733
2734 socket_lock(so, 1);
2735 so_update_last_owner_locked(so, p);
2736 so_update_policy(so);
2737
2738 #if NECP
2739 so_update_necp_policy(so, NULL, NULL);
2740 #endif /* NECP */
2741
2742 dontroute = (flags & MSG_DONTROUTE) &&
2743 (so->so_options & SO_DONTROUTE) == 0 &&
2744 (so->so_proto->pr_flags & PR_ATOMIC);
2745 OSIncrementAtomicLong(&p->p_stats->p_ru.ru_msgsnd);
2746
2747 error = sosendcheck(so, NULL, resid, 0, atomic, flags,
2748 &sblocked, NULL);
2749 if (error) {
2750 goto release;
2751 }
2752
2753 /*
2754 * Use big 4 KB clusters when the outgoing interface does not prefer
2755 * 2 KB clusters
2756 */
2757 bigcl = !(so->so_flags1 & SOF1_IF_2KCL) || sosendbigcl_ignore_capab;
2758
2759 if (soreserveheadroom != 0) {
2760 headroom = so->so_pktheadroom;
2761 }
2762
2763 do {
2764 int i;
2765 int num_needed = 0;
2766 int chainlength;
2767 size_t maxpktlen = 0;
2768 int bytes_to_alloc;
2769
2770 if (sosendminchain > 0) {
2771 chainlength = 0;
2772 } else {
2773 chainlength = sosendmaxchain;
2774 }
2775
2776 socket_unlock(so, 0);
2777
2778 /*
2779 * Find a set of uio that fit in a reasonable number
2780 * of mbuf packets
2781 */
2782 for (i = uiofirst; i < uiocnt; i++) {
2783 struct uio *auio = uioarray[i];
2784
2785 len = uio_resid(auio);
2786
2787 /* Do nothing for empty messages */
2788 if (len == 0) {
2789 continue;
2790 }
2791
2792 num_needed += 1;
2793 uiolast += 1;
2794
2795 if (len > maxpktlen) {
2796 maxpktlen = len;
2797 }
2798
2799 chainlength += len;
2800 if (chainlength > sosendmaxchain) {
2801 break;
2802 }
2803 }
2804 /*
2805 * Nothing left to send
2806 */
2807 if (num_needed == 0) {
2808 socket_lock(so, 0);
2809 break;
2810 }
2811 /*
2812 * Allocate buffer large enough to include headroom space for
2813 * network and link header
2814 *
2815 */
2816 bytes_to_alloc = maxpktlen + headroom;
2817
2818 /*
2819 * Allocate a single contiguous buffer of the smallest available
2820 * size when possible
2821 */
2822 if (bytes_to_alloc > MCLBYTES &&
2823 bytes_to_alloc <= MBIGCLBYTES && bigcl) {
2824 freelist = m_getpackets_internal(
2825 (unsigned int *)&num_needed,
2826 num_needed, M_WAIT, 1,
2827 MBIGCLBYTES);
2828 } else if (bytes_to_alloc > _MHLEN &&
2829 bytes_to_alloc <= MCLBYTES) {
2830 freelist = m_getpackets_internal(
2831 (unsigned int *)&num_needed,
2832 num_needed, M_WAIT, 1,
2833 MCLBYTES);
2834 } else {
2835 freelist = m_allocpacket_internal(
2836 (unsigned int *)&num_needed,
2837 bytes_to_alloc, NULL, M_WAIT, 1, 0);
2838 }
2839
2840 if (freelist == NULL) {
2841 socket_lock(so, 0);
2842 error = ENOMEM;
2843 goto release;
2844 }
2845 /*
2846 * Copy each uio of the set into its own mbuf packet
2847 */
2848 for (i = uiofirst, m = freelist;
2849 i < uiolast && m != NULL;
2850 i++) {
2851 int bytes_to_copy;
2852 struct mbuf *n;
2853 struct uio *auio = uioarray[i];
2854
2855 bytes_to_copy = uio_resid(auio);
2856
2857 /* Do nothing for empty messages */
2858 if (bytes_to_copy == 0) {
2859 continue;
2860 }
2861 /*
2862 * Leave headroom for protocol headers
2863 * in the first mbuf of the chain
2864 */
2865 m->m_data += headroom;
2866
2867 for (n = m; n != NULL; n = n->m_next) {
2868 if ((m->m_flags & M_EXT)) {
2869 mlen = m->m_ext.ext_size -
2870 M_LEADINGSPACE(m);
2871 } else if ((m->m_flags & M_PKTHDR)) {
2872 mlen =
2873 MHLEN - M_LEADINGSPACE(m);
2874 } else {
2875 mlen = MLEN - M_LEADINGSPACE(m);
2876 }
2877 len = imin(mlen, bytes_to_copy);
2878
2879 /*
2880 * Note: uiomove() decrements the iovec
2881 * length
2882 */
2883 error = uiomove(mtod(n, caddr_t),
2884 len, auio);
2885 if (error != 0) {
2886 break;
2887 }
2888 n->m_len = len;
2889 m->m_pkthdr.len += len;
2890
2891 VERIFY(m->m_pkthdr.len <= maxpktlen);
2892
2893 bytes_to_copy -= len;
2894 resid -= len;
2895 }
2896 if (m->m_pkthdr.len == 0) {
2897 printf(
2898 "%s:%d so %llx pkt %llx type %u len null\n",
2899 __func__, __LINE__,
2900 (uint64_t)DEBUG_KERNEL_ADDRPERM(so),
2901 (uint64_t)DEBUG_KERNEL_ADDRPERM(m),
2902 m->m_type);
2903 }
2904 if (error != 0) {
2905 break;
2906 }
2907 m = m->m_nextpkt;
2908 }
2909
2910 socket_lock(so, 0);
2911
2912 if (error) {
2913 goto release;
2914 }
2915 top = freelist;
2916 freelist = NULL;
2917
2918 if (dontroute) {
2919 so->so_options |= SO_DONTROUTE;
2920 }
2921
2922 if ((flags & MSG_SKIPCFIL) == 0) {
2923 struct mbuf **prevnextp = NULL;
2924
2925 for (i = uiofirst, m = top;
2926 i < uiolast && m != NULL;
2927 i++) {
2928 struct mbuf *nextpkt = m->m_nextpkt;
2929
2930 /*
2931 * Socket filter processing
2932 */
2933 error = sflt_data_out(so, NULL, &m,
2934 NULL, 0);
2935 if (error != 0 && error != EJUSTRETURN) {
2936 goto release;
2937 }
2938
2939 #if CONTENT_FILTER
2940 if (error == 0) {
2941 /*
2942 * Content filter processing
2943 */
2944 error = cfil_sock_data_out(so, NULL, m,
2945 NULL, 0);
2946 if (error != 0 && error != EJUSTRETURN) {
2947 goto release;
2948 }
2949 }
2950 #endif /* CONTENT_FILTER */
2951 /*
2952 * Remove packet from the list when
2953 * swallowed by a filter
2954 */
2955 if (error == EJUSTRETURN) {
2956 error = 0;
2957 if (prevnextp != NULL) {
2958 *prevnextp = nextpkt;
2959 } else {
2960 top = nextpkt;
2961 }
2962 }
2963
2964 m = nextpkt;
2965 if (m != NULL) {
2966 prevnextp = &m->m_nextpkt;
2967 }
2968 }
2969 }
2970 if (top != NULL) {
2971 error = (*so->so_proto->pr_usrreqs->pru_send_list)
2972 (so, 0, top, NULL, NULL, p);
2973 }
2974
2975 if (dontroute) {
2976 so->so_options &= ~SO_DONTROUTE;
2977 }
2978
2979 top = NULL;
2980 uiofirst = uiolast;
2981 } while (resid > 0 && error == 0);
2982 release:
2983 if (sblocked) {
2984 sbunlock(&so->so_snd, FALSE); /* will unlock socket */
2985 } else {
2986 socket_unlock(so, 1);
2987 }
2988 out:
2989 if (top != NULL) {
2990 m_freem(top);
2991 }
2992 if (freelist != NULL) {
2993 m_freem_list(freelist);
2994 }
2995
2996 KERNEL_DEBUG(DBG_FNC_SOSEND_LIST | DBG_FUNC_END, so, resid,
2997 so->so_snd.sb_cc, 0, error);
2998
2999 return error;
3000 }
3001
3002 /*
3003 * May return ERESTART when packet is dropped by MAC policy check
3004 */
3005 static int
3006 soreceive_addr(struct proc *p, struct socket *so, struct sockaddr **psa,
3007 int flags, struct mbuf **mp, struct mbuf **nextrecordp, int canwait)
3008 {
3009 int error = 0;
3010 struct mbuf *m = *mp;
3011 struct mbuf *nextrecord = *nextrecordp;
3012
3013 KASSERT(m->m_type == MT_SONAME, ("receive 1a"));
3014 #if CONFIG_MACF_SOCKET_SUBSET
3015 /*
3016 * Call the MAC framework for policy checking if we're in
3017 * the user process context and the socket isn't connected.
3018 */
3019 if (p != kernproc && !(so->so_state & SS_ISCONNECTED)) {
3020 struct mbuf *m0 = m;
3021 /*
3022 * Dequeue this record (temporarily) from the receive
3023 * list since we're about to drop the socket's lock
3024 * where a new record may arrive and be appended to
3025 * the list. Upon MAC policy failure, the record
3026 * will be freed. Otherwise, we'll add it back to
3027 * the head of the list. We cannot rely on SB_LOCK
3028 * because append operation uses the socket's lock.
3029 */
3030 do {
3031 m->m_nextpkt = NULL;
3032 sbfree(&so->so_rcv, m);
3033 m = m->m_next;
3034 } while (m != NULL);
3035 m = m0;
3036 so->so_rcv.sb_mb = nextrecord;
3037 SB_EMPTY_FIXUP(&so->so_rcv);
3038 SBLASTRECORDCHK(&so->so_rcv, "soreceive 1a");
3039 SBLASTMBUFCHK(&so->so_rcv, "soreceive 1a");
3040 socket_unlock(so, 0);
3041
3042 if (mac_socket_check_received(proc_ucred(p), so,
3043 mtod(m, struct sockaddr *)) != 0) {
3044 /*
3045 * MAC policy failure; free this record and
3046 * process the next record (or block until
3047 * one is available). We have adjusted sb_cc
3048 * and sb_mbcnt above so there is no need to
3049 * call sbfree() again.
3050 */
3051 m_freem(m);
3052 /*
3053 * Clear SB_LOCK but don't unlock the socket.
3054 * Process the next record or wait for one.
3055 */
3056 socket_lock(so, 0);
3057 sbunlock(&so->so_rcv, TRUE); /* stay locked */
3058 error = ERESTART;
3059 goto done;
3060 }
3061 socket_lock(so, 0);
3062 /*
3063 * If the socket has been defunct'd, drop it.
3064 */
3065 if (so->so_flags & SOF_DEFUNCT) {
3066 m_freem(m);
3067 error = ENOTCONN;
3068 goto done;
3069 }
3070 /*
3071 * Re-adjust the socket receive list and re-enqueue
3072 * the record in front of any packets which may have
3073 * been appended while we dropped the lock.
3074 */
3075 for (m = m0; m->m_next != NULL; m = m->m_next) {
3076 sballoc(&so->so_rcv, m);
3077 }
3078 sballoc(&so->so_rcv, m);
3079 if (so->so_rcv.sb_mb == NULL) {
3080 so->so_rcv.sb_lastrecord = m0;
3081 so->so_rcv.sb_mbtail = m;
3082 }
3083 m = m0;
3084 nextrecord = m->m_nextpkt = so->so_rcv.sb_mb;
3085 so->so_rcv.sb_mb = m;
3086 SBLASTRECORDCHK(&so->so_rcv, "soreceive 1b");
3087 SBLASTMBUFCHK(&so->so_rcv, "soreceive 1b");
3088 }
3089 #endif /* CONFIG_MACF_SOCKET_SUBSET */
3090 if (psa != NULL) {
3091 *psa = dup_sockaddr(mtod(m, struct sockaddr *), canwait);
3092 if ((*psa == NULL) && (flags & MSG_NEEDSA)) {
3093 error = EWOULDBLOCK;
3094 goto done;
3095 }
3096 }
3097 if (flags & MSG_PEEK) {
3098 m = m->m_next;
3099 } else {
3100 sbfree(&so->so_rcv, m);
3101 if (m->m_next == NULL && so->so_rcv.sb_cc != 0) {
3102 panic("%s: about to create invalid socketbuf",
3103 __func__);
3104 /* NOTREACHED */
3105 }
3106 MFREE(m, so->so_rcv.sb_mb);
3107 m = so->so_rcv.sb_mb;
3108 if (m != NULL) {
3109 m->m_nextpkt = nextrecord;
3110 } else {
3111 so->so_rcv.sb_mb = nextrecord;
3112 SB_EMPTY_FIXUP(&so->so_rcv);
3113 }
3114 }
3115 done:
3116 *mp = m;
3117 *nextrecordp = nextrecord;
3118
3119 return error;
3120 }
3121
3122 /*
3123 * Process one or more MT_CONTROL mbufs present before any data mbufs
3124 * in the first mbuf chain on the socket buffer. If MSG_PEEK, we
3125 * just copy the data; if !MSG_PEEK, we call into the protocol to
3126 * perform externalization.
3127 */
3128 static int
3129 soreceive_ctl(struct socket *so, struct mbuf **controlp, int flags,
3130 struct mbuf **mp, struct mbuf **nextrecordp)
3131 {
3132 int error = 0;
3133 struct mbuf *cm = NULL, *cmn;
3134 struct mbuf **cme = &cm;
3135 struct sockbuf *sb_rcv = &so->so_rcv;
3136 struct mbuf **msgpcm = NULL;
3137 struct mbuf *m = *mp;
3138 struct mbuf *nextrecord = *nextrecordp;
3139 struct protosw *pr = so->so_proto;
3140
3141 /*
3142 * Externalizing the control messages would require us to
3143 * drop the socket's lock below. Once we re-acquire the
3144 * lock, the mbuf chain might change. In order to preserve
3145 * consistency, we unlink all control messages from the
3146 * first mbuf chain in one shot and link them separately
3147 * onto a different chain.
3148 */
3149 do {
3150 if (flags & MSG_PEEK) {
3151 if (controlp != NULL) {
3152 if (*controlp == NULL) {
3153 msgpcm = controlp;
3154 }
3155 *controlp = m_copy(m, 0, m->m_len);
3156
3157 /*
3158 * If we failed to allocate an mbuf,
3159 * release any previously allocated
3160 * mbufs for control data. Return
3161 * an error. Keep the mbufs in the
3162 * socket as this is using
3163 * MSG_PEEK flag.
3164 */
3165 if (*controlp == NULL) {
3166 m_freem(*msgpcm);
3167 error = ENOBUFS;
3168 goto done;
3169 }
3170 controlp = &(*controlp)->m_next;
3171 }
3172 m = m->m_next;
3173 } else {
3174 m->m_nextpkt = NULL;
3175 sbfree(sb_rcv, m);
3176 sb_rcv->sb_mb = m->m_next;
3177 m->m_next = NULL;
3178 *cme = m;
3179 cme = &(*cme)->m_next;
3180 m = sb_rcv->sb_mb;
3181 }
3182 } while (m != NULL && m->m_type == MT_CONTROL);
3183
3184 if (!(flags & MSG_PEEK)) {
3185 if (sb_rcv->sb_mb != NULL) {
3186 sb_rcv->sb_mb->m_nextpkt = nextrecord;
3187 } else {
3188 sb_rcv->sb_mb = nextrecord;
3189 SB_EMPTY_FIXUP(sb_rcv);
3190 }
3191 if (nextrecord == NULL) {
3192 sb_rcv->sb_lastrecord = m;
3193 }
3194 }
3195
3196 SBLASTRECORDCHK(&so->so_rcv, "soreceive ctl");
3197 SBLASTMBUFCHK(&so->so_rcv, "soreceive ctl");
3198
3199 while (cm != NULL) {
3200 int cmsg_type;
3201
3202 cmn = cm->m_next;
3203 cm->m_next = NULL;
3204 cmsg_type = mtod(cm, struct cmsghdr *)->cmsg_type;
3205
3206 /*
3207 * Call the protocol to externalize SCM_RIGHTS message
3208 * and return the modified message to the caller upon
3209 * success. Otherwise, all other control messages are
3210 * returned unmodified to the caller. Note that we
3211 * only get into this loop if MSG_PEEK is not set.
3212 */
3213 if (pr->pr_domain->dom_externalize != NULL &&
3214 cmsg_type == SCM_RIGHTS) {
3215 /*
3216 * Release socket lock: see 3903171. This
3217 * would also allow more records to be appended
3218 * to the socket buffer. We still have SB_LOCK
3219 * set on it, so we can be sure that the head
3220 * of the mbuf chain won't change.
3221 */
3222 socket_unlock(so, 0);
3223 error = (*pr->pr_domain->dom_externalize)(cm);
3224 socket_lock(so, 0);
3225 } else {
3226 error = 0;
3227 }
3228
3229 if (controlp != NULL && error == 0) {
3230 *controlp = cm;
3231 controlp = &(*controlp)->m_next;
3232 } else {
3233 (void) m_free(cm);
3234 }
3235 cm = cmn;
3236 }
3237 /*
3238 * Update the value of nextrecord in case we received new
3239 * records when the socket was unlocked above for
3240 * externalizing SCM_RIGHTS.
3241 */
3242 if (m != NULL) {
3243 nextrecord = sb_rcv->sb_mb->m_nextpkt;
3244 } else {
3245 nextrecord = sb_rcv->sb_mb;
3246 }
3247
3248 done:
3249 *mp = m;
3250 *nextrecordp = nextrecord;
3251
3252 return error;
3253 }
3254
3255 /*
3256 * Implement receive operations on a socket.
3257 * We depend on the way that records are added to the sockbuf
3258 * by sbappend*. In particular, each record (mbufs linked through m_next)
3259 * must begin with an address if the protocol so specifies,
3260 * followed by an optional mbuf or mbufs containing ancillary data,
3261 * and then zero or more mbufs of data.
3262 * In order to avoid blocking network interrupts for the entire time here,
3263 * we splx() while doing the actual copy to user space.
3264 * Although the sockbuf is locked, new data may still be appended,
3265 * and thus we must maintain consistency of the sockbuf during that time.
3266 *
3267 * The caller may receive the data as a single mbuf chain by supplying
3268 * an mbuf **mp0 for use in returning the chain. The uio is then used
3269 * only for the count in uio_resid.
3270 *
3271 * Returns: 0 Success
3272 * ENOBUFS
3273 * ENOTCONN
3274 * EWOULDBLOCK
3275 * uiomove:EFAULT
3276 * sblock:EWOULDBLOCK
3277 * sblock:EINTR
3278 * sbwait:EBADF
3279 * sbwait:EINTR
3280 * sodelayed_copy:EFAULT
3281 * <pru_rcvoob>:EINVAL[TCP]
3282 * <pru_rcvoob>:EWOULDBLOCK[TCP]
3283 * <pru_rcvoob>:???
3284 * <pr_domain->dom_externalize>:EMSGSIZE[AF_UNIX]
3285 * <pr_domain->dom_externalize>:ENOBUFS[AF_UNIX]
3286 * <pr_domain->dom_externalize>:???
3287 *
3288 * Notes: Additional return values from calls through <pru_rcvoob> and
3289 * <pr_domain->dom_externalize> depend on protocols other than
3290 * TCP or AF_UNIX, which are documented above.
3291 */
3292 int
3293 soreceive(struct socket *so, struct sockaddr **psa, struct uio *uio,
3294 struct mbuf **mp0, struct mbuf **controlp, int *flagsp)
3295 {
3296 struct mbuf *m, **mp, *ml = NULL;
3297 struct mbuf *nextrecord, *free_list;
3298 int flags, error, offset;
3299 user_ssize_t len;
3300 struct protosw *pr = so->so_proto;
3301 int moff, type = 0;
3302 user_ssize_t orig_resid = uio_resid(uio);
3303 user_ssize_t delayed_copy_len;
3304 int can_delay;
3305 int need_event;
3306 struct proc *p = current_proc();
3307 boolean_t en_tracing = FALSE;
3308
3309 /*
3310 * Sanity check on the length passed by caller as we are making 'int'
3311 * comparisons
3312 */
3313 if (orig_resid < 0 || orig_resid > INT_MAX) {
3314 return EINVAL;
3315 }
3316
3317 KERNEL_DEBUG(DBG_FNC_SORECEIVE | DBG_FUNC_START, so,
3318 uio_resid(uio), so->so_rcv.sb_cc, so->so_rcv.sb_lowat,
3319 so->so_rcv.sb_hiwat);
3320
3321 socket_lock(so, 1);
3322 so_update_last_owner_locked(so, p);
3323 so_update_policy(so);
3324
3325 #ifdef MORE_LOCKING_DEBUG
3326 if (so->so_usecount == 1) {
3327 panic("%s: so=%x no other reference on socket\n", __func__, so);
3328 /* NOTREACHED */
3329 }
3330 #endif
3331 mp = mp0;
3332 if (psa != NULL) {
3333 *psa = NULL;
3334 }
3335 if (controlp != NULL) {
3336 *controlp = NULL;
3337 }
3338 if (flagsp != NULL) {
3339 flags = *flagsp & ~MSG_EOR;
3340 } else {
3341 flags = 0;
3342 }
3343
3344 /*
3345 * If a recv attempt is made on a previously-accepted socket
3346 * that has been marked as inactive (disconnected), reject
3347 * the request.
3348 */
3349 if (so->so_flags & SOF_DEFUNCT) {
3350 struct sockbuf *sb = &so->so_rcv;
3351
3352 error = ENOTCONN;
3353 SODEFUNCTLOG("%s[%d, %s]: defunct so 0x%llx [%d,%d] (%d)\n",
3354 __func__, proc_pid(p), proc_best_name(p),
3355 (uint64_t)DEBUG_KERNEL_ADDRPERM(so),
3356 SOCK_DOM(so), SOCK_TYPE(so), error);
3357 /*
3358 * This socket should have been disconnected and flushed
3359 * prior to being returned from sodefunct(); there should
3360 * be no data on its receive list, so panic otherwise.
3361 */
3362 if (so->so_state & SS_DEFUNCT) {
3363 sb_empty_assert(sb, __func__);
3364 }
3365 socket_unlock(so, 1);
3366 return error;
3367 }
3368
3369 if ((so->so_flags1 & SOF1_PRECONNECT_DATA) &&
3370 pr->pr_usrreqs->pru_preconnect) {
3371 /*
3372 * A user may set the CONNECT_RESUME_ON_READ_WRITE-flag but not
3373 * calling write() right after this. *If* the app calls a read
3374 * we do not want to block this read indefinetely. Thus,
3375 * we trigger a connect so that the session gets initiated.
3376 */
3377 error = (*pr->pr_usrreqs->pru_preconnect)(so);
3378
3379 if (error) {
3380 socket_unlock(so, 1);
3381 return error;
3382 }
3383 }
3384
3385 if (ENTR_SHOULDTRACE &&
3386 (SOCK_CHECK_DOM(so, AF_INET) || SOCK_CHECK_DOM(so, AF_INET6))) {
3387 /*
3388 * enable energy tracing for inet sockets that go over
3389 * non-loopback interfaces only.
3390 */
3391 struct inpcb *inp = sotoinpcb(so);
3392 if (inp->inp_last_outifp != NULL &&
3393 !(inp->inp_last_outifp->if_flags & IFF_LOOPBACK)) {
3394 en_tracing = TRUE;
3395 KERNEL_ENERGYTRACE(kEnTrActKernSockRead, DBG_FUNC_START,
3396 VM_KERNEL_ADDRPERM(so),
3397 ((so->so_state & SS_NBIO) ?
3398 kEnTrFlagNonBlocking : 0),
3399 (int64_t)orig_resid);
3400 }
3401 }
3402
3403 /*
3404 * When SO_WANTOOBFLAG is set we try to get out-of-band data
3405 * regardless of the flags argument. Here is the case were
3406 * out-of-band data is not inline.
3407 */
3408 if ((flags & MSG_OOB) ||
3409 ((so->so_options & SO_WANTOOBFLAG) != 0 &&
3410 (so->so_options & SO_OOBINLINE) == 0 &&
3411 (so->so_oobmark || (so->so_state & SS_RCVATMARK)))) {
3412 m = m_get(M_WAIT, MT_DATA);
3413 if (m == NULL) {
3414 socket_unlock(so, 1);
3415 KERNEL_DEBUG(DBG_FNC_SORECEIVE | DBG_FUNC_END,
3416 ENOBUFS, 0, 0, 0, 0);
3417 return ENOBUFS;
3418 }
3419 error = (*pr->pr_usrreqs->pru_rcvoob)(so, m, flags & MSG_PEEK);
3420 if (error) {
3421 goto bad;
3422 }
3423 socket_unlock(so, 0);
3424 do {
3425 error = uiomove(mtod(m, caddr_t),
3426 imin(uio_resid(uio), m->m_len), uio);
3427 m = m_free(m);
3428 } while (uio_resid(uio) && error == 0 && m != NULL);
3429 socket_lock(so, 0);
3430 bad:
3431 if (m != NULL) {
3432 m_freem(m);
3433 }
3434
3435 if ((so->so_options & SO_WANTOOBFLAG) != 0) {
3436 if (error == EWOULDBLOCK || error == EINVAL) {
3437 /*
3438 * Let's try to get normal data:
3439 * EWOULDBLOCK: out-of-band data not
3440 * receive yet. EINVAL: out-of-band data
3441 * already read.
3442 */
3443 error = 0;
3444 goto nooob;
3445 } else if (error == 0 && flagsp != NULL) {
3446 *flagsp |= MSG_OOB;
3447 }
3448 }
3449 socket_unlock(so, 1);
3450 if (en_tracing) {
3451 KERNEL_ENERGYTRACE(kEnTrActKernSockRead, DBG_FUNC_END,
3452 VM_KERNEL_ADDRPERM(so), 0,
3453 (int64_t)(orig_resid - uio_resid(uio)));
3454 }
3455 KERNEL_DEBUG(DBG_FNC_SORECEIVE | DBG_FUNC_END, error,
3456 0, 0, 0, 0);
3457
3458 return error;
3459 }
3460 nooob:
3461 if (mp != NULL) {
3462 *mp = NULL;
3463 }
3464
3465 if (so->so_state & SS_ISCONFIRMING && uio_resid(uio)) {
3466 (*pr->pr_usrreqs->pru_rcvd)(so, 0);
3467 }
3468
3469 free_list = NULL;
3470 delayed_copy_len = 0;
3471 restart:
3472 #ifdef MORE_LOCKING_DEBUG
3473 if (so->so_usecount <= 1) {
3474 printf("soreceive: sblock so=0x%llx ref=%d on socket\n",
3475 (uint64_t)DEBUG_KERNEL_ADDRPERM(so), so->so_usecount);
3476 }
3477 #endif
3478 /*
3479 * See if the socket has been closed (SS_NOFDREF|SS_CANTRCVMORE)
3480 * and if so just return to the caller. This could happen when
3481 * soreceive() is called by a socket upcall function during the
3482 * time the socket is freed. The socket buffer would have been
3483 * locked across the upcall, therefore we cannot put this thread
3484 * to sleep (else we will deadlock) or return EWOULDBLOCK (else
3485 * we may livelock), because the lock on the socket buffer will
3486 * only be released when the upcall routine returns to its caller.
3487 * Because the socket has been officially closed, there can be
3488 * no further read on it.
3489 *
3490 * A multipath subflow socket would have its SS_NOFDREF set by
3491 * default, so check for SOF_MP_SUBFLOW socket flag; when the
3492 * socket is closed for real, SOF_MP_SUBFLOW would be cleared.
3493 */
3494 if ((so->so_state & (SS_NOFDREF | SS_CANTRCVMORE)) ==
3495 (SS_NOFDREF | SS_CANTRCVMORE) && !(so->so_flags & SOF_MP_SUBFLOW)) {
3496 socket_unlock(so, 1);
3497 return 0;
3498 }
3499
3500 error = sblock(&so->so_rcv, SBLOCKWAIT(flags));
3501 if (error) {
3502 socket_unlock(so, 1);
3503 KERNEL_DEBUG(DBG_FNC_SORECEIVE | DBG_FUNC_END, error,
3504 0, 0, 0, 0);
3505 if (en_tracing) {
3506 KERNEL_ENERGYTRACE(kEnTrActKernSockRead, DBG_FUNC_END,
3507 VM_KERNEL_ADDRPERM(so), 0,
3508 (int64_t)(orig_resid - uio_resid(uio)));
3509 }
3510 return error;
3511 }
3512
3513 m = so->so_rcv.sb_mb;
3514 /*
3515 * If we have less data than requested, block awaiting more
3516 * (subject to any timeout) if:
3517 * 1. the current count is less than the low water mark, or
3518 * 2. MSG_WAITALL is set, and it is possible to do the entire
3519 * receive operation at once if we block (resid <= hiwat).
3520 * 3. MSG_DONTWAIT is not set
3521 * If MSG_WAITALL is set but resid is larger than the receive buffer,
3522 * we have to do the receive in sections, and thus risk returning
3523 * a short count if a timeout or signal occurs after we start.
3524 */
3525 if (m == NULL || (((flags & MSG_DONTWAIT) == 0 &&
3526 so->so_rcv.sb_cc < uio_resid(uio)) &&
3527 (so->so_rcv.sb_cc < so->so_rcv.sb_lowat ||
3528 ((flags & MSG_WAITALL) && uio_resid(uio) <= so->so_rcv.sb_hiwat)) &&
3529 m->m_nextpkt == NULL && (pr->pr_flags & PR_ATOMIC) == 0)) {
3530 /*
3531 * Panic if we notice inconsistencies in the socket's
3532 * receive list; both sb_mb and sb_cc should correctly
3533 * reflect the contents of the list, otherwise we may
3534 * end up with false positives during select() or poll()
3535 * which could put the application in a bad state.
3536 */
3537 SB_MB_CHECK(&so->so_rcv);
3538
3539 if (so->so_error) {
3540 if (m != NULL) {
3541 goto dontblock;
3542 }
3543 error = so->so_error;
3544 if ((flags & MSG_PEEK) == 0) {
3545 so->so_error = 0;
3546 }
3547 goto release;
3548 }
3549 if (so->so_state & SS_CANTRCVMORE) {
3550 #if CONTENT_FILTER
3551 /*
3552 * Deal with half closed connections
3553 */
3554 if ((so->so_state & SS_ISDISCONNECTED) == 0 &&
3555 cfil_sock_data_pending(&so->so_rcv) != 0) {
3556 CFIL_LOG(LOG_INFO,
3557 "so %llx ignore SS_CANTRCVMORE",
3558 (uint64_t)DEBUG_KERNEL_ADDRPERM(so));
3559 } else
3560 #endif /* CONTENT_FILTER */
3561 if (m != NULL) {
3562 goto dontblock;
3563 } else {
3564 goto release;
3565 }
3566 }
3567 for (; m != NULL; m = m->m_next) {
3568 if (m->m_type == MT_OOBDATA || (m->m_flags & M_EOR)) {
3569 m = so->so_rcv.sb_mb;
3570 goto dontblock;
3571 }
3572 }
3573 if ((so->so_state & (SS_ISCONNECTED | SS_ISCONNECTING)) == 0 &&
3574 (so->so_proto->pr_flags & PR_CONNREQUIRED)) {
3575 error = ENOTCONN;
3576 goto release;
3577 }
3578 if (uio_resid(uio) == 0) {
3579 goto release;
3580 }
3581
3582 if ((so->so_state & SS_NBIO) ||
3583 (flags & (MSG_DONTWAIT | MSG_NBIO))) {
3584 error = EWOULDBLOCK;
3585 goto release;
3586 }
3587 SBLASTRECORDCHK(&so->so_rcv, "soreceive sbwait 1");
3588 SBLASTMBUFCHK(&so->so_rcv, "soreceive sbwait 1");
3589 sbunlock(&so->so_rcv, TRUE); /* keep socket locked */
3590 #if EVEN_MORE_LOCKING_DEBUG
3591 if (socket_debug) {
3592 printf("Waiting for socket data\n");
3593 }
3594 #endif
3595
3596 error = sbwait(&so->so_rcv);
3597 #if EVEN_MORE_LOCKING_DEBUG
3598 if (socket_debug) {
3599 printf("SORECEIVE - sbwait returned %d\n", error);
3600 }
3601 #endif
3602 if (so->so_usecount < 1) {
3603 panic("%s: after 2nd sblock so=%p ref=%d on socket\n",
3604 __func__, so, so->so_usecount);
3605 /* NOTREACHED */
3606 }
3607 if (error) {
3608 socket_unlock(so, 1);
3609 KERNEL_DEBUG(DBG_FNC_SORECEIVE | DBG_FUNC_END, error,
3610 0, 0, 0, 0);
3611 if (en_tracing) {
3612 KERNEL_ENERGYTRACE(kEnTrActKernSockRead, DBG_FUNC_END,
3613 VM_KERNEL_ADDRPERM(so), 0,
3614 (int64_t)(orig_resid - uio_resid(uio)));
3615 }
3616 return error;
3617 }
3618 goto restart;
3619 }
3620 dontblock:
3621 OSIncrementAtomicLong(&p->p_stats->p_ru.ru_msgrcv);
3622 SBLASTRECORDCHK(&so->so_rcv, "soreceive 1");
3623 SBLASTMBUFCHK(&so->so_rcv, "soreceive 1");
3624 nextrecord = m->m_nextpkt;
3625
3626 if ((pr->pr_flags & PR_ADDR) && m->m_type == MT_SONAME) {
3627 error = soreceive_addr(p, so, psa, flags, &m, &nextrecord,
3628 mp0 == NULL);
3629 if (error == ERESTART) {
3630 goto restart;
3631 } else if (error != 0) {
3632 goto release;
3633 }
3634 orig_resid = 0;
3635 }
3636
3637 /*
3638 * Process one or more MT_CONTROL mbufs present before any data mbufs
3639 * in the first mbuf chain on the socket buffer. If MSG_PEEK, we
3640 * just copy the data; if !MSG_PEEK, we call into the protocol to
3641 * perform externalization.
3642 */
3643 if (m != NULL && m->m_type == MT_CONTROL) {
3644 error = soreceive_ctl(so, controlp, flags, &m, &nextrecord);
3645 if (error != 0) {
3646 goto release;
3647 }
3648 orig_resid = 0;
3649 }
3650
3651 /*
3652 * If the socket is a TCP socket with message delivery
3653 * enabled, then create a control msg to deliver the
3654 * relative TCP sequence number for this data. Waiting
3655 * until this point will protect against failures to
3656 * allocate an mbuf for control msgs.
3657 */
3658 if (so->so_type == SOCK_STREAM && SOCK_PROTO(so) == IPPROTO_TCP &&
3659 (so->so_flags & SOF_ENABLE_MSGS) && controlp != NULL) {
3660 struct mbuf *seq_cm;
3661
3662 seq_cm = sbcreatecontrol((caddr_t)&m->m_pkthdr.msg_seq,
3663 sizeof(uint32_t), SCM_SEQNUM, SOL_SOCKET);
3664 if (seq_cm == NULL) {
3665 /* unable to allocate a control mbuf */
3666 error = ENOBUFS;
3667 goto release;
3668 }
3669 *controlp = seq_cm;
3670 controlp = &seq_cm->m_next;
3671 }
3672
3673 if (m != NULL) {
3674 if (!(flags & MSG_PEEK)) {
3675 /*
3676 * We get here because m points to an mbuf following
3677 * any MT_SONAME or MT_CONTROL mbufs which have been
3678 * processed above. In any case, m should be pointing
3679 * to the head of the mbuf chain, and the nextrecord
3680 * should be either NULL or equal to m->m_nextpkt.
3681 * See comments above about SB_LOCK.
3682 */
3683 if (m != so->so_rcv.sb_mb ||
3684 m->m_nextpkt != nextrecord) {
3685 panic("%s: post-control !sync so=%p m=%p "
3686 "nextrecord=%p\n", __func__, so, m,
3687 nextrecord);
3688 /* NOTREACHED */
3689 }
3690 if (nextrecord == NULL) {
3691 so->so_rcv.sb_lastrecord = m;
3692 }
3693 }
3694 type = m->m_type;
3695 if (type == MT_OOBDATA) {
3696 flags |= MSG_OOB;
3697 }
3698 } else {
3699 if (!(flags & MSG_PEEK)) {
3700 SB_EMPTY_FIXUP(&so->so_rcv);
3701 }
3702 }
3703 SBLASTRECORDCHK(&so->so_rcv, "soreceive 2");
3704 SBLASTMBUFCHK(&so->so_rcv, "soreceive 2");
3705
3706 moff = 0;
3707 offset = 0;
3708
3709 if (!(flags & MSG_PEEK) && uio_resid(uio) > sorecvmincopy) {
3710 can_delay = 1;
3711 } else {
3712 can_delay = 0;
3713 }
3714
3715 need_event = 0;
3716
3717 while (m != NULL &&
3718 (uio_resid(uio) - delayed_copy_len) > 0 && error == 0) {
3719 if (m->m_type == MT_OOBDATA) {
3720 if (type != MT_OOBDATA) {
3721 break;
3722 }
3723 } else if (type == MT_OOBDATA) {
3724 break;
3725 }
3726 /*
3727 * Make sure to allways set MSG_OOB event when getting
3728 * out of band data inline.
3729 */
3730 if ((so->so_options & SO_WANTOOBFLAG) != 0 &&
3731 (so->so_options & SO_OOBINLINE) != 0 &&
3732 (so->so_state & SS_RCVATMARK) != 0) {
3733 flags |= MSG_OOB;
3734 }
3735 so->so_state &= ~SS_RCVATMARK;
3736 len = uio_resid(uio) - delayed_copy_len;
3737 if (so->so_oobmark && len > so->so_oobmark - offset) {
3738 len = so->so_oobmark - offset;
3739 }
3740 if (len > m->m_len - moff) {
3741 len = m->m_len - moff;
3742 }
3743 /*
3744 * If mp is set, just pass back the mbufs.
3745 * Otherwise copy them out via the uio, then free.
3746 * Sockbuf must be consistent here (points to current mbuf,
3747 * it points to next record) when we drop priority;
3748 * we must note any additions to the sockbuf when we
3749 * block interrupts again.
3750 */
3751 if (mp == NULL) {
3752 SBLASTRECORDCHK(&so->so_rcv, "soreceive uiomove");
3753 SBLASTMBUFCHK(&so->so_rcv, "soreceive uiomove");
3754 if (can_delay && len == m->m_len) {
3755 /*
3756 * only delay the copy if we're consuming the
3757 * mbuf and we're NOT in MSG_PEEK mode
3758 * and we have enough data to make it worthwile
3759 * to drop and retake the lock... can_delay
3760 * reflects the state of the 2 latter
3761 * constraints moff should always be zero
3762 * in these cases
3763 */
3764 delayed_copy_len += len;
3765 } else {
3766 if (delayed_copy_len) {
3767 error = sodelayed_copy(so, uio,
3768 &free_list, &delayed_copy_len);
3769
3770 if (error) {
3771 goto release;
3772 }
3773 /*
3774 * can only get here if MSG_PEEK is not
3775 * set therefore, m should point at the
3776 * head of the rcv queue; if it doesn't,
3777 * it means something drastically
3778 * changed while we were out from behind
3779 * the lock in sodelayed_copy. perhaps
3780 * a RST on the stream. in any event,
3781 * the stream has been interrupted. it's
3782 * probably best just to return whatever
3783 * data we've moved and let the caller
3784 * sort it out...
3785 */
3786 if (m != so->so_rcv.sb_mb) {
3787 break;
3788 }
3789 }
3790 socket_unlock(so, 0);
3791 error = uiomove(mtod(m, caddr_t) + moff,
3792 (int)len, uio);
3793 socket_lock(so, 0);
3794
3795 if (error) {
3796 goto release;
3797 }
3798 }
3799 } else {
3800 uio_setresid(uio, (uio_resid(uio) - len));
3801 }
3802 if (len == m->m_len - moff) {
3803 if (m->m_flags & M_EOR) {
3804 flags |= MSG_EOR;
3805 }
3806 if (flags & MSG_PEEK) {
3807 m = m->m_next;
3808 moff = 0;
3809 } else {
3810 nextrecord = m->m_nextpkt;
3811 sbfree(&so->so_rcv, m);
3812 m->m_nextpkt = NULL;
3813
3814 /*
3815 * If this packet is an unordered packet
3816 * (indicated by M_UNORDERED_DATA flag), remove
3817 * the additional bytes added to the
3818 * receive socket buffer size.
3819 */
3820 if ((so->so_flags & SOF_ENABLE_MSGS) &&
3821 m->m_len &&
3822 (m->m_flags & M_UNORDERED_DATA) &&
3823 sbreserve(&so->so_rcv,
3824 so->so_rcv.sb_hiwat - m->m_len)) {
3825 if (so->so_msg_state->msg_uno_bytes >
3826 m->m_len) {
3827 so->so_msg_state->
3828 msg_uno_bytes -= m->m_len;
3829 } else {
3830 so->so_msg_state->
3831 msg_uno_bytes = 0;
3832 }
3833 m->m_flags &= ~M_UNORDERED_DATA;
3834 }
3835
3836 if (mp != NULL) {
3837 *mp = m;
3838 mp = &m->m_next;
3839 so->so_rcv.sb_mb = m = m->m_next;
3840 *mp = NULL;
3841 } else {
3842 if (free_list == NULL) {
3843 free_list = m;
3844 } else {
3845 ml->m_next = m;
3846 }
3847 ml = m;
3848 so->so_rcv.sb_mb = m = m->m_next;
3849 ml->m_next = NULL;
3850 }
3851 if (m != NULL) {
3852 m->m_nextpkt = nextrecord;
3853 if (nextrecord == NULL) {
3854 so->so_rcv.sb_lastrecord = m;
3855 }
3856 } else {
3857 so->so_rcv.sb_mb = nextrecord;
3858 SB_EMPTY_FIXUP(&so->so_rcv);
3859 }
3860 SBLASTRECORDCHK(&so->so_rcv, "soreceive 3");
3861 SBLASTMBUFCHK(&so->so_rcv, "soreceive 3");
3862 }
3863 } else {
3864 if (flags & MSG_PEEK) {
3865 moff += len;
3866 } else {
3867 if (mp != NULL) {
3868 int copy_flag;
3869
3870 if (flags & MSG_DONTWAIT) {
3871 copy_flag = M_DONTWAIT;
3872 } else {
3873 copy_flag = M_WAIT;
3874 }
3875 *mp = m_copym(m, 0, len, copy_flag);
3876 /*
3877 * Failed to allocate an mbuf?
3878 * Adjust uio_resid back, it was
3879 * adjusted down by len bytes which
3880 * we didn't copy over.
3881 */
3882 if (*mp == NULL) {
3883 uio_setresid(uio,
3884 (uio_resid(uio) + len));
3885 break;
3886 }
3887 }
3888 m->m_data += len;
3889 m->m_len -= len;
3890 so->so_rcv.sb_cc -= len;
3891 }
3892 }
3893 if (so->so_oobmark) {
3894 if ((flags & MSG_PEEK) == 0) {
3895 so->so_oobmark -= len;
3896 if (so->so_oobmark == 0) {
3897 so->so_state |= SS_RCVATMARK;
3898 /*
3899 * delay posting the actual event until
3900 * after any delayed copy processing
3901 * has finished
3902 */
3903 need_event = 1;
3904 break;
3905 }
3906 } else {
3907 offset += len;
3908 if (offset == so->so_oobmark) {
3909 break;
3910 }
3911 }
3912 }
3913 if (flags & MSG_EOR) {
3914 break;
3915 }
3916 /*
3917 * If the MSG_WAITALL or MSG_WAITSTREAM flag is set
3918 * (for non-atomic socket), we must not quit until
3919 * "uio->uio_resid == 0" or an error termination.
3920 * If a signal/timeout occurs, return with a short
3921 * count but without error. Keep sockbuf locked
3922 * against other readers.
3923 */
3924 while (flags & (MSG_WAITALL | MSG_WAITSTREAM) && m == NULL &&
3925 (uio_resid(uio) - delayed_copy_len) > 0 &&
3926 !sosendallatonce(so) && !nextrecord) {
3927 if (so->so_error || ((so->so_state & SS_CANTRCVMORE)
3928 #if CONTENT_FILTER
3929 && cfil_sock_data_pending(&so->so_rcv) == 0
3930 #endif /* CONTENT_FILTER */
3931 )) {
3932 goto release;
3933 }
3934
3935 /*
3936 * Depending on the protocol (e.g. TCP), the following
3937 * might cause the socket lock to be dropped and later
3938 * be reacquired, and more data could have arrived and
3939 * have been appended to the receive socket buffer by
3940 * the time it returns. Therefore, we only sleep in
3941 * sbwait() below if and only if the socket buffer is
3942 * empty, in order to avoid a false sleep.
3943 */
3944 if (pr->pr_flags & PR_WANTRCVD && so->so_pcb &&
3945 (((struct inpcb *)so->so_pcb)->inp_state !=
3946 INPCB_STATE_DEAD)) {
3947 (*pr->pr_usrreqs->pru_rcvd)(so, flags);
3948 }
3949
3950 SBLASTRECORDCHK(&so->so_rcv, "soreceive sbwait 2");
3951 SBLASTMBUFCHK(&so->so_rcv, "soreceive sbwait 2");
3952
3953 if (so->so_rcv.sb_mb == NULL && sbwait(&so->so_rcv)) {
3954 error = 0;
3955 goto release;
3956 }
3957 /*
3958 * have to wait until after we get back from the sbwait
3959 * to do the copy because we will drop the lock if we
3960 * have enough data that has been delayed... by dropping
3961 * the lock we open up a window allowing the netisr
3962 * thread to process the incoming packets and to change
3963 * the state of this socket... we're issuing the sbwait
3964 * because the socket is empty and we're expecting the
3965 * netisr thread to wake us up when more packets arrive;
3966 * if we allow that processing to happen and then sbwait
3967 * we could stall forever with packets sitting in the
3968 * socket if no further packets arrive from the remote
3969 * side.
3970 *
3971 * we want to copy before we've collected all the data
3972 * to satisfy this request to allow the copy to overlap
3973 * the incoming packet processing on an MP system
3974 */
3975 if (delayed_copy_len > sorecvmincopy &&
3976 (delayed_copy_len > (so->so_rcv.sb_hiwat / 2))) {
3977 error = sodelayed_copy(so, uio,
3978 &free_list, &delayed_copy_len);
3979
3980 if (error) {
3981 goto release;
3982 }
3983 }
3984 m = so->so_rcv.sb_mb;
3985 if (m != NULL) {
3986 nextrecord = m->m_nextpkt;
3987 }
3988 SB_MB_CHECK(&so->so_rcv);
3989 }
3990 }
3991 #ifdef MORE_LOCKING_DEBUG
3992 if (so->so_usecount <= 1) {
3993 panic("%s: after big while so=%p ref=%d on socket\n",
3994 __func__, so, so->so_usecount);
3995 /* NOTREACHED */
3996 }
3997 #endif
3998
3999 if (m != NULL && pr->pr_flags & PR_ATOMIC) {
4000 if (so->so_options & SO_DONTTRUNC) {
4001 flags |= MSG_RCVMORE;
4002 } else {
4003 flags |= MSG_TRUNC;
4004 if ((flags & MSG_PEEK) == 0) {
4005 (void) sbdroprecord(&so->so_rcv);
4006 }
4007 }
4008 }
4009
4010 /*
4011 * pru_rcvd below (for TCP) may cause more data to be received
4012 * if the socket lock is dropped prior to sending the ACK; some
4013 * legacy OpenTransport applications don't handle this well
4014 * (if it receives less data than requested while MSG_HAVEMORE
4015 * is set), and so we set the flag now based on what we know
4016 * prior to calling pru_rcvd.
4017 */
4018 if ((so->so_options & SO_WANTMORE) && so->so_rcv.sb_cc > 0) {
4019 flags |= MSG_HAVEMORE;
4020 }
4021
4022 if ((flags & MSG_PEEK) == 0) {
4023 if (m == NULL) {
4024 so->so_rcv.sb_mb = nextrecord;
4025 /*
4026 * First part is an inline SB_EMPTY_FIXUP(). Second
4027 * part makes sure sb_lastrecord is up-to-date if
4028 * there is still data in the socket buffer.
4029 */
4030 if (so->so_rcv.sb_mb == NULL) {
4031 so->so_rcv.sb_mbtail = NULL;
4032 so->so_rcv.sb_lastrecord = NULL;
4033 } else if (nextrecord->m_nextpkt == NULL) {
4034 so->so_rcv.sb_lastrecord = nextrecord;
4035 }
4036 SB_MB_CHECK(&so->so_rcv);
4037 }
4038 SBLASTRECORDCHK(&so->so_rcv, "soreceive 4");
4039 SBLASTMBUFCHK(&so->so_rcv, "soreceive 4");
4040 if (pr->pr_flags & PR_WANTRCVD && so->so_pcb) {
4041 (*pr->pr_usrreqs->pru_rcvd)(so, flags);
4042 }
4043 }
4044
4045 if (delayed_copy_len) {
4046 error = sodelayed_copy(so, uio, &free_list, &delayed_copy_len);
4047 if (error) {
4048 goto release;
4049 }
4050 }
4051 if (free_list != NULL) {
4052 m_freem_list(free_list);
4053 free_list = NULL;
4054 }
4055 if (need_event) {
4056 postevent(so, 0, EV_OOB);
4057 }
4058
4059 if (orig_resid == uio_resid(uio) && orig_resid &&
4060 (flags & MSG_EOR) == 0 && (so->so_state & SS_CANTRCVMORE) == 0) {
4061 sbunlock(&so->so_rcv, TRUE); /* keep socket locked */
4062 goto restart;
4063 }
4064
4065 if (flagsp != NULL) {
4066 *flagsp |= flags;
4067 }
4068 release:
4069 #ifdef MORE_LOCKING_DEBUG
4070 if (so->so_usecount <= 1) {
4071 panic("%s: release so=%p ref=%d on socket\n", __func__,
4072 so, so->so_usecount);
4073 /* NOTREACHED */
4074 }
4075 #endif
4076 if (delayed_copy_len) {
4077 error = sodelayed_copy(so, uio, &free_list, &delayed_copy_len);
4078 }
4079
4080 if (free_list != NULL) {
4081 m_freem_list(free_list);
4082 }
4083
4084 sbunlock(&so->so_rcv, FALSE); /* will unlock socket */
4085
4086 if (en_tracing) {
4087 KERNEL_ENERGYTRACE(kEnTrActKernSockRead, DBG_FUNC_END,
4088 VM_KERNEL_ADDRPERM(so),
4089 ((error == EWOULDBLOCK) ? kEnTrFlagNoWork : 0),
4090 (int64_t)(orig_resid - uio_resid(uio)));
4091 }
4092 KERNEL_DEBUG(DBG_FNC_SORECEIVE | DBG_FUNC_END, so, uio_resid(uio),
4093 so->so_rcv.sb_cc, 0, error);
4094
4095 return error;
4096 }
4097
4098 /*
4099 * Returns: 0 Success
4100 * uiomove:EFAULT
4101 */
4102 static int
4103 sodelayed_copy(struct socket *so, struct uio *uio, struct mbuf **free_list,
4104 user_ssize_t *resid)
4105 {
4106 int error = 0;
4107 struct mbuf *m;
4108
4109 m = *free_list;
4110
4111 socket_unlock(so, 0);
4112
4113 while (m != NULL && error == 0) {
4114 error = uiomove(mtod(m, caddr_t), (int)m->m_len, uio);
4115 m = m->m_next;
4116 }
4117 m_freem_list(*free_list);
4118
4119 *free_list = NULL;
4120 *resid = 0;
4121
4122 socket_lock(so, 0);
4123
4124 return error;
4125 }
4126
4127 static int
4128 sodelayed_copy_list(struct socket *so, struct recv_msg_elem *msgarray,
4129 u_int uiocnt, struct mbuf **free_list, user_ssize_t *resid)
4130 {
4131 #pragma unused(so)
4132 int error = 0;
4133 struct mbuf *ml, *m;
4134 int i = 0;
4135 struct uio *auio;
4136
4137 for (ml = *free_list, i = 0; ml != NULL && i < uiocnt;
4138 ml = ml->m_nextpkt, i++) {
4139 auio = msgarray[i].uio;
4140 for (m = ml; m != NULL; m = m->m_next) {
4141 error = uiomove(mtod(m, caddr_t), m->m_len, auio);
4142 if (error != 0) {
4143 goto out;
4144 }
4145 }
4146 }
4147 out:
4148 m_freem_list(*free_list);
4149
4150 *free_list = NULL;
4151 *resid = 0;
4152
4153 return error;
4154 }
4155
4156 int
4157 soreceive_list(struct socket *so, struct recv_msg_elem *msgarray, u_int uiocnt,
4158 int *flagsp)
4159 {
4160 struct mbuf *m;
4161 struct mbuf *nextrecord;
4162 struct mbuf *ml = NULL, *free_list = NULL, *free_tail = NULL;
4163 int error;
4164 user_ssize_t len, pktlen, delayed_copy_len = 0;
4165 struct protosw *pr = so->so_proto;
4166 user_ssize_t resid;
4167 struct proc *p = current_proc();
4168 struct uio *auio = NULL;
4169 int npkts = 0;
4170 int sblocked = 0;
4171 struct sockaddr **psa = NULL;
4172 struct mbuf **controlp = NULL;
4173 int can_delay;
4174 int flags;
4175 struct mbuf *free_others = NULL;
4176
4177 KERNEL_DEBUG(DBG_FNC_SORECEIVE_LIST | DBG_FUNC_START,
4178 so, uiocnt,
4179 so->so_rcv.sb_cc, so->so_rcv.sb_lowat, so->so_rcv.sb_hiwat);
4180
4181 /*
4182 * Sanity checks:
4183 * - Only supports don't wait flags
4184 * - Only support datagram sockets (could be extended to raw)
4185 * - Must be atomic
4186 * - Protocol must support packet chains
4187 * - The uio array is NULL (should we panic?)
4188 */
4189 if (flagsp != NULL) {
4190 flags = *flagsp;
4191 } else {
4192 flags = 0;
4193 }
4194 if (flags & ~(MSG_PEEK | MSG_WAITALL | MSG_DONTWAIT | MSG_NEEDSA |
4195 MSG_NBIO)) {
4196 printf("%s invalid flags 0x%x\n", __func__, flags);
4197 error = EINVAL;
4198 goto out;
4199 }
4200 if (so->so_type != SOCK_DGRAM) {
4201 error = EINVAL;
4202 goto out;
4203 }
4204 if (sosendallatonce(so) == 0) {
4205 error = EINVAL;
4206 goto out;
4207 }
4208 if (so->so_proto->pr_usrreqs->pru_send_list == NULL) {
4209 error = EPROTONOSUPPORT;
4210 goto out;
4211 }
4212 if (msgarray == NULL) {
4213 printf("%s uioarray is NULL\n", __func__);
4214 error = EINVAL;
4215 goto out;
4216 }
4217 if (uiocnt == 0) {
4218 printf("%s uiocnt is 0\n", __func__);
4219 error = EINVAL;
4220 goto out;
4221 }
4222 /*
4223 * Sanity check on the length passed by caller as we are making 'int'
4224 * comparisons
4225 */
4226 resid = recv_msg_array_resid(msgarray, uiocnt);
4227 if (resid < 0 || resid > INT_MAX) {
4228 error = EINVAL;
4229 goto out;
4230 }
4231
4232 if (!(flags & MSG_PEEK) && sorecvmincopy > 0) {
4233 can_delay = 1;
4234 } else {
4235 can_delay = 0;
4236 }
4237
4238 socket_lock(so, 1);
4239 so_update_last_owner_locked(so, p);
4240 so_update_policy(so);
4241
4242 #if NECP
4243 so_update_necp_policy(so, NULL, NULL);
4244 #endif /* NECP */
4245
4246 /*
4247 * If a recv attempt is made on a previously-accepted socket
4248 * that has been marked as inactive (disconnected), reject
4249 * the request.
4250 */
4251 if (so->so_flags & SOF_DEFUNCT) {
4252 struct sockbuf *sb = &so->so_rcv;
4253
4254 error = ENOTCONN;
4255 SODEFUNCTLOG("%s[%d, %s]: defunct so 0x%llx [%d,%d] (%d)\n",
4256 __func__, proc_pid(p), proc_best_name(p),
4257 (uint64_t)DEBUG_KERNEL_ADDRPERM(so),
4258 SOCK_DOM(so), SOCK_TYPE(so), error);
4259 /*
4260 * This socket should have been disconnected and flushed
4261 * prior to being returned from sodefunct(); there should
4262 * be no data on its receive list, so panic otherwise.
4263 */
4264 if (so->so_state & SS_DEFUNCT) {
4265 sb_empty_assert(sb, __func__);
4266 }
4267 goto release;
4268 }
4269
4270 next:
4271 /*
4272 * The uio may be empty
4273 */
4274 if (npkts >= uiocnt) {
4275 error = 0;
4276 goto release;
4277 }
4278 restart:
4279 /*
4280 * See if the socket has been closed (SS_NOFDREF|SS_CANTRCVMORE)
4281 * and if so just return to the caller. This could happen when
4282 * soreceive() is called by a socket upcall function during the
4283 * time the socket is freed. The socket buffer would have been
4284 * locked across the upcall, therefore we cannot put this thread
4285 * to sleep (else we will deadlock) or return EWOULDBLOCK (else
4286 * we may livelock), because the lock on the socket buffer will
4287 * only be released when the upcall routine returns to its caller.
4288 * Because the socket has been officially closed, there can be
4289 * no further read on it.
4290 */
4291 if ((so->so_state & (SS_NOFDREF | SS_CANTRCVMORE)) ==
4292 (SS_NOFDREF | SS_CANTRCVMORE)) {
4293 error = 0;
4294 goto release;
4295 }
4296
4297 error = sblock(&so->so_rcv, SBLOCKWAIT(flags));
4298 if (error) {
4299 goto release;
4300 }
4301 sblocked = 1;
4302
4303 m = so->so_rcv.sb_mb;
4304 /*
4305 * Block awaiting more datagram if needed
4306 */
4307 if (m == NULL || (((flags & MSG_DONTWAIT) == 0 &&
4308 (so->so_rcv.sb_cc < so->so_rcv.sb_lowat ||
4309 ((flags & MSG_WAITALL) && npkts < uiocnt))))) {
4310 /*
4311 * Panic if we notice inconsistencies in the socket's
4312 * receive list; both sb_mb and sb_cc should correctly
4313 * reflect the contents of the list, otherwise we may
4314 * end up with false positives during select() or poll()
4315 * which could put the application in a bad state.
4316 */
4317 SB_MB_CHECK(&so->so_rcv);
4318
4319 if (so->so_error) {
4320 error = so->so_error;
4321 if ((flags & MSG_PEEK) == 0) {
4322 so->so_error = 0;
4323 }
4324 goto release;
4325 }
4326 if (so->so_state & SS_CANTRCVMORE) {
4327 goto release;
4328 }
4329 if ((so->so_state & (SS_ISCONNECTED | SS_ISCONNECTING)) == 0 &&
4330 (so->so_proto->pr_flags & PR_CONNREQUIRED)) {
4331 error = ENOTCONN;
4332 goto release;
4333 }
4334 if ((so->so_state & SS_NBIO) ||
4335 (flags & (MSG_DONTWAIT | MSG_NBIO))) {
4336 error = EWOULDBLOCK;
4337 goto release;
4338 }
4339 /*
4340 * Do not block if we got some data
4341 */
4342 if (free_list != NULL) {
4343 error = 0;
4344 goto release;
4345 }
4346
4347 SBLASTRECORDCHK(&so->so_rcv, "soreceive sbwait 1");
4348 SBLASTMBUFCHK(&so->so_rcv, "soreceive sbwait 1");
4349
4350 sbunlock(&so->so_rcv, TRUE); /* keep socket locked */
4351 sblocked = 0;
4352
4353 error = sbwait(&so->so_rcv);
4354 if (error) {
4355 goto release;
4356 }
4357 goto restart;
4358 }
4359
4360 OSIncrementAtomicLong(&p->p_stats->p_ru.ru_msgrcv);
4361 SBLASTRECORDCHK(&so->so_rcv, "soreceive 1");
4362 SBLASTMBUFCHK(&so->so_rcv, "soreceive 1");
4363
4364 /*
4365 * Consume the current uio index as we have a datagram
4366 */
4367 auio = msgarray[npkts].uio;
4368 resid = uio_resid(auio);
4369 msgarray[npkts].which |= SOCK_MSG_DATA;
4370 psa = (msgarray[npkts].which & SOCK_MSG_SA) ?
4371 &msgarray[npkts].psa : NULL;
4372 controlp = (msgarray[npkts].which & SOCK_MSG_CONTROL) ?
4373 &msgarray[npkts].controlp : NULL;
4374 npkts += 1;
4375 nextrecord = m->m_nextpkt;
4376
4377 if ((pr->pr_flags & PR_ADDR) && m->m_type == MT_SONAME) {
4378 error = soreceive_addr(p, so, psa, flags, &m, &nextrecord, 1);
4379 if (error == ERESTART) {
4380 goto restart;
4381 } else if (error != 0) {
4382 goto release;
4383 }
4384 }
4385
4386 if (m != NULL && m->m_type == MT_CONTROL) {
4387 error = soreceive_ctl(so, controlp, flags, &m, &nextrecord);
4388 if (error != 0) {
4389 goto release;
4390 }
4391 }
4392
4393 if (m->m_pkthdr.len == 0) {
4394 printf("%s:%d so %llx pkt %llx type %u pktlen null\n",
4395 __func__, __LINE__,
4396 (uint64_t)DEBUG_KERNEL_ADDRPERM(so),
4397 (uint64_t)DEBUG_KERNEL_ADDRPERM(m),
4398 m->m_type);
4399 }
4400
4401 /*
4402 * Loop to copy the mbufs of the current record
4403 * Support zero length packets
4404 */
4405 ml = NULL;
4406 pktlen = 0;
4407 while (m != NULL && (len = resid - pktlen) >= 0 && error == 0) {
4408 if (m->m_len == 0) {
4409 panic("%p m_len zero", m);
4410 }
4411 if (m->m_type == 0) {
4412 panic("%p m_type zero", m);
4413 }
4414 /*
4415 * Clip to the residual length
4416 */
4417 if (len > m->m_len) {
4418 len = m->m_len;
4419 }
4420 pktlen += len;
4421 /*
4422 * Copy the mbufs via the uio or delay the copy
4423 * Sockbuf must be consistent here (points to current mbuf,
4424 * it points to next record) when we drop priority;
4425 * we must note any additions to the sockbuf when we
4426 * block interrupts again.
4427 */
4428 if (len > 0 && can_delay == 0) {
4429 socket_unlock(so, 0);
4430 error = uiomove(mtod(m, caddr_t), (int)len, auio);
4431 socket_lock(so, 0);
4432 if (error) {
4433 goto release;
4434 }
4435 } else {
4436 delayed_copy_len += len;
4437 }
4438
4439 if (len == m->m_len) {
4440 /*
4441 * m was entirely copied
4442 */
4443 sbfree(&so->so_rcv, m);
4444 nextrecord = m->m_nextpkt;
4445 m->m_nextpkt = NULL;
4446
4447 /*
4448 * Set the first packet to the head of the free list
4449 */
4450 if (free_list == NULL) {
4451 free_list = m;
4452 }
4453 /*
4454 * Link current packet to tail of free list
4455 */
4456 if (ml == NULL) {
4457 if (free_tail != NULL) {
4458 free_tail->m_nextpkt = m;
4459 }
4460 free_tail = m;
4461 }
4462 /*
4463 * Link current mbuf to last mbuf of current packet
4464 */
4465 if (ml != NULL) {
4466 ml->m_next = m;
4467 }
4468 ml = m;
4469
4470 /*
4471 * Move next buf to head of socket buffer
4472 */
4473 so->so_rcv.sb_mb = m = ml->m_next;
4474 ml->m_next = NULL;
4475
4476 if (m != NULL) {
4477 m->m_nextpkt = nextrecord;
4478 if (nextrecord == NULL) {
4479 so->so_rcv.sb_lastrecord = m;
4480 }
4481 } else {
4482 so->so_rcv.sb_mb = nextrecord;
4483 SB_EMPTY_FIXUP(&so->so_rcv);
4484 }
4485 SBLASTRECORDCHK(&so->so_rcv, "soreceive 3");
4486 SBLASTMBUFCHK(&so->so_rcv, "soreceive 3");
4487 } else {
4488 /*
4489 * Stop the loop on partial copy
4490 */
4491 break;
4492 }
4493 }
4494 #ifdef MORE_LOCKING_DEBUG
4495 if (so->so_usecount <= 1) {
4496 panic("%s: after big while so=%llx ref=%d on socket\n",
4497 __func__,
4498 (uint64_t)DEBUG_KERNEL_ADDRPERM(so), so->so_usecount);
4499 /* NOTREACHED */
4500 }
4501 #endif
4502 /*
4503 * Tell the caller we made a partial copy
4504 */
4505 if (m != NULL) {
4506 if (so->so_options & SO_DONTTRUNC) {
4507 /*
4508 * Copyout first the freelist then the partial mbuf
4509 */
4510 socket_unlock(so, 0);
4511 if (delayed_copy_len) {
4512 error = sodelayed_copy_list(so, msgarray,
4513 uiocnt, &free_list, &delayed_copy_len);
4514 }
4515
4516 if (error == 0) {
4517 error = uiomove(mtod(m, caddr_t), (int)len,
4518 auio);
4519 }
4520 socket_lock(so, 0);
4521 if (error) {
4522 goto release;
4523 }
4524
4525 m->m_data += len;
4526 m->m_len -= len;
4527 so->so_rcv.sb_cc -= len;
4528 flags |= MSG_RCVMORE;
4529 } else {
4530 (void) sbdroprecord(&so->so_rcv);
4531 nextrecord = so->so_rcv.sb_mb;
4532 m = NULL;
4533 flags |= MSG_TRUNC;
4534 }
4535 }
4536
4537 if (m == NULL) {
4538 so->so_rcv.sb_mb = nextrecord;
4539 /*
4540 * First part is an inline SB_EMPTY_FIXUP(). Second
4541 * part makes sure sb_lastrecord is up-to-date if
4542 * there is still data in the socket buffer.
4543 */
4544 if (so->so_rcv.sb_mb == NULL) {
4545 so->so_rcv.sb_mbtail = NULL;
4546 so->so_rcv.sb_lastrecord = NULL;
4547 } else if (nextrecord->m_nextpkt == NULL) {
4548 so->so_rcv.sb_lastrecord = nextrecord;
4549 }
4550 SB_MB_CHECK(&so->so_rcv);
4551 }
4552 SBLASTRECORDCHK(&so->so_rcv, "soreceive 4");
4553 SBLASTMBUFCHK(&so->so_rcv, "soreceive 4");
4554
4555 /*
4556 * We can continue to the next packet as long as:
4557 * - We haven't exhausted the uio array
4558 * - There was no error
4559 * - A packet was not truncated
4560 * - We can still receive more data
4561 */
4562 if (npkts < uiocnt && error == 0 &&
4563 (flags & (MSG_RCVMORE | MSG_TRUNC)) == 0 &&
4564 (so->so_state & SS_CANTRCVMORE) == 0) {
4565 sbunlock(&so->so_rcv, TRUE); /* keep socket locked */
4566 sblocked = 0;
4567
4568 goto next;
4569 }
4570 if (flagsp != NULL) {
4571 *flagsp |= flags;
4572 }
4573
4574 release:
4575 /*
4576 * pru_rcvd may cause more data to be received if the socket lock
4577 * is dropped so we set MSG_HAVEMORE now based on what we know.
4578 * That way the caller won't be surprised if it receives less data
4579 * than requested.
4580 */
4581 if ((so->so_options & SO_WANTMORE) && so->so_rcv.sb_cc > 0) {
4582 flags |= MSG_HAVEMORE;
4583 }
4584
4585 if (pr->pr_flags & PR_WANTRCVD && so->so_pcb) {
4586 (*pr->pr_usrreqs->pru_rcvd)(so, flags);
4587 }
4588
4589 if (sblocked) {
4590 sbunlock(&so->so_rcv, FALSE); /* will unlock socket */
4591 } else {
4592 socket_unlock(so, 1);
4593 }
4594
4595 if (delayed_copy_len) {
4596 error = sodelayed_copy_list(so, msgarray, uiocnt,
4597 &free_list, &delayed_copy_len);
4598 }
4599 out:
4600 /*
4601 * Amortize the cost of freeing the mbufs
4602 */
4603 if (free_list != NULL) {
4604 m_freem_list(free_list);
4605 }
4606 if (free_others != NULL) {
4607 m_freem_list(free_others);
4608 }
4609
4610 KERNEL_DEBUG(DBG_FNC_SORECEIVE_LIST | DBG_FUNC_END, error,
4611 0, 0, 0, 0);
4612 return error;
4613 }
4614
4615 static int
4616 so_statistics_event_to_nstat_event(int64_t *input_options,
4617 uint64_t *nstat_event)
4618 {
4619 int error = 0;
4620 switch (*input_options) {
4621 case SO_STATISTICS_EVENT_ENTER_CELLFALLBACK:
4622 *nstat_event = NSTAT_EVENT_SRC_ENTER_CELLFALLBACK;
4623 break;
4624 case SO_STATISTICS_EVENT_EXIT_CELLFALLBACK:
4625 *nstat_event = NSTAT_EVENT_SRC_EXIT_CELLFALLBACK;
4626 break;
4627 #if (DEBUG || DEVELOPMENT)
4628 case SO_STATISTICS_EVENT_RESERVED_1:
4629 *nstat_event = NSTAT_EVENT_SRC_RESERVED_1;
4630 break;
4631 case SO_STATISTICS_EVENT_RESERVED_2:
4632 *nstat_event = NSTAT_EVENT_SRC_RESERVED_2;
4633 break;
4634 #endif /* (DEBUG || DEVELOPMENT) */
4635 default:
4636 error = EINVAL;
4637 break;
4638 }
4639 return error;
4640 }
4641
4642 /*
4643 * Returns: 0 Success
4644 * EINVAL
4645 * ENOTCONN
4646 * <pru_shutdown>:EINVAL
4647 * <pru_shutdown>:EADDRNOTAVAIL[TCP]
4648 * <pru_shutdown>:ENOBUFS[TCP]
4649 * <pru_shutdown>:EMSGSIZE[TCP]
4650 * <pru_shutdown>:EHOSTUNREACH[TCP]
4651 * <pru_shutdown>:ENETUNREACH[TCP]
4652 * <pru_shutdown>:ENETDOWN[TCP]
4653 * <pru_shutdown>:ENOMEM[TCP]
4654 * <pru_shutdown>:EACCES[TCP]
4655 * <pru_shutdown>:EMSGSIZE[TCP]
4656 * <pru_shutdown>:ENOBUFS[TCP]
4657 * <pru_shutdown>:???[TCP] [ignorable: mostly IPSEC/firewall/DLIL]
4658 * <pru_shutdown>:??? [other protocol families]
4659 */
4660 int
4661 soshutdown(struct socket *so, int how)
4662 {
4663 int error;
4664
4665 KERNEL_DEBUG(DBG_FNC_SOSHUTDOWN | DBG_FUNC_START, how, 0, 0, 0, 0);
4666
4667 switch (how) {
4668 case SHUT_RD:
4669 case SHUT_WR:
4670 case SHUT_RDWR:
4671 socket_lock(so, 1);
4672 if ((so->so_state &
4673 (SS_ISCONNECTED | SS_ISCONNECTING | SS_ISDISCONNECTING)) == 0) {
4674 error = ENOTCONN;
4675 } else {
4676 error = soshutdownlock(so, how);
4677 }
4678 socket_unlock(so, 1);
4679 break;
4680 default:
4681 error = EINVAL;
4682 break;
4683 }
4684
4685 KERNEL_DEBUG(DBG_FNC_SOSHUTDOWN | DBG_FUNC_END, how, error, 0, 0, 0);
4686
4687 return error;
4688 }
4689
4690 int
4691 soshutdownlock_final(struct socket *so, int how)
4692 {
4693 struct protosw *pr = so->so_proto;
4694 int error = 0;
4695
4696 sflt_notify(so, sock_evt_shutdown, &how);
4697
4698 if (how != SHUT_WR) {
4699 if ((so->so_state & SS_CANTRCVMORE) != 0) {
4700 /* read already shut down */
4701 error = ENOTCONN;
4702 goto done;
4703 }
4704 sorflush(so);
4705 postevent(so, 0, EV_RCLOSED);
4706 }
4707 if (how != SHUT_RD) {
4708 if ((so->so_state & SS_CANTSENDMORE) != 0) {
4709 /* write already shut down */
4710 error = ENOTCONN;
4711 goto done;
4712 }
4713 error = (*pr->pr_usrreqs->pru_shutdown)(so);
4714 postevent(so, 0, EV_WCLOSED);
4715 }
4716 done:
4717 KERNEL_DEBUG(DBG_FNC_SOSHUTDOWN, how, 1, 0, 0, 0);
4718 return error;
4719 }
4720
4721 int
4722 soshutdownlock(struct socket *so, int how)
4723 {
4724 int error = 0;
4725
4726 #if CONTENT_FILTER
4727 /*
4728 * A content filter may delay the actual shutdown until it
4729 * has processed the pending data
4730 */
4731 if (so->so_flags & SOF_CONTENT_FILTER) {
4732 error = cfil_sock_shutdown(so, &how);
4733 if (error == EJUSTRETURN) {
4734 error = 0;
4735 goto done;
4736 } else if (error != 0) {
4737 goto done;
4738 }
4739 }
4740 #endif /* CONTENT_FILTER */
4741
4742 error = soshutdownlock_final(so, how);
4743
4744 done:
4745 return error;
4746 }
4747
4748 void
4749 sowflush(struct socket *so)
4750 {
4751 struct sockbuf *sb = &so->so_snd;
4752
4753 /*
4754 * Obtain lock on the socket buffer (SB_LOCK). This is required
4755 * to prevent the socket buffer from being unexpectedly altered
4756 * while it is used by another thread in socket send/receive.
4757 *
4758 * sblock() must not fail here, hence the assertion.
4759 */
4760 (void) sblock(sb, SBL_WAIT | SBL_NOINTR | SBL_IGNDEFUNCT);
4761 VERIFY(sb->sb_flags & SB_LOCK);
4762
4763 sb->sb_flags &= ~(SB_SEL | SB_UPCALL);
4764 sb->sb_flags |= SB_DROP;
4765 sb->sb_upcall = NULL;
4766 sb->sb_upcallarg = NULL;
4767
4768 sbunlock(sb, TRUE); /* keep socket locked */
4769
4770 selthreadclear(&sb->sb_sel);
4771 sbrelease(sb);
4772 }
4773
4774 void
4775 sorflush(struct socket *so)
4776 {
4777 struct sockbuf *sb = &so->so_rcv;
4778 struct protosw *pr = so->so_proto;
4779 struct sockbuf asb;
4780 #ifdef notyet
4781 lck_mtx_t *mutex_held;
4782 /*
4783 * XXX: This code is currently commented out, because we may get here
4784 * as part of sofreelastref(), and at that time, pr_getlock() may no
4785 * longer be able to return us the lock; this will be fixed in future.
4786 */
4787 if (so->so_proto->pr_getlock != NULL) {
4788 mutex_held = (*so->so_proto->pr_getlock)(so, 0);
4789 } else {
4790 mutex_held = so->so_proto->pr_domain->dom_mtx;
4791 }
4792
4793 LCK_MTX_ASSERT(mutex_held, LCK_MTX_ASSERT_OWNED);
4794 #endif /* notyet */
4795
4796 sflt_notify(so, sock_evt_flush_read, NULL);
4797
4798 socantrcvmore(so);
4799
4800 /*
4801 * Obtain lock on the socket buffer (SB_LOCK). This is required
4802 * to prevent the socket buffer from being unexpectedly altered
4803 * while it is used by another thread in socket send/receive.
4804 *
4805 * sblock() must not fail here, hence the assertion.
4806 */
4807 (void) sblock(sb, SBL_WAIT | SBL_NOINTR | SBL_IGNDEFUNCT);
4808 VERIFY(sb->sb_flags & SB_LOCK);
4809
4810 /*
4811 * Copy only the relevant fields from "sb" to "asb" which we
4812 * need for sbrelease() to function. In particular, skip
4813 * sb_sel as it contains the wait queue linkage, which would
4814 * wreak havoc if we were to issue selthreadclear() on "asb".
4815 * Make sure to not carry over SB_LOCK in "asb", as we need
4816 * to acquire it later as part of sbrelease().
4817 */
4818 bzero(&asb, sizeof(asb));
4819 asb.sb_cc = sb->sb_cc;
4820 asb.sb_hiwat = sb->sb_hiwat;
4821 asb.sb_mbcnt = sb->sb_mbcnt;
4822 asb.sb_mbmax = sb->sb_mbmax;
4823 asb.sb_ctl = sb->sb_ctl;
4824 asb.sb_lowat = sb->sb_lowat;
4825 asb.sb_mb = sb->sb_mb;
4826 asb.sb_mbtail = sb->sb_mbtail;
4827 asb.sb_lastrecord = sb->sb_lastrecord;
4828 asb.sb_so = sb->sb_so;
4829 asb.sb_flags = sb->sb_flags;
4830 asb.sb_flags &= ~(SB_LOCK | SB_SEL | SB_KNOTE | SB_UPCALL);
4831 asb.sb_flags |= SB_DROP;
4832
4833 /*
4834 * Ideally we'd bzero() these and preserve the ones we need;
4835 * but to do that we'd need to shuffle things around in the
4836 * sockbuf, and we can't do it now because there are KEXTS
4837 * that are directly referring to the socket structure.
4838 *
4839 * Setting SB_DROP acts as a barrier to prevent further appends.
4840 * Clearing SB_SEL is done for selthreadclear() below.
4841 */
4842 sb->sb_cc = 0;
4843 sb->sb_hiwat = 0;
4844 sb->sb_mbcnt = 0;
4845 sb->sb_mbmax = 0;
4846 sb->sb_ctl = 0;
4847 sb->sb_lowat = 0;
4848 sb->sb_mb = NULL;
4849 sb->sb_mbtail = NULL;
4850 sb->sb_lastrecord = NULL;
4851 sb->sb_timeo.tv_sec = 0;
4852 sb->sb_timeo.tv_usec = 0;
4853 sb->sb_upcall = NULL;
4854 sb->sb_upcallarg = NULL;
4855 sb->sb_flags &= ~(SB_SEL | SB_UPCALL);
4856 sb->sb_flags |= SB_DROP;
4857
4858 sbunlock(sb, TRUE); /* keep socket locked */
4859
4860 /*
4861 * Note that selthreadclear() is called on the original "sb" and
4862 * not the local "asb" because of the way wait queue linkage is
4863 * implemented. Given that selwakeup() may be triggered, SB_SEL
4864 * should no longer be set (cleared above.)
4865 */
4866 selthreadclear(&sb->sb_sel);
4867
4868 if ((pr->pr_flags & PR_RIGHTS) && pr->pr_domain->dom_dispose) {
4869 (*pr->pr_domain->dom_dispose)(asb.sb_mb);
4870 }
4871
4872 sbrelease(&asb);
4873 }
4874
4875 /*
4876 * Perhaps this routine, and sooptcopyout(), below, ought to come in
4877 * an additional variant to handle the case where the option value needs
4878 * to be some kind of integer, but not a specific size.
4879 * In addition to their use here, these functions are also called by the
4880 * protocol-level pr_ctloutput() routines.
4881 *
4882 * Returns: 0 Success
4883 * EINVAL
4884 * copyin:EFAULT
4885 */
4886 int
4887 sooptcopyin(struct sockopt *sopt, void *buf, size_t len, size_t minlen)
4888 {
4889 size_t valsize;
4890
4891 /*
4892 * If the user gives us more than we wanted, we ignore it,
4893 * but if we don't get the minimum length the caller
4894 * wants, we return EINVAL. On success, sopt->sopt_valsize
4895 * is set to however much we actually retrieved.
4896 */
4897 if ((valsize = sopt->sopt_valsize) < minlen) {
4898 return EINVAL;
4899 }
4900 if (valsize > len) {
4901 sopt->sopt_valsize = valsize = len;
4902 }
4903
4904 if (sopt->sopt_p != kernproc) {
4905 return copyin(sopt->sopt_val, buf, valsize);
4906 }
4907
4908 bcopy(CAST_DOWN(caddr_t, sopt->sopt_val), buf, valsize);
4909 return 0;
4910 }
4911
4912 /*
4913 * sooptcopyin_timeval
4914 * Copy in a timeval value into tv_p, and take into account whether the
4915 * the calling process is 64-bit or 32-bit. Moved the sanity checking
4916 * code here so that we can verify the 64-bit tv_sec value before we lose
4917 * the top 32-bits assigning tv64.tv_sec to tv_p->tv_sec.
4918 */
4919 static int
4920 sooptcopyin_timeval(struct sockopt *sopt, struct timeval *tv_p)
4921 {
4922 int error;
4923
4924 if (proc_is64bit(sopt->sopt_p)) {
4925 struct user64_timeval tv64;
4926
4927 if (sopt->sopt_valsize < sizeof(tv64)) {
4928 return EINVAL;
4929 }
4930
4931 sopt->sopt_valsize = sizeof(tv64);
4932 if (sopt->sopt_p != kernproc) {
4933 error = copyin(sopt->sopt_val, &tv64, sizeof(tv64));
4934 if (error != 0) {
4935 return error;
4936 }
4937 } else {
4938 bcopy(CAST_DOWN(caddr_t, sopt->sopt_val), &tv64,
4939 sizeof(tv64));
4940 }
4941 if (tv64.tv_sec < 0 || tv64.tv_sec > LONG_MAX ||
4942 tv64.tv_usec < 0 || tv64.tv_usec >= 1000000) {
4943 return EDOM;
4944 }
4945
4946 tv_p->tv_sec = tv64.tv_sec;
4947 tv_p->tv_usec = tv64.tv_usec;
4948 } else {
4949 struct user32_timeval tv32;
4950
4951 if (sopt->sopt_valsize < sizeof(tv32)) {
4952 return EINVAL;
4953 }
4954
4955 sopt->sopt_valsize = sizeof(tv32);
4956 if (sopt->sopt_p != kernproc) {
4957 error = copyin(sopt->sopt_val, &tv32, sizeof(tv32));
4958 if (error != 0) {
4959 return error;
4960 }
4961 } else {
4962 bcopy(CAST_DOWN(caddr_t, sopt->sopt_val), &tv32,
4963 sizeof(tv32));
4964 }
4965 #ifndef __LP64__
4966 /*
4967 * K64todo "comparison is always false due to
4968 * limited range of data type"
4969 */
4970 if (tv32.tv_sec < 0 || tv32.tv_sec > LONG_MAX ||
4971 tv32.tv_usec < 0 || tv32.tv_usec >= 1000000) {
4972 return EDOM;
4973 }
4974 #endif
4975 tv_p->tv_sec = tv32.tv_sec;
4976 tv_p->tv_usec = tv32.tv_usec;
4977 }
4978 return 0;
4979 }
4980
4981 int
4982 soopt_cred_check(struct socket *so, int priv, boolean_t allow_root,
4983 boolean_t ignore_delegate)
4984 {
4985 kauth_cred_t cred = NULL;
4986 proc_t ep = PROC_NULL;
4987 uid_t uid;
4988 int error = 0;
4989
4990 if (ignore_delegate == false && so->so_flags & SOF_DELEGATED) {
4991 ep = proc_find(so->e_pid);
4992 if (ep) {
4993 cred = kauth_cred_proc_ref(ep);
4994 }
4995 }
4996
4997 uid = kauth_cred_getuid(cred ? cred : so->so_cred);
4998
4999 /* uid is 0 for root */
5000 if (uid != 0 || !allow_root) {
5001 error = priv_check_cred(cred ? cred : so->so_cred, priv, 0);
5002 }
5003 if (cred) {
5004 kauth_cred_unref(&cred);
5005 }
5006 if (ep != PROC_NULL) {
5007 proc_rele(ep);
5008 }
5009
5010 return error;
5011 }
5012
5013 /*
5014 * Returns: 0 Success
5015 * EINVAL
5016 * ENOPROTOOPT
5017 * ENOBUFS
5018 * EDOM
5019 * sooptcopyin:EINVAL
5020 * sooptcopyin:EFAULT
5021 * sooptcopyin_timeval:EINVAL
5022 * sooptcopyin_timeval:EFAULT
5023 * sooptcopyin_timeval:EDOM
5024 * <pr_ctloutput>:EOPNOTSUPP[AF_UNIX]
5025 * <pr_ctloutput>:???w
5026 * sflt_attach_private:??? [whatever a filter author chooses]
5027 * <sf_setoption>:??? [whatever a filter author chooses]
5028 *
5029 * Notes: Other <pru_listen> returns depend on the protocol family; all
5030 * <sf_listen> returns depend on what the filter author causes
5031 * their filter to return.
5032 */
5033 int
5034 sosetoptlock(struct socket *so, struct sockopt *sopt, int dolock)
5035 {
5036 int error, optval;
5037 int64_t long_optval;
5038 struct linger l;
5039 struct timeval tv;
5040 #if CONFIG_MACF_SOCKET
5041 struct mac extmac;
5042 #endif /* MAC_SOCKET */
5043
5044 if (sopt->sopt_dir != SOPT_SET) {
5045 sopt->sopt_dir = SOPT_SET;
5046 }
5047
5048 if (dolock) {
5049 socket_lock(so, 1);
5050 }
5051
5052 if ((so->so_state & (SS_CANTRCVMORE | SS_CANTSENDMORE)) ==
5053 (SS_CANTRCVMORE | SS_CANTSENDMORE) &&
5054 (so->so_flags & SOF_NPX_SETOPTSHUT) == 0) {
5055 /* the socket has been shutdown, no more sockopt's */
5056 error = EINVAL;
5057 goto out;
5058 }
5059
5060 error = sflt_setsockopt(so, sopt);
5061 if (error != 0) {
5062 if (error == EJUSTRETURN) {
5063 error = 0;
5064 }
5065 goto out;
5066 }
5067
5068 if (sopt->sopt_level != SOL_SOCKET) {
5069 if (so->so_proto != NULL &&
5070 so->so_proto->pr_ctloutput != NULL) {
5071 error = (*so->so_proto->pr_ctloutput)(so, sopt);
5072 goto out;
5073 }
5074 error = ENOPROTOOPT;
5075 } else {
5076 /*
5077 * Allow socket-level (SOL_SOCKET) options to be filtered by
5078 * the protocol layer, if needed. A zero value returned from
5079 * the handler means use default socket-level processing as
5080 * done by the rest of this routine. Otherwise, any other
5081 * return value indicates that the option is unsupported.
5082 */
5083 if (so->so_proto != NULL && (error = so->so_proto->pr_usrreqs->
5084 pru_socheckopt(so, sopt)) != 0) {
5085 goto out;
5086 }
5087
5088 error = 0;
5089 switch (sopt->sopt_name) {
5090 case SO_LINGER:
5091 case SO_LINGER_SEC:
5092 error = sooptcopyin(sopt, &l, sizeof(l), sizeof(l));
5093 if (error != 0) {
5094 goto out;
5095 }
5096
5097 so->so_linger = (sopt->sopt_name == SO_LINGER) ?
5098 l.l_linger : l.l_linger * hz;
5099 if (l.l_onoff != 0) {
5100 so->so_options |= SO_LINGER;
5101 } else {
5102 so->so_options &= ~SO_LINGER;
5103 }
5104 break;
5105
5106 case SO_DEBUG:
5107 case SO_KEEPALIVE:
5108 case SO_DONTROUTE:
5109 case SO_USELOOPBACK:
5110 case SO_BROADCAST:
5111 case SO_REUSEADDR:
5112 case SO_REUSEPORT:
5113 case SO_OOBINLINE:
5114 case SO_TIMESTAMP:
5115 case SO_TIMESTAMP_MONOTONIC:
5116 case SO_TIMESTAMP_CONTINUOUS:
5117 case SO_DONTTRUNC:
5118 case SO_WANTMORE:
5119 case SO_WANTOOBFLAG:
5120 case SO_NOWAKEFROMSLEEP:
5121 case SO_NOAPNFALLBK:
5122 error = sooptcopyin(sopt, &optval, sizeof(optval),
5123 sizeof(optval));
5124 if (error != 0) {
5125 goto out;
5126 }
5127 if (optval) {
5128 so->so_options |= sopt->sopt_name;
5129 } else {
5130 so->so_options &= ~sopt->sopt_name;
5131 }
5132 break;
5133
5134 case SO_SNDBUF:
5135 case SO_RCVBUF:
5136 case SO_SNDLOWAT:
5137 case SO_RCVLOWAT:
5138 error = sooptcopyin(sopt, &optval, sizeof(optval),
5139 sizeof(optval));
5140 if (error != 0) {
5141 goto out;
5142 }
5143
5144 /*
5145 * Values < 1 make no sense for any of these
5146 * options, so disallow them.
5147 */
5148 if (optval < 1) {
5149 error = EINVAL;
5150 goto out;
5151 }
5152
5153 switch (sopt->sopt_name) {
5154 case SO_SNDBUF:
5155 case SO_RCVBUF: {
5156 struct sockbuf *sb =
5157 (sopt->sopt_name == SO_SNDBUF) ?
5158 &so->so_snd : &so->so_rcv;
5159 if (sbreserve(sb, (u_int32_t)optval) == 0) {
5160 error = ENOBUFS;
5161 goto out;
5162 }
5163 sb->sb_flags |= SB_USRSIZE;
5164 sb->sb_flags &= ~SB_AUTOSIZE;
5165 sb->sb_idealsize = (u_int32_t)optval;
5166 break;
5167 }
5168 /*
5169 * Make sure the low-water is never greater than
5170 * the high-water.
5171 */
5172 case SO_SNDLOWAT: {
5173 int space = sbspace(&so->so_snd);
5174 u_int32_t hiwat = so->so_snd.sb_hiwat;
5175
5176 if (so->so_snd.sb_flags & SB_UNIX) {
5177 struct unpcb *unp =
5178 (struct unpcb *)(so->so_pcb);
5179 if (unp != NULL &&
5180 unp->unp_conn != NULL) {
5181 hiwat += unp->unp_conn->unp_cc;
5182 }
5183 }
5184
5185 so->so_snd.sb_lowat =
5186 (optval > hiwat) ?
5187 hiwat : optval;
5188
5189 if (space >= so->so_snd.sb_lowat) {
5190 sowwakeup(so);
5191 }
5192 break;
5193 }
5194 case SO_RCVLOWAT: {
5195 int64_t data_len;
5196 so->so_rcv.sb_lowat =
5197 (optval > so->so_rcv.sb_hiwat) ?
5198 so->so_rcv.sb_hiwat : optval;
5199 data_len = so->so_rcv.sb_cc
5200 - so->so_rcv.sb_ctl;
5201 if (data_len >= so->so_rcv.sb_lowat) {
5202 sorwakeup(so);
5203 }
5204 break;
5205 }
5206 }
5207 break;
5208
5209 case SO_SNDTIMEO:
5210 case SO_RCVTIMEO:
5211 error = sooptcopyin_timeval(sopt, &tv);
5212 if (error != 0) {
5213 goto out;
5214 }
5215
5216 switch (sopt->sopt_name) {
5217 case SO_SNDTIMEO:
5218 so->so_snd.sb_timeo = tv;
5219 break;
5220 case SO_RCVTIMEO:
5221 so->so_rcv.sb_timeo = tv;
5222 break;
5223 }
5224 break;
5225
5226 case SO_NKE: {
5227 struct so_nke nke;
5228
5229 error = sooptcopyin(sopt, &nke, sizeof(nke),
5230 sizeof(nke));
5231 if (error != 0) {
5232 goto out;
5233 }
5234
5235 error = sflt_attach_internal(so, nke.nke_handle);
5236 break;
5237 }
5238
5239 case SO_NOSIGPIPE:
5240 error = sooptcopyin(sopt, &optval, sizeof(optval),
5241 sizeof(optval));
5242 if (error != 0) {
5243 goto out;
5244 }
5245 if (optval != 0) {
5246 so->so_flags |= SOF_NOSIGPIPE;
5247 } else {
5248 so->so_flags &= ~SOF_NOSIGPIPE;
5249 }
5250 break;
5251
5252 case SO_NOADDRERR:
5253 error = sooptcopyin(sopt, &optval, sizeof(optval),
5254 sizeof(optval));
5255 if (error != 0) {
5256 goto out;
5257 }
5258 if (optval != 0) {
5259 so->so_flags |= SOF_NOADDRAVAIL;
5260 } else {
5261 so->so_flags &= ~SOF_NOADDRAVAIL;
5262 }
5263 break;
5264
5265 case SO_REUSESHAREUID:
5266 error = sooptcopyin(sopt, &optval, sizeof(optval),
5267 sizeof(optval));
5268 if (error != 0) {
5269 goto out;
5270 }
5271 if (optval != 0) {
5272 so->so_flags |= SOF_REUSESHAREUID;
5273 } else {
5274 so->so_flags &= ~SOF_REUSESHAREUID;
5275 }
5276 break;
5277
5278 case SO_NOTIFYCONFLICT:
5279 if (kauth_cred_issuser(kauth_cred_get()) == 0) {
5280 error = EPERM;
5281 goto out;
5282 }
5283 error = sooptcopyin(sopt, &optval, sizeof(optval),
5284 sizeof(optval));
5285 if (error != 0) {
5286 goto out;
5287 }
5288 if (optval != 0) {
5289 so->so_flags |= SOF_NOTIFYCONFLICT;
5290 } else {
5291 so->so_flags &= ~SOF_NOTIFYCONFLICT;
5292 }
5293 break;
5294
5295 case SO_RESTRICTIONS:
5296 error = sooptcopyin(sopt, &optval, sizeof(optval),
5297 sizeof(optval));
5298 if (error != 0) {
5299 goto out;
5300 }
5301
5302 error = so_set_restrictions(so, optval);
5303 break;
5304
5305 case SO_AWDL_UNRESTRICTED:
5306 if (SOCK_DOM(so) != PF_INET &&
5307 SOCK_DOM(so) != PF_INET6) {
5308 error = EOPNOTSUPP;
5309 goto out;
5310 }
5311 error = sooptcopyin(sopt, &optval, sizeof(optval),
5312 sizeof(optval));
5313 if (error != 0) {
5314 goto out;
5315 }
5316 if (optval != 0) {
5317 error = soopt_cred_check(so,
5318 PRIV_NET_RESTRICTED_AWDL, false, false);
5319 if (error == 0) {
5320 inp_set_awdl_unrestricted(
5321 sotoinpcb(so));
5322 }
5323 } else {
5324 inp_clear_awdl_unrestricted(sotoinpcb(so));
5325 }
5326 break;
5327 case SO_INTCOPROC_ALLOW:
5328 if (SOCK_DOM(so) != PF_INET6) {
5329 error = EOPNOTSUPP;
5330 goto out;
5331 }
5332 error = sooptcopyin(sopt, &optval, sizeof(optval),
5333 sizeof(optval));
5334 if (error != 0) {
5335 goto out;
5336 }
5337 if (optval != 0 &&
5338 inp_get_intcoproc_allowed(sotoinpcb(so)) == FALSE) {
5339 error = soopt_cred_check(so,
5340 PRIV_NET_RESTRICTED_INTCOPROC, false, false);
5341 if (error == 0) {
5342 inp_set_intcoproc_allowed(
5343 sotoinpcb(so));
5344 }
5345 } else if (optval == 0) {
5346 inp_clear_intcoproc_allowed(sotoinpcb(so));
5347 }
5348 break;
5349
5350 case SO_LABEL:
5351 #if CONFIG_MACF_SOCKET
5352 if ((error = sooptcopyin(sopt, &extmac, sizeof(extmac),
5353 sizeof(extmac))) != 0) {
5354 goto out;
5355 }
5356
5357 error = mac_setsockopt_label(proc_ucred(sopt->sopt_p),
5358 so, &extmac);
5359 #else
5360 error = EOPNOTSUPP;
5361 #endif /* MAC_SOCKET */
5362 break;
5363
5364 case SO_UPCALLCLOSEWAIT:
5365 error = sooptcopyin(sopt, &optval, sizeof(optval),
5366 sizeof(optval));
5367 if (error != 0) {
5368 goto out;
5369 }
5370 if (optval != 0) {
5371 so->so_flags |= SOF_UPCALLCLOSEWAIT;
5372 } else {
5373 so->so_flags &= ~SOF_UPCALLCLOSEWAIT;
5374 }
5375 break;
5376
5377 case SO_RANDOMPORT:
5378 error = sooptcopyin(sopt, &optval, sizeof(optval),
5379 sizeof(optval));
5380 if (error != 0) {
5381 goto out;
5382 }
5383 if (optval != 0) {
5384 so->so_flags |= SOF_BINDRANDOMPORT;
5385 } else {
5386 so->so_flags &= ~SOF_BINDRANDOMPORT;
5387 }
5388 break;
5389
5390 case SO_NP_EXTENSIONS: {
5391 struct so_np_extensions sonpx;
5392
5393 error = sooptcopyin(sopt, &sonpx, sizeof(sonpx),
5394 sizeof(sonpx));
5395 if (error != 0) {
5396 goto out;
5397 }
5398 if (sonpx.npx_mask & ~SONPX_MASK_VALID) {
5399 error = EINVAL;
5400 goto out;
5401 }
5402 /*
5403 * Only one bit defined for now
5404 */
5405 if ((sonpx.npx_mask & SONPX_SETOPTSHUT)) {
5406 if ((sonpx.npx_flags & SONPX_SETOPTSHUT)) {
5407 so->so_flags |= SOF_NPX_SETOPTSHUT;
5408 } else {
5409 so->so_flags &= ~SOF_NPX_SETOPTSHUT;
5410 }
5411 }
5412 break;
5413 }
5414
5415 case SO_TRAFFIC_CLASS: {
5416 error = sooptcopyin(sopt, &optval, sizeof(optval),
5417 sizeof(optval));
5418 if (error != 0) {
5419 goto out;
5420 }
5421 if (optval >= SO_TC_NET_SERVICE_OFFSET) {
5422 int netsvc = optval - SO_TC_NET_SERVICE_OFFSET;
5423 error = so_set_net_service_type(so, netsvc);
5424 goto out;
5425 }
5426 error = so_set_traffic_class(so, optval);
5427 if (error != 0) {
5428 goto out;
5429 }
5430 so->so_flags1 &= ~SOF1_TC_NET_SERV_TYPE;
5431 so->so_netsvctype = _NET_SERVICE_TYPE_UNSPEC;
5432 break;
5433 }
5434
5435 case SO_RECV_TRAFFIC_CLASS: {
5436 error = sooptcopyin(sopt, &optval, sizeof(optval),
5437 sizeof(optval));
5438 if (error != 0) {
5439 goto out;
5440 }
5441 if (optval == 0) {
5442 so->so_flags &= ~SOF_RECV_TRAFFIC_CLASS;
5443 } else {
5444 so->so_flags |= SOF_RECV_TRAFFIC_CLASS;
5445 }
5446 break;
5447 }
5448
5449 #if (DEVELOPMENT || DEBUG)
5450 case SO_TRAFFIC_CLASS_DBG: {
5451 struct so_tcdbg so_tcdbg;
5452
5453 error = sooptcopyin(sopt, &so_tcdbg,
5454 sizeof(struct so_tcdbg), sizeof(struct so_tcdbg));
5455 if (error != 0) {
5456 goto out;
5457 }
5458 error = so_set_tcdbg(so, &so_tcdbg);
5459 if (error != 0) {
5460 goto out;
5461 }
5462 break;
5463 }
5464 #endif /* (DEVELOPMENT || DEBUG) */
5465
5466 case SO_PRIVILEGED_TRAFFIC_CLASS:
5467 error = priv_check_cred(kauth_cred_get(),
5468 PRIV_NET_PRIVILEGED_TRAFFIC_CLASS, 0);
5469 if (error != 0) {
5470 goto out;
5471 }
5472 error = sooptcopyin(sopt, &optval, sizeof(optval),
5473 sizeof(optval));
5474 if (error != 0) {
5475 goto out;
5476 }
5477 if (optval == 0) {
5478 so->so_flags &= ~SOF_PRIVILEGED_TRAFFIC_CLASS;
5479 } else {
5480 so->so_flags |= SOF_PRIVILEGED_TRAFFIC_CLASS;
5481 }
5482 break;
5483
5484 #if (DEVELOPMENT || DEBUG)
5485 case SO_DEFUNCTIT:
5486 error = sosetdefunct(current_proc(), so, 0, FALSE);
5487 if (error == 0) {
5488 error = sodefunct(current_proc(), so, 0);
5489 }
5490
5491 break;
5492 #endif /* (DEVELOPMENT || DEBUG) */
5493
5494 case SO_DEFUNCTOK:
5495 error = sooptcopyin(sopt, &optval, sizeof(optval),
5496 sizeof(optval));
5497 if (error != 0 || (so->so_flags & SOF_DEFUNCT)) {
5498 if (error == 0) {
5499 error = EBADF;
5500 }
5501 goto out;
5502 }
5503 /*
5504 * Any process can set SO_DEFUNCTOK (clear
5505 * SOF_NODEFUNCT), but only root can clear
5506 * SO_DEFUNCTOK (set SOF_NODEFUNCT).
5507 */
5508 if (optval == 0 &&
5509 kauth_cred_issuser(kauth_cred_get()) == 0) {
5510 error = EPERM;
5511 goto out;
5512 }
5513 if (optval) {
5514 so->so_flags &= ~SOF_NODEFUNCT;
5515 } else {
5516 so->so_flags |= SOF_NODEFUNCT;
5517 }
5518
5519 if (SOCK_DOM(so) == PF_INET ||
5520 SOCK_DOM(so) == PF_INET6) {
5521 char s[MAX_IPv6_STR_LEN];
5522 char d[MAX_IPv6_STR_LEN];
5523 struct inpcb *inp = sotoinpcb(so);
5524
5525 SODEFUNCTLOG("%s[%d, %s]: so 0x%llx "
5526 "[%s %s:%d -> %s:%d] is now marked "
5527 "as %seligible for "
5528 "defunct\n", __func__, proc_selfpid(),
5529 proc_best_name(current_proc()),
5530 (uint64_t)DEBUG_KERNEL_ADDRPERM(so),
5531 (SOCK_TYPE(so) == SOCK_STREAM) ?
5532 "TCP" : "UDP", inet_ntop(SOCK_DOM(so),
5533 ((SOCK_DOM(so) == PF_INET) ?
5534 (void *)&inp->inp_laddr.s_addr :
5535 (void *)&inp->in6p_laddr), s, sizeof(s)),
5536 ntohs(inp->in6p_lport),
5537 inet_ntop(SOCK_DOM(so),
5538 (SOCK_DOM(so) == PF_INET) ?
5539 (void *)&inp->inp_faddr.s_addr :
5540 (void *)&inp->in6p_faddr, d, sizeof(d)),
5541 ntohs(inp->in6p_fport),
5542 (so->so_flags & SOF_NODEFUNCT) ?
5543 "not " : "");
5544 } else {
5545 SODEFUNCTLOG("%s[%d, %s]: so 0x%llx [%d,%d] "
5546 "is now marked as %seligible for "
5547 "defunct\n",
5548 __func__, proc_selfpid(),
5549 proc_best_name(current_proc()),
5550 (uint64_t)DEBUG_KERNEL_ADDRPERM(so),
5551 SOCK_DOM(so), SOCK_TYPE(so),
5552 (so->so_flags & SOF_NODEFUNCT) ?
5553 "not " : "");
5554 }
5555 break;
5556
5557 case SO_ISDEFUNCT:
5558 /* This option is not settable */
5559 error = EINVAL;
5560 break;
5561
5562 case SO_OPPORTUNISTIC:
5563 error = sooptcopyin(sopt, &optval, sizeof(optval),
5564 sizeof(optval));
5565 if (error == 0) {
5566 error = so_set_opportunistic(so, optval);
5567 }
5568 break;
5569
5570 case SO_FLUSH:
5571 /* This option is handled by lower layer(s) */
5572 error = 0;
5573 break;
5574
5575 case SO_RECV_ANYIF:
5576 error = sooptcopyin(sopt, &optval, sizeof(optval),
5577 sizeof(optval));
5578 if (error == 0) {
5579 error = so_set_recv_anyif(so, optval);
5580 }
5581 break;
5582
5583 case SO_TRAFFIC_MGT_BACKGROUND: {
5584 /* This option is handled by lower layer(s) */
5585 error = 0;
5586 break;
5587 }
5588
5589 #if FLOW_DIVERT
5590 case SO_FLOW_DIVERT_TOKEN:
5591 error = flow_divert_token_set(so, sopt);
5592 break;
5593 #endif /* FLOW_DIVERT */
5594
5595
5596 case SO_DELEGATED:
5597 if ((error = sooptcopyin(sopt, &optval, sizeof(optval),
5598 sizeof(optval))) != 0) {
5599 break;
5600 }
5601
5602 error = so_set_effective_pid(so, optval, sopt->sopt_p, true);
5603 break;
5604
5605 case SO_DELEGATED_UUID: {
5606 uuid_t euuid;
5607
5608 if ((error = sooptcopyin(sopt, &euuid, sizeof(euuid),
5609 sizeof(euuid))) != 0) {
5610 break;
5611 }
5612
5613 error = so_set_effective_uuid(so, euuid, sopt->sopt_p, true);
5614 break;
5615 }
5616
5617 #if NECP
5618 case SO_NECP_ATTRIBUTES:
5619 error = necp_set_socket_attributes(so, sopt);
5620 break;
5621
5622 case SO_NECP_CLIENTUUID: {
5623 if (SOCK_DOM(so) == PF_MULTIPATH) {
5624 /* Handled by MPTCP itself */
5625 break;
5626 }
5627
5628 if (SOCK_DOM(so) != PF_INET && SOCK_DOM(so) != PF_INET6) {
5629 error = EINVAL;
5630 goto out;
5631 }
5632
5633 struct inpcb *inp = sotoinpcb(so);
5634 if (!uuid_is_null(inp->necp_client_uuid)) {
5635 // Clear out the old client UUID if present
5636 necp_inpcb_remove_cb(inp);
5637 }
5638
5639 error = sooptcopyin(sopt, &inp->necp_client_uuid,
5640 sizeof(uuid_t), sizeof(uuid_t));
5641 if (error != 0) {
5642 goto out;
5643 }
5644
5645 if (uuid_is_null(inp->necp_client_uuid)) {
5646 error = EINVAL;
5647 goto out;
5648 }
5649
5650 pid_t current_pid = proc_pid(current_proc());
5651 error = necp_client_register_socket_flow(current_pid,
5652 inp->necp_client_uuid, inp);
5653 if (error != 0) {
5654 uuid_clear(inp->necp_client_uuid);
5655 goto out;
5656 }
5657
5658 if (inp->inp_lport != 0) {
5659 // There is a bound local port, so this is not
5660 // a fresh socket. Assign to the client.
5661 necp_client_assign_from_socket(current_pid, inp->necp_client_uuid, inp);
5662 }
5663
5664 break;
5665 }
5666 case SO_NECP_LISTENUUID: {
5667 if (SOCK_DOM(so) != PF_INET && SOCK_DOM(so) != PF_INET6) {
5668 error = EINVAL;
5669 goto out;
5670 }
5671
5672 struct inpcb *inp = sotoinpcb(so);
5673 if (!uuid_is_null(inp->necp_client_uuid)) {
5674 error = EINVAL;
5675 goto out;
5676 }
5677
5678 error = sooptcopyin(sopt, &inp->necp_client_uuid,
5679 sizeof(uuid_t), sizeof(uuid_t));
5680 if (error != 0) {
5681 goto out;
5682 }
5683
5684 if (uuid_is_null(inp->necp_client_uuid)) {
5685 error = EINVAL;
5686 goto out;
5687 }
5688
5689 error = necp_client_register_socket_listener(proc_pid(current_proc()),
5690 inp->necp_client_uuid, inp);
5691 if (error != 0) {
5692 uuid_clear(inp->necp_client_uuid);
5693 goto out;
5694 }
5695
5696 // Mark that the port registration is held by NECP
5697 inp->inp_flags2 |= INP2_EXTERNAL_PORT;
5698
5699 break;
5700 }
5701 #endif /* NECP */
5702
5703 case SO_EXTENDED_BK_IDLE:
5704 error = sooptcopyin(sopt, &optval, sizeof(optval),
5705 sizeof(optval));
5706 if (error == 0) {
5707 error = so_set_extended_bk_idle(so, optval);
5708 }
5709 break;
5710
5711 case SO_MARK_CELLFALLBACK:
5712 error = sooptcopyin(sopt, &optval, sizeof(optval),
5713 sizeof(optval));
5714 if (error != 0) {
5715 goto out;
5716 }
5717 if (optval < 0) {
5718 error = EINVAL;
5719 goto out;
5720 }
5721 if (optval == 0) {
5722 so->so_flags1 &= ~SOF1_CELLFALLBACK;
5723 } else {
5724 so->so_flags1 |= SOF1_CELLFALLBACK;
5725 }
5726 break;
5727
5728 case SO_STATISTICS_EVENT:
5729 error = sooptcopyin(sopt, &long_optval,
5730 sizeof(long_optval), sizeof(long_optval));
5731 if (error != 0) {
5732 goto out;
5733 }
5734 u_int64_t nstat_event = 0;
5735 error = so_statistics_event_to_nstat_event(
5736 &long_optval, &nstat_event);
5737 if (error != 0) {
5738 goto out;
5739 }
5740 nstat_pcb_event(sotoinpcb(so), nstat_event);
5741 break;
5742
5743 case SO_NET_SERVICE_TYPE: {
5744 error = sooptcopyin(sopt, &optval, sizeof(optval),
5745 sizeof(optval));
5746 if (error != 0) {
5747 goto out;
5748 }
5749 error = so_set_net_service_type(so, optval);
5750 break;
5751 }
5752
5753 case SO_QOSMARKING_POLICY_OVERRIDE:
5754 error = priv_check_cred(kauth_cred_get(),
5755 PRIV_NET_QOSMARKING_POLICY_OVERRIDE, 0);
5756 if (error != 0) {
5757 goto out;
5758 }
5759 error = sooptcopyin(sopt, &optval, sizeof(optval),
5760 sizeof(optval));
5761 if (error != 0) {
5762 goto out;
5763 }
5764 if (optval == 0) {
5765 so->so_flags1 &= ~SOF1_QOSMARKING_POLICY_OVERRIDE;
5766 } else {
5767 so->so_flags1 |= SOF1_QOSMARKING_POLICY_OVERRIDE;
5768 }
5769 break;
5770
5771 case SO_MPKL_SEND_INFO: {
5772 struct so_mpkl_send_info so_mpkl_send_info;
5773
5774 error = sooptcopyin(sopt, &so_mpkl_send_info,
5775 sizeof(struct so_mpkl_send_info), sizeof(struct so_mpkl_send_info));
5776 if (error != 0) {
5777 goto out;
5778 }
5779 uuid_copy(so->so_mpkl_send_uuid, so_mpkl_send_info.mpkl_uuid);
5780 so->so_mpkl_send_proto = so_mpkl_send_info.mpkl_proto;
5781
5782 if (uuid_is_null(so->so_mpkl_send_uuid) && so->so_mpkl_send_proto == 0) {
5783 so->so_flags1 &= ~SOF1_MPKL_SEND_INFO;
5784 } else {
5785 so->so_flags1 |= SOF1_MPKL_SEND_INFO;
5786 }
5787 break;
5788 }
5789 default:
5790 error = ENOPROTOOPT;
5791 break;
5792 }
5793 if (error == 0 && so->so_proto != NULL &&
5794 so->so_proto->pr_ctloutput != NULL) {
5795 (void) so->so_proto->pr_ctloutput(so, sopt);
5796 }
5797 }
5798 out:
5799 if (dolock) {
5800 socket_unlock(so, 1);
5801 }
5802 return error;
5803 }
5804
5805 /* Helper routines for getsockopt */
5806 int
5807 sooptcopyout(struct sockopt *sopt, void *buf, size_t len)
5808 {
5809 int error;
5810 size_t valsize;
5811
5812 error = 0;
5813
5814 /*
5815 * Documented get behavior is that we always return a value,
5816 * possibly truncated to fit in the user's buffer.
5817 * Traditional behavior is that we always tell the user
5818 * precisely how much we copied, rather than something useful
5819 * like the total amount we had available for her.
5820 * Note that this interface is not idempotent; the entire answer must
5821 * generated ahead of time.
5822 */
5823 valsize = min(len, sopt->sopt_valsize);
5824 sopt->sopt_valsize = valsize;
5825 if (sopt->sopt_val != USER_ADDR_NULL) {
5826 if (sopt->sopt_p != kernproc) {
5827 error = copyout(buf, sopt->sopt_val, valsize);
5828 } else {
5829 bcopy(buf, CAST_DOWN(caddr_t, sopt->sopt_val), valsize);
5830 }
5831 }
5832 return error;
5833 }
5834
5835 static int
5836 sooptcopyout_timeval(struct sockopt *sopt, const struct timeval *tv_p)
5837 {
5838 int error;
5839 size_t len;
5840 struct user64_timeval tv64 = {};
5841 struct user32_timeval tv32 = {};
5842 const void * val;
5843 size_t valsize;
5844
5845 error = 0;
5846 if (proc_is64bit(sopt->sopt_p)) {
5847 len = sizeof(tv64);
5848 tv64.tv_sec = tv_p->tv_sec;
5849 tv64.tv_usec = tv_p->tv_usec;
5850 val = &tv64;
5851 } else {
5852 len = sizeof(tv32);
5853 tv32.tv_sec = tv_p->tv_sec;
5854 tv32.tv_usec = tv_p->tv_usec;
5855 val = &tv32;
5856 }
5857 valsize = min(len, sopt->sopt_valsize);
5858 sopt->sopt_valsize = valsize;
5859 if (sopt->sopt_val != USER_ADDR_NULL) {
5860 if (sopt->sopt_p != kernproc) {
5861 error = copyout(val, sopt->sopt_val, valsize);
5862 } else {
5863 bcopy(val, CAST_DOWN(caddr_t, sopt->sopt_val), valsize);
5864 }
5865 }
5866 return error;
5867 }
5868
5869 /*
5870 * Return: 0 Success
5871 * ENOPROTOOPT
5872 * <pr_ctloutput>:EOPNOTSUPP[AF_UNIX]
5873 * <pr_ctloutput>:???
5874 * <sf_getoption>:???
5875 */
5876 int
5877 sogetoptlock(struct socket *so, struct sockopt *sopt, int dolock)
5878 {
5879 int error, optval;
5880 struct linger l;
5881 struct timeval tv;
5882 #if CONFIG_MACF_SOCKET
5883 struct mac extmac;
5884 #endif /* MAC_SOCKET */
5885
5886 if (sopt->sopt_dir != SOPT_GET) {
5887 sopt->sopt_dir = SOPT_GET;
5888 }
5889
5890 if (dolock) {
5891 socket_lock(so, 1);
5892 }
5893
5894 error = sflt_getsockopt(so, sopt);
5895 if (error != 0) {
5896 if (error == EJUSTRETURN) {
5897 error = 0;
5898 }
5899 goto out;
5900 }
5901
5902 if (sopt->sopt_level != SOL_SOCKET) {
5903 if (so->so_proto != NULL &&
5904 so->so_proto->pr_ctloutput != NULL) {
5905 error = (*so->so_proto->pr_ctloutput)(so, sopt);
5906 goto out;
5907 }
5908 error = ENOPROTOOPT;
5909 } else {
5910 /*
5911 * Allow socket-level (SOL_SOCKET) options to be filtered by
5912 * the protocol layer, if needed. A zero value returned from
5913 * the handler means use default socket-level processing as
5914 * done by the rest of this routine. Otherwise, any other
5915 * return value indicates that the option is unsupported.
5916 */
5917 if (so->so_proto != NULL && (error = so->so_proto->pr_usrreqs->
5918 pru_socheckopt(so, sopt)) != 0) {
5919 goto out;
5920 }
5921
5922 error = 0;
5923 switch (sopt->sopt_name) {
5924 case SO_LINGER:
5925 case SO_LINGER_SEC:
5926 l.l_onoff = ((so->so_options & SO_LINGER) ? 1 : 0);
5927 l.l_linger = (sopt->sopt_name == SO_LINGER) ?
5928 so->so_linger : so->so_linger / hz;
5929 error = sooptcopyout(sopt, &l, sizeof(l));
5930 break;
5931
5932 case SO_USELOOPBACK:
5933 case SO_DONTROUTE:
5934 case SO_DEBUG:
5935 case SO_KEEPALIVE:
5936 case SO_REUSEADDR:
5937 case SO_REUSEPORT:
5938 case SO_BROADCAST:
5939 case SO_OOBINLINE:
5940 case SO_TIMESTAMP:
5941 case SO_TIMESTAMP_MONOTONIC:
5942 case SO_TIMESTAMP_CONTINUOUS:
5943 case SO_DONTTRUNC:
5944 case SO_WANTMORE:
5945 case SO_WANTOOBFLAG:
5946 case SO_NOWAKEFROMSLEEP:
5947 case SO_NOAPNFALLBK:
5948 optval = so->so_options & sopt->sopt_name;
5949 integer:
5950 error = sooptcopyout(sopt, &optval, sizeof(optval));
5951 break;
5952
5953 case SO_TYPE:
5954 optval = so->so_type;
5955 goto integer;
5956
5957 case SO_NREAD:
5958 if (so->so_proto->pr_flags & PR_ATOMIC) {
5959 int pkt_total;
5960 struct mbuf *m1;
5961
5962 pkt_total = 0;
5963 m1 = so->so_rcv.sb_mb;
5964 while (m1 != NULL) {
5965 if (m1->m_type == MT_DATA ||
5966 m1->m_type == MT_HEADER ||
5967 m1->m_type == MT_OOBDATA) {
5968 pkt_total += m1->m_len;
5969 }
5970 m1 = m1->m_next;
5971 }
5972 optval = pkt_total;
5973 } else {
5974 optval = so->so_rcv.sb_cc - so->so_rcv.sb_ctl;
5975 }
5976 goto integer;
5977
5978 case SO_NUMRCVPKT:
5979 if (so->so_proto->pr_flags & PR_ATOMIC) {
5980 int cnt = 0;
5981 struct mbuf *m1;
5982
5983 m1 = so->so_rcv.sb_mb;
5984 while (m1 != NULL) {
5985 cnt += 1;
5986 m1 = m1->m_nextpkt;
5987 }
5988 optval = cnt;
5989 goto integer;
5990 } else {
5991 error = ENOPROTOOPT;
5992 break;
5993 }
5994
5995 case SO_NWRITE:
5996 optval = so->so_snd.sb_cc;
5997 goto integer;
5998
5999 case SO_ERROR:
6000 optval = so->so_error;
6001 so->so_error = 0;
6002 goto integer;
6003
6004 case SO_SNDBUF: {
6005 u_int32_t hiwat = so->so_snd.sb_hiwat;
6006
6007 if (so->so_snd.sb_flags & SB_UNIX) {
6008 struct unpcb *unp =
6009 (struct unpcb *)(so->so_pcb);
6010 if (unp != NULL && unp->unp_conn != NULL) {
6011 hiwat += unp->unp_conn->unp_cc;
6012 }
6013 }
6014
6015 optval = hiwat;
6016 goto integer;
6017 }
6018 case SO_RCVBUF:
6019 optval = so->so_rcv.sb_hiwat;
6020 goto integer;
6021
6022 case SO_SNDLOWAT:
6023 optval = so->so_snd.sb_lowat;
6024 goto integer;
6025
6026 case SO_RCVLOWAT:
6027 optval = so->so_rcv.sb_lowat;
6028 goto integer;
6029
6030 case SO_SNDTIMEO:
6031 case SO_RCVTIMEO:
6032 tv = (sopt->sopt_name == SO_SNDTIMEO ?
6033 so->so_snd.sb_timeo : so->so_rcv.sb_timeo);
6034
6035 error = sooptcopyout_timeval(sopt, &tv);
6036 break;
6037
6038 case SO_NOSIGPIPE:
6039 optval = (so->so_flags & SOF_NOSIGPIPE);
6040 goto integer;
6041
6042 case SO_NOADDRERR:
6043 optval = (so->so_flags & SOF_NOADDRAVAIL);
6044 goto integer;
6045
6046 case SO_REUSESHAREUID:
6047 optval = (so->so_flags & SOF_REUSESHAREUID);
6048 goto integer;
6049
6050
6051 case SO_NOTIFYCONFLICT:
6052 optval = (so->so_flags & SOF_NOTIFYCONFLICT);
6053 goto integer;
6054
6055 case SO_RESTRICTIONS:
6056 optval = so_get_restrictions(so);
6057 goto integer;
6058
6059 case SO_AWDL_UNRESTRICTED:
6060 if (SOCK_DOM(so) == PF_INET ||
6061 SOCK_DOM(so) == PF_INET6) {
6062 optval = inp_get_awdl_unrestricted(
6063 sotoinpcb(so));
6064 goto integer;
6065 } else {
6066 error = EOPNOTSUPP;
6067 }
6068 break;
6069
6070 case SO_INTCOPROC_ALLOW:
6071 if (SOCK_DOM(so) == PF_INET6) {
6072 optval = inp_get_intcoproc_allowed(
6073 sotoinpcb(so));
6074 goto integer;
6075 } else {
6076 error = EOPNOTSUPP;
6077 }
6078 break;
6079
6080 case SO_LABEL:
6081 #if CONFIG_MACF_SOCKET
6082 if ((error = sooptcopyin(sopt, &extmac, sizeof(extmac),
6083 sizeof(extmac))) != 0 ||
6084 (error = mac_socket_label_get(proc_ucred(
6085 sopt->sopt_p), so, &extmac)) != 0) {
6086 break;
6087 }
6088
6089 error = sooptcopyout(sopt, &extmac, sizeof(extmac));
6090 #else
6091 error = EOPNOTSUPP;
6092 #endif /* MAC_SOCKET */
6093 break;
6094
6095 case SO_PEERLABEL:
6096 #if CONFIG_MACF_SOCKET
6097 if ((error = sooptcopyin(sopt, &extmac, sizeof(extmac),
6098 sizeof(extmac))) != 0 ||
6099 (error = mac_socketpeer_label_get(proc_ucred(
6100 sopt->sopt_p), so, &extmac)) != 0) {
6101 break;
6102 }
6103
6104 error = sooptcopyout(sopt, &extmac, sizeof(extmac));
6105 #else
6106 error = EOPNOTSUPP;
6107 #endif /* MAC_SOCKET */
6108 break;
6109
6110 #ifdef __APPLE_API_PRIVATE
6111 case SO_UPCALLCLOSEWAIT:
6112 optval = (so->so_flags & SOF_UPCALLCLOSEWAIT);
6113 goto integer;
6114 #endif
6115 case SO_RANDOMPORT:
6116 optval = (so->so_flags & SOF_BINDRANDOMPORT);
6117 goto integer;
6118
6119 case SO_NP_EXTENSIONS: {
6120 struct so_np_extensions sonpx = {};
6121
6122 sonpx.npx_flags = (so->so_flags & SOF_NPX_SETOPTSHUT) ?
6123 SONPX_SETOPTSHUT : 0;
6124 sonpx.npx_mask = SONPX_MASK_VALID;
6125
6126 error = sooptcopyout(sopt, &sonpx,
6127 sizeof(struct so_np_extensions));
6128 break;
6129 }
6130
6131 case SO_TRAFFIC_CLASS:
6132 optval = so->so_traffic_class;
6133 goto integer;
6134
6135 case SO_RECV_TRAFFIC_CLASS:
6136 optval = (so->so_flags & SOF_RECV_TRAFFIC_CLASS);
6137 goto integer;
6138
6139 case SO_TRAFFIC_CLASS_STATS:
6140 error = sooptcopyout(sopt, &so->so_tc_stats,
6141 sizeof(so->so_tc_stats));
6142 break;
6143
6144 #if (DEVELOPMENT || DEBUG)
6145 case SO_TRAFFIC_CLASS_DBG:
6146 error = sogetopt_tcdbg(so, sopt);
6147 break;
6148 #endif /* (DEVELOPMENT || DEBUG) */
6149
6150 case SO_PRIVILEGED_TRAFFIC_CLASS:
6151 optval = (so->so_flags & SOF_PRIVILEGED_TRAFFIC_CLASS);
6152 goto integer;
6153
6154 case SO_DEFUNCTOK:
6155 optval = !(so->so_flags & SOF_NODEFUNCT);
6156 goto integer;
6157
6158 case SO_ISDEFUNCT:
6159 optval = (so->so_flags & SOF_DEFUNCT);
6160 goto integer;
6161
6162 case SO_OPPORTUNISTIC:
6163 optval = so_get_opportunistic(so);
6164 goto integer;
6165
6166 case SO_FLUSH:
6167 /* This option is not gettable */
6168 error = EINVAL;
6169 break;
6170
6171 case SO_RECV_ANYIF:
6172 optval = so_get_recv_anyif(so);
6173 goto integer;
6174
6175 case SO_TRAFFIC_MGT_BACKGROUND:
6176 /* This option is handled by lower layer(s) */
6177 if (so->so_proto != NULL &&
6178 so->so_proto->pr_ctloutput != NULL) {
6179 (void) so->so_proto->pr_ctloutput(so, sopt);
6180 }
6181 break;
6182
6183 #if FLOW_DIVERT
6184 case SO_FLOW_DIVERT_TOKEN:
6185 error = flow_divert_token_get(so, sopt);
6186 break;
6187 #endif /* FLOW_DIVERT */
6188
6189 #if NECP
6190 case SO_NECP_ATTRIBUTES:
6191 error = necp_get_socket_attributes(so, sopt);
6192 break;
6193
6194 case SO_NECP_CLIENTUUID: {
6195 uuid_t *ncu;
6196
6197 if (SOCK_DOM(so) == PF_MULTIPATH) {
6198 ncu = &mpsotomppcb(so)->necp_client_uuid;
6199 } else if (SOCK_DOM(so) == PF_INET || SOCK_DOM(so) == PF_INET6) {
6200 ncu = &sotoinpcb(so)->necp_client_uuid;
6201 } else {
6202 error = EINVAL;
6203 goto out;
6204 }
6205
6206 error = sooptcopyout(sopt, ncu, sizeof(uuid_t));
6207 break;
6208 }
6209
6210 case SO_NECP_LISTENUUID: {
6211 uuid_t *nlu;
6212
6213 if (SOCK_DOM(so) == PF_INET || SOCK_DOM(so) == PF_INET6) {
6214 if (sotoinpcb(so)->inp_flags2 & INP2_EXTERNAL_PORT) {
6215 nlu = &sotoinpcb(so)->necp_client_uuid;
6216 } else {
6217 error = ENOENT;
6218 goto out;
6219 }
6220 } else {
6221 error = EINVAL;
6222 goto out;
6223 }
6224
6225 error = sooptcopyout(sopt, nlu, sizeof(uuid_t));
6226 break;
6227 }
6228 #endif /* NECP */
6229
6230 #if CONTENT_FILTER
6231 case SO_CFIL_SOCK_ID: {
6232 cfil_sock_id_t sock_id;
6233
6234 sock_id = cfil_sock_id_from_socket(so);
6235
6236 error = sooptcopyout(sopt, &sock_id,
6237 sizeof(cfil_sock_id_t));
6238 break;
6239 }
6240 #endif /* CONTENT_FILTER */
6241
6242 case SO_EXTENDED_BK_IDLE:
6243 optval = (so->so_flags1 & SOF1_EXTEND_BK_IDLE_WANTED);
6244 goto integer;
6245 case SO_MARK_CELLFALLBACK:
6246 optval = ((so->so_flags1 & SOF1_CELLFALLBACK) > 0)
6247 ? 1 : 0;
6248 goto integer;
6249 case SO_NET_SERVICE_TYPE: {
6250 if ((so->so_flags1 & SOF1_TC_NET_SERV_TYPE)) {
6251 optval = so->so_netsvctype;
6252 } else {
6253 optval = NET_SERVICE_TYPE_BE;
6254 }
6255 goto integer;
6256 }
6257 case SO_NETSVC_MARKING_LEVEL:
6258 optval = so_get_netsvc_marking_level(so);
6259 goto integer;
6260
6261 case SO_MPKL_SEND_INFO: {
6262 struct so_mpkl_send_info so_mpkl_send_info;
6263
6264 uuid_copy(so_mpkl_send_info.mpkl_uuid, so->so_mpkl_send_uuid);
6265 so_mpkl_send_info.mpkl_proto = so->so_mpkl_send_proto;
6266 error = sooptcopyout(sopt, &so_mpkl_send_info,
6267 sizeof(struct so_mpkl_send_info));
6268 break;
6269 }
6270 default:
6271 error = ENOPROTOOPT;
6272 break;
6273 }
6274 }
6275 out:
6276 if (dolock) {
6277 socket_unlock(so, 1);
6278 }
6279 return error;
6280 }
6281
6282 /*
6283 * The size limits on our soopt_getm is different from that on FreeBSD.
6284 * We limit the size of options to MCLBYTES. This will have to change
6285 * if we need to define options that need more space than MCLBYTES.
6286 */
6287 int
6288 soopt_getm(struct sockopt *sopt, struct mbuf **mp)
6289 {
6290 struct mbuf *m, *m_prev;
6291 int sopt_size = sopt->sopt_valsize;
6292 int how;
6293
6294 if (sopt_size <= 0 || sopt_size > MCLBYTES) {
6295 return EMSGSIZE;
6296 }
6297
6298 how = sopt->sopt_p != kernproc ? M_WAIT : M_DONTWAIT;
6299 MGET(m, how, MT_DATA);
6300 if (m == NULL) {
6301 return ENOBUFS;
6302 }
6303 if (sopt_size > MLEN) {
6304 MCLGET(m, how);
6305 if ((m->m_flags & M_EXT) == 0) {
6306 m_free(m);
6307 return ENOBUFS;
6308 }
6309 m->m_len = min(MCLBYTES, sopt_size);
6310 } else {
6311 m->m_len = min(MLEN, sopt_size);
6312 }
6313 sopt_size -= m->m_len;
6314 *mp = m;
6315 m_prev = m;
6316
6317 while (sopt_size > 0) {
6318 MGET(m, how, MT_DATA);
6319 if (m == NULL) {
6320 m_freem(*mp);
6321 return ENOBUFS;
6322 }
6323 if (sopt_size > MLEN) {
6324 MCLGET(m, how);
6325 if ((m->m_flags & M_EXT) == 0) {
6326 m_freem(*mp);
6327 m_freem(m);
6328 return ENOBUFS;
6329 }
6330 m->m_len = min(MCLBYTES, sopt_size);
6331 } else {
6332 m->m_len = min(MLEN, sopt_size);
6333 }
6334 sopt_size -= m->m_len;
6335 m_prev->m_next = m;
6336 m_prev = m;
6337 }
6338 return 0;
6339 }
6340
6341 /* copyin sopt data into mbuf chain */
6342 int
6343 soopt_mcopyin(struct sockopt *sopt, struct mbuf *m)
6344 {
6345 struct mbuf *m0 = m;
6346
6347 if (sopt->sopt_val == USER_ADDR_NULL) {
6348 return 0;
6349 }
6350 while (m != NULL && sopt->sopt_valsize >= m->m_len) {
6351 if (sopt->sopt_p != kernproc) {
6352 int error;
6353
6354 error = copyin(sopt->sopt_val, mtod(m, char *),
6355 m->m_len);
6356 if (error != 0) {
6357 m_freem(m0);
6358 return error;
6359 }
6360 } else {
6361 bcopy(CAST_DOWN(caddr_t, sopt->sopt_val),
6362 mtod(m, char *), m->m_len);
6363 }
6364 sopt->sopt_valsize -= m->m_len;
6365 sopt->sopt_val += m->m_len;
6366 m = m->m_next;
6367 }
6368 /* should be allocated enoughly at ip6_sooptmcopyin() */
6369 if (m != NULL) {
6370 panic("soopt_mcopyin");
6371 /* NOTREACHED */
6372 }
6373 return 0;
6374 }
6375
6376 /* copyout mbuf chain data into soopt */
6377 int
6378 soopt_mcopyout(struct sockopt *sopt, struct mbuf *m)
6379 {
6380 struct mbuf *m0 = m;
6381 size_t valsize = 0;
6382
6383 if (sopt->sopt_val == USER_ADDR_NULL) {
6384 return 0;
6385 }
6386 while (m != NULL && sopt->sopt_valsize >= m->m_len) {
6387 if (sopt->sopt_p != kernproc) {
6388 int error;
6389
6390 error = copyout(mtod(m, char *), sopt->sopt_val,
6391 m->m_len);
6392 if (error != 0) {
6393 m_freem(m0);
6394 return error;
6395 }
6396 } else {
6397 bcopy(mtod(m, char *),
6398 CAST_DOWN(caddr_t, sopt->sopt_val), m->m_len);
6399 }
6400 sopt->sopt_valsize -= m->m_len;
6401 sopt->sopt_val += m->m_len;
6402 valsize += m->m_len;
6403 m = m->m_next;
6404 }
6405 if (m != NULL) {
6406 /* enough soopt buffer should be given from user-land */
6407 m_freem(m0);
6408 return EINVAL;
6409 }
6410 sopt->sopt_valsize = valsize;
6411 return 0;
6412 }
6413
6414 void
6415 sohasoutofband(struct socket *so)
6416 {
6417 if (so->so_pgid < 0) {
6418 gsignal(-so->so_pgid, SIGURG);
6419 } else if (so->so_pgid > 0) {
6420 proc_signal(so->so_pgid, SIGURG);
6421 }
6422 selwakeup(&so->so_rcv.sb_sel);
6423 if (so->so_rcv.sb_flags & SB_KNOTE) {
6424 KNOTE(&so->so_rcv.sb_sel.si_note,
6425 (NOTE_OOB | SO_FILT_HINT_LOCKED));
6426 }
6427 }
6428
6429 int
6430 sopoll(struct socket *so, int events, kauth_cred_t cred, void * wql)
6431 {
6432 #pragma unused(cred)
6433 struct proc *p = current_proc();
6434 int revents = 0;
6435
6436 socket_lock(so, 1);
6437 so_update_last_owner_locked(so, PROC_NULL);
6438 so_update_policy(so);
6439
6440 if (events & (POLLIN | POLLRDNORM)) {
6441 if (soreadable(so)) {
6442 revents |= events & (POLLIN | POLLRDNORM);
6443 }
6444 }
6445
6446 if (events & (POLLOUT | POLLWRNORM)) {
6447 if (sowriteable(so)) {
6448 revents |= events & (POLLOUT | POLLWRNORM);
6449 }
6450 }
6451
6452 if (events & (POLLPRI | POLLRDBAND)) {
6453 if (so->so_oobmark || (so->so_state & SS_RCVATMARK)) {
6454 revents |= events & (POLLPRI | POLLRDBAND);
6455 }
6456 }
6457
6458 if (revents == 0) {
6459 if (events & (POLLIN | POLLPRI | POLLRDNORM | POLLRDBAND)) {
6460 /*
6461 * Darwin sets the flag first,
6462 * BSD calls selrecord first
6463 */
6464 so->so_rcv.sb_flags |= SB_SEL;
6465 selrecord(p, &so->so_rcv.sb_sel, wql);
6466 }
6467
6468 if (events & (POLLOUT | POLLWRNORM)) {
6469 /*
6470 * Darwin sets the flag first,
6471 * BSD calls selrecord first
6472 */
6473 so->so_snd.sb_flags |= SB_SEL;
6474 selrecord(p, &so->so_snd.sb_sel, wql);
6475 }
6476 }
6477
6478 socket_unlock(so, 1);
6479 return revents;
6480 }
6481
6482 int
6483 soo_kqfilter(struct fileproc *fp, struct knote *kn, struct kevent_qos_s *kev)
6484 {
6485 struct socket *so = (struct socket *)fp->f_fglob->fg_data;
6486 int result;
6487
6488 socket_lock(so, 1);
6489 so_update_last_owner_locked(so, PROC_NULL);
6490 so_update_policy(so);
6491
6492 #if CONFIG_MACF_SOCKET
6493 proc_t p = knote_get_kq(kn)->kq_p;
6494 if (mac_socket_check_kqfilter(proc_ucred(p), kn, so) != 0) {
6495 socket_unlock(so, 1);
6496 knote_set_error(kn, EPERM);
6497 return 0;
6498 }
6499 #endif /* MAC_SOCKET */
6500
6501 switch (kn->kn_filter) {
6502 case EVFILT_READ:
6503 kn->kn_filtid = EVFILTID_SOREAD;
6504 break;
6505 case EVFILT_WRITE:
6506 kn->kn_filtid = EVFILTID_SOWRITE;
6507 break;
6508 case EVFILT_SOCK:
6509 kn->kn_filtid = EVFILTID_SCK;
6510 break;
6511 case EVFILT_EXCEPT:
6512 kn->kn_filtid = EVFILTID_SOEXCEPT;
6513 break;
6514 default:
6515 socket_unlock(so, 1);
6516 knote_set_error(kn, EINVAL);
6517 return 0;
6518 }
6519
6520 /*
6521 * call the appropriate sub-filter attach
6522 * with the socket still locked
6523 */
6524 result = knote_fops(kn)->f_attach(kn, kev);
6525
6526 socket_unlock(so, 1);
6527
6528 return result;
6529 }
6530
6531 static int
6532 filt_soread_common(struct knote *kn, struct kevent_qos_s *kev, struct socket *so)
6533 {
6534 int retval = 0;
6535 int64_t data = 0;
6536
6537 if (so->so_options & SO_ACCEPTCONN) {
6538 /*
6539 * Radar 6615193 handle the listen case dynamically
6540 * for kqueue read filter. This allows to call listen()
6541 * after registering the kqueue EVFILT_READ.
6542 */
6543
6544 retval = !TAILQ_EMPTY(&so->so_comp);
6545 data = so->so_qlen;
6546 goto out;
6547 }
6548
6549 /* socket isn't a listener */
6550 /*
6551 * NOTE_LOWAT specifies new low water mark in data, i.e.
6552 * the bytes of protocol data. We therefore exclude any
6553 * control bytes.
6554 */
6555 data = so->so_rcv.sb_cc - so->so_rcv.sb_ctl;
6556
6557 if (kn->kn_sfflags & NOTE_OOB) {
6558 if (so->so_oobmark || (so->so_state & SS_RCVATMARK)) {
6559 kn->kn_fflags |= NOTE_OOB;
6560 data -= so->so_oobmark;
6561 retval = 1;
6562 goto out;
6563 }
6564 }
6565
6566 if ((so->so_state & SS_CANTRCVMORE)
6567 #if CONTENT_FILTER
6568 && cfil_sock_data_pending(&so->so_rcv) == 0
6569 #endif /* CONTENT_FILTER */
6570 ) {
6571 kn->kn_flags |= EV_EOF;
6572 kn->kn_fflags = so->so_error;
6573 retval = 1;
6574 goto out;
6575 }
6576
6577 if (so->so_error) { /* temporary udp error */
6578 retval = 1;
6579 goto out;
6580 }
6581
6582 int64_t lowwat = so->so_rcv.sb_lowat;
6583 /*
6584 * Ensure that when NOTE_LOWAT is used, the derived
6585 * low water mark is bounded by socket's rcv buf's
6586 * high and low water mark values.
6587 */
6588 if (kn->kn_sfflags & NOTE_LOWAT) {
6589 if (kn->kn_sdata > so->so_rcv.sb_hiwat) {
6590 lowwat = so->so_rcv.sb_hiwat;
6591 } else if (kn->kn_sdata > lowwat) {
6592 lowwat = kn->kn_sdata;
6593 }
6594 }
6595
6596 /*
6597 * While the `data` field is the amount of data to read,
6598 * 0-sized packets need to wake up the kqueue, see 58140856,
6599 * so we need to take control bytes into account too.
6600 */
6601 retval = (so->so_rcv.sb_cc >= lowwat);
6602
6603 out:
6604 if (retval && kev) {
6605 knote_fill_kevent(kn, kev, data);
6606 }
6607 return retval;
6608 }
6609
6610 static int
6611 filt_sorattach(struct knote *kn, __unused struct kevent_qos_s *kev)
6612 {
6613 struct socket *so = (struct socket *)kn->kn_fp->f_fglob->fg_data;
6614
6615 /* socket locked */
6616
6617 /*
6618 * If the caller explicitly asked for OOB results (e.g. poll())
6619 * from EVFILT_READ, then save that off in the hookid field
6620 * and reserve the kn_flags EV_OOBAND bit for output only.
6621 */
6622 if (kn->kn_filter == EVFILT_READ &&
6623 kn->kn_flags & EV_OOBAND) {
6624 kn->kn_flags &= ~EV_OOBAND;
6625 kn->kn_hook32 = EV_OOBAND;
6626 } else {
6627 kn->kn_hook32 = 0;
6628 }
6629 if (KNOTE_ATTACH(&so->so_rcv.sb_sel.si_note, kn)) {
6630 so->so_rcv.sb_flags |= SB_KNOTE;
6631 }
6632
6633 /* indicate if event is already fired */
6634 return filt_soread_common(kn, NULL, so);
6635 }
6636
6637 static void
6638 filt_sordetach(struct knote *kn)
6639 {
6640 struct socket *so = (struct socket *)kn->kn_fp->f_fglob->fg_data;
6641
6642 socket_lock(so, 1);
6643 if (so->so_rcv.sb_flags & SB_KNOTE) {
6644 if (KNOTE_DETACH(&so->so_rcv.sb_sel.si_note, kn)) {
6645 so->so_rcv.sb_flags &= ~SB_KNOTE;
6646 }
6647 }
6648 socket_unlock(so, 1);
6649 }
6650
6651 /*ARGSUSED*/
6652 static int
6653 filt_soread(struct knote *kn, long hint)
6654 {
6655 struct socket *so = (struct socket *)kn->kn_fp->f_fglob->fg_data;
6656 int retval;
6657
6658 if ((hint & SO_FILT_HINT_LOCKED) == 0) {
6659 socket_lock(so, 1);
6660 }
6661
6662 retval = filt_soread_common(kn, NULL, so);
6663
6664 if ((hint & SO_FILT_HINT_LOCKED) == 0) {
6665 socket_unlock(so, 1);
6666 }
6667
6668 return retval;
6669 }
6670
6671 static int
6672 filt_sortouch(struct knote *kn, struct kevent_qos_s *kev)
6673 {
6674 struct socket *so = (struct socket *)kn->kn_fp->f_fglob->fg_data;
6675 int retval;
6676
6677 socket_lock(so, 1);
6678
6679 /* save off the new input fflags and data */
6680 kn->kn_sfflags = kev->fflags;
6681 kn->kn_sdata = kev->data;
6682
6683 /* determine if changes result in fired events */
6684 retval = filt_soread_common(kn, NULL, so);
6685
6686 socket_unlock(so, 1);
6687
6688 return retval;
6689 }
6690
6691 static int
6692 filt_sorprocess(struct knote *kn, struct kevent_qos_s *kev)
6693 {
6694 struct socket *so = (struct socket *)kn->kn_fp->f_fglob->fg_data;
6695 int retval;
6696
6697 socket_lock(so, 1);
6698 retval = filt_soread_common(kn, kev, so);
6699 socket_unlock(so, 1);
6700
6701 return retval;
6702 }
6703
6704 int
6705 so_wait_for_if_feedback(struct socket *so)
6706 {
6707 if ((SOCK_DOM(so) == PF_INET || SOCK_DOM(so) == PF_INET6) &&
6708 (so->so_state & SS_ISCONNECTED)) {
6709 struct inpcb *inp = sotoinpcb(so);
6710 if (INP_WAIT_FOR_IF_FEEDBACK(inp)) {
6711 return 1;
6712 }
6713 }
6714 return 0;
6715 }
6716
6717 static int
6718 filt_sowrite_common(struct knote *kn, struct kevent_qos_s *kev, struct socket *so)
6719 {
6720 int ret = 0;
6721 int64_t data = sbspace(&so->so_snd);
6722
6723 if (so->so_state & SS_CANTSENDMORE) {
6724 kn->kn_flags |= EV_EOF;
6725 kn->kn_fflags = so->so_error;
6726 ret = 1;
6727 goto out;
6728 }
6729
6730 if (so->so_error) { /* temporary udp error */
6731 ret = 1;
6732 goto out;
6733 }
6734
6735 if (!socanwrite(so)) {
6736 ret = 0;
6737 goto out;
6738 }
6739
6740 if (so->so_flags1 & SOF1_PRECONNECT_DATA) {
6741 ret = 1;
6742 goto out;
6743 }
6744
6745 int64_t lowwat = so->so_snd.sb_lowat;
6746
6747 if (kn->kn_sfflags & NOTE_LOWAT) {
6748 if (kn->kn_sdata > so->so_snd.sb_hiwat) {
6749 lowwat = so->so_snd.sb_hiwat;
6750 } else if (kn->kn_sdata > lowwat) {
6751 lowwat = kn->kn_sdata;
6752 }
6753 }
6754
6755 if (data >= lowwat) {
6756 if ((so->so_flags & SOF_NOTSENT_LOWAT)
6757 #if (DEBUG || DEVELOPMENT)
6758 && so_notsent_lowat_check == 1
6759 #endif /* DEBUG || DEVELOPMENT */
6760 ) {
6761 if ((SOCK_DOM(so) == PF_INET ||
6762 SOCK_DOM(so) == PF_INET6) &&
6763 so->so_type == SOCK_STREAM) {
6764 ret = tcp_notsent_lowat_check(so);
6765 }
6766 #if MPTCP
6767 else if ((SOCK_DOM(so) == PF_MULTIPATH) &&
6768 (SOCK_PROTO(so) == IPPROTO_TCP)) {
6769 ret = mptcp_notsent_lowat_check(so);
6770 }
6771 #endif
6772 else {
6773 ret = 1;
6774 goto out;
6775 }
6776 } else {
6777 ret = 1;
6778 }
6779 }
6780 if (so_wait_for_if_feedback(so)) {
6781 ret = 0;
6782 }
6783
6784 out:
6785 if (ret && kev) {
6786 knote_fill_kevent(kn, kev, data);
6787 }
6788 return ret;
6789 }
6790
6791 static int
6792 filt_sowattach(struct knote *kn, __unused struct kevent_qos_s *kev)
6793 {
6794 struct socket *so = (struct socket *)kn->kn_fp->f_fglob->fg_data;
6795
6796 /* socket locked */
6797 if (KNOTE_ATTACH(&so->so_snd.sb_sel.si_note, kn)) {
6798 so->so_snd.sb_flags |= SB_KNOTE;
6799 }
6800
6801 /* determine if its already fired */
6802 return filt_sowrite_common(kn, NULL, so);
6803 }
6804
6805 static void
6806 filt_sowdetach(struct knote *kn)
6807 {
6808 struct socket *so = (struct socket *)kn->kn_fp->f_fglob->fg_data;
6809 socket_lock(so, 1);
6810
6811 if (so->so_snd.sb_flags & SB_KNOTE) {
6812 if (KNOTE_DETACH(&so->so_snd.sb_sel.si_note, kn)) {
6813 so->so_snd.sb_flags &= ~SB_KNOTE;
6814 }
6815 }
6816 socket_unlock(so, 1);
6817 }
6818
6819 /*ARGSUSED*/
6820 static int
6821 filt_sowrite(struct knote *kn, long hint)
6822 {
6823 struct socket *so = (struct socket *)kn->kn_fp->f_fglob->fg_data;
6824 int ret;
6825
6826 if ((hint & SO_FILT_HINT_LOCKED) == 0) {
6827 socket_lock(so, 1);
6828 }
6829
6830 ret = filt_sowrite_common(kn, NULL, so);
6831
6832 if ((hint & SO_FILT_HINT_LOCKED) == 0) {
6833 socket_unlock(so, 1);
6834 }
6835
6836 return ret;
6837 }
6838
6839 static int
6840 filt_sowtouch(struct knote *kn, struct kevent_qos_s *kev)
6841 {
6842 struct socket *so = (struct socket *)kn->kn_fp->f_fglob->fg_data;
6843 int ret;
6844
6845 socket_lock(so, 1);
6846
6847 /*save off the new input fflags and data */
6848 kn->kn_sfflags = kev->fflags;
6849 kn->kn_sdata = kev->data;
6850
6851 /* determine if these changes result in a triggered event */
6852 ret = filt_sowrite_common(kn, NULL, so);
6853
6854 socket_unlock(so, 1);
6855
6856 return ret;
6857 }
6858
6859 static int
6860 filt_sowprocess(struct knote *kn, struct kevent_qos_s *kev)
6861 {
6862 struct socket *so = (struct socket *)kn->kn_fp->f_fglob->fg_data;
6863 int ret;
6864
6865 socket_lock(so, 1);
6866 ret = filt_sowrite_common(kn, kev, so);
6867 socket_unlock(so, 1);
6868
6869 return ret;
6870 }
6871
6872 static int
6873 filt_sockev_common(struct knote *kn, struct kevent_qos_s *kev,
6874 struct socket *so, long ev_hint)
6875 {
6876 int ret = 0;
6877 int64_t data = 0;
6878 uint32_t level_trigger = 0;
6879
6880 if (ev_hint & SO_FILT_HINT_CONNRESET) {
6881 kn->kn_fflags |= NOTE_CONNRESET;
6882 }
6883 if (ev_hint & SO_FILT_HINT_TIMEOUT) {
6884 kn->kn_fflags |= NOTE_TIMEOUT;
6885 }
6886 if (ev_hint & SO_FILT_HINT_NOSRCADDR) {
6887 kn->kn_fflags |= NOTE_NOSRCADDR;
6888 }
6889 if (ev_hint & SO_FILT_HINT_IFDENIED) {
6890 kn->kn_fflags |= NOTE_IFDENIED;
6891 }
6892 if (ev_hint & SO_FILT_HINT_KEEPALIVE) {
6893 kn->kn_fflags |= NOTE_KEEPALIVE;
6894 }
6895 if (ev_hint & SO_FILT_HINT_ADAPTIVE_WTIMO) {
6896 kn->kn_fflags |= NOTE_ADAPTIVE_WTIMO;
6897 }
6898 if (ev_hint & SO_FILT_HINT_ADAPTIVE_RTIMO) {
6899 kn->kn_fflags |= NOTE_ADAPTIVE_RTIMO;
6900 }
6901 if ((ev_hint & SO_FILT_HINT_CONNECTED) ||
6902 (so->so_state & SS_ISCONNECTED)) {
6903 kn->kn_fflags |= NOTE_CONNECTED;
6904 level_trigger |= NOTE_CONNECTED;
6905 }
6906 if ((ev_hint & SO_FILT_HINT_DISCONNECTED) ||
6907 (so->so_state & SS_ISDISCONNECTED)) {
6908 kn->kn_fflags |= NOTE_DISCONNECTED;
6909 level_trigger |= NOTE_DISCONNECTED;
6910 }
6911 if (ev_hint & SO_FILT_HINT_CONNINFO_UPDATED) {
6912 if (so->so_proto != NULL &&
6913 (so->so_proto->pr_flags & PR_EVCONNINFO)) {
6914 kn->kn_fflags |= NOTE_CONNINFO_UPDATED;
6915 }
6916 }
6917
6918 if ((ev_hint & SO_FILT_HINT_NOTIFY_ACK) ||
6919 tcp_notify_ack_active(so)) {
6920 kn->kn_fflags |= NOTE_NOTIFY_ACK;
6921 }
6922
6923 if ((so->so_state & SS_CANTRCVMORE)
6924 #if CONTENT_FILTER
6925 && cfil_sock_data_pending(&so->so_rcv) == 0
6926 #endif /* CONTENT_FILTER */
6927 ) {
6928 kn->kn_fflags |= NOTE_READCLOSED;
6929 level_trigger |= NOTE_READCLOSED;
6930 }
6931
6932 if (so->so_state & SS_CANTSENDMORE) {
6933 kn->kn_fflags |= NOTE_WRITECLOSED;
6934 level_trigger |= NOTE_WRITECLOSED;
6935 }
6936
6937 if ((ev_hint & SO_FILT_HINT_SUSPEND) ||
6938 (so->so_flags & SOF_SUSPENDED)) {
6939 kn->kn_fflags &= ~(NOTE_SUSPEND | NOTE_RESUME);
6940
6941 /* If resume event was delivered before, reset it */
6942 kn->kn_hook32 &= ~NOTE_RESUME;
6943
6944 kn->kn_fflags |= NOTE_SUSPEND;
6945 level_trigger |= NOTE_SUSPEND;
6946 }
6947
6948 if ((ev_hint & SO_FILT_HINT_RESUME) ||
6949 (so->so_flags & SOF_SUSPENDED) == 0) {
6950 kn->kn_fflags &= ~(NOTE_SUSPEND | NOTE_RESUME);
6951
6952 /* If suspend event was delivered before, reset it */
6953 kn->kn_hook32 &= ~NOTE_SUSPEND;
6954
6955 kn->kn_fflags |= NOTE_RESUME;
6956 level_trigger |= NOTE_RESUME;
6957 }
6958
6959 if (so->so_error != 0) {
6960 ret = 1;
6961 data = so->so_error;
6962 kn->kn_flags |= EV_EOF;
6963 } else {
6964 u_int32_t data32;
6965 get_sockev_state(so, &data32);
6966 data = data32;
6967 }
6968
6969 /* Reset any events that are not requested on this knote */
6970 kn->kn_fflags &= (kn->kn_sfflags & EVFILT_SOCK_ALL_MASK);
6971 level_trigger &= (kn->kn_sfflags & EVFILT_SOCK_ALL_MASK);
6972
6973 /* Find the level triggerred events that are already delivered */
6974 level_trigger &= kn->kn_hook32;
6975 level_trigger &= EVFILT_SOCK_LEVEL_TRIGGER_MASK;
6976
6977 /* Do not deliver level triggerred events more than once */
6978 if ((kn->kn_fflags & ~level_trigger) != 0) {
6979 ret = 1;
6980 }
6981
6982 if (ret && kev) {
6983 /*
6984 * Store the state of the events being delivered. This
6985 * state can be used to deliver level triggered events
6986 * ateast once and still avoid waking up the application
6987 * multiple times as long as the event is active.
6988 */
6989 if (kn->kn_fflags != 0) {
6990 kn->kn_hook32 |= (kn->kn_fflags &
6991 EVFILT_SOCK_LEVEL_TRIGGER_MASK);
6992 }
6993
6994 /*
6995 * NOTE_RESUME and NOTE_SUSPEND are an exception, deliver
6996 * only one of them and remember the last one that was
6997 * delivered last
6998 */
6999 if (kn->kn_fflags & NOTE_SUSPEND) {
7000 kn->kn_hook32 &= ~NOTE_RESUME;
7001 }
7002 if (kn->kn_fflags & NOTE_RESUME) {
7003 kn->kn_hook32 &= ~NOTE_SUSPEND;
7004 }
7005
7006 knote_fill_kevent(kn, kev, data);
7007 }
7008 return ret;
7009 }
7010
7011 static int
7012 filt_sockattach(struct knote *kn, __unused struct kevent_qos_s *kev)
7013 {
7014 struct socket *so = (struct socket *)kn->kn_fp->f_fglob->fg_data;
7015
7016 /* socket locked */
7017 kn->kn_hook32 = 0;
7018 if (KNOTE_ATTACH(&so->so_klist, kn)) {
7019 so->so_flags |= SOF_KNOTE;
7020 }
7021
7022 /* determine if event already fired */
7023 return filt_sockev_common(kn, NULL, so, 0);
7024 }
7025
7026 static void
7027 filt_sockdetach(struct knote *kn)
7028 {
7029 struct socket *so = (struct socket *)kn->kn_fp->f_fglob->fg_data;
7030 socket_lock(so, 1);
7031
7032 if ((so->so_flags & SOF_KNOTE) != 0) {
7033 if (KNOTE_DETACH(&so->so_klist, kn)) {
7034 so->so_flags &= ~SOF_KNOTE;
7035 }
7036 }
7037 socket_unlock(so, 1);
7038 }
7039
7040 static int
7041 filt_sockev(struct knote *kn, long hint)
7042 {
7043 int ret = 0, locked = 0;
7044 struct socket *so = (struct socket *)kn->kn_fp->f_fglob->fg_data;
7045 long ev_hint = (hint & SO_FILT_HINT_EV);
7046
7047 if ((hint & SO_FILT_HINT_LOCKED) == 0) {
7048 socket_lock(so, 1);
7049 locked = 1;
7050 }
7051
7052 ret = filt_sockev_common(kn, NULL, so, ev_hint);
7053
7054 if (locked) {
7055 socket_unlock(so, 1);
7056 }
7057
7058 return ret;
7059 }
7060
7061
7062
7063 /*
7064 * filt_socktouch - update event state
7065 */
7066 static int
7067 filt_socktouch(
7068 struct knote *kn,
7069 struct kevent_qos_s *kev)
7070 {
7071 struct socket *so = (struct socket *)kn->kn_fp->f_fglob->fg_data;
7072 uint32_t changed_flags;
7073 int ret;
7074
7075 socket_lock(so, 1);
7076
7077 /* save off the [result] data and fflags */
7078 changed_flags = (kn->kn_sfflags ^ kn->kn_hook32);
7079
7080 /* save off the new input fflags and data */
7081 kn->kn_sfflags = kev->fflags;
7082 kn->kn_sdata = kev->data;
7083
7084 /* restrict the current results to the (smaller?) set of new interest */
7085 /*
7086 * For compatibility with previous implementations, we leave kn_fflags
7087 * as they were before.
7088 */
7089 //kn->kn_fflags &= kev->fflags;
7090
7091 /*
7092 * Since we keep track of events that are already
7093 * delivered, if any of those events are not requested
7094 * anymore the state related to them can be reset
7095 */
7096 kn->kn_hook32 &= ~(changed_flags & EVFILT_SOCK_LEVEL_TRIGGER_MASK);
7097
7098 /* determine if we have events to deliver */
7099 ret = filt_sockev_common(kn, NULL, so, 0);
7100
7101 socket_unlock(so, 1);
7102
7103 return ret;
7104 }
7105
7106 /*
7107 * filt_sockprocess - query event fired state and return data
7108 */
7109 static int
7110 filt_sockprocess(struct knote *kn, struct kevent_qos_s *kev)
7111 {
7112 struct socket *so = (struct socket *)kn->kn_fp->f_fglob->fg_data;
7113 int ret = 0;
7114
7115 socket_lock(so, 1);
7116
7117 ret = filt_sockev_common(kn, kev, so, 0);
7118
7119 socket_unlock(so, 1);
7120
7121 return ret;
7122 }
7123
7124 void
7125 get_sockev_state(struct socket *so, u_int32_t *statep)
7126 {
7127 u_int32_t state = *(statep);
7128
7129 /*
7130 * If the state variable is already used by a previous event,
7131 * reset it.
7132 */
7133 if (state != 0) {
7134 return;
7135 }
7136
7137 if (so->so_state & SS_ISCONNECTED) {
7138 state |= SOCKEV_CONNECTED;
7139 } else {
7140 state &= ~(SOCKEV_CONNECTED);
7141 }
7142 state |= ((so->so_state & SS_ISDISCONNECTED) ? SOCKEV_DISCONNECTED : 0);
7143 *(statep) = state;
7144 }
7145
7146 #define SO_LOCK_HISTORY_STR_LEN \
7147 (2 * SO_LCKDBG_MAX * (2 + (2 * sizeof (void *)) + 1) + 1)
7148
7149 __private_extern__ const char *
7150 solockhistory_nr(struct socket *so)
7151 {
7152 size_t n = 0;
7153 int i;
7154 static char lock_history_str[SO_LOCK_HISTORY_STR_LEN];
7155
7156 bzero(lock_history_str, sizeof(lock_history_str));
7157 for (i = SO_LCKDBG_MAX - 1; i >= 0; i--) {
7158 n += scnprintf(lock_history_str + n,
7159 SO_LOCK_HISTORY_STR_LEN - n, "%p:%p ",
7160 so->lock_lr[(so->next_lock_lr + i) % SO_LCKDBG_MAX],
7161 so->unlock_lr[(so->next_unlock_lr + i) % SO_LCKDBG_MAX]);
7162 }
7163 return lock_history_str;
7164 }
7165
7166 lck_mtx_t *
7167 socket_getlock(struct socket *so, int flags)
7168 {
7169 if (so->so_proto->pr_getlock != NULL) {
7170 return (*so->so_proto->pr_getlock)(so, flags);
7171 } else {
7172 return so->so_proto->pr_domain->dom_mtx;
7173 }
7174 }
7175
7176 void
7177 socket_lock(struct socket *so, int refcount)
7178 {
7179 void *lr_saved;
7180
7181 lr_saved = __builtin_return_address(0);
7182
7183 if (so->so_proto->pr_lock) {
7184 (*so->so_proto->pr_lock)(so, refcount, lr_saved);
7185 } else {
7186 #ifdef MORE_LOCKING_DEBUG
7187 LCK_MTX_ASSERT(so->so_proto->pr_domain->dom_mtx,
7188 LCK_MTX_ASSERT_NOTOWNED);
7189 #endif
7190 lck_mtx_lock(so->so_proto->pr_domain->dom_mtx);
7191 if (refcount) {
7192 so->so_usecount++;
7193 }
7194 so->lock_lr[so->next_lock_lr] = lr_saved;
7195 so->next_lock_lr = (so->next_lock_lr + 1) % SO_LCKDBG_MAX;
7196 }
7197 }
7198
7199 void
7200 socket_lock_assert_owned(struct socket *so)
7201 {
7202 lck_mtx_t *mutex_held;
7203
7204 if (so->so_proto->pr_getlock != NULL) {
7205 mutex_held = (*so->so_proto->pr_getlock)(so, 0);
7206 } else {
7207 mutex_held = so->so_proto->pr_domain->dom_mtx;
7208 }
7209
7210 LCK_MTX_ASSERT(mutex_held, LCK_MTX_ASSERT_OWNED);
7211 }
7212
7213 int
7214 socket_try_lock(struct socket *so)
7215 {
7216 lck_mtx_t *mtx;
7217
7218 if (so->so_proto->pr_getlock != NULL) {
7219 mtx = (*so->so_proto->pr_getlock)(so, 0);
7220 } else {
7221 mtx = so->so_proto->pr_domain->dom_mtx;
7222 }
7223
7224 return lck_mtx_try_lock(mtx);
7225 }
7226
7227 void
7228 socket_unlock(struct socket *so, int refcount)
7229 {
7230 void *lr_saved;
7231 lck_mtx_t *mutex_held;
7232
7233 lr_saved = __builtin_return_address(0);
7234
7235 if (so == NULL || so->so_proto == NULL) {
7236 panic("%s: null so_proto so=%p\n", __func__, so);
7237 /* NOTREACHED */
7238 }
7239
7240 if (so->so_proto->pr_unlock) {
7241 (*so->so_proto->pr_unlock)(so, refcount, lr_saved);
7242 } else {
7243 mutex_held = so->so_proto->pr_domain->dom_mtx;
7244 #ifdef MORE_LOCKING_DEBUG
7245 LCK_MTX_ASSERT(mutex_held, LCK_MTX_ASSERT_OWNED);
7246 #endif
7247 so->unlock_lr[so->next_unlock_lr] = lr_saved;
7248 so->next_unlock_lr = (so->next_unlock_lr + 1) % SO_LCKDBG_MAX;
7249
7250 if (refcount) {
7251 if (so->so_usecount <= 0) {
7252 panic("%s: bad refcount=%d so=%p (%d, %d, %d) "
7253 "lrh=%s", __func__, so->so_usecount, so,
7254 SOCK_DOM(so), so->so_type,
7255 SOCK_PROTO(so), solockhistory_nr(so));
7256 /* NOTREACHED */
7257 }
7258
7259 so->so_usecount--;
7260 if (so->so_usecount == 0) {
7261 sofreelastref(so, 1);
7262 }
7263 }
7264 lck_mtx_unlock(mutex_held);
7265 }
7266 }
7267
7268 /* Called with socket locked, will unlock socket */
7269 void
7270 sofree(struct socket *so)
7271 {
7272 lck_mtx_t *mutex_held;
7273
7274 if (so->so_proto->pr_getlock != NULL) {
7275 mutex_held = (*so->so_proto->pr_getlock)(so, 0);
7276 } else {
7277 mutex_held = so->so_proto->pr_domain->dom_mtx;
7278 }
7279 LCK_MTX_ASSERT(mutex_held, LCK_MTX_ASSERT_OWNED);
7280
7281 sofreelastref(so, 0);
7282 }
7283
7284 void
7285 soreference(struct socket *so)
7286 {
7287 socket_lock(so, 1); /* locks & take one reference on socket */
7288 socket_unlock(so, 0); /* unlock only */
7289 }
7290
7291 void
7292 sodereference(struct socket *so)
7293 {
7294 socket_lock(so, 0);
7295 socket_unlock(so, 1);
7296 }
7297
7298 /*
7299 * Set or clear SOF_MULTIPAGES on the socket to enable or disable the
7300 * possibility of using jumbo clusters. Caller must ensure to hold
7301 * the socket lock.
7302 */
7303 void
7304 somultipages(struct socket *so, boolean_t set)
7305 {
7306 if (set) {
7307 so->so_flags |= SOF_MULTIPAGES;
7308 } else {
7309 so->so_flags &= ~SOF_MULTIPAGES;
7310 }
7311 }
7312
7313 void
7314 soif2kcl(struct socket *so, boolean_t set)
7315 {
7316 if (set) {
7317 so->so_flags1 |= SOF1_IF_2KCL;
7318 } else {
7319 so->so_flags1 &= ~SOF1_IF_2KCL;
7320 }
7321 }
7322
7323 int
7324 so_isdstlocal(struct socket *so)
7325 {
7326 struct inpcb *inp = (struct inpcb *)so->so_pcb;
7327
7328 if (SOCK_DOM(so) == PF_INET) {
7329 return inaddr_local(inp->inp_faddr);
7330 } else if (SOCK_DOM(so) == PF_INET6) {
7331 return in6addr_local(&inp->in6p_faddr);
7332 }
7333
7334 return 0;
7335 }
7336
7337 int
7338 sosetdefunct(struct proc *p, struct socket *so, int level, boolean_t noforce)
7339 {
7340 struct sockbuf *rcv, *snd;
7341 int err = 0, defunct;
7342
7343 rcv = &so->so_rcv;
7344 snd = &so->so_snd;
7345
7346 defunct = (so->so_flags & SOF_DEFUNCT);
7347 if (defunct) {
7348 if (!(snd->sb_flags & rcv->sb_flags & SB_DROP)) {
7349 panic("%s: SB_DROP not set", __func__);
7350 /* NOTREACHED */
7351 }
7352 goto done;
7353 }
7354
7355 if (so->so_flags & SOF_NODEFUNCT) {
7356 if (noforce) {
7357 err = EOPNOTSUPP;
7358 if (p != PROC_NULL) {
7359 SODEFUNCTLOG("%s[%d, %s]: (target pid %d "
7360 "name %s level %d) so 0x%llx [%d,%d] "
7361 "is not eligible for defunct "
7362 "(%d)\n", __func__, proc_selfpid(),
7363 proc_best_name(current_proc()), proc_pid(p),
7364 proc_best_name(p), level,
7365 (uint64_t)DEBUG_KERNEL_ADDRPERM(so),
7366 SOCK_DOM(so), SOCK_TYPE(so), err);
7367 }
7368 return err;
7369 }
7370 so->so_flags &= ~SOF_NODEFUNCT;
7371 if (p != PROC_NULL) {
7372 SODEFUNCTLOG("%s[%d, %s]: (target pid %d "
7373 "name %s level %d) so 0x%llx [%d,%d] "
7374 "defunct by force "
7375 "(%d)\n", __func__, proc_selfpid(),
7376 proc_best_name(current_proc()), proc_pid(p),
7377 proc_best_name(p), level,
7378 (uint64_t)DEBUG_KERNEL_ADDRPERM(so),
7379 SOCK_DOM(so), SOCK_TYPE(so), err);
7380 }
7381 } else if (so->so_flags1 & SOF1_EXTEND_BK_IDLE_WANTED) {
7382 struct inpcb *inp = (struct inpcb *)so->so_pcb;
7383 struct ifnet *ifp = inp->inp_last_outifp;
7384
7385 if (ifp && IFNET_IS_CELLULAR(ifp)) {
7386 OSIncrementAtomic(&soextbkidlestat.so_xbkidle_nocell);
7387 } else if (so->so_flags & SOF_DELEGATED) {
7388 OSIncrementAtomic(&soextbkidlestat.so_xbkidle_nodlgtd);
7389 } else if (soextbkidlestat.so_xbkidle_time == 0) {
7390 OSIncrementAtomic(&soextbkidlestat.so_xbkidle_notime);
7391 } else if (noforce && p != PROC_NULL) {
7392 OSIncrementAtomic(&soextbkidlestat.so_xbkidle_active);
7393
7394 so->so_flags1 |= SOF1_EXTEND_BK_IDLE_INPROG;
7395 so->so_extended_bk_start = net_uptime();
7396 OSBitOrAtomic(P_LXBKIDLEINPROG, &p->p_ladvflag);
7397
7398 inpcb_timer_sched(inp->inp_pcbinfo, INPCB_TIMER_LAZY);
7399
7400 err = EOPNOTSUPP;
7401 SODEFUNCTLOG("%s[%d, %s]: (target pid %d "
7402 "name %s level %d) so 0x%llx [%d,%d] "
7403 "extend bk idle "
7404 "(%d)\n", __func__, proc_selfpid(),
7405 proc_best_name(current_proc()), proc_pid(p),
7406 proc_best_name(p), level,
7407 (uint64_t)DEBUG_KERNEL_ADDRPERM(so),
7408 SOCK_DOM(so), SOCK_TYPE(so), err);
7409 return err;
7410 } else {
7411 OSIncrementAtomic(&soextbkidlestat.so_xbkidle_forced);
7412 }
7413 }
7414
7415 so->so_flags |= SOF_DEFUNCT;
7416
7417 /* Prevent further data from being appended to the socket buffers */
7418 snd->sb_flags |= SB_DROP;
7419 rcv->sb_flags |= SB_DROP;
7420
7421 /* Flush any existing data in the socket buffers */
7422 if (rcv->sb_cc != 0) {
7423 rcv->sb_flags &= ~SB_SEL;
7424 selthreadclear(&rcv->sb_sel);
7425 sbrelease(rcv);
7426 }
7427 if (snd->sb_cc != 0) {
7428 snd->sb_flags &= ~SB_SEL;
7429 selthreadclear(&snd->sb_sel);
7430 sbrelease(snd);
7431 }
7432
7433 done:
7434 if (p != PROC_NULL) {
7435 SODEFUNCTLOG("%s[%d, %s]: (target pid %d name %s level %d) "
7436 "so 0x%llx [%d,%d] %s defunct%s\n", __func__,
7437 proc_selfpid(), proc_best_name(current_proc()),
7438 proc_pid(p), proc_best_name(p), level,
7439 (uint64_t)DEBUG_KERNEL_ADDRPERM(so), SOCK_DOM(so),
7440 SOCK_TYPE(so), defunct ? "is already" : "marked as",
7441 (so->so_flags1 & SOF1_EXTEND_BK_IDLE_WANTED) ?
7442 " extbkidle" : "");
7443 }
7444 return err;
7445 }
7446
7447 int
7448 sodefunct(struct proc *p, struct socket *so, int level)
7449 {
7450 struct sockbuf *rcv, *snd;
7451
7452 if (!(so->so_flags & SOF_DEFUNCT)) {
7453 panic("%s improperly called", __func__);
7454 /* NOTREACHED */
7455 }
7456 if (so->so_state & SS_DEFUNCT) {
7457 goto done;
7458 }
7459
7460 rcv = &so->so_rcv;
7461 snd = &so->so_snd;
7462
7463 if (SOCK_DOM(so) == PF_INET || SOCK_DOM(so) == PF_INET6) {
7464 char s[MAX_IPv6_STR_LEN];
7465 char d[MAX_IPv6_STR_LEN];
7466 struct inpcb *inp = sotoinpcb(so);
7467
7468 if (p != PROC_NULL) {
7469 SODEFUNCTLOG(
7470 "%s[%d, %s]: (target pid %d name %s level %d) "
7471 "so 0x%llx [%s %s:%d -> %s:%d] is now defunct "
7472 "[rcv_si 0x%x, snd_si 0x%x, rcv_fl 0x%x, "
7473 " snd_fl 0x%x]\n", __func__,
7474 proc_selfpid(), proc_best_name(current_proc()),
7475 proc_pid(p), proc_best_name(p), level,
7476 (uint64_t)DEBUG_KERNEL_ADDRPERM(so),
7477 (SOCK_TYPE(so) == SOCK_STREAM) ? "TCP" : "UDP",
7478 inet_ntop(SOCK_DOM(so), ((SOCK_DOM(so) == PF_INET) ?
7479 (void *)&inp->inp_laddr.s_addr :
7480 (void *)&inp->in6p_laddr),
7481 s, sizeof(s)), ntohs(inp->in6p_lport),
7482 inet_ntop(SOCK_DOM(so), (SOCK_DOM(so) == PF_INET) ?
7483 (void *)&inp->inp_faddr.s_addr :
7484 (void *)&inp->in6p_faddr,
7485 d, sizeof(d)), ntohs(inp->in6p_fport),
7486 (uint32_t)rcv->sb_sel.si_flags,
7487 (uint32_t)snd->sb_sel.si_flags,
7488 rcv->sb_flags, snd->sb_flags);
7489 }
7490 } else if (p != PROC_NULL) {
7491 SODEFUNCTLOG("%s[%d, %s]: (target pid %d name %s level %d) "
7492 "so 0x%llx [%d,%d] is now defunct [rcv_si 0x%x, "
7493 "snd_si 0x%x, rcv_fl 0x%x, snd_fl 0x%x]\n", __func__,
7494 proc_selfpid(), proc_best_name(current_proc()),
7495 proc_pid(p), proc_best_name(p), level,
7496 (uint64_t)DEBUG_KERNEL_ADDRPERM(so),
7497 SOCK_DOM(so), SOCK_TYPE(so),
7498 (uint32_t)rcv->sb_sel.si_flags,
7499 (uint32_t)snd->sb_sel.si_flags, rcv->sb_flags,
7500 snd->sb_flags);
7501 }
7502
7503 /*
7504 * Unwedge threads blocked on sbwait() and sb_lock().
7505 */
7506 sbwakeup(rcv);
7507 sbwakeup(snd);
7508
7509 so->so_flags1 |= SOF1_DEFUNCTINPROG;
7510 if (rcv->sb_flags & SB_LOCK) {
7511 sbunlock(rcv, TRUE); /* keep socket locked */
7512 }
7513 if (snd->sb_flags & SB_LOCK) {
7514 sbunlock(snd, TRUE); /* keep socket locked */
7515 }
7516 /*
7517 * Flush the buffers and disconnect. We explicitly call shutdown
7518 * on both data directions to ensure that SS_CANT{RCV,SEND}MORE
7519 * states are set for the socket. This would also flush out data
7520 * hanging off the receive list of this socket.
7521 */
7522 (void) soshutdownlock_final(so, SHUT_RD);
7523 (void) soshutdownlock_final(so, SHUT_WR);
7524 (void) sodisconnectlocked(so);
7525
7526 /*
7527 * Explicitly handle connectionless-protocol disconnection
7528 * and release any remaining data in the socket buffers.
7529 */
7530 if (!(so->so_state & SS_ISDISCONNECTED)) {
7531 (void) soisdisconnected(so);
7532 }
7533
7534 if (so->so_error == 0) {
7535 so->so_error = EBADF;
7536 }
7537
7538 if (rcv->sb_cc != 0) {
7539 rcv->sb_flags &= ~SB_SEL;
7540 selthreadclear(&rcv->sb_sel);
7541 sbrelease(rcv);
7542 }
7543 if (snd->sb_cc != 0) {
7544 snd->sb_flags &= ~SB_SEL;
7545 selthreadclear(&snd->sb_sel);
7546 sbrelease(snd);
7547 }
7548 so->so_state |= SS_DEFUNCT;
7549 OSIncrementAtomicLong((volatile long *)&sodefunct_calls);
7550
7551 done:
7552 return 0;
7553 }
7554
7555 int
7556 soresume(struct proc *p, struct socket *so, int locked)
7557 {
7558 if (locked == 0) {
7559 socket_lock(so, 1);
7560 }
7561
7562 if (so->so_flags1 & SOF1_EXTEND_BK_IDLE_INPROG) {
7563 SODEFUNCTLOG("%s[%d, %s]: (target pid %d name %s) so 0x%llx "
7564 "[%d,%d] resumed from bk idle\n",
7565 __func__, proc_selfpid(), proc_best_name(current_proc()),
7566 proc_pid(p), proc_best_name(p),
7567 (uint64_t)DEBUG_KERNEL_ADDRPERM(so),
7568 SOCK_DOM(so), SOCK_TYPE(so));
7569
7570 so->so_flags1 &= ~SOF1_EXTEND_BK_IDLE_INPROG;
7571 so->so_extended_bk_start = 0;
7572 OSBitAndAtomic(~P_LXBKIDLEINPROG, &p->p_ladvflag);
7573
7574 OSIncrementAtomic(&soextbkidlestat.so_xbkidle_resumed);
7575 OSDecrementAtomic(&soextbkidlestat.so_xbkidle_active);
7576 VERIFY(soextbkidlestat.so_xbkidle_active >= 0);
7577 }
7578 if (locked == 0) {
7579 socket_unlock(so, 1);
7580 }
7581
7582 return 0;
7583 }
7584
7585 /*
7586 * Does not attempt to account for sockets that are delegated from
7587 * the current process
7588 */
7589 int
7590 so_set_extended_bk_idle(struct socket *so, int optval)
7591 {
7592 int error = 0;
7593
7594 if ((SOCK_DOM(so) != PF_INET && SOCK_DOM(so) != PF_INET6) ||
7595 SOCK_PROTO(so) != IPPROTO_TCP) {
7596 OSDecrementAtomic(&soextbkidlestat.so_xbkidle_notsupp);
7597 error = EOPNOTSUPP;
7598 } else if (optval == 0) {
7599 so->so_flags1 &= ~SOF1_EXTEND_BK_IDLE_WANTED;
7600
7601 soresume(current_proc(), so, 1);
7602 } else {
7603 struct proc *p = current_proc();
7604 int i;
7605 struct filedesc *fdp;
7606 int count = 0;
7607
7608 /*
7609 * Unlock socket to avoid lock ordering issue with
7610 * the proc fd table lock
7611 */
7612 socket_unlock(so, 0);
7613
7614 proc_fdlock(p);
7615
7616 fdp = p->p_fd;
7617 for (i = 0; i < fdp->fd_nfiles; i++) {
7618 struct fileproc *fp = fdp->fd_ofiles[i];
7619 struct socket *so2;
7620
7621 if (fp == NULL ||
7622 (fdp->fd_ofileflags[i] & UF_RESERVED) != 0 ||
7623 FILEGLOB_DTYPE(fp->f_fglob) != DTYPE_SOCKET) {
7624 continue;
7625 }
7626
7627 so2 = (struct socket *)fp->f_fglob->fg_data;
7628 if (so != so2 &&
7629 so2->so_flags1 & SOF1_EXTEND_BK_IDLE_WANTED) {
7630 count++;
7631 }
7632 if (count >= soextbkidlestat.so_xbkidle_maxperproc) {
7633 break;
7634 }
7635 }
7636 proc_fdunlock(p);
7637
7638 socket_lock(so, 0);
7639
7640 if (count >= soextbkidlestat.so_xbkidle_maxperproc) {
7641 OSIncrementAtomic(&soextbkidlestat.so_xbkidle_toomany);
7642 error = EBUSY;
7643 } else if (so->so_flags & SOF_DELEGATED) {
7644 OSIncrementAtomic(&soextbkidlestat.so_xbkidle_nodlgtd);
7645 error = EBUSY;
7646 } else {
7647 so->so_flags1 |= SOF1_EXTEND_BK_IDLE_WANTED;
7648 OSIncrementAtomic(&soextbkidlestat.so_xbkidle_wantok);
7649 }
7650 SODEFUNCTLOG("%s[%d, %s]: so 0x%llx [%d,%d] "
7651 "%s marked for extended bk idle\n",
7652 __func__, proc_selfpid(), proc_best_name(current_proc()),
7653 (uint64_t)DEBUG_KERNEL_ADDRPERM(so),
7654 SOCK_DOM(so), SOCK_TYPE(so),
7655 (so->so_flags1 & SOF1_EXTEND_BK_IDLE_WANTED) ?
7656 "is" : "not");
7657 }
7658
7659 return error;
7660 }
7661
7662 static void
7663 so_stop_extended_bk_idle(struct socket *so)
7664 {
7665 so->so_flags1 &= ~SOF1_EXTEND_BK_IDLE_INPROG;
7666 so->so_extended_bk_start = 0;
7667
7668 OSDecrementAtomic(&soextbkidlestat.so_xbkidle_active);
7669 VERIFY(soextbkidlestat.so_xbkidle_active >= 0);
7670 /*
7671 * Force defunct
7672 */
7673 sosetdefunct(current_proc(), so,
7674 SHUTDOWN_SOCKET_LEVEL_DISCONNECT_INTERNAL, FALSE);
7675 if (so->so_flags & SOF_DEFUNCT) {
7676 sodefunct(current_proc(), so,
7677 SHUTDOWN_SOCKET_LEVEL_DISCONNECT_INTERNAL);
7678 }
7679 }
7680
7681 void
7682 so_drain_extended_bk_idle(struct socket *so)
7683 {
7684 if (so && (so->so_flags1 & SOF1_EXTEND_BK_IDLE_INPROG)) {
7685 /*
7686 * Only penalize sockets that have outstanding data
7687 */
7688 if (so->so_rcv.sb_cc || so->so_snd.sb_cc) {
7689 so_stop_extended_bk_idle(so);
7690
7691 OSIncrementAtomic(&soextbkidlestat.so_xbkidle_drained);
7692 }
7693 }
7694 }
7695
7696 /*
7697 * Return values tells if socket is still in extended background idle
7698 */
7699 int
7700 so_check_extended_bk_idle_time(struct socket *so)
7701 {
7702 int ret = 1;
7703
7704 if ((so->so_flags1 & SOF1_EXTEND_BK_IDLE_INPROG)) {
7705 SODEFUNCTLOG("%s[%d, %s]: so 0x%llx [%d,%d]\n",
7706 __func__, proc_selfpid(), proc_best_name(current_proc()),
7707 (uint64_t)DEBUG_KERNEL_ADDRPERM(so),
7708 SOCK_DOM(so), SOCK_TYPE(so));
7709 if (net_uptime() - so->so_extended_bk_start >
7710 soextbkidlestat.so_xbkidle_time) {
7711 so_stop_extended_bk_idle(so);
7712
7713 OSIncrementAtomic(&soextbkidlestat.so_xbkidle_expired);
7714
7715 ret = 0;
7716 } else {
7717 struct inpcb *inp = (struct inpcb *)so->so_pcb;
7718
7719 inpcb_timer_sched(inp->inp_pcbinfo, INPCB_TIMER_LAZY);
7720 OSIncrementAtomic(&soextbkidlestat.so_xbkidle_resched);
7721 }
7722 }
7723
7724 return ret;
7725 }
7726
7727 void
7728 resume_proc_sockets(proc_t p)
7729 {
7730 if (p->p_ladvflag & P_LXBKIDLEINPROG) {
7731 struct filedesc *fdp;
7732 int i;
7733
7734 proc_fdlock(p);
7735 fdp = p->p_fd;
7736 for (i = 0; i < fdp->fd_nfiles; i++) {
7737 struct fileproc *fp;
7738 struct socket *so;
7739
7740 fp = fdp->fd_ofiles[i];
7741 if (fp == NULL ||
7742 (fdp->fd_ofileflags[i] & UF_RESERVED) != 0 ||
7743 FILEGLOB_DTYPE(fp->f_fglob) != DTYPE_SOCKET) {
7744 continue;
7745 }
7746
7747 so = (struct socket *)fp->f_fglob->fg_data;
7748 (void) soresume(p, so, 0);
7749 }
7750 proc_fdunlock(p);
7751
7752 OSBitAndAtomic(~P_LXBKIDLEINPROG, &p->p_ladvflag);
7753 }
7754 }
7755
7756 __private_extern__ int
7757 so_set_recv_anyif(struct socket *so, int optval)
7758 {
7759 int ret = 0;
7760
7761 #if INET6
7762 if (SOCK_DOM(so) == PF_INET || SOCK_DOM(so) == PF_INET6) {
7763 #else
7764 if (SOCK_DOM(so) == PF_INET) {
7765 #endif /* !INET6 */
7766 if (optval) {
7767 sotoinpcb(so)->inp_flags |= INP_RECV_ANYIF;
7768 } else {
7769 sotoinpcb(so)->inp_flags &= ~INP_RECV_ANYIF;
7770 }
7771 }
7772
7773
7774 return ret;
7775 }
7776
7777 __private_extern__ int
7778 so_get_recv_anyif(struct socket *so)
7779 {
7780 int ret = 0;
7781
7782 #if INET6
7783 if (SOCK_DOM(so) == PF_INET || SOCK_DOM(so) == PF_INET6) {
7784 #else
7785 if (SOCK_DOM(so) == PF_INET) {
7786 #endif /* !INET6 */
7787 ret = (sotoinpcb(so)->inp_flags & INP_RECV_ANYIF) ? 1 : 0;
7788 }
7789
7790 return ret;
7791 }
7792
7793 int
7794 so_set_restrictions(struct socket *so, uint32_t vals)
7795 {
7796 int nocell_old, nocell_new;
7797 int noexpensive_old, noexpensive_new;
7798 int noconstrained_old, noconstrained_new;
7799
7800 /*
7801 * Deny-type restrictions are trapdoors; once set they cannot be
7802 * unset for the lifetime of the socket. This allows them to be
7803 * issued by a framework on behalf of the application without
7804 * having to worry that they can be undone.
7805 *
7806 * Note here that socket-level restrictions overrides any protocol
7807 * level restrictions. For instance, SO_RESTRICT_DENY_CELLULAR
7808 * socket restriction issued on the socket has a higher precendence
7809 * than INP_NO_IFT_CELLULAR. The latter is affected by the UUID
7810 * policy PROC_UUID_NO_CELLULAR for unrestricted sockets only,
7811 * i.e. when SO_RESTRICT_DENY_CELLULAR has not been issued.
7812 */
7813 nocell_old = (so->so_restrictions & SO_RESTRICT_DENY_CELLULAR);
7814 noexpensive_old = (so->so_restrictions & SO_RESTRICT_DENY_EXPENSIVE);
7815 noconstrained_old = (so->so_restrictions & SO_RESTRICT_DENY_CONSTRAINED);
7816 so->so_restrictions |= (vals & (SO_RESTRICT_DENY_IN |
7817 SO_RESTRICT_DENY_OUT | SO_RESTRICT_DENY_CELLULAR |
7818 SO_RESTRICT_DENY_EXPENSIVE | SO_RESTRICT_DENY_CONSTRAINED));
7819 nocell_new = (so->so_restrictions & SO_RESTRICT_DENY_CELLULAR);
7820 noexpensive_new = (so->so_restrictions & SO_RESTRICT_DENY_EXPENSIVE);
7821 noconstrained_new = (so->so_restrictions & SO_RESTRICT_DENY_CONSTRAINED);
7822
7823 /* we can only set, not clear restrictions */
7824 if ((nocell_new - nocell_old) == 0 &&
7825 (noexpensive_new - noexpensive_old) == 0 &&
7826 (noconstrained_new - noconstrained_old) == 0) {
7827 return 0;
7828 }
7829 #if INET6
7830 if (SOCK_DOM(so) == PF_INET || SOCK_DOM(so) == PF_INET6) {
7831 #else
7832 if (SOCK_DOM(so) == PF_INET) {
7833 #endif /* !INET6 */
7834 if (nocell_new - nocell_old != 0) {
7835 /*
7836 * if deny cellular is now set, do what's needed
7837 * for INPCB
7838 */
7839 inp_set_nocellular(sotoinpcb(so));
7840 }
7841 if (noexpensive_new - noexpensive_old != 0) {
7842 inp_set_noexpensive(sotoinpcb(so));
7843 }
7844 if (noconstrained_new - noconstrained_old != 0) {
7845 inp_set_noconstrained(sotoinpcb(so));
7846 }
7847 }
7848
7849 if (SOCK_DOM(so) == PF_MULTIPATH) {
7850 mptcp_set_restrictions(so);
7851 }
7852
7853 return 0;
7854 }
7855
7856 uint32_t
7857 so_get_restrictions(struct socket *so)
7858 {
7859 return so->so_restrictions & (SO_RESTRICT_DENY_IN |
7860 SO_RESTRICT_DENY_OUT |
7861 SO_RESTRICT_DENY_CELLULAR | SO_RESTRICT_DENY_EXPENSIVE);
7862 }
7863
7864 int
7865 so_set_effective_pid(struct socket *so, int epid, struct proc *p, boolean_t check_cred)
7866 {
7867 struct proc *ep = PROC_NULL;
7868 int error = 0;
7869
7870 /* pid 0 is reserved for kernel */
7871 if (epid == 0) {
7872 error = EINVAL;
7873 goto done;
7874 }
7875
7876 /*
7877 * If this is an in-kernel socket, prevent its delegate
7878 * association from changing unless the socket option is
7879 * coming from within the kernel itself.
7880 */
7881 if (so->last_pid == 0 && p != kernproc) {
7882 error = EACCES;
7883 goto done;
7884 }
7885
7886 /*
7887 * If this is issued by a process that's recorded as the
7888 * real owner of the socket, or if the pid is the same as
7889 * the process's own pid, then proceed. Otherwise ensure
7890 * that the issuing process has the necessary privileges.
7891 */
7892 if (check_cred && (epid != so->last_pid || epid != proc_pid(p))) {
7893 if ((error = priv_check_cred(kauth_cred_get(),
7894 PRIV_NET_PRIVILEGED_SOCKET_DELEGATE, 0))) {
7895 error = EACCES;
7896 goto done;
7897 }
7898 }
7899
7900 /* Find the process that corresponds to the effective pid */
7901 if ((ep = proc_find(epid)) == PROC_NULL) {
7902 error = ESRCH;
7903 goto done;
7904 }
7905
7906 /*
7907 * If a process tries to delegate the socket to itself, then
7908 * there's really nothing to do; treat it as a way for the
7909 * delegate association to be cleared. Note that we check
7910 * the passed-in proc rather than calling proc_selfpid(),
7911 * as we need to check the process issuing the socket option
7912 * which could be kernproc. Given that we don't allow 0 for
7913 * effective pid, it means that a delegated in-kernel socket
7914 * stays delegated during its lifetime (which is probably OK.)
7915 */
7916 if (epid == proc_pid(p)) {
7917 so->so_flags &= ~SOF_DELEGATED;
7918 so->e_upid = 0;
7919 so->e_pid = 0;
7920 uuid_clear(so->e_uuid);
7921 } else {
7922 so->so_flags |= SOF_DELEGATED;
7923 so->e_upid = proc_uniqueid(ep);
7924 so->e_pid = proc_pid(ep);
7925 proc_getexecutableuuid(ep, so->e_uuid, sizeof(so->e_uuid));
7926
7927 #if defined(XNU_TARGET_OS_OSX)
7928 if (ep->p_responsible_pid != so->e_pid) {
7929 proc_t rp = proc_find(ep->p_responsible_pid);
7930 if (rp != PROC_NULL) {
7931 proc_getexecutableuuid(rp, so->so_ruuid, sizeof(so->so_ruuid));
7932 so->so_rpid = ep->p_responsible_pid;
7933 proc_rele(rp);
7934 } else {
7935 uuid_clear(so->so_ruuid);
7936 so->so_rpid = -1;
7937 }
7938 }
7939 #endif
7940 }
7941 if (so->so_proto != NULL && so->so_proto->pr_update_last_owner != NULL) {
7942 (*so->so_proto->pr_update_last_owner)(so, NULL, ep);
7943 }
7944 done:
7945 if (error == 0 && net_io_policy_log) {
7946 uuid_string_t buf;
7947
7948 uuid_unparse(so->e_uuid, buf);
7949 log(LOG_DEBUG, "%s[%s,%d]: so 0x%llx [%d,%d] epid %d (%s) "
7950 "euuid %s%s\n", __func__, proc_name_address(p),
7951 proc_pid(p), (uint64_t)DEBUG_KERNEL_ADDRPERM(so),
7952 SOCK_DOM(so), SOCK_TYPE(so),
7953 so->e_pid, proc_name_address(ep), buf,
7954 ((so->so_flags & SOF_DELEGATED) ? " [delegated]" : ""));
7955 } else if (error != 0 && net_io_policy_log) {
7956 log(LOG_ERR, "%s[%s,%d]: so 0x%llx [%d,%d] epid %d (%s) "
7957 "ERROR (%d)\n", __func__, proc_name_address(p),
7958 proc_pid(p), (uint64_t)DEBUG_KERNEL_ADDRPERM(so),
7959 SOCK_DOM(so), SOCK_TYPE(so),
7960 epid, (ep == PROC_NULL) ? "PROC_NULL" :
7961 proc_name_address(ep), error);
7962 }
7963
7964 /* Update this socket's policy upon success */
7965 if (error == 0) {
7966 so->so_policy_gencnt *= -1;
7967 so_update_policy(so);
7968 #if NECP
7969 so_update_necp_policy(so, NULL, NULL);
7970 #endif /* NECP */
7971 }
7972
7973 if (ep != PROC_NULL) {
7974 proc_rele(ep);
7975 }
7976
7977 return error;
7978 }
7979
7980 int
7981 so_set_effective_uuid(struct socket *so, uuid_t euuid, struct proc *p, boolean_t check_cred)
7982 {
7983 uuid_string_t buf;
7984 uuid_t uuid;
7985 int error = 0;
7986
7987 /* UUID must not be all-zeroes (reserved for kernel) */
7988 if (uuid_is_null(euuid)) {
7989 error = EINVAL;
7990 goto done;
7991 }
7992
7993 /*
7994 * If this is an in-kernel socket, prevent its delegate
7995 * association from changing unless the socket option is
7996 * coming from within the kernel itself.
7997 */
7998 if (so->last_pid == 0 && p != kernproc) {
7999 error = EACCES;
8000 goto done;
8001 }
8002
8003 /* Get the UUID of the issuing process */
8004 proc_getexecutableuuid(p, uuid, sizeof(uuid));
8005
8006 /*
8007 * If this is issued by a process that's recorded as the
8008 * real owner of the socket, or if the uuid is the same as
8009 * the process's own uuid, then proceed. Otherwise ensure
8010 * that the issuing process has the necessary privileges.
8011 */
8012 if (check_cred &&
8013 (uuid_compare(euuid, so->last_uuid) != 0 ||
8014 uuid_compare(euuid, uuid) != 0)) {
8015 if ((error = priv_check_cred(kauth_cred_get(),
8016 PRIV_NET_PRIVILEGED_SOCKET_DELEGATE, 0))) {
8017 error = EACCES;
8018 goto done;
8019 }
8020 }
8021
8022 /*
8023 * If a process tries to delegate the socket to itself, then
8024 * there's really nothing to do; treat it as a way for the
8025 * delegate association to be cleared. Note that we check
8026 * the uuid of the passed-in proc rather than that of the
8027 * current process, as we need to check the process issuing
8028 * the socket option which could be kernproc itself. Given
8029 * that we don't allow 0 for effective uuid, it means that
8030 * a delegated in-kernel socket stays delegated during its
8031 * lifetime (which is okay.)
8032 */
8033 if (uuid_compare(euuid, uuid) == 0) {
8034 so->so_flags &= ~SOF_DELEGATED;
8035 so->e_upid = 0;
8036 so->e_pid = 0;
8037 uuid_clear(so->e_uuid);
8038 } else {
8039 so->so_flags |= SOF_DELEGATED;
8040 /*
8041 * Unlike so_set_effective_pid(), we only have the UUID
8042 * here and the process ID is not known. Inherit the
8043 * real {pid,upid} of the socket.
8044 */
8045 so->e_upid = so->last_upid;
8046 so->e_pid = so->last_pid;
8047 uuid_copy(so->e_uuid, euuid);
8048 }
8049 /*
8050 * The following will clear the effective process name as it's the same
8051 * as the real process
8052 */
8053 if (so->so_proto != NULL && so->so_proto->pr_update_last_owner != NULL) {
8054 (*so->so_proto->pr_update_last_owner)(so, NULL, NULL);
8055 }
8056 done:
8057 if (error == 0 && net_io_policy_log) {
8058 uuid_unparse(so->e_uuid, buf);
8059 log(LOG_DEBUG, "%s[%s,%d]: so 0x%llx [%d,%d] epid %d "
8060 "euuid %s%s\n", __func__, proc_name_address(p), proc_pid(p),
8061 (uint64_t)DEBUG_KERNEL_ADDRPERM(so), SOCK_DOM(so),
8062 SOCK_TYPE(so), so->e_pid, buf,
8063 ((so->so_flags & SOF_DELEGATED) ? " [delegated]" : ""));
8064 } else if (error != 0 && net_io_policy_log) {
8065 uuid_unparse(euuid, buf);
8066 log(LOG_DEBUG, "%s[%s,%d]: so 0x%llx [%d,%d] euuid %s "
8067 "ERROR (%d)\n", __func__, proc_name_address(p), proc_pid(p),
8068 (uint64_t)DEBUG_KERNEL_ADDRPERM(so), SOCK_DOM(so),
8069 SOCK_TYPE(so), buf, error);
8070 }
8071
8072 /* Update this socket's policy upon success */
8073 if (error == 0) {
8074 so->so_policy_gencnt *= -1;
8075 so_update_policy(so);
8076 #if NECP
8077 so_update_necp_policy(so, NULL, NULL);
8078 #endif /* NECP */
8079 }
8080
8081 return error;
8082 }
8083
8084 void
8085 netpolicy_post_msg(uint32_t ev_code, struct netpolicy_event_data *ev_data,
8086 uint32_t ev_datalen)
8087 {
8088 struct kev_msg ev_msg;
8089
8090 /*
8091 * A netpolicy event always starts with a netpolicy_event_data
8092 * structure, but the caller can provide for a longer event
8093 * structure to post, depending on the event code.
8094 */
8095 VERIFY(ev_data != NULL && ev_datalen >= sizeof(*ev_data));
8096
8097 bzero(&ev_msg, sizeof(ev_msg));
8098 ev_msg.vendor_code = KEV_VENDOR_APPLE;
8099 ev_msg.kev_class = KEV_NETWORK_CLASS;
8100 ev_msg.kev_subclass = KEV_NETPOLICY_SUBCLASS;
8101 ev_msg.event_code = ev_code;
8102
8103 ev_msg.dv[0].data_ptr = ev_data;
8104 ev_msg.dv[0].data_length = ev_datalen;
8105
8106 kev_post_msg(&ev_msg);
8107 }
8108
8109 void
8110 socket_post_kev_msg(uint32_t ev_code,
8111 struct kev_socket_event_data *ev_data,
8112 uint32_t ev_datalen)
8113 {
8114 struct kev_msg ev_msg;
8115
8116 bzero(&ev_msg, sizeof(ev_msg));
8117 ev_msg.vendor_code = KEV_VENDOR_APPLE;
8118 ev_msg.kev_class = KEV_NETWORK_CLASS;
8119 ev_msg.kev_subclass = KEV_SOCKET_SUBCLASS;
8120 ev_msg.event_code = ev_code;
8121
8122 ev_msg.dv[0].data_ptr = ev_data;
8123 ev_msg.dv[0].data_length = ev_datalen;
8124
8125 kev_post_msg(&ev_msg);
8126 }
8127
8128 void
8129 socket_post_kev_msg_closed(struct socket *so)
8130 {
8131 struct kev_socket_closed ev;
8132 struct sockaddr *socksa = NULL, *peersa = NULL;
8133 int err;
8134 bzero(&ev, sizeof(ev));
8135 err = (*so->so_proto->pr_usrreqs->pru_sockaddr)(so, &socksa);
8136 if (err == 0) {
8137 err = (*so->so_proto->pr_usrreqs->pru_peeraddr)(so,
8138 &peersa);
8139 if (err == 0) {
8140 memcpy(&ev.ev_data.kev_sockname, socksa,
8141 min(socksa->sa_len,
8142 sizeof(ev.ev_data.kev_sockname)));
8143 memcpy(&ev.ev_data.kev_peername, peersa,
8144 min(peersa->sa_len,
8145 sizeof(ev.ev_data.kev_peername)));
8146 socket_post_kev_msg(KEV_SOCKET_CLOSED,
8147 &ev.ev_data, sizeof(ev));
8148 }
8149 }
8150 if (socksa != NULL) {
8151 FREE(socksa, M_SONAME);
8152 }
8153 if (peersa != NULL) {
8154 FREE(peersa, M_SONAME);
8155 }
8156 }