]> git.saurik.com Git - apple/xnu.git/blob - bsd/kern/uipc_socket.c
xnu-4570.71.2.tar.gz
[apple/xnu.git] / bsd / kern / uipc_socket.c
1 /*
2 * Copyright (c) 1998-2017 Apple Inc. All rights reserved.
3 *
4 * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
5 *
6 * This file contains Original Code and/or Modifications of Original Code
7 * as defined in and that are subject to the Apple Public Source License
8 * Version 2.0 (the 'License'). You may not use this file except in
9 * compliance with the License. The rights granted to you under the License
10 * may not be used to create, or enable the creation or redistribution of,
11 * unlawful or unlicensed copies of an Apple operating system, or to
12 * circumvent, violate, or enable the circumvention or violation of, any
13 * terms of an Apple operating system software license agreement.
14 *
15 * Please obtain a copy of the License at
16 * http://www.opensource.apple.com/apsl/ and read it before using this file.
17 *
18 * The Original Code and all software distributed under the License are
19 * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
20 * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
21 * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
22 * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
23 * Please see the License for the specific language governing rights and
24 * limitations under the License.
25 *
26 * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
27 */
28 /* Copyright (c) 1995 NeXT Computer, Inc. All Rights Reserved */
29 /*
30 * Copyright (c) 1982, 1986, 1988, 1990, 1993
31 * The Regents of the University of California. All rights reserved.
32 *
33 * Redistribution and use in source and binary forms, with or without
34 * modification, are permitted provided that the following conditions
35 * are met:
36 * 1. Redistributions of source code must retain the above copyright
37 * notice, this list of conditions and the following disclaimer.
38 * 2. Redistributions in binary form must reproduce the above copyright
39 * notice, this list of conditions and the following disclaimer in the
40 * documentation and/or other materials provided with the distribution.
41 * 3. All advertising materials mentioning features or use of this software
42 * must display the following acknowledgement:
43 * This product includes software developed by the University of
44 * California, Berkeley and its contributors.
45 * 4. Neither the name of the University nor the names of its contributors
46 * may be used to endorse or promote products derived from this software
47 * without specific prior written permission.
48 *
49 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
50 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
51 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
52 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
53 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
54 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
55 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
56 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
57 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
58 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
59 * SUCH DAMAGE.
60 *
61 * @(#)uipc_socket.c 8.3 (Berkeley) 4/15/94
62 */
63 /*
64 * NOTICE: This file was modified by SPARTA, Inc. in 2005 to introduce
65 * support for mandatory and extensible security protections. This notice
66 * is included in support of clause 2.2 (b) of the Apple Public License,
67 * Version 2.0.
68 */
69
70 #include <sys/param.h>
71 #include <sys/systm.h>
72 #include <sys/filedesc.h>
73 #include <sys/proc.h>
74 #include <sys/proc_internal.h>
75 #include <sys/kauth.h>
76 #include <sys/file_internal.h>
77 #include <sys/fcntl.h>
78 #include <sys/malloc.h>
79 #include <sys/mbuf.h>
80 #include <sys/domain.h>
81 #include <sys/kernel.h>
82 #include <sys/event.h>
83 #include <sys/poll.h>
84 #include <sys/protosw.h>
85 #include <sys/socket.h>
86 #include <sys/socketvar.h>
87 #include <sys/resourcevar.h>
88 #include <sys/signalvar.h>
89 #include <sys/sysctl.h>
90 #include <sys/syslog.h>
91 #include <sys/uio.h>
92 #include <sys/uio_internal.h>
93 #include <sys/ev.h>
94 #include <sys/kdebug.h>
95 #include <sys/un.h>
96 #include <sys/user.h>
97 #include <sys/priv.h>
98 #include <sys/kern_event.h>
99 #include <net/route.h>
100 #include <net/init.h>
101 #include <net/net_api_stats.h>
102 #include <net/ntstat.h>
103 #include <net/content_filter.h>
104 #include <netinet/in.h>
105 #include <netinet/in_pcb.h>
106 #include <netinet/in_tclass.h>
107 #include <netinet/tcp_var.h>
108 #include <netinet/ip6.h>
109 #include <netinet6/ip6_var.h>
110 #include <netinet/flow_divert.h>
111 #include <kern/zalloc.h>
112 #include <kern/locks.h>
113 #include <machine/limits.h>
114 #include <libkern/OSAtomic.h>
115 #include <pexpert/pexpert.h>
116 #include <kern/assert.h>
117 #include <kern/task.h>
118 #include <kern/policy_internal.h>
119
120 #include <sys/kpi_mbuf.h>
121 #include <sys/mcache.h>
122 #include <sys/unpcb.h>
123 #include <libkern/section_keywords.h>
124
125 #if CONFIG_MACF
126 #include <security/mac_framework.h>
127 #endif /* MAC */
128
129 #if MULTIPATH
130 #include <netinet/mp_pcb.h>
131 #include <netinet/mptcp_var.h>
132 #endif /* MULTIPATH */
133
134 #define ROUNDUP(a, b) (((a) + ((b) - 1)) & (~((b) - 1)))
135
136 #if DEBUG || DEVELOPMENT
137 #define DEBUG_KERNEL_ADDRPERM(_v) (_v)
138 #else
139 #define DEBUG_KERNEL_ADDRPERM(_v) VM_KERNEL_ADDRPERM(_v)
140 #endif
141
142 /* TODO: this should be in a header file somewhere */
143 extern char *proc_name_address(void *p);
144 extern char *proc_best_name(proc_t);
145
146 static u_int32_t so_cache_hw; /* High water mark for socache */
147 static u_int32_t so_cache_timeouts; /* number of timeouts */
148 static u_int32_t so_cache_max_freed; /* max freed per timeout */
149 static u_int32_t cached_sock_count = 0;
150 STAILQ_HEAD(, socket) so_cache_head;
151 int max_cached_sock_count = MAX_CACHED_SOCKETS;
152 static u_int32_t so_cache_time;
153 static int socketinit_done;
154 static struct zone *so_cache_zone;
155
156 static lck_grp_t *so_cache_mtx_grp;
157 static lck_attr_t *so_cache_mtx_attr;
158 static lck_grp_attr_t *so_cache_mtx_grp_attr;
159 static lck_mtx_t *so_cache_mtx;
160
161 #include <machine/limits.h>
162
163 static int filt_sorattach(struct knote *kn, struct kevent_internal_s *kev);
164 static void filt_sordetach(struct knote *kn);
165 static int filt_soread(struct knote *kn, long hint);
166 static int filt_sortouch(struct knote *kn, struct kevent_internal_s *kev);
167 static int filt_sorprocess(struct knote *kn, struct filt_process_s *data, struct kevent_internal_s *kev);
168
169 static int filt_sowattach(struct knote *kn, struct kevent_internal_s *kev);
170 static void filt_sowdetach(struct knote *kn);
171 static int filt_sowrite(struct knote *kn, long hint);
172 static int filt_sowtouch(struct knote *kn, struct kevent_internal_s *kev);
173 static int filt_sowprocess(struct knote *kn, struct filt_process_s *data, struct kevent_internal_s *kev);
174
175 static int filt_sockattach(struct knote *kn, struct kevent_internal_s *kev);
176 static void filt_sockdetach(struct knote *kn);
177 static int filt_sockev(struct knote *kn, long hint);
178 static int filt_socktouch(struct knote *kn, struct kevent_internal_s *kev);
179 static int filt_sockprocess(struct knote *kn, struct filt_process_s *data, struct kevent_internal_s *kev);
180
181 static int sooptcopyin_timeval(struct sockopt *, struct timeval *);
182 static int sooptcopyout_timeval(struct sockopt *, const struct timeval *);
183
184 SECURITY_READ_ONLY_EARLY(struct filterops) soread_filtops = {
185 .f_isfd = 1,
186 .f_attach = filt_sorattach,
187 .f_detach = filt_sordetach,
188 .f_event = filt_soread,
189 .f_touch = filt_sortouch,
190 .f_process = filt_sorprocess,
191 };
192
193 SECURITY_READ_ONLY_EARLY(struct filterops) sowrite_filtops = {
194 .f_isfd = 1,
195 .f_attach = filt_sowattach,
196 .f_detach = filt_sowdetach,
197 .f_event = filt_sowrite,
198 .f_touch = filt_sowtouch,
199 .f_process = filt_sowprocess,
200 };
201
202 SECURITY_READ_ONLY_EARLY(struct filterops) sock_filtops = {
203 .f_isfd = 1,
204 .f_attach = filt_sockattach,
205 .f_detach = filt_sockdetach,
206 .f_event = filt_sockev,
207 .f_touch = filt_socktouch,
208 .f_process = filt_sockprocess,
209 };
210
211 SECURITY_READ_ONLY_EARLY(struct filterops) soexcept_filtops = {
212 .f_isfd = 1,
213 .f_attach = filt_sorattach,
214 .f_detach = filt_sordetach,
215 .f_event = filt_soread,
216 .f_touch = filt_sortouch,
217 .f_process = filt_sorprocess,
218 };
219
220 SYSCTL_DECL(_kern_ipc);
221
222 #define EVEN_MORE_LOCKING_DEBUG 0
223
224 int socket_debug = 0;
225 SYSCTL_INT(_kern_ipc, OID_AUTO, socket_debug,
226 CTLFLAG_RW | CTLFLAG_LOCKED, &socket_debug, 0, "");
227
228 static unsigned long sodefunct_calls = 0;
229 SYSCTL_LONG(_kern_ipc, OID_AUTO, sodefunct_calls, CTLFLAG_LOCKED,
230 &sodefunct_calls, "");
231
232 static int socket_zone = M_SOCKET;
233 so_gen_t so_gencnt; /* generation count for sockets */
234
235 MALLOC_DEFINE(M_SONAME, "soname", "socket name");
236 MALLOC_DEFINE(M_PCB, "pcb", "protocol control block");
237
238 #define DBG_LAYER_IN_BEG NETDBG_CODE(DBG_NETSOCK, 0)
239 #define DBG_LAYER_IN_END NETDBG_CODE(DBG_NETSOCK, 2)
240 #define DBG_LAYER_OUT_BEG NETDBG_CODE(DBG_NETSOCK, 1)
241 #define DBG_LAYER_OUT_END NETDBG_CODE(DBG_NETSOCK, 3)
242 #define DBG_FNC_SOSEND NETDBG_CODE(DBG_NETSOCK, (4 << 8) | 1)
243 #define DBG_FNC_SOSEND_LIST NETDBG_CODE(DBG_NETSOCK, (4 << 8) | 3)
244 #define DBG_FNC_SORECEIVE NETDBG_CODE(DBG_NETSOCK, (8 << 8))
245 #define DBG_FNC_SORECEIVE_LIST NETDBG_CODE(DBG_NETSOCK, (8 << 8) | 3)
246 #define DBG_FNC_SOSHUTDOWN NETDBG_CODE(DBG_NETSOCK, (9 << 8))
247
248 #define MAX_SOOPTGETM_SIZE (128 * MCLBYTES)
249
250 int somaxconn = SOMAXCONN;
251 SYSCTL_INT(_kern_ipc, KIPC_SOMAXCONN, somaxconn,
252 CTLFLAG_RW | CTLFLAG_LOCKED, &somaxconn, 0, "");
253
254 /* Should we get a maximum also ??? */
255 static int sosendmaxchain = 65536;
256 static int sosendminchain = 16384;
257 static int sorecvmincopy = 16384;
258 SYSCTL_INT(_kern_ipc, OID_AUTO, sosendminchain,
259 CTLFLAG_RW | CTLFLAG_LOCKED, &sosendminchain, 0, "");
260 SYSCTL_INT(_kern_ipc, OID_AUTO, sorecvmincopy,
261 CTLFLAG_RW | CTLFLAG_LOCKED, &sorecvmincopy, 0, "");
262
263 /*
264 * Set to enable jumbo clusters (if available) for large writes when
265 * the socket is marked with SOF_MULTIPAGES; see below.
266 */
267 int sosendjcl = 1;
268 SYSCTL_INT(_kern_ipc, OID_AUTO, sosendjcl,
269 CTLFLAG_RW | CTLFLAG_LOCKED, &sosendjcl, 0, "");
270
271 /*
272 * Set this to ignore SOF_MULTIPAGES and use jumbo clusters for large
273 * writes on the socket for all protocols on any network interfaces,
274 * depending upon sosendjcl above. Be extra careful when setting this
275 * to 1, because sending down packets that cross physical pages down to
276 * broken drivers (those that falsely assume that the physical pages
277 * are contiguous) might lead to system panics or silent data corruption.
278 * When set to 0, the system will respect SOF_MULTIPAGES, which is set
279 * only for TCP sockets whose outgoing interface is IFNET_MULTIPAGES
280 * capable. Set this to 1 only for testing/debugging purposes.
281 */
282 int sosendjcl_ignore_capab = 0;
283 SYSCTL_INT(_kern_ipc, OID_AUTO, sosendjcl_ignore_capab,
284 CTLFLAG_RW | CTLFLAG_LOCKED, &sosendjcl_ignore_capab, 0, "");
285
286 /*
287 * Set this to ignore SOF1_IF_2KCL and use big clusters for large
288 * writes on the socket for all protocols on any network interfaces.
289 * Be extra careful when setting this to 1, because sending down packets with
290 * clusters larger that 2 KB might lead to system panics or data corruption.
291 * When set to 0, the system will respect SOF1_IF_2KCL, which is set
292 * on the outgoing interface
293 * Set this to 1 for testing/debugging purposes only.
294 */
295 int sosendbigcl_ignore_capab = 0;
296 SYSCTL_INT(_kern_ipc, OID_AUTO, sosendbigcl_ignore_capab,
297 CTLFLAG_RW | CTLFLAG_LOCKED, &sosendbigcl_ignore_capab, 0, "");
298
299 int sodefunctlog = 0;
300 SYSCTL_INT(_kern_ipc, OID_AUTO, sodefunctlog, CTLFLAG_RW | CTLFLAG_LOCKED,
301 &sodefunctlog, 0, "");
302
303 int sothrottlelog = 0;
304 SYSCTL_INT(_kern_ipc, OID_AUTO, sothrottlelog, CTLFLAG_RW | CTLFLAG_LOCKED,
305 &sothrottlelog, 0, "");
306
307 int sorestrictrecv = 1;
308 SYSCTL_INT(_kern_ipc, OID_AUTO, sorestrictrecv, CTLFLAG_RW | CTLFLAG_LOCKED,
309 &sorestrictrecv, 0, "Enable inbound interface restrictions");
310
311 int sorestrictsend = 1;
312 SYSCTL_INT(_kern_ipc, OID_AUTO, sorestrictsend, CTLFLAG_RW | CTLFLAG_LOCKED,
313 &sorestrictsend, 0, "Enable outbound interface restrictions");
314
315 int soreserveheadroom = 1;
316 SYSCTL_INT(_kern_ipc, OID_AUTO, soreserveheadroom, CTLFLAG_RW | CTLFLAG_LOCKED,
317 &soreserveheadroom, 0, "To allocate contiguous datagram buffers");
318
319 #if (DEBUG || DEVELOPMENT)
320 int so_notsent_lowat_check = 1;
321 SYSCTL_INT(_kern_ipc, OID_AUTO, notsent_lowat, CTLFLAG_RW|CTLFLAG_LOCKED,
322 &so_notsent_lowat_check, 0, "enable/disable notsnet lowat check");
323 #endif /* DEBUG || DEVELOPMENT */
324
325 int so_accept_list_waits = 0;
326 #if (DEBUG || DEVELOPMENT)
327 SYSCTL_INT(_kern_ipc, OID_AUTO, accept_list_waits, CTLFLAG_RW|CTLFLAG_LOCKED,
328 &so_accept_list_waits, 0, "number of waits for listener incomp list");
329 #endif /* DEBUG || DEVELOPMENT */
330
331 extern struct inpcbinfo tcbinfo;
332
333 /* TODO: these should be in header file */
334 extern int get_inpcb_str_size(void);
335 extern int get_tcp_str_size(void);
336
337 vm_size_t so_cache_zone_element_size;
338
339 static int sodelayed_copy(struct socket *, struct uio *, struct mbuf **,
340 user_ssize_t *);
341 static void cached_sock_alloc(struct socket **, int);
342 static void cached_sock_free(struct socket *);
343
344 /*
345 * Maximum of extended background idle sockets per process
346 * Set to zero to disable further setting of the option
347 */
348
349 #define SO_IDLE_BK_IDLE_MAX_PER_PROC 1
350 #define SO_IDLE_BK_IDLE_TIME 600
351 #define SO_IDLE_BK_IDLE_RCV_HIWAT 131072
352
353 struct soextbkidlestat soextbkidlestat;
354
355 SYSCTL_UINT(_kern_ipc, OID_AUTO, maxextbkidleperproc,
356 CTLFLAG_RW | CTLFLAG_LOCKED, &soextbkidlestat.so_xbkidle_maxperproc, 0,
357 "Maximum of extended background idle sockets per process");
358
359 SYSCTL_UINT(_kern_ipc, OID_AUTO, extbkidletime, CTLFLAG_RW | CTLFLAG_LOCKED,
360 &soextbkidlestat.so_xbkidle_time, 0,
361 "Time in seconds to keep extended background idle sockets");
362
363 SYSCTL_UINT(_kern_ipc, OID_AUTO, extbkidlercvhiwat, CTLFLAG_RW | CTLFLAG_LOCKED,
364 &soextbkidlestat.so_xbkidle_rcvhiwat, 0,
365 "High water mark for extended background idle sockets");
366
367 SYSCTL_STRUCT(_kern_ipc, OID_AUTO, extbkidlestat, CTLFLAG_RD | CTLFLAG_LOCKED,
368 &soextbkidlestat, soextbkidlestat, "");
369
370 int so_set_extended_bk_idle(struct socket *, int);
371
372
373 /*
374 * SOTCDB_NO_DSCP is set by default, to prevent the networking stack from
375 * setting the DSCP code on the packet based on the service class; see
376 * <rdar://problem/11277343> for details.
377 */
378 __private_extern__ u_int32_t sotcdb = 0;
379 SYSCTL_INT(_kern_ipc, OID_AUTO, sotcdb, CTLFLAG_RW | CTLFLAG_LOCKED,
380 &sotcdb, 0, "");
381
382 void
383 socketinit(void)
384 {
385 _CASSERT(sizeof(so_gencnt) == sizeof(uint64_t));
386 VERIFY(IS_P2ALIGNED(&so_gencnt, sizeof(uint32_t)));
387
388 #ifdef __LP64__
389 _CASSERT(sizeof(struct sa_endpoints) == sizeof(struct user64_sa_endpoints));
390 _CASSERT(offsetof(struct sa_endpoints, sae_srcif) == offsetof(struct user64_sa_endpoints, sae_srcif));
391 _CASSERT(offsetof(struct sa_endpoints, sae_srcaddr) == offsetof(struct user64_sa_endpoints, sae_srcaddr));
392 _CASSERT(offsetof(struct sa_endpoints, sae_srcaddrlen) == offsetof(struct user64_sa_endpoints, sae_srcaddrlen));
393 _CASSERT(offsetof(struct sa_endpoints, sae_dstaddr) == offsetof(struct user64_sa_endpoints, sae_dstaddr));
394 _CASSERT(offsetof(struct sa_endpoints, sae_dstaddrlen) == offsetof(struct user64_sa_endpoints, sae_dstaddrlen));
395 #else
396 _CASSERT(sizeof(struct sa_endpoints) == sizeof(struct user32_sa_endpoints));
397 _CASSERT(offsetof(struct sa_endpoints, sae_srcif) == offsetof(struct user32_sa_endpoints, sae_srcif));
398 _CASSERT(offsetof(struct sa_endpoints, sae_srcaddr) == offsetof(struct user32_sa_endpoints, sae_srcaddr));
399 _CASSERT(offsetof(struct sa_endpoints, sae_srcaddrlen) == offsetof(struct user32_sa_endpoints, sae_srcaddrlen));
400 _CASSERT(offsetof(struct sa_endpoints, sae_dstaddr) == offsetof(struct user32_sa_endpoints, sae_dstaddr));
401 _CASSERT(offsetof(struct sa_endpoints, sae_dstaddrlen) == offsetof(struct user32_sa_endpoints, sae_dstaddrlen));
402 #endif
403
404 if (socketinit_done) {
405 printf("socketinit: already called...\n");
406 return;
407 }
408 socketinit_done = 1;
409
410 PE_parse_boot_argn("socket_debug", &socket_debug,
411 sizeof (socket_debug));
412
413 /*
414 * allocate lock group attribute and group for socket cache mutex
415 */
416 so_cache_mtx_grp_attr = lck_grp_attr_alloc_init();
417 so_cache_mtx_grp = lck_grp_alloc_init("so_cache",
418 so_cache_mtx_grp_attr);
419
420 /*
421 * allocate the lock attribute for socket cache mutex
422 */
423 so_cache_mtx_attr = lck_attr_alloc_init();
424
425 /* cached sockets mutex */
426 so_cache_mtx = lck_mtx_alloc_init(so_cache_mtx_grp, so_cache_mtx_attr);
427 if (so_cache_mtx == NULL) {
428 panic("%s: unable to allocate so_cache_mtx\n", __func__);
429 /* NOTREACHED */
430 }
431 STAILQ_INIT(&so_cache_head);
432
433 so_cache_zone_element_size = (vm_size_t)(sizeof (struct socket) + 4
434 + get_inpcb_str_size() + 4 + get_tcp_str_size());
435
436 so_cache_zone = zinit(so_cache_zone_element_size,
437 (120000 * so_cache_zone_element_size), 8192, "socache zone");
438 zone_change(so_cache_zone, Z_CALLERACCT, FALSE);
439 zone_change(so_cache_zone, Z_NOENCRYPT, TRUE);
440
441 bzero(&soextbkidlestat, sizeof(struct soextbkidlestat));
442 soextbkidlestat.so_xbkidle_maxperproc = SO_IDLE_BK_IDLE_MAX_PER_PROC;
443 soextbkidlestat.so_xbkidle_time = SO_IDLE_BK_IDLE_TIME;
444 soextbkidlestat.so_xbkidle_rcvhiwat = SO_IDLE_BK_IDLE_RCV_HIWAT;
445
446 in_pcbinit();
447 sflt_init();
448 socket_tclass_init();
449 #if MULTIPATH
450 mp_pcbinit();
451 #endif /* MULTIPATH */
452 }
453
454 static void
455 cached_sock_alloc(struct socket **so, int waitok)
456 {
457 caddr_t temp;
458 uintptr_t offset;
459
460 lck_mtx_lock(so_cache_mtx);
461
462 if (!STAILQ_EMPTY(&so_cache_head)) {
463 VERIFY(cached_sock_count > 0);
464
465 *so = STAILQ_FIRST(&so_cache_head);
466 STAILQ_REMOVE_HEAD(&so_cache_head, so_cache_ent);
467 STAILQ_NEXT((*so), so_cache_ent) = NULL;
468
469 cached_sock_count--;
470 lck_mtx_unlock(so_cache_mtx);
471
472 temp = (*so)->so_saved_pcb;
473 bzero((caddr_t)*so, sizeof (struct socket));
474
475 (*so)->so_saved_pcb = temp;
476 } else {
477
478 lck_mtx_unlock(so_cache_mtx);
479
480 if (waitok)
481 *so = (struct socket *)zalloc(so_cache_zone);
482 else
483 *so = (struct socket *)zalloc_noblock(so_cache_zone);
484
485 if (*so == NULL)
486 return;
487
488 bzero((caddr_t)*so, sizeof (struct socket));
489
490 /*
491 * Define offsets for extra structures into our
492 * single block of memory. Align extra structures
493 * on longword boundaries.
494 */
495
496 offset = (uintptr_t)*so;
497 offset += sizeof (struct socket);
498
499 offset = ALIGN(offset);
500
501 (*so)->so_saved_pcb = (caddr_t)offset;
502 offset += get_inpcb_str_size();
503
504 offset = ALIGN(offset);
505
506 ((struct inpcb *)(void *)(*so)->so_saved_pcb)->inp_saved_ppcb =
507 (caddr_t)offset;
508 }
509
510 OSBitOrAtomic(SOF1_CACHED_IN_SOCK_LAYER, &(*so)->so_flags1);
511 }
512
513 static void
514 cached_sock_free(struct socket *so)
515 {
516
517 lck_mtx_lock(so_cache_mtx);
518
519 so_cache_time = net_uptime();
520 if (++cached_sock_count > max_cached_sock_count) {
521 --cached_sock_count;
522 lck_mtx_unlock(so_cache_mtx);
523 zfree(so_cache_zone, so);
524 } else {
525 if (so_cache_hw < cached_sock_count)
526 so_cache_hw = cached_sock_count;
527
528 STAILQ_INSERT_TAIL(&so_cache_head, so, so_cache_ent);
529
530 so->cache_timestamp = so_cache_time;
531 lck_mtx_unlock(so_cache_mtx);
532 }
533 }
534
535 void
536 so_update_last_owner_locked(struct socket *so, proc_t self)
537 {
538 if (so->last_pid != 0) {
539 /*
540 * last_pid and last_upid should remain zero for sockets
541 * created using sock_socket. The check above achieves that
542 */
543 if (self == PROC_NULL)
544 self = current_proc();
545
546 if (so->last_upid != proc_uniqueid(self) ||
547 so->last_pid != proc_pid(self)) {
548 so->last_upid = proc_uniqueid(self);
549 so->last_pid = proc_pid(self);
550 proc_getexecutableuuid(self, so->last_uuid,
551 sizeof (so->last_uuid));
552 }
553 proc_pidoriginatoruuid(so->so_vuuid, sizeof(so->so_vuuid));
554 }
555 }
556
557 void
558 so_update_policy(struct socket *so)
559 {
560 if (SOCK_DOM(so) == PF_INET || SOCK_DOM(so) == PF_INET6)
561 (void) inp_update_policy(sotoinpcb(so));
562 }
563
564 #if NECP
565 static void
566 so_update_necp_policy(struct socket *so, struct sockaddr *override_local_addr,
567 struct sockaddr *override_remote_addr)
568 {
569 if (SOCK_DOM(so) == PF_INET || SOCK_DOM(so) == PF_INET6)
570 inp_update_necp_policy(sotoinpcb(so), override_local_addr,
571 override_remote_addr, 0);
572 }
573 #endif /* NECP */
574
575 boolean_t
576 so_cache_timer(void)
577 {
578 struct socket *p;
579 int n_freed = 0;
580 boolean_t rc = FALSE;
581
582 lck_mtx_lock(so_cache_mtx);
583 so_cache_timeouts++;
584 so_cache_time = net_uptime();
585
586 while (!STAILQ_EMPTY(&so_cache_head)) {
587 VERIFY(cached_sock_count > 0);
588 p = STAILQ_FIRST(&so_cache_head);
589 if ((so_cache_time - p->cache_timestamp) <
590 SO_CACHE_TIME_LIMIT)
591 break;
592
593 STAILQ_REMOVE_HEAD(&so_cache_head, so_cache_ent);
594 --cached_sock_count;
595
596 zfree(so_cache_zone, p);
597
598 if (++n_freed >= SO_CACHE_MAX_FREE_BATCH) {
599 so_cache_max_freed++;
600 break;
601 }
602 }
603
604 /* Schedule again if there is more to cleanup */
605 if (!STAILQ_EMPTY(&so_cache_head))
606 rc = TRUE;
607
608 lck_mtx_unlock(so_cache_mtx);
609 return (rc);
610 }
611
612 /*
613 * Get a socket structure from our zone, and initialize it.
614 * We don't implement `waitok' yet (see comments in uipc_domain.c).
615 * Note that it would probably be better to allocate socket
616 * and PCB at the same time, but I'm not convinced that all
617 * the protocols can be easily modified to do this.
618 */
619 struct socket *
620 soalloc(int waitok, int dom, int type)
621 {
622 struct socket *so;
623
624 if ((dom == PF_INET) && (type == SOCK_STREAM)) {
625 cached_sock_alloc(&so, waitok);
626 } else {
627 MALLOC_ZONE(so, struct socket *, sizeof (*so), socket_zone,
628 M_WAITOK);
629 if (so != NULL)
630 bzero(so, sizeof (*so));
631 }
632 if (so != NULL) {
633 so->so_gencnt = OSIncrementAtomic64((SInt64 *)&so_gencnt);
634 so->so_zone = socket_zone;
635
636 /*
637 * Increment the socket allocation statistics
638 */
639 INC_ATOMIC_INT64_LIM(net_api_stats.nas_socket_alloc_total);
640
641 #if CONFIG_MACF_SOCKET
642 /* Convert waitok to M_WAITOK/M_NOWAIT for MAC Framework. */
643 if (mac_socket_label_init(so, !waitok) != 0) {
644 sodealloc(so);
645 return (NULL);
646 }
647 #endif /* MAC_SOCKET */
648 }
649
650 return (so);
651 }
652
653 int
654 socreate_internal(int dom, struct socket **aso, int type, int proto,
655 struct proc *p, uint32_t flags, struct proc *ep)
656 {
657 struct protosw *prp;
658 struct socket *so;
659 int error = 0;
660
661 #if TCPDEBUG
662 extern int tcpconsdebug;
663 #endif
664
665 VERIFY(aso != NULL);
666 *aso = NULL;
667
668 if (proto != 0)
669 prp = pffindproto(dom, proto, type);
670 else
671 prp = pffindtype(dom, type);
672
673 if (prp == NULL || prp->pr_usrreqs->pru_attach == NULL) {
674 if (pffinddomain(dom) == NULL)
675 return (EAFNOSUPPORT);
676 if (proto != 0) {
677 if (pffindprotonotype(dom, proto) != NULL)
678 return (EPROTOTYPE);
679 }
680 return (EPROTONOSUPPORT);
681 }
682 if (prp->pr_type != type)
683 return (EPROTOTYPE);
684 so = soalloc(1, dom, type);
685 if (so == NULL)
686 return (ENOBUFS);
687
688 switch (dom) {
689 case PF_LOCAL:
690 INC_ATOMIC_INT64_LIM(net_api_stats.nas_socket_domain_local_total);
691 break;
692 case PF_INET:
693 INC_ATOMIC_INT64_LIM(net_api_stats.nas_socket_domain_inet_total);
694 if (type == SOCK_STREAM) {
695 INC_ATOMIC_INT64_LIM(net_api_stats.nas_socket_inet_stream_total);
696 } else {
697 INC_ATOMIC_INT64_LIM(net_api_stats.nas_socket_inet_dgram_total);
698 }
699 break;
700 case PF_ROUTE:
701 INC_ATOMIC_INT64_LIM(net_api_stats.nas_socket_domain_route_total);
702 break;
703 case PF_NDRV:
704 INC_ATOMIC_INT64_LIM(net_api_stats.nas_socket_domain_ndrv_total);
705 break;
706 case PF_KEY:
707 INC_ATOMIC_INT64_LIM(net_api_stats.nas_socket_domain_key_total);
708 break;
709 case PF_INET6:
710 INC_ATOMIC_INT64_LIM(net_api_stats.nas_socket_domain_inet6_total);
711 if (type == SOCK_STREAM) {
712 INC_ATOMIC_INT64_LIM(net_api_stats.nas_socket_inet6_stream_total);
713 } else {
714 INC_ATOMIC_INT64_LIM(net_api_stats.nas_socket_inet6_dgram_total);
715 }
716 break;
717 case PF_SYSTEM:
718 INC_ATOMIC_INT64_LIM(net_api_stats.nas_socket_domain_system_total);
719 break;
720 case PF_MULTIPATH:
721 INC_ATOMIC_INT64_LIM(net_api_stats.nas_socket_domain_multipath_total);
722 break;
723 default:
724 INC_ATOMIC_INT64_LIM(net_api_stats.nas_socket_domain_other_total);
725 break;
726 }
727
728 if (flags & SOCF_ASYNC)
729 so->so_state |= SS_NBIO;
730
731 TAILQ_INIT(&so->so_incomp);
732 TAILQ_INIT(&so->so_comp);
733 so->so_type = type;
734 so->last_upid = proc_uniqueid(p);
735 so->last_pid = proc_pid(p);
736 proc_getexecutableuuid(p, so->last_uuid, sizeof (so->last_uuid));
737 proc_pidoriginatoruuid(so->so_vuuid, sizeof(so->so_vuuid));
738
739 if (ep != PROC_NULL && ep != p) {
740 so->e_upid = proc_uniqueid(ep);
741 so->e_pid = proc_pid(ep);
742 proc_getexecutableuuid(ep, so->e_uuid, sizeof (so->e_uuid));
743 so->so_flags |= SOF_DELEGATED;
744 }
745
746 so->so_cred = kauth_cred_proc_ref(p);
747 if (!suser(kauth_cred_get(), NULL))
748 so->so_state |= SS_PRIV;
749
750 so->so_proto = prp;
751 so->so_rcv.sb_flags |= SB_RECV;
752 so->so_rcv.sb_so = so->so_snd.sb_so = so;
753 so->next_lock_lr = 0;
754 so->next_unlock_lr = 0;
755
756 #if CONFIG_MACF_SOCKET
757 mac_socket_label_associate(kauth_cred_get(), so);
758 #endif /* MAC_SOCKET */
759
760 /*
761 * Attachment will create the per pcb lock if necessary and
762 * increase refcount for creation, make sure it's done before
763 * socket is inserted in lists.
764 */
765 so->so_usecount++;
766
767 error = (*prp->pr_usrreqs->pru_attach)(so, proto, p);
768 if (error != 0) {
769 /*
770 * Warning:
771 * If so_pcb is not zero, the socket will be leaked,
772 * so protocol attachment handler must be coded carefuly
773 */
774 so->so_state |= SS_NOFDREF;
775 VERIFY(so->so_usecount > 0);
776 so->so_usecount--;
777 sofreelastref(so, 1); /* will deallocate the socket */
778 return (error);
779 }
780
781 atomic_add_32(&prp->pr_domain->dom_refs, 1);
782 TAILQ_INIT(&so->so_evlist);
783
784 /* Attach socket filters for this protocol */
785 sflt_initsock(so);
786 #if TCPDEBUG
787 if (tcpconsdebug == 2)
788 so->so_options |= SO_DEBUG;
789 #endif
790 so_set_default_traffic_class(so);
791
792 /*
793 * If this thread or task is marked to create backgrounded sockets,
794 * mark the socket as background.
795 */
796 if (proc_get_effective_thread_policy(current_thread(),
797 TASK_POLICY_NEW_SOCKETS_BG)) {
798 socket_set_traffic_mgt_flags(so, TRAFFIC_MGT_SO_BACKGROUND);
799 so->so_background_thread = current_thread();
800 }
801
802 switch (dom) {
803 /*
804 * Don't mark Unix domain, system or multipath sockets as
805 * eligible for defunct by default.
806 */
807 case PF_LOCAL:
808 case PF_SYSTEM:
809 case PF_MULTIPATH:
810 so->so_flags |= SOF_NODEFUNCT;
811 break;
812 default:
813 break;
814 }
815
816 /*
817 * Entitlements can't be checked at socket creation time except if the
818 * application requested a feature guarded by a privilege (c.f., socket
819 * delegation).
820 * The priv(9) and the Sandboxing APIs are designed with the idea that
821 * a privilege check should only be triggered by a userland request.
822 * A privilege check at socket creation time is time consuming and
823 * could trigger many authorisation error messages from the security
824 * APIs.
825 */
826
827 *aso = so;
828
829 return (0);
830 }
831
832 /*
833 * Returns: 0 Success
834 * EAFNOSUPPORT
835 * EPROTOTYPE
836 * EPROTONOSUPPORT
837 * ENOBUFS
838 * <pru_attach>:ENOBUFS[AF_UNIX]
839 * <pru_attach>:ENOBUFS[TCP]
840 * <pru_attach>:ENOMEM[TCP]
841 * <pru_attach>:??? [other protocol families, IPSEC]
842 */
843 int
844 socreate(int dom, struct socket **aso, int type, int proto)
845 {
846 return (socreate_internal(dom, aso, type, proto, current_proc(), 0,
847 PROC_NULL));
848 }
849
850 int
851 socreate_delegate(int dom, struct socket **aso, int type, int proto, pid_t epid)
852 {
853 int error = 0;
854 struct proc *ep = PROC_NULL;
855
856 if ((proc_selfpid() != epid) && ((ep = proc_find(epid)) == PROC_NULL)) {
857 error = ESRCH;
858 goto done;
859 }
860
861 error = socreate_internal(dom, aso, type, proto, current_proc(), 0, ep);
862
863 /*
864 * It might not be wise to hold the proc reference when calling
865 * socreate_internal since it calls soalloc with M_WAITOK
866 */
867 done:
868 if (ep != PROC_NULL)
869 proc_rele(ep);
870
871 return (error);
872 }
873
874 /*
875 * Returns: 0 Success
876 * <pru_bind>:EINVAL Invalid argument [COMMON_START]
877 * <pru_bind>:EAFNOSUPPORT Address family not supported
878 * <pru_bind>:EADDRNOTAVAIL Address not available.
879 * <pru_bind>:EINVAL Invalid argument
880 * <pru_bind>:EAFNOSUPPORT Address family not supported [notdef]
881 * <pru_bind>:EACCES Permission denied
882 * <pru_bind>:EADDRINUSE Address in use
883 * <pru_bind>:EAGAIN Resource unavailable, try again
884 * <pru_bind>:EPERM Operation not permitted
885 * <pru_bind>:???
886 * <sf_bind>:???
887 *
888 * Notes: It's not possible to fully enumerate the return codes above,
889 * since socket filter authors and protocol family authors may
890 * not choose to limit their error returns to those listed, even
891 * though this may result in some software operating incorrectly.
892 *
893 * The error codes which are enumerated above are those known to
894 * be returned by the tcp_usr_bind function supplied.
895 */
896 int
897 sobindlock(struct socket *so, struct sockaddr *nam, int dolock)
898 {
899 struct proc *p = current_proc();
900 int error = 0;
901
902 if (dolock)
903 socket_lock(so, 1);
904
905 so_update_last_owner_locked(so, p);
906 so_update_policy(so);
907
908 #if NECP
909 so_update_necp_policy(so, nam, NULL);
910 #endif /* NECP */
911
912 /*
913 * If this is a bind request on a socket that has been marked
914 * as inactive, reject it now before we go any further.
915 */
916 if (so->so_flags & SOF_DEFUNCT) {
917 error = EINVAL;
918 SODEFUNCTLOG("%s[%d, %s]: defunct so 0x%llx [%d,%d] (%d)\n",
919 __func__, proc_pid(p), proc_best_name(p),
920 (uint64_t)DEBUG_KERNEL_ADDRPERM(so),
921 SOCK_DOM(so), SOCK_TYPE(so), error);
922 goto out;
923 }
924
925 /* Socket filter */
926 error = sflt_bind(so, nam);
927
928 if (error == 0)
929 error = (*so->so_proto->pr_usrreqs->pru_bind)(so, nam, p);
930 out:
931 if (dolock)
932 socket_unlock(so, 1);
933
934 if (error == EJUSTRETURN)
935 error = 0;
936
937 return (error);
938 }
939
940 void
941 sodealloc(struct socket *so)
942 {
943 kauth_cred_unref(&so->so_cred);
944
945 /* Remove any filters */
946 sflt_termsock(so);
947
948 #if CONTENT_FILTER
949 cfil_sock_detach(so);
950 #endif /* CONTENT_FILTER */
951
952 /* Delete the state allocated for msg queues on a socket */
953 if (so->so_flags & SOF_ENABLE_MSGS) {
954 FREE(so->so_msg_state, M_TEMP);
955 so->so_msg_state = NULL;
956 }
957 VERIFY(so->so_msg_state == NULL);
958
959 so->so_gencnt = OSIncrementAtomic64((SInt64 *)&so_gencnt);
960
961 #if CONFIG_MACF_SOCKET
962 mac_socket_label_destroy(so);
963 #endif /* MAC_SOCKET */
964
965 if (so->so_flags1 & SOF1_CACHED_IN_SOCK_LAYER) {
966 cached_sock_free(so);
967 } else {
968 FREE_ZONE(so, sizeof (*so), so->so_zone);
969 }
970 }
971
972 /*
973 * Returns: 0 Success
974 * EINVAL
975 * EOPNOTSUPP
976 * <pru_listen>:EINVAL[AF_UNIX]
977 * <pru_listen>:EINVAL[TCP]
978 * <pru_listen>:EADDRNOTAVAIL[TCP] Address not available.
979 * <pru_listen>:EINVAL[TCP] Invalid argument
980 * <pru_listen>:EAFNOSUPPORT[TCP] Address family not supported [notdef]
981 * <pru_listen>:EACCES[TCP] Permission denied
982 * <pru_listen>:EADDRINUSE[TCP] Address in use
983 * <pru_listen>:EAGAIN[TCP] Resource unavailable, try again
984 * <pru_listen>:EPERM[TCP] Operation not permitted
985 * <sf_listen>:???
986 *
987 * Notes: Other <pru_listen> returns depend on the protocol family; all
988 * <sf_listen> returns depend on what the filter author causes
989 * their filter to return.
990 */
991 int
992 solisten(struct socket *so, int backlog)
993 {
994 struct proc *p = current_proc();
995 int error = 0;
996
997 socket_lock(so, 1);
998
999 so_update_last_owner_locked(so, p);
1000 so_update_policy(so);
1001
1002 #if NECP
1003 so_update_necp_policy(so, NULL, NULL);
1004 #endif /* NECP */
1005
1006 if (so->so_proto == NULL) {
1007 error = EINVAL;
1008 goto out;
1009 }
1010 if ((so->so_proto->pr_flags & PR_CONNREQUIRED) == 0) {
1011 error = EOPNOTSUPP;
1012 goto out;
1013 }
1014
1015 /*
1016 * If the listen request is made on a socket that is not fully
1017 * disconnected, or on a socket that has been marked as inactive,
1018 * reject the request now.
1019 */
1020 if ((so->so_state &
1021 (SS_ISCONNECTED|SS_ISCONNECTING|SS_ISDISCONNECTING)) ||
1022 (so->so_flags & SOF_DEFUNCT)) {
1023 error = EINVAL;
1024 if (so->so_flags & SOF_DEFUNCT) {
1025 SODEFUNCTLOG("%s[%d, %s]: defunct so 0x%llx [%d,%d] "
1026 "(%d)\n", __func__, proc_pid(p),
1027 proc_best_name(p),
1028 (uint64_t)DEBUG_KERNEL_ADDRPERM(so),
1029 SOCK_DOM(so), SOCK_TYPE(so), error);
1030 }
1031 goto out;
1032 }
1033
1034 if ((so->so_restrictions & SO_RESTRICT_DENY_IN) != 0) {
1035 error = EPERM;
1036 goto out;
1037 }
1038
1039 error = sflt_listen(so);
1040 if (error == 0)
1041 error = (*so->so_proto->pr_usrreqs->pru_listen)(so, p);
1042
1043 if (error) {
1044 if (error == EJUSTRETURN)
1045 error = 0;
1046 goto out;
1047 }
1048
1049 if (TAILQ_EMPTY(&so->so_comp))
1050 so->so_options |= SO_ACCEPTCONN;
1051 /*
1052 * POSIX: The implementation may have an upper limit on the length of
1053 * the listen queue-either global or per accepting socket. If backlog
1054 * exceeds this limit, the length of the listen queue is set to the
1055 * limit.
1056 *
1057 * If listen() is called with a backlog argument value that is less
1058 * than 0, the function behaves as if it had been called with a backlog
1059 * argument value of 0.
1060 *
1061 * A backlog argument of 0 may allow the socket to accept connections,
1062 * in which case the length of the listen queue may be set to an
1063 * implementation-defined minimum value.
1064 */
1065 if (backlog <= 0 || backlog > somaxconn)
1066 backlog = somaxconn;
1067
1068 so->so_qlimit = backlog;
1069 out:
1070 socket_unlock(so, 1);
1071 return (error);
1072 }
1073
1074 /*
1075 * The "accept list lock" protects the fields related to the listener queues
1076 * because we can unlock a socket to respect the lock ordering between
1077 * the listener socket and its clients sockets. The lock ordering is first to
1078 * acquire the client socket before the listener socket.
1079 *
1080 * The accept list lock serializes access to the following fields:
1081 * - of the listener socket:
1082 * - so_comp
1083 * - so_incomp
1084 * - so_qlen
1085 * - so_inqlen
1086 * - of client sockets that are in so_comp or so_incomp:
1087 * - so_head
1088 * - so_list
1089 *
1090 * As one can see the accept list lock protects the consistent of the
1091 * linkage of the client sockets.
1092 *
1093 * Note that those fields may be read without holding the accept list lock
1094 * for a preflight provided the accept list lock is taken when committing
1095 * to take an action based on the result of the preflight. The preflight
1096 * saves the cost of doing the unlock/lock dance.
1097 */
1098 void
1099 so_acquire_accept_list(struct socket *head, struct socket *so)
1100 {
1101 lck_mtx_t *mutex_held;
1102
1103 if (head->so_proto->pr_getlock == NULL) {
1104 return;
1105 }
1106 mutex_held = (*head->so_proto->pr_getlock)(head, PR_F_WILLUNLOCK);
1107 LCK_MTX_ASSERT(mutex_held, LCK_MTX_ASSERT_OWNED);
1108
1109 if (!(head->so_flags1 & SOF1_ACCEPT_LIST_HELD)) {
1110 head->so_flags1 |= SOF1_ACCEPT_LIST_HELD;
1111 return;
1112 }
1113 if (so != NULL) {
1114 socket_unlock(so, 0);
1115 }
1116 while (head->so_flags1 & SOF1_ACCEPT_LIST_HELD) {
1117 so_accept_list_waits += 1;
1118 msleep((caddr_t)&head->so_incomp, mutex_held,
1119 PSOCK | PCATCH, __func__, NULL);
1120 }
1121 head->so_flags1 |= SOF1_ACCEPT_LIST_HELD;
1122 if (so != NULL) {
1123 socket_unlock(head, 0);
1124 socket_lock(so, 0);
1125 socket_lock(head, 0);
1126 }
1127 }
1128
1129 void
1130 so_release_accept_list(struct socket *head)
1131 {
1132 if (head->so_proto->pr_getlock != NULL) {
1133 lck_mtx_t *mutex_held;
1134
1135 mutex_held = (*head->so_proto->pr_getlock)(head, 0);
1136 LCK_MTX_ASSERT(mutex_held, LCK_MTX_ASSERT_OWNED);
1137
1138 head->so_flags1 &= ~SOF1_ACCEPT_LIST_HELD;
1139 wakeup((caddr_t)&head->so_incomp);
1140 }
1141 }
1142
1143 void
1144 sofreelastref(struct socket *so, int dealloc)
1145 {
1146 struct socket *head = so->so_head;
1147
1148 /* Assume socket is locked */
1149
1150 if (!(so->so_flags & SOF_PCBCLEARING) || !(so->so_state & SS_NOFDREF)) {
1151 selthreadclear(&so->so_snd.sb_sel);
1152 selthreadclear(&so->so_rcv.sb_sel);
1153 so->so_rcv.sb_flags &= ~(SB_SEL|SB_UPCALL);
1154 so->so_snd.sb_flags &= ~(SB_SEL|SB_UPCALL);
1155 so->so_event = sonullevent;
1156 return;
1157 }
1158 if (head != NULL) {
1159 /*
1160 * Need to lock the listener when the protocol has
1161 * per socket locks
1162 */
1163 if (head->so_proto->pr_getlock != NULL) {
1164 socket_lock(head, 1);
1165 so_acquire_accept_list(head, so);
1166 }
1167 if (so->so_state & SS_INCOMP) {
1168 so->so_state &= ~SS_INCOMP;
1169 TAILQ_REMOVE(&head->so_incomp, so, so_list);
1170 head->so_incqlen--;
1171 head->so_qlen--;
1172 so->so_head = NULL;
1173
1174 if (head->so_proto->pr_getlock != NULL) {
1175 so_release_accept_list(head);
1176 socket_unlock(head, 1);
1177 }
1178 } else if (so->so_state & SS_COMP) {
1179 if (head->so_proto->pr_getlock != NULL) {
1180 so_release_accept_list(head);
1181 socket_unlock(head, 1);
1182 }
1183 /*
1184 * We must not decommission a socket that's
1185 * on the accept(2) queue. If we do, then
1186 * accept(2) may hang after select(2) indicated
1187 * that the listening socket was ready.
1188 */
1189 selthreadclear(&so->so_snd.sb_sel);
1190 selthreadclear(&so->so_rcv.sb_sel);
1191 so->so_rcv.sb_flags &= ~(SB_SEL|SB_UPCALL);
1192 so->so_snd.sb_flags &= ~(SB_SEL|SB_UPCALL);
1193 so->so_event = sonullevent;
1194 return;
1195 } else {
1196 if (head->so_proto->pr_getlock != NULL) {
1197 so_release_accept_list(head);
1198 socket_unlock(head, 1);
1199 }
1200 printf("sofree: not queued\n");
1201 }
1202 }
1203 sowflush(so);
1204 sorflush(so);
1205
1206 #if FLOW_DIVERT
1207 if (so->so_flags & SOF_FLOW_DIVERT) {
1208 flow_divert_detach(so);
1209 }
1210 #endif /* FLOW_DIVERT */
1211
1212 /* 3932268: disable upcall */
1213 so->so_rcv.sb_flags &= ~SB_UPCALL;
1214 so->so_snd.sb_flags &= ~(SB_UPCALL|SB_SNDBYTE_CNT);
1215 so->so_event = sonullevent;
1216
1217 if (dealloc)
1218 sodealloc(so);
1219 }
1220
1221 void
1222 soclose_wait_locked(struct socket *so)
1223 {
1224 lck_mtx_t *mutex_held;
1225
1226 if (so->so_proto->pr_getlock != NULL)
1227 mutex_held = (*so->so_proto->pr_getlock)(so, PR_F_WILLUNLOCK);
1228 else
1229 mutex_held = so->so_proto->pr_domain->dom_mtx;
1230 LCK_MTX_ASSERT(mutex_held, LCK_MTX_ASSERT_OWNED);
1231
1232 /*
1233 * Double check here and return if there's no outstanding upcall;
1234 * otherwise proceed further only if SOF_UPCALLCLOSEWAIT is set.
1235 */
1236 if (!so->so_upcallusecount || !(so->so_flags & SOF_UPCALLCLOSEWAIT))
1237 return;
1238 so->so_rcv.sb_flags &= ~SB_UPCALL;
1239 so->so_snd.sb_flags &= ~SB_UPCALL;
1240 so->so_flags |= SOF_CLOSEWAIT;
1241
1242 (void) msleep((caddr_t)&so->so_upcallusecount, mutex_held, (PZERO - 1),
1243 "soclose_wait_locked", NULL);
1244 LCK_MTX_ASSERT(mutex_held, LCK_MTX_ASSERT_OWNED);
1245 so->so_flags &= ~SOF_CLOSEWAIT;
1246 }
1247
1248 /*
1249 * Close a socket on last file table reference removal.
1250 * Initiate disconnect if connected.
1251 * Free socket when disconnect complete.
1252 */
1253 int
1254 soclose_locked(struct socket *so)
1255 {
1256 int error = 0;
1257 struct timespec ts;
1258
1259 if (so->so_usecount == 0) {
1260 panic("soclose: so=%p refcount=0\n", so);
1261 /* NOTREACHED */
1262 }
1263
1264 sflt_notify(so, sock_evt_closing, NULL);
1265
1266 if (so->so_upcallusecount)
1267 soclose_wait_locked(so);
1268
1269 #if CONTENT_FILTER
1270 /*
1271 * We have to wait until the content filters are done
1272 */
1273 if ((so->so_flags & SOF_CONTENT_FILTER) != 0) {
1274 cfil_sock_close_wait(so);
1275 cfil_sock_is_closed(so);
1276 cfil_sock_detach(so);
1277 }
1278 #endif /* CONTENT_FILTER */
1279
1280 if (so->so_flags1 & SOF1_EXTEND_BK_IDLE_INPROG) {
1281 soresume(current_proc(), so, 1);
1282 so->so_flags1 &= ~SOF1_EXTEND_BK_IDLE_WANTED;
1283 }
1284
1285 if ((so->so_options & SO_ACCEPTCONN)) {
1286 struct socket *sp, *sonext;
1287 int persocklock = 0;
1288 int incomp_overflow_only;
1289
1290 /*
1291 * We do not want new connection to be added
1292 * to the connection queues
1293 */
1294 so->so_options &= ~SO_ACCEPTCONN;
1295
1296 /*
1297 * We can drop the lock on the listener once
1298 * we've acquired the incoming list
1299 */
1300 if (so->so_proto->pr_getlock != NULL) {
1301 persocklock = 1;
1302 so_acquire_accept_list(so, NULL);
1303 socket_unlock(so, 0);
1304 }
1305 again:
1306 incomp_overflow_only = 1;
1307
1308 TAILQ_FOREACH_SAFE(sp, &so->so_incomp, so_list, sonext) {
1309 /*
1310 * Radar 5350314
1311 * skip sockets thrown away by tcpdropdropblreq
1312 * they will get cleanup by the garbage collection.
1313 * otherwise, remove the incomp socket from the queue
1314 * and let soabort trigger the appropriate cleanup.
1315 */
1316 if (sp->so_flags & SOF_OVERFLOW)
1317 continue;
1318
1319 if (persocklock != 0)
1320 socket_lock(sp, 1);
1321
1322 /*
1323 * Radar 27945981
1324 * The extra reference for the list insure the
1325 * validity of the socket pointer when we perform the
1326 * unlock of the head above
1327 */
1328 if (sp->so_state & SS_INCOMP) {
1329 sp->so_state &= ~SS_INCOMP;
1330 sp->so_head = NULL;
1331 TAILQ_REMOVE(&so->so_incomp, sp, so_list);
1332 so->so_incqlen--;
1333 so->so_qlen--;
1334
1335 (void) soabort(sp);
1336 } else {
1337 panic("%s sp %p in so_incomp but !SS_INCOMP",
1338 __func__, sp);
1339 }
1340
1341 if (persocklock != 0)
1342 socket_unlock(sp, 1);
1343 }
1344
1345 TAILQ_FOREACH_SAFE(sp, &so->so_comp, so_list, sonext) {
1346 /* Dequeue from so_comp since sofree() won't do it */
1347 if (persocklock != 0)
1348 socket_lock(sp, 1);
1349
1350 if (sp->so_state & SS_COMP) {
1351 sp->so_state &= ~SS_COMP;
1352 sp->so_head = NULL;
1353 TAILQ_REMOVE(&so->so_comp, sp, so_list);
1354 so->so_qlen--;
1355
1356 (void) soabort(sp);
1357 } else {
1358 panic("%s sp %p in so_comp but !SS_COMP",
1359 __func__, sp);
1360 }
1361
1362 if (persocklock)
1363 socket_unlock(sp, 1);
1364 }
1365
1366 if (incomp_overflow_only == 0 && !TAILQ_EMPTY(&so->so_incomp)) {
1367 #if (DEBUG|DEVELOPMENT)
1368 panic("%s head %p so_comp not empty\n", __func__, so);
1369 #endif /* (DEVELOPMENT || DEBUG) */
1370
1371 goto again;
1372 }
1373
1374 if (!TAILQ_EMPTY(&so->so_comp)) {
1375 #if (DEBUG|DEVELOPMENT)
1376 panic("%s head %p so_comp not empty\n", __func__, so);
1377 #endif /* (DEVELOPMENT || DEBUG) */
1378
1379 goto again;
1380 }
1381
1382 if (persocklock) {
1383 socket_lock(so, 0);
1384 so_release_accept_list(so);
1385 }
1386 }
1387 if (so->so_pcb == NULL) {
1388 /* 3915887: mark the socket as ready for dealloc */
1389 so->so_flags |= SOF_PCBCLEARING;
1390 goto discard;
1391 }
1392 if (so->so_state & SS_ISCONNECTED) {
1393 if ((so->so_state & SS_ISDISCONNECTING) == 0) {
1394 error = sodisconnectlocked(so);
1395 if (error)
1396 goto drop;
1397 }
1398 if (so->so_options & SO_LINGER) {
1399 lck_mtx_t *mutex_held;
1400
1401 if ((so->so_state & SS_ISDISCONNECTING) &&
1402 (so->so_state & SS_NBIO))
1403 goto drop;
1404 if (so->so_proto->pr_getlock != NULL)
1405 mutex_held = (*so->so_proto->pr_getlock)(so, PR_F_WILLUNLOCK);
1406 else
1407 mutex_held = so->so_proto->pr_domain->dom_mtx;
1408 while (so->so_state & SS_ISCONNECTED) {
1409 ts.tv_sec = (so->so_linger/100);
1410 ts.tv_nsec = (so->so_linger % 100) *
1411 NSEC_PER_USEC * 1000 * 10;
1412 error = msleep((caddr_t)&so->so_timeo,
1413 mutex_held, PSOCK | PCATCH, "soclose", &ts);
1414 if (error) {
1415 /*
1416 * It's OK when the time fires,
1417 * don't report an error
1418 */
1419 if (error == EWOULDBLOCK)
1420 error = 0;
1421 break;
1422 }
1423 }
1424 }
1425 }
1426 drop:
1427 if (so->so_usecount == 0) {
1428 panic("soclose: usecount is zero so=%p\n", so);
1429 /* NOTREACHED */
1430 }
1431 if (so->so_pcb != NULL && !(so->so_flags & SOF_PCBCLEARING)) {
1432 int error2 = (*so->so_proto->pr_usrreqs->pru_detach)(so);
1433 if (error == 0)
1434 error = error2;
1435 }
1436 if (so->so_usecount <= 0) {
1437 panic("soclose: usecount is zero so=%p\n", so);
1438 /* NOTREACHED */
1439 }
1440 discard:
1441 if (so->so_pcb != NULL && !(so->so_flags & SOF_MP_SUBFLOW) &&
1442 (so->so_state & SS_NOFDREF)) {
1443 panic("soclose: NOFDREF");
1444 /* NOTREACHED */
1445 }
1446 so->so_state |= SS_NOFDREF;
1447
1448 if ((so->so_flags & SOF_KNOTE) != 0)
1449 KNOTE(&so->so_klist, SO_FILT_HINT_LOCKED);
1450
1451 atomic_add_32(&so->so_proto->pr_domain->dom_refs, -1);
1452 evsofree(so);
1453
1454 VERIFY(so->so_usecount > 0);
1455 so->so_usecount--;
1456 sofree(so);
1457 return (error);
1458 }
1459
1460 int
1461 soclose(struct socket *so)
1462 {
1463 int error = 0;
1464 socket_lock(so, 1);
1465
1466 if (so->so_retaincnt == 0) {
1467 error = soclose_locked(so);
1468 } else {
1469 /*
1470 * if the FD is going away, but socket is
1471 * retained in kernel remove its reference
1472 */
1473 so->so_usecount--;
1474 if (so->so_usecount < 2)
1475 panic("soclose: retaincnt non null and so=%p "
1476 "usecount=%d\n", so, so->so_usecount);
1477 }
1478 socket_unlock(so, 1);
1479 return (error);
1480 }
1481
1482 /*
1483 * Must be called at splnet...
1484 */
1485 /* Should already be locked */
1486 int
1487 soabort(struct socket *so)
1488 {
1489 int error;
1490
1491 #ifdef MORE_LOCKING_DEBUG
1492 lck_mtx_t *mutex_held;
1493
1494 if (so->so_proto->pr_getlock != NULL)
1495 mutex_held = (*so->so_proto->pr_getlock)(so, 0);
1496 else
1497 mutex_held = so->so_proto->pr_domain->dom_mtx;
1498 LCK_MTX_ASSERT(mutex_held, LCK_MTX_ASSERT_OWNED);
1499 #endif
1500
1501 if ((so->so_flags & SOF_ABORTED) == 0) {
1502 so->so_flags |= SOF_ABORTED;
1503 error = (*so->so_proto->pr_usrreqs->pru_abort)(so);
1504 if (error) {
1505 sofree(so);
1506 return (error);
1507 }
1508 }
1509 return (0);
1510 }
1511
1512 int
1513 soacceptlock(struct socket *so, struct sockaddr **nam, int dolock)
1514 {
1515 int error;
1516
1517 if (dolock)
1518 socket_lock(so, 1);
1519
1520 so_update_last_owner_locked(so, PROC_NULL);
1521 so_update_policy(so);
1522 #if NECP
1523 so_update_necp_policy(so, NULL, NULL);
1524 #endif /* NECP */
1525
1526 if ((so->so_state & SS_NOFDREF) == 0)
1527 panic("soaccept: !NOFDREF");
1528 so->so_state &= ~SS_NOFDREF;
1529 error = (*so->so_proto->pr_usrreqs->pru_accept)(so, nam);
1530
1531 if (dolock)
1532 socket_unlock(so, 1);
1533 return (error);
1534 }
1535
1536 int
1537 soaccept(struct socket *so, struct sockaddr **nam)
1538 {
1539 return (soacceptlock(so, nam, 1));
1540 }
1541
1542 int
1543 soacceptfilter(struct socket *so, struct socket *head)
1544 {
1545 struct sockaddr *local = NULL, *remote = NULL;
1546 int error = 0;
1547
1548 /*
1549 * Hold the lock even if this socket has not been made visible
1550 * to the filter(s). For sockets with global locks, this protects
1551 * against the head or peer going away
1552 */
1553 socket_lock(so, 1);
1554 if (sogetaddr_locked(so, &remote, 1) != 0 ||
1555 sogetaddr_locked(so, &local, 0) != 0) {
1556 so->so_state &= ~SS_NOFDREF;
1557 socket_unlock(so, 1);
1558 soclose(so);
1559 /* Out of resources; try it again next time */
1560 error = ECONNABORTED;
1561 goto done;
1562 }
1563
1564 error = sflt_accept(head, so, local, remote);
1565
1566 /*
1567 * If we get EJUSTRETURN from one of the filters, mark this socket
1568 * as inactive and return it anyway. This newly accepted socket
1569 * will be disconnected later before we hand it off to the caller.
1570 */
1571 if (error == EJUSTRETURN) {
1572 error = 0;
1573 (void) sosetdefunct(current_proc(), so,
1574 SHUTDOWN_SOCKET_LEVEL_DISCONNECT_INTERNAL, FALSE);
1575 }
1576
1577 if (error != 0) {
1578 /*
1579 * This may seem like a duplication to the above error
1580 * handling part when we return ECONNABORTED, except
1581 * the following is done while holding the lock since
1582 * the socket has been exposed to the filter(s) earlier.
1583 */
1584 so->so_state &= ~SS_NOFDREF;
1585 socket_unlock(so, 1);
1586 soclose(so);
1587 /* Propagate socket filter's error code to the caller */
1588 } else {
1589 socket_unlock(so, 1);
1590 }
1591 done:
1592 /* Callee checks for NULL pointer */
1593 sock_freeaddr(remote);
1594 sock_freeaddr(local);
1595 return (error);
1596 }
1597
1598 /*
1599 * Returns: 0 Success
1600 * EOPNOTSUPP Operation not supported on socket
1601 * EISCONN Socket is connected
1602 * <pru_connect>:EADDRNOTAVAIL Address not available.
1603 * <pru_connect>:EINVAL Invalid argument
1604 * <pru_connect>:EAFNOSUPPORT Address family not supported [notdef]
1605 * <pru_connect>:EACCES Permission denied
1606 * <pru_connect>:EADDRINUSE Address in use
1607 * <pru_connect>:EAGAIN Resource unavailable, try again
1608 * <pru_connect>:EPERM Operation not permitted
1609 * <sf_connect_out>:??? [anything a filter writer might set]
1610 */
1611 int
1612 soconnectlock(struct socket *so, struct sockaddr *nam, int dolock)
1613 {
1614 int error;
1615 struct proc *p = current_proc();
1616
1617 if (dolock)
1618 socket_lock(so, 1);
1619
1620 so_update_last_owner_locked(so, p);
1621 so_update_policy(so);
1622
1623 #if NECP
1624 so_update_necp_policy(so, NULL, nam);
1625 #endif /* NECP */
1626
1627 /*
1628 * If this is a listening socket or if this is a previously-accepted
1629 * socket that has been marked as inactive, reject the connect request.
1630 */
1631 if ((so->so_options & SO_ACCEPTCONN) || (so->so_flags & SOF_DEFUNCT)) {
1632 error = EOPNOTSUPP;
1633 if (so->so_flags & SOF_DEFUNCT) {
1634 SODEFUNCTLOG("%s[%d, %s]: defunct so 0x%llx [%d,%d] "
1635 "(%d)\n", __func__, proc_pid(p),
1636 proc_best_name(p),
1637 (uint64_t)DEBUG_KERNEL_ADDRPERM(so),
1638 SOCK_DOM(so), SOCK_TYPE(so), error);
1639 }
1640 if (dolock)
1641 socket_unlock(so, 1);
1642 return (error);
1643 }
1644
1645 if ((so->so_restrictions & SO_RESTRICT_DENY_OUT) != 0) {
1646 if (dolock)
1647 socket_unlock(so, 1);
1648 return (EPERM);
1649 }
1650
1651 /*
1652 * If protocol is connection-based, can only connect once.
1653 * Otherwise, if connected, try to disconnect first.
1654 * This allows user to disconnect by connecting to, e.g.,
1655 * a null address.
1656 */
1657 if (so->so_state & (SS_ISCONNECTED|SS_ISCONNECTING) &&
1658 ((so->so_proto->pr_flags & PR_CONNREQUIRED) ||
1659 (error = sodisconnectlocked(so)))) {
1660 error = EISCONN;
1661 } else {
1662 /*
1663 * Run connect filter before calling protocol:
1664 * - non-blocking connect returns before completion;
1665 */
1666 error = sflt_connectout(so, nam);
1667 if (error != 0) {
1668 if (error == EJUSTRETURN)
1669 error = 0;
1670 } else {
1671 error = (*so->so_proto->pr_usrreqs->pru_connect)
1672 (so, nam, p);
1673 }
1674 }
1675 if (dolock)
1676 socket_unlock(so, 1);
1677 return (error);
1678 }
1679
1680 int
1681 soconnect(struct socket *so, struct sockaddr *nam)
1682 {
1683 return (soconnectlock(so, nam, 1));
1684 }
1685
1686 /*
1687 * Returns: 0 Success
1688 * <pru_connect2>:EINVAL[AF_UNIX]
1689 * <pru_connect2>:EPROTOTYPE[AF_UNIX]
1690 * <pru_connect2>:??? [other protocol families]
1691 *
1692 * Notes: <pru_connect2> is not supported by [TCP].
1693 */
1694 int
1695 soconnect2(struct socket *so1, struct socket *so2)
1696 {
1697 int error;
1698
1699 socket_lock(so1, 1);
1700 if (so2->so_proto->pr_lock)
1701 socket_lock(so2, 1);
1702
1703 error = (*so1->so_proto->pr_usrreqs->pru_connect2)(so1, so2);
1704
1705 socket_unlock(so1, 1);
1706 if (so2->so_proto->pr_lock)
1707 socket_unlock(so2, 1);
1708 return (error);
1709 }
1710
1711 int
1712 soconnectxlocked(struct socket *so, struct sockaddr *src,
1713 struct sockaddr *dst, struct proc *p, uint32_t ifscope,
1714 sae_associd_t aid, sae_connid_t *pcid, uint32_t flags, void *arg,
1715 uint32_t arglen, uio_t auio, user_ssize_t *bytes_written)
1716 {
1717 int error;
1718
1719 so_update_last_owner_locked(so, p);
1720 so_update_policy(so);
1721
1722 /*
1723 * If this is a listening socket or if this is a previously-accepted
1724 * socket that has been marked as inactive, reject the connect request.
1725 */
1726 if ((so->so_options & SO_ACCEPTCONN) || (so->so_flags & SOF_DEFUNCT)) {
1727 error = EOPNOTSUPP;
1728 if (so->so_flags & SOF_DEFUNCT) {
1729 SODEFUNCTLOG("%s[%d, %s]: defunct so 0x%llx [%d,%d] "
1730 "(%d)\n", __func__, proc_pid(p),
1731 proc_best_name(p),
1732 (uint64_t)DEBUG_KERNEL_ADDRPERM(so),
1733 SOCK_DOM(so), SOCK_TYPE(so), error);
1734 }
1735 return (error);
1736 }
1737
1738 if ((so->so_restrictions & SO_RESTRICT_DENY_OUT) != 0)
1739 return (EPERM);
1740
1741 /*
1742 * If protocol is connection-based, can only connect once
1743 * unless PR_MULTICONN is set. Otherwise, if connected,
1744 * try to disconnect first. This allows user to disconnect
1745 * by connecting to, e.g., a null address.
1746 */
1747 if ((so->so_state & (SS_ISCONNECTED|SS_ISCONNECTING)) &&
1748 !(so->so_proto->pr_flags & PR_MULTICONN) &&
1749 ((so->so_proto->pr_flags & PR_CONNREQUIRED) ||
1750 (error = sodisconnectlocked(so)) != 0)) {
1751 error = EISCONN;
1752 } else {
1753 /*
1754 * Run connect filter before calling protocol:
1755 * - non-blocking connect returns before completion;
1756 */
1757 error = sflt_connectout(so, dst);
1758 if (error != 0) {
1759 /* Disable PRECONNECT_DATA, as we don't need to send a SYN anymore. */
1760 so->so_flags1 &= ~SOF1_PRECONNECT_DATA;
1761 if (error == EJUSTRETURN)
1762 error = 0;
1763 } else {
1764 error = (*so->so_proto->pr_usrreqs->pru_connectx)
1765 (so, src, dst, p, ifscope, aid, pcid,
1766 flags, arg, arglen, auio, bytes_written);
1767 }
1768 }
1769
1770 return (error);
1771 }
1772
1773 int
1774 sodisconnectlocked(struct socket *so)
1775 {
1776 int error;
1777
1778 if ((so->so_state & SS_ISCONNECTED) == 0) {
1779 error = ENOTCONN;
1780 goto bad;
1781 }
1782 if (so->so_state & SS_ISDISCONNECTING) {
1783 error = EALREADY;
1784 goto bad;
1785 }
1786
1787 error = (*so->so_proto->pr_usrreqs->pru_disconnect)(so);
1788 if (error == 0)
1789 sflt_notify(so, sock_evt_disconnected, NULL);
1790
1791 bad:
1792 return (error);
1793 }
1794
1795 /* Locking version */
1796 int
1797 sodisconnect(struct socket *so)
1798 {
1799 int error;
1800
1801 socket_lock(so, 1);
1802 error = sodisconnectlocked(so);
1803 socket_unlock(so, 1);
1804 return (error);
1805 }
1806
1807 int
1808 sodisconnectxlocked(struct socket *so, sae_associd_t aid, sae_connid_t cid)
1809 {
1810 int error;
1811
1812 /*
1813 * Call the protocol disconnectx handler; let it handle all
1814 * matters related to the connection state of this session.
1815 */
1816 error = (*so->so_proto->pr_usrreqs->pru_disconnectx)(so, aid, cid);
1817 if (error == 0) {
1818 /*
1819 * The event applies only for the session, not for
1820 * the disconnection of individual subflows.
1821 */
1822 if (so->so_state & (SS_ISDISCONNECTING|SS_ISDISCONNECTED))
1823 sflt_notify(so, sock_evt_disconnected, NULL);
1824 }
1825 return (error);
1826 }
1827
1828 int
1829 sodisconnectx(struct socket *so, sae_associd_t aid, sae_connid_t cid)
1830 {
1831 int error;
1832
1833 socket_lock(so, 1);
1834 error = sodisconnectxlocked(so, aid, cid);
1835 socket_unlock(so, 1);
1836 return (error);
1837 }
1838
1839 #define SBLOCKWAIT(f) (((f) & MSG_DONTWAIT) ? 0 : SBL_WAIT)
1840
1841 /*
1842 * sosendcheck will lock the socket buffer if it isn't locked and
1843 * verify that there is space for the data being inserted.
1844 *
1845 * Returns: 0 Success
1846 * EPIPE
1847 * sblock:EWOULDBLOCK
1848 * sblock:EINTR
1849 * sbwait:EBADF
1850 * sbwait:EINTR
1851 * [so_error]:???
1852 */
1853 int
1854 sosendcheck(struct socket *so, struct sockaddr *addr, user_ssize_t resid,
1855 int32_t clen, int32_t atomic, int flags, int *sblocked,
1856 struct mbuf *control)
1857 {
1858 int error = 0;
1859 int32_t space;
1860 int assumelock = 0;
1861
1862 restart:
1863 if (*sblocked == 0) {
1864 if ((so->so_snd.sb_flags & SB_LOCK) != 0 &&
1865 so->so_send_filt_thread != 0 &&
1866 so->so_send_filt_thread == current_thread()) {
1867 /*
1868 * We're being called recursively from a filter,
1869 * allow this to continue. Radar 4150520.
1870 * Don't set sblocked because we don't want
1871 * to perform an unlock later.
1872 */
1873 assumelock = 1;
1874 } else {
1875 error = sblock(&so->so_snd, SBLOCKWAIT(flags));
1876 if (error) {
1877 if (so->so_flags & SOF_DEFUNCT)
1878 goto defunct;
1879 return (error);
1880 }
1881 *sblocked = 1;
1882 }
1883 }
1884
1885 /*
1886 * If a send attempt is made on a socket that has been marked
1887 * as inactive (disconnected), reject the request.
1888 */
1889 if (so->so_flags & SOF_DEFUNCT) {
1890 defunct:
1891 error = EPIPE;
1892 SODEFUNCTLOG("%s[%d, %s]: defunct so 0x%llx [%d,%d] (%d)\n",
1893 __func__, proc_selfpid(), proc_best_name(current_proc()),
1894 (uint64_t)DEBUG_KERNEL_ADDRPERM(so),
1895 SOCK_DOM(so), SOCK_TYPE(so), error);
1896 return (error);
1897 }
1898
1899 if (so->so_state & SS_CANTSENDMORE) {
1900 #if CONTENT_FILTER
1901 /*
1902 * Can re-inject data of half closed connections
1903 */
1904 if ((so->so_state & SS_ISDISCONNECTED) == 0 &&
1905 so->so_snd.sb_cfil_thread == current_thread() &&
1906 cfil_sock_data_pending(&so->so_snd) != 0)
1907 CFIL_LOG(LOG_INFO,
1908 "so %llx ignore SS_CANTSENDMORE",
1909 (uint64_t)DEBUG_KERNEL_ADDRPERM(so));
1910 else
1911 #endif /* CONTENT_FILTER */
1912 return (EPIPE);
1913 }
1914 if (so->so_error) {
1915 error = so->so_error;
1916 so->so_error = 0;
1917 return (error);
1918 }
1919
1920 if ((so->so_state & SS_ISCONNECTED) == 0) {
1921 if ((so->so_proto->pr_flags & PR_CONNREQUIRED) != 0) {
1922 if (((so->so_state & SS_ISCONFIRMING) == 0) &&
1923 (resid != 0 || clen == 0) &&
1924 !(so->so_flags1 & SOF1_PRECONNECT_DATA))
1925 return (ENOTCONN);
1926
1927 } else if (addr == 0 && !(flags&MSG_HOLD)) {
1928 return ((so->so_proto->pr_flags & PR_CONNREQUIRED) ?
1929 ENOTCONN : EDESTADDRREQ);
1930 }
1931 }
1932
1933 if (so->so_flags & SOF_ENABLE_MSGS)
1934 space = msgq_sbspace(so, control);
1935 else
1936 space = sbspace(&so->so_snd);
1937
1938 if (flags & MSG_OOB)
1939 space += 1024;
1940 if ((atomic && resid > so->so_snd.sb_hiwat) ||
1941 clen > so->so_snd.sb_hiwat)
1942 return (EMSGSIZE);
1943
1944 if ((space < resid + clen &&
1945 (atomic || (space < (int32_t)so->so_snd.sb_lowat) ||
1946 space < clen)) ||
1947 (so->so_type == SOCK_STREAM && so_wait_for_if_feedback(so))) {
1948 /*
1949 * don't block the connectx call when there's more data
1950 * than can be copied.
1951 */
1952 if (so->so_flags1 & SOF1_PRECONNECT_DATA) {
1953 if (space == 0) {
1954 return (EWOULDBLOCK);
1955 }
1956 if (space < (int32_t)so->so_snd.sb_lowat) {
1957 return (0);
1958 }
1959 }
1960 if ((so->so_state & SS_NBIO) || (flags & MSG_NBIO) ||
1961 assumelock) {
1962 return (EWOULDBLOCK);
1963 }
1964 sbunlock(&so->so_snd, TRUE); /* keep socket locked */
1965 *sblocked = 0;
1966 error = sbwait(&so->so_snd);
1967 if (error) {
1968 if (so->so_flags & SOF_DEFUNCT)
1969 goto defunct;
1970 return (error);
1971 }
1972 goto restart;
1973 }
1974 return (0);
1975 }
1976
1977 /*
1978 * Send on a socket.
1979 * If send must go all at once and message is larger than
1980 * send buffering, then hard error.
1981 * Lock against other senders.
1982 * If must go all at once and not enough room now, then
1983 * inform user that this would block and do nothing.
1984 * Otherwise, if nonblocking, send as much as possible.
1985 * The data to be sent is described by "uio" if nonzero,
1986 * otherwise by the mbuf chain "top" (which must be null
1987 * if uio is not). Data provided in mbuf chain must be small
1988 * enough to send all at once.
1989 *
1990 * Returns nonzero on error, timeout or signal; callers
1991 * must check for short counts if EINTR/ERESTART are returned.
1992 * Data and control buffers are freed on return.
1993 * Experiment:
1994 * MSG_HOLD: go thru most of sosend(), but just enqueue the mbuf
1995 * MSG_SEND: go thru as for MSG_HOLD on current fragment, then
1996 * point at the mbuf chain being constructed and go from there.
1997 *
1998 * Returns: 0 Success
1999 * EOPNOTSUPP
2000 * EINVAL
2001 * ENOBUFS
2002 * uiomove:EFAULT
2003 * sosendcheck:EPIPE
2004 * sosendcheck:EWOULDBLOCK
2005 * sosendcheck:EINTR
2006 * sosendcheck:EBADF
2007 * sosendcheck:EINTR
2008 * sosendcheck:??? [value from so_error]
2009 * <pru_send>:ECONNRESET[TCP]
2010 * <pru_send>:EINVAL[TCP]
2011 * <pru_send>:ENOBUFS[TCP]
2012 * <pru_send>:EADDRINUSE[TCP]
2013 * <pru_send>:EADDRNOTAVAIL[TCP]
2014 * <pru_send>:EAFNOSUPPORT[TCP]
2015 * <pru_send>:EACCES[TCP]
2016 * <pru_send>:EAGAIN[TCP]
2017 * <pru_send>:EPERM[TCP]
2018 * <pru_send>:EMSGSIZE[TCP]
2019 * <pru_send>:EHOSTUNREACH[TCP]
2020 * <pru_send>:ENETUNREACH[TCP]
2021 * <pru_send>:ENETDOWN[TCP]
2022 * <pru_send>:ENOMEM[TCP]
2023 * <pru_send>:ENOBUFS[TCP]
2024 * <pru_send>:???[TCP] [ignorable: mostly IPSEC/firewall/DLIL]
2025 * <pru_send>:EINVAL[AF_UNIX]
2026 * <pru_send>:EOPNOTSUPP[AF_UNIX]
2027 * <pru_send>:EPIPE[AF_UNIX]
2028 * <pru_send>:ENOTCONN[AF_UNIX]
2029 * <pru_send>:EISCONN[AF_UNIX]
2030 * <pru_send>:???[AF_UNIX] [whatever a filter author chooses]
2031 * <sf_data_out>:??? [whatever a filter author chooses]
2032 *
2033 * Notes: Other <pru_send> returns depend on the protocol family; all
2034 * <sf_data_out> returns depend on what the filter author causes
2035 * their filter to return.
2036 */
2037 int
2038 sosend(struct socket *so, struct sockaddr *addr, struct uio *uio,
2039 struct mbuf *top, struct mbuf *control, int flags)
2040 {
2041 struct mbuf **mp;
2042 struct mbuf *m, *freelist = NULL;
2043 user_ssize_t space, len, resid, orig_resid;
2044 int clen = 0, error, dontroute, mlen, sendflags;
2045 int atomic = sosendallatonce(so) || top;
2046 int sblocked = 0;
2047 struct proc *p = current_proc();
2048 struct mbuf *control_copy = NULL;
2049 uint16_t headroom = 0;
2050 boolean_t en_tracing = FALSE;
2051
2052 if (uio != NULL)
2053 resid = uio_resid(uio);
2054 else
2055 resid = top->m_pkthdr.len;
2056
2057 KERNEL_DEBUG((DBG_FNC_SOSEND | DBG_FUNC_START), so, resid,
2058 so->so_snd.sb_cc, so->so_snd.sb_lowat, so->so_snd.sb_hiwat);
2059
2060 socket_lock(so, 1);
2061
2062 /*
2063 * trace if tracing & network (vs. unix) sockets & and
2064 * non-loopback
2065 */
2066 if (ENTR_SHOULDTRACE &&
2067 (SOCK_CHECK_DOM(so, AF_INET) || SOCK_CHECK_DOM(so, AF_INET6))) {
2068 struct inpcb *inp = sotoinpcb(so);
2069 if (inp->inp_last_outifp != NULL &&
2070 !(inp->inp_last_outifp->if_flags & IFF_LOOPBACK)) {
2071 en_tracing = TRUE;
2072 KERNEL_ENERGYTRACE(kEnTrActKernSockWrite, DBG_FUNC_START,
2073 VM_KERNEL_ADDRPERM(so),
2074 ((so->so_state & SS_NBIO) ? kEnTrFlagNonBlocking : 0),
2075 (int64_t)resid);
2076 orig_resid = resid;
2077 }
2078 }
2079
2080 /*
2081 * Re-injection should not affect process accounting
2082 */
2083 if ((flags & MSG_SKIPCFIL) == 0) {
2084 so_update_last_owner_locked(so, p);
2085 so_update_policy(so);
2086
2087 #if NECP
2088 so_update_necp_policy(so, NULL, addr);
2089 #endif /* NECP */
2090 }
2091
2092 if (so->so_type != SOCK_STREAM && (flags & MSG_OOB) != 0) {
2093 error = EOPNOTSUPP;
2094 goto out_locked;
2095 }
2096
2097 /*
2098 * In theory resid should be unsigned.
2099 * However, space must be signed, as it might be less than 0
2100 * if we over-committed, and we must use a signed comparison
2101 * of space and resid. On the other hand, a negative resid
2102 * causes us to loop sending 0-length segments to the protocol.
2103 *
2104 * Usually, MSG_EOR isn't used on SOCK_STREAM type sockets.
2105 * But it will be used by sockets doing message delivery.
2106 *
2107 * Note: We limit resid to be a positive int value as we use
2108 * imin() to set bytes_to_copy -- radr://14558484
2109 */
2110 if (resid < 0 || resid > INT_MAX || (so->so_type == SOCK_STREAM &&
2111 !(so->so_flags & SOF_ENABLE_MSGS) && (flags & MSG_EOR))) {
2112 error = EINVAL;
2113 goto out_locked;
2114 }
2115
2116 dontroute = (flags & MSG_DONTROUTE) &&
2117 (so->so_options & SO_DONTROUTE) == 0 &&
2118 (so->so_proto->pr_flags & PR_ATOMIC);
2119 OSIncrementAtomicLong(&p->p_stats->p_ru.ru_msgsnd);
2120
2121 if (control != NULL)
2122 clen = control->m_len;
2123
2124 if (soreserveheadroom != 0)
2125 headroom = so->so_pktheadroom;
2126
2127 do {
2128 error = sosendcheck(so, addr, resid, clen, atomic, flags,
2129 &sblocked, control);
2130 if (error)
2131 goto out_locked;
2132
2133 mp = &top;
2134 if (so->so_flags & SOF_ENABLE_MSGS)
2135 space = msgq_sbspace(so, control);
2136 else
2137 space = sbspace(&so->so_snd) - clen;
2138 space += ((flags & MSG_OOB) ? 1024 : 0);
2139
2140 do {
2141 if (uio == NULL) {
2142 /*
2143 * Data is prepackaged in "top".
2144 */
2145 resid = 0;
2146 if (flags & MSG_EOR)
2147 top->m_flags |= M_EOR;
2148 } else {
2149 int chainlength;
2150 int bytes_to_copy;
2151 boolean_t jumbocl;
2152 boolean_t bigcl;
2153 int bytes_to_alloc;
2154
2155 bytes_to_copy = imin(resid, space);
2156
2157 bytes_to_alloc = bytes_to_copy;
2158 if (top == NULL)
2159 bytes_to_alloc += headroom;
2160
2161 if (sosendminchain > 0)
2162 chainlength = 0;
2163 else
2164 chainlength = sosendmaxchain;
2165
2166 /*
2167 * Use big 4 KB cluster when the outgoing interface
2168 * does not prefer 2 KB clusters
2169 */
2170 bigcl = !(so->so_flags1 & SOF1_IF_2KCL) ||
2171 sosendbigcl_ignore_capab;
2172
2173 /*
2174 * Attempt to use larger than system page-size
2175 * clusters for large writes only if there is
2176 * a jumbo cluster pool and if the socket is
2177 * marked accordingly.
2178 */
2179 jumbocl = sosendjcl && njcl > 0 &&
2180 ((so->so_flags & SOF_MULTIPAGES) ||
2181 sosendjcl_ignore_capab) &&
2182 bigcl;
2183
2184 socket_unlock(so, 0);
2185
2186 do {
2187 int num_needed;
2188 int hdrs_needed = (top == NULL) ? 1 : 0;
2189
2190 /*
2191 * try to maintain a local cache of mbuf
2192 * clusters needed to complete this
2193 * write the list is further limited to
2194 * the number that are currently needed
2195 * to fill the socket this mechanism
2196 * allows a large number of mbufs/
2197 * clusters to be grabbed under a single
2198 * mbuf lock... if we can't get any
2199 * clusters, than fall back to trying
2200 * for mbufs if we fail early (or
2201 * miscalcluate the number needed) make
2202 * sure to release any clusters we
2203 * haven't yet consumed.
2204 */
2205 if (freelist == NULL &&
2206 bytes_to_alloc > MBIGCLBYTES &&
2207 jumbocl) {
2208 num_needed =
2209 bytes_to_alloc / M16KCLBYTES;
2210
2211 if ((bytes_to_alloc -
2212 (num_needed * M16KCLBYTES))
2213 >= MINCLSIZE)
2214 num_needed++;
2215
2216 freelist =
2217 m_getpackets_internal(
2218 (unsigned int *)&num_needed,
2219 hdrs_needed, M_WAIT, 0,
2220 M16KCLBYTES);
2221 /*
2222 * Fall back to 4K cluster size
2223 * if allocation failed
2224 */
2225 }
2226
2227 if (freelist == NULL &&
2228 bytes_to_alloc > MCLBYTES &&
2229 bigcl) {
2230 num_needed =
2231 bytes_to_alloc / MBIGCLBYTES;
2232
2233 if ((bytes_to_alloc -
2234 (num_needed * MBIGCLBYTES)) >=
2235 MINCLSIZE)
2236 num_needed++;
2237
2238 freelist =
2239 m_getpackets_internal(
2240 (unsigned int *)&num_needed,
2241 hdrs_needed, M_WAIT, 0,
2242 MBIGCLBYTES);
2243 /*
2244 * Fall back to cluster size
2245 * if allocation failed
2246 */
2247 }
2248
2249 /*
2250 * Allocate a cluster as we want to
2251 * avoid to split the data in more
2252 * that one segment and using MINCLSIZE
2253 * would lead us to allocate two mbufs
2254 */
2255 if (soreserveheadroom != 0 &&
2256 freelist == NULL &&
2257 ((top == NULL &&
2258 bytes_to_alloc > _MHLEN) ||
2259 bytes_to_alloc > _MLEN)) {
2260 num_needed = ROUNDUP(bytes_to_alloc, MCLBYTES) /
2261 MCLBYTES;
2262 freelist =
2263 m_getpackets_internal(
2264 (unsigned int *)&num_needed,
2265 hdrs_needed, M_WAIT, 0,
2266 MCLBYTES);
2267 /*
2268 * Fall back to a single mbuf
2269 * if allocation failed
2270 */
2271 } else if (freelist == NULL &&
2272 bytes_to_alloc > MINCLSIZE) {
2273 num_needed =
2274 bytes_to_alloc / MCLBYTES;
2275
2276 if ((bytes_to_alloc -
2277 (num_needed * MCLBYTES)) >=
2278 MINCLSIZE)
2279 num_needed++;
2280
2281 freelist =
2282 m_getpackets_internal(
2283 (unsigned int *)&num_needed,
2284 hdrs_needed, M_WAIT, 0,
2285 MCLBYTES);
2286 /*
2287 * Fall back to a single mbuf
2288 * if allocation failed
2289 */
2290 }
2291 /*
2292 * For datagram protocols, leave
2293 * headroom for protocol headers
2294 * in the first cluster of the chain
2295 */
2296 if (freelist != NULL && atomic &&
2297 top == NULL && headroom > 0) {
2298 freelist->m_data += headroom;
2299 }
2300
2301 /*
2302 * Fall back to regular mbufs without
2303 * reserving the socket headroom
2304 */
2305 if (freelist == NULL) {
2306 if (top == NULL)
2307 MGETHDR(freelist,
2308 M_WAIT, MT_DATA);
2309 else
2310 MGET(freelist,
2311 M_WAIT, MT_DATA);
2312
2313 if (freelist == NULL) {
2314 error = ENOBUFS;
2315 socket_lock(so, 0);
2316 goto out_locked;
2317 }
2318 /*
2319 * For datagram protocols,
2320 * leave room for protocol
2321 * headers in first mbuf.
2322 */
2323 if (atomic && top == NULL &&
2324 bytes_to_copy < MHLEN) {
2325 MH_ALIGN(freelist,
2326 bytes_to_copy);
2327 }
2328 }
2329 m = freelist;
2330 freelist = m->m_next;
2331 m->m_next = NULL;
2332
2333 if ((m->m_flags & M_EXT))
2334 mlen = m->m_ext.ext_size -
2335 m_leadingspace(m);
2336 else if ((m->m_flags & M_PKTHDR))
2337 mlen =
2338 MHLEN - m_leadingspace(m);
2339 else
2340 mlen = MLEN - m_leadingspace(m);
2341 len = imin(mlen, bytes_to_copy);
2342
2343 chainlength += len;
2344
2345 space -= len;
2346
2347 error = uiomove(mtod(m, caddr_t),
2348 len, uio);
2349
2350 resid = uio_resid(uio);
2351
2352 m->m_len = len;
2353 *mp = m;
2354 top->m_pkthdr.len += len;
2355 if (error)
2356 break;
2357 mp = &m->m_next;
2358 if (resid <= 0) {
2359 if (flags & MSG_EOR)
2360 top->m_flags |= M_EOR;
2361 break;
2362 }
2363 bytes_to_copy = min(resid, space);
2364
2365 } while (space > 0 &&
2366 (chainlength < sosendmaxchain || atomic ||
2367 resid < MINCLSIZE));
2368
2369 socket_lock(so, 0);
2370
2371 if (error)
2372 goto out_locked;
2373 }
2374
2375 if (flags & (MSG_HOLD|MSG_SEND)) {
2376 /* Enqueue for later, go away if HOLD */
2377 struct mbuf *mb1;
2378 if (so->so_temp && (flags & MSG_FLUSH)) {
2379 m_freem(so->so_temp);
2380 so->so_temp = NULL;
2381 }
2382 if (so->so_temp)
2383 so->so_tail->m_next = top;
2384 else
2385 so->so_temp = top;
2386 mb1 = top;
2387 while (mb1->m_next)
2388 mb1 = mb1->m_next;
2389 so->so_tail = mb1;
2390 if (flags & MSG_HOLD) {
2391 top = NULL;
2392 goto out_locked;
2393 }
2394 top = so->so_temp;
2395 }
2396 if (dontroute)
2397 so->so_options |= SO_DONTROUTE;
2398
2399 /*
2400 * Compute flags here, for pru_send and NKEs
2401 *
2402 * If the user set MSG_EOF, the protocol
2403 * understands this flag and nothing left to
2404 * send then use PRU_SEND_EOF instead of PRU_SEND.
2405 */
2406 sendflags = (flags & MSG_OOB) ? PRUS_OOB :
2407 ((flags & MSG_EOF) &&
2408 (so->so_proto->pr_flags & PR_IMPLOPCL) &&
2409 (resid <= 0)) ? PRUS_EOF :
2410 /* If there is more to send set PRUS_MORETOCOME */
2411 (resid > 0 && space > 0) ? PRUS_MORETOCOME : 0;
2412
2413 if ((flags & MSG_SKIPCFIL) == 0) {
2414 /*
2415 * Socket filter processing
2416 */
2417 error = sflt_data_out(so, addr, &top,
2418 &control, (sendflags & MSG_OOB) ?
2419 sock_data_filt_flag_oob : 0);
2420 if (error) {
2421 if (error == EJUSTRETURN) {
2422 error = 0;
2423 clen = 0;
2424 control = NULL;
2425 top = NULL;
2426 }
2427 goto out_locked;
2428 }
2429 #if CONTENT_FILTER
2430 /*
2431 * Content filter processing
2432 */
2433 error = cfil_sock_data_out(so, addr, top,
2434 control, (sendflags & MSG_OOB) ?
2435 sock_data_filt_flag_oob : 0);
2436 if (error) {
2437 if (error == EJUSTRETURN) {
2438 error = 0;
2439 clen = 0;
2440 control = NULL;
2441 top = NULL;
2442 }
2443 goto out_locked;
2444 }
2445 #endif /* CONTENT_FILTER */
2446 }
2447 if (so->so_flags & SOF_ENABLE_MSGS) {
2448 /*
2449 * Make a copy of control mbuf,
2450 * so that msg priority can be
2451 * passed to subsequent mbufs.
2452 */
2453 control_copy = m_dup(control, M_NOWAIT);
2454 }
2455 error = (*so->so_proto->pr_usrreqs->pru_send)
2456 (so, sendflags, top, addr, control, p);
2457
2458 if (flags & MSG_SEND)
2459 so->so_temp = NULL;
2460
2461 if (dontroute)
2462 so->so_options &= ~SO_DONTROUTE;
2463
2464 clen = 0;
2465 control = control_copy;
2466 control_copy = NULL;
2467 top = NULL;
2468 mp = &top;
2469 if (error)
2470 goto out_locked;
2471 } while (resid && space > 0);
2472 } while (resid);
2473
2474 out_locked:
2475 if (sblocked)
2476 sbunlock(&so->so_snd, FALSE); /* will unlock socket */
2477 else
2478 socket_unlock(so, 1);
2479 if (top != NULL)
2480 m_freem(top);
2481 if (control != NULL)
2482 m_freem(control);
2483 if (freelist != NULL)
2484 m_freem_list(freelist);
2485 if (control_copy != NULL)
2486 m_freem(control_copy);
2487
2488 soclearfastopen(so);
2489
2490 if (en_tracing) {
2491 /* resid passed here is the bytes left in uio */
2492 KERNEL_ENERGYTRACE(kEnTrActKernSockWrite, DBG_FUNC_END,
2493 VM_KERNEL_ADDRPERM(so),
2494 ((error == EWOULDBLOCK) ? kEnTrFlagNoWork : 0),
2495 (int64_t)(orig_resid - resid));
2496 }
2497 KERNEL_DEBUG(DBG_FNC_SOSEND | DBG_FUNC_END, so, resid,
2498 so->so_snd.sb_cc, space, error);
2499
2500 return (error);
2501 }
2502
2503 /*
2504 * Supported only connected sockets (no address) without ancillary data
2505 * (control mbuf) for atomic protocols
2506 */
2507 int
2508 sosend_list(struct socket *so, struct uio **uioarray, u_int uiocnt, int flags)
2509 {
2510 struct mbuf *m, *freelist = NULL;
2511 user_ssize_t len, resid;
2512 int error, dontroute, mlen;
2513 int atomic = sosendallatonce(so);
2514 int sblocked = 0;
2515 struct proc *p = current_proc();
2516 u_int uiofirst = 0;
2517 u_int uiolast = 0;
2518 struct mbuf *top = NULL;
2519 uint16_t headroom = 0;
2520 boolean_t bigcl;
2521
2522 KERNEL_DEBUG((DBG_FNC_SOSEND_LIST | DBG_FUNC_START), so, uiocnt,
2523 so->so_snd.sb_cc, so->so_snd.sb_lowat, so->so_snd.sb_hiwat);
2524
2525 if (so->so_type != SOCK_DGRAM) {
2526 error = EINVAL;
2527 goto out;
2528 }
2529 if (atomic == 0) {
2530 error = EINVAL;
2531 goto out;
2532 }
2533 if (so->so_proto->pr_usrreqs->pru_send_list == NULL) {
2534 error = EPROTONOSUPPORT;
2535 goto out;
2536 }
2537 if (flags & ~(MSG_DONTWAIT | MSG_NBIO)) {
2538 error = EINVAL;
2539 goto out;
2540 }
2541 resid = uio_array_resid(uioarray, uiocnt);
2542
2543 /*
2544 * In theory resid should be unsigned.
2545 * However, space must be signed, as it might be less than 0
2546 * if we over-committed, and we must use a signed comparison
2547 * of space and resid. On the other hand, a negative resid
2548 * causes us to loop sending 0-length segments to the protocol.
2549 *
2550 * Note: We limit resid to be a positive int value as we use
2551 * imin() to set bytes_to_copy -- radr://14558484
2552 */
2553 if (resid < 0 || resid > INT_MAX) {
2554 error = EINVAL;
2555 goto out;
2556 }
2557
2558 socket_lock(so, 1);
2559 so_update_last_owner_locked(so, p);
2560 so_update_policy(so);
2561
2562 #if NECP
2563 so_update_necp_policy(so, NULL, NULL);
2564 #endif /* NECP */
2565
2566 dontroute = (flags & MSG_DONTROUTE) &&
2567 (so->so_options & SO_DONTROUTE) == 0 &&
2568 (so->so_proto->pr_flags & PR_ATOMIC);
2569 OSIncrementAtomicLong(&p->p_stats->p_ru.ru_msgsnd);
2570
2571 error = sosendcheck(so, NULL, resid, 0, atomic, flags,
2572 &sblocked, NULL);
2573 if (error)
2574 goto release;
2575
2576 /*
2577 * Use big 4 KB clusters when the outgoing interface does not prefer
2578 * 2 KB clusters
2579 */
2580 bigcl = !(so->so_flags1 & SOF1_IF_2KCL) || sosendbigcl_ignore_capab;
2581
2582 if (soreserveheadroom != 0)
2583 headroom = so->so_pktheadroom;
2584
2585 do {
2586 int i;
2587 int num_needed = 0;
2588 int chainlength;
2589 size_t maxpktlen = 0;
2590 int bytes_to_alloc;
2591
2592 if (sosendminchain > 0)
2593 chainlength = 0;
2594 else
2595 chainlength = sosendmaxchain;
2596
2597 socket_unlock(so, 0);
2598
2599 /*
2600 * Find a set of uio that fit in a reasonable number
2601 * of mbuf packets
2602 */
2603 for (i = uiofirst; i < uiocnt; i++) {
2604 struct uio *auio = uioarray[i];
2605
2606 len = uio_resid(auio);
2607
2608 /* Do nothing for empty messages */
2609 if (len == 0)
2610 continue;
2611
2612 num_needed += 1;
2613 uiolast += 1;
2614
2615 if (len > maxpktlen)
2616 maxpktlen = len;
2617
2618 chainlength += len;
2619 if (chainlength > sosendmaxchain)
2620 break;
2621 }
2622 /*
2623 * Nothing left to send
2624 */
2625 if (num_needed == 0) {
2626 socket_lock(so, 0);
2627 break;
2628 }
2629 /*
2630 * Allocate buffer large enough to include headroom space for
2631 * network and link header
2632 *
2633 */
2634 bytes_to_alloc = maxpktlen + headroom;
2635
2636 /*
2637 * Allocate a single contiguous buffer of the smallest available
2638 * size when possible
2639 */
2640 if (bytes_to_alloc > MCLBYTES &&
2641 bytes_to_alloc <= MBIGCLBYTES && bigcl) {
2642 freelist = m_getpackets_internal(
2643 (unsigned int *)&num_needed,
2644 num_needed, M_WAIT, 1,
2645 MBIGCLBYTES);
2646 } else if (bytes_to_alloc > _MHLEN &&
2647 bytes_to_alloc <= MCLBYTES) {
2648 freelist = m_getpackets_internal(
2649 (unsigned int *)&num_needed,
2650 num_needed, M_WAIT, 1,
2651 MCLBYTES);
2652 } else {
2653 freelist = m_allocpacket_internal(
2654 (unsigned int *)&num_needed,
2655 bytes_to_alloc, NULL, M_WAIT, 1, 0);
2656 }
2657
2658 if (freelist == NULL) {
2659 socket_lock(so, 0);
2660 error = ENOMEM;
2661 goto release;
2662 }
2663 /*
2664 * Copy each uio of the set into its own mbuf packet
2665 */
2666 for (i = uiofirst, m = freelist;
2667 i < uiolast && m != NULL;
2668 i++) {
2669 int bytes_to_copy;
2670 struct mbuf *n;
2671 struct uio *auio = uioarray[i];
2672
2673 bytes_to_copy = uio_resid(auio);
2674
2675 /* Do nothing for empty messages */
2676 if (bytes_to_copy == 0)
2677 continue;
2678 /*
2679 * Leave headroom for protocol headers
2680 * in the first mbuf of the chain
2681 */
2682 m->m_data += headroom;
2683
2684 for (n = m; n != NULL; n = n->m_next) {
2685 if ((m->m_flags & M_EXT))
2686 mlen = m->m_ext.ext_size -
2687 m_leadingspace(m);
2688 else if ((m->m_flags & M_PKTHDR))
2689 mlen =
2690 MHLEN - m_leadingspace(m);
2691 else
2692 mlen = MLEN - m_leadingspace(m);
2693 len = imin(mlen, bytes_to_copy);
2694
2695 /*
2696 * Note: uiomove() decrements the iovec
2697 * length
2698 */
2699 error = uiomove(mtod(n, caddr_t),
2700 len, auio);
2701 if (error != 0)
2702 break;
2703 n->m_len = len;
2704 m->m_pkthdr.len += len;
2705
2706 VERIFY(m->m_pkthdr.len <= maxpktlen);
2707
2708 bytes_to_copy -= len;
2709 resid -= len;
2710 }
2711 if (m->m_pkthdr.len == 0) {
2712 printf(
2713 "%s:%d so %llx pkt %llx type %u len null\n",
2714 __func__, __LINE__,
2715 (uint64_t)DEBUG_KERNEL_ADDRPERM(so),
2716 (uint64_t)DEBUG_KERNEL_ADDRPERM(m),
2717 m->m_type);
2718 }
2719 if (error != 0)
2720 break;
2721 m = m->m_nextpkt;
2722 }
2723
2724 socket_lock(so, 0);
2725
2726 if (error)
2727 goto release;
2728 top = freelist;
2729 freelist = NULL;
2730
2731 if (dontroute)
2732 so->so_options |= SO_DONTROUTE;
2733
2734 if ((flags & MSG_SKIPCFIL) == 0) {
2735 struct mbuf **prevnextp = NULL;
2736
2737 for (i = uiofirst, m = top;
2738 i < uiolast && m != NULL;
2739 i++) {
2740 struct mbuf *nextpkt = m->m_nextpkt;
2741
2742 /*
2743 * Socket filter processing
2744 */
2745 error = sflt_data_out(so, NULL, &m,
2746 NULL, 0);
2747 if (error != 0 && error != EJUSTRETURN)
2748 goto release;
2749
2750 #if CONTENT_FILTER
2751 if (error == 0) {
2752 /*
2753 * Content filter processing
2754 */
2755 error = cfil_sock_data_out(so, NULL, m,
2756 NULL, 0);
2757 if (error != 0 && error != EJUSTRETURN)
2758 goto release;
2759 }
2760 #endif /* CONTENT_FILTER */
2761 /*
2762 * Remove packet from the list when
2763 * swallowed by a filter
2764 */
2765 if (error == EJUSTRETURN) {
2766 error = 0;
2767 if (prevnextp != NULL)
2768 *prevnextp = nextpkt;
2769 else
2770 top = nextpkt;
2771 }
2772
2773 m = nextpkt;
2774 if (m != NULL)
2775 prevnextp = &m->m_nextpkt;
2776 }
2777 }
2778 if (top != NULL)
2779 error = (*so->so_proto->pr_usrreqs->pru_send_list)
2780 (so, 0, top, NULL, NULL, p);
2781
2782 if (dontroute)
2783 so->so_options &= ~SO_DONTROUTE;
2784
2785 top = NULL;
2786 uiofirst = uiolast;
2787 } while (resid > 0 && error == 0);
2788 release:
2789 if (sblocked)
2790 sbunlock(&so->so_snd, FALSE); /* will unlock socket */
2791 else
2792 socket_unlock(so, 1);
2793 out:
2794 if (top != NULL)
2795 m_freem(top);
2796 if (freelist != NULL)
2797 m_freem_list(freelist);
2798
2799 KERNEL_DEBUG(DBG_FNC_SOSEND_LIST | DBG_FUNC_END, so, resid,
2800 so->so_snd.sb_cc, 0, error);
2801
2802 return (error);
2803 }
2804
2805 /*
2806 * May return ERESTART when packet is dropped by MAC policy check
2807 */
2808 static int
2809 soreceive_addr(struct proc *p, struct socket *so, struct sockaddr **psa,
2810 int flags, struct mbuf **mp, struct mbuf **nextrecordp, int canwait)
2811 {
2812 int error = 0;
2813 struct mbuf *m = *mp;
2814 struct mbuf *nextrecord = *nextrecordp;
2815
2816 KASSERT(m->m_type == MT_SONAME, ("receive 1a"));
2817 #if CONFIG_MACF_SOCKET_SUBSET
2818 /*
2819 * Call the MAC framework for policy checking if we're in
2820 * the user process context and the socket isn't connected.
2821 */
2822 if (p != kernproc && !(so->so_state & SS_ISCONNECTED)) {
2823 struct mbuf *m0 = m;
2824 /*
2825 * Dequeue this record (temporarily) from the receive
2826 * list since we're about to drop the socket's lock
2827 * where a new record may arrive and be appended to
2828 * the list. Upon MAC policy failure, the record
2829 * will be freed. Otherwise, we'll add it back to
2830 * the head of the list. We cannot rely on SB_LOCK
2831 * because append operation uses the socket's lock.
2832 */
2833 do {
2834 m->m_nextpkt = NULL;
2835 sbfree(&so->so_rcv, m);
2836 m = m->m_next;
2837 } while (m != NULL);
2838 m = m0;
2839 so->so_rcv.sb_mb = nextrecord;
2840 SB_EMPTY_FIXUP(&so->so_rcv);
2841 SBLASTRECORDCHK(&so->so_rcv, "soreceive 1a");
2842 SBLASTMBUFCHK(&so->so_rcv, "soreceive 1a");
2843 socket_unlock(so, 0);
2844
2845 if (mac_socket_check_received(proc_ucred(p), so,
2846 mtod(m, struct sockaddr *)) != 0) {
2847 /*
2848 * MAC policy failure; free this record and
2849 * process the next record (or block until
2850 * one is available). We have adjusted sb_cc
2851 * and sb_mbcnt above so there is no need to
2852 * call sbfree() again.
2853 */
2854 m_freem(m);
2855 /*
2856 * Clear SB_LOCK but don't unlock the socket.
2857 * Process the next record or wait for one.
2858 */
2859 socket_lock(so, 0);
2860 sbunlock(&so->so_rcv, TRUE); /* stay locked */
2861 error = ERESTART;
2862 goto done;
2863 }
2864 socket_lock(so, 0);
2865 /*
2866 * If the socket has been defunct'd, drop it.
2867 */
2868 if (so->so_flags & SOF_DEFUNCT) {
2869 m_freem(m);
2870 error = ENOTCONN;
2871 goto done;
2872 }
2873 /*
2874 * Re-adjust the socket receive list and re-enqueue
2875 * the record in front of any packets which may have
2876 * been appended while we dropped the lock.
2877 */
2878 for (m = m0; m->m_next != NULL; m = m->m_next)
2879 sballoc(&so->so_rcv, m);
2880 sballoc(&so->so_rcv, m);
2881 if (so->so_rcv.sb_mb == NULL) {
2882 so->so_rcv.sb_lastrecord = m0;
2883 so->so_rcv.sb_mbtail = m;
2884 }
2885 m = m0;
2886 nextrecord = m->m_nextpkt = so->so_rcv.sb_mb;
2887 so->so_rcv.sb_mb = m;
2888 SBLASTRECORDCHK(&so->so_rcv, "soreceive 1b");
2889 SBLASTMBUFCHK(&so->so_rcv, "soreceive 1b");
2890 }
2891 #endif /* CONFIG_MACF_SOCKET_SUBSET */
2892 if (psa != NULL) {
2893 *psa = dup_sockaddr(mtod(m, struct sockaddr *), canwait);
2894 if ((*psa == NULL) && (flags & MSG_NEEDSA)) {
2895 error = EWOULDBLOCK;
2896 goto done;
2897 }
2898 }
2899 if (flags & MSG_PEEK) {
2900 m = m->m_next;
2901 } else {
2902 sbfree(&so->so_rcv, m);
2903 if (m->m_next == NULL && so->so_rcv.sb_cc != 0) {
2904 panic("%s: about to create invalid socketbuf",
2905 __func__);
2906 /* NOTREACHED */
2907 }
2908 MFREE(m, so->so_rcv.sb_mb);
2909 m = so->so_rcv.sb_mb;
2910 if (m != NULL) {
2911 m->m_nextpkt = nextrecord;
2912 } else {
2913 so->so_rcv.sb_mb = nextrecord;
2914 SB_EMPTY_FIXUP(&so->so_rcv);
2915 }
2916 }
2917 done:
2918 *mp = m;
2919 *nextrecordp = nextrecord;
2920
2921 return (error);
2922 }
2923
2924 /*
2925 * Process one or more MT_CONTROL mbufs present before any data mbufs
2926 * in the first mbuf chain on the socket buffer. If MSG_PEEK, we
2927 * just copy the data; if !MSG_PEEK, we call into the protocol to
2928 * perform externalization.
2929 */
2930 static int
2931 soreceive_ctl(struct socket *so, struct mbuf **controlp, int flags,
2932 struct mbuf **mp, struct mbuf **nextrecordp)
2933 {
2934 int error = 0;
2935 struct mbuf *cm = NULL, *cmn;
2936 struct mbuf **cme = &cm;
2937 struct sockbuf *sb_rcv = &so->so_rcv;
2938 struct mbuf **msgpcm = NULL;
2939 struct mbuf *m = *mp;
2940 struct mbuf *nextrecord = *nextrecordp;
2941 struct protosw *pr = so->so_proto;
2942
2943 /*
2944 * Externalizing the control messages would require us to
2945 * drop the socket's lock below. Once we re-acquire the
2946 * lock, the mbuf chain might change. In order to preserve
2947 * consistency, we unlink all control messages from the
2948 * first mbuf chain in one shot and link them separately
2949 * onto a different chain.
2950 */
2951 do {
2952 if (flags & MSG_PEEK) {
2953 if (controlp != NULL) {
2954 if (*controlp == NULL) {
2955 msgpcm = controlp;
2956 }
2957 *controlp = m_copy(m, 0, m->m_len);
2958
2959 /*
2960 * If we failed to allocate an mbuf,
2961 * release any previously allocated
2962 * mbufs for control data. Return
2963 * an error. Keep the mbufs in the
2964 * socket as this is using
2965 * MSG_PEEK flag.
2966 */
2967 if (*controlp == NULL) {
2968 m_freem(*msgpcm);
2969 error = ENOBUFS;
2970 goto done;
2971 }
2972 controlp = &(*controlp)->m_next;
2973 }
2974 m = m->m_next;
2975 } else {
2976 m->m_nextpkt = NULL;
2977 sbfree(sb_rcv, m);
2978 sb_rcv->sb_mb = m->m_next;
2979 m->m_next = NULL;
2980 *cme = m;
2981 cme = &(*cme)->m_next;
2982 m = sb_rcv->sb_mb;
2983 }
2984 } while (m != NULL && m->m_type == MT_CONTROL);
2985
2986 if (!(flags & MSG_PEEK)) {
2987 if (sb_rcv->sb_mb != NULL) {
2988 sb_rcv->sb_mb->m_nextpkt = nextrecord;
2989 } else {
2990 sb_rcv->sb_mb = nextrecord;
2991 SB_EMPTY_FIXUP(sb_rcv);
2992 }
2993 if (nextrecord == NULL)
2994 sb_rcv->sb_lastrecord = m;
2995 }
2996
2997 SBLASTRECORDCHK(&so->so_rcv, "soreceive ctl");
2998 SBLASTMBUFCHK(&so->so_rcv, "soreceive ctl");
2999
3000 while (cm != NULL) {
3001 int cmsg_type;
3002
3003 cmn = cm->m_next;
3004 cm->m_next = NULL;
3005 cmsg_type = mtod(cm, struct cmsghdr *)->cmsg_type;
3006
3007 /*
3008 * Call the protocol to externalize SCM_RIGHTS message
3009 * and return the modified message to the caller upon
3010 * success. Otherwise, all other control messages are
3011 * returned unmodified to the caller. Note that we
3012 * only get into this loop if MSG_PEEK is not set.
3013 */
3014 if (pr->pr_domain->dom_externalize != NULL &&
3015 cmsg_type == SCM_RIGHTS) {
3016 /*
3017 * Release socket lock: see 3903171. This
3018 * would also allow more records to be appended
3019 * to the socket buffer. We still have SB_LOCK
3020 * set on it, so we can be sure that the head
3021 * of the mbuf chain won't change.
3022 */
3023 socket_unlock(so, 0);
3024 error = (*pr->pr_domain->dom_externalize)(cm);
3025 socket_lock(so, 0);
3026 } else {
3027 error = 0;
3028 }
3029
3030 if (controlp != NULL && error == 0) {
3031 *controlp = cm;
3032 controlp = &(*controlp)->m_next;
3033 } else {
3034 (void) m_free(cm);
3035 }
3036 cm = cmn;
3037 }
3038 /*
3039 * Update the value of nextrecord in case we received new
3040 * records when the socket was unlocked above for
3041 * externalizing SCM_RIGHTS.
3042 */
3043 if (m != NULL)
3044 nextrecord = sb_rcv->sb_mb->m_nextpkt;
3045 else
3046 nextrecord = sb_rcv->sb_mb;
3047
3048 done:
3049 *mp = m;
3050 *nextrecordp = nextrecord;
3051
3052 return (error);
3053 }
3054
3055 /*
3056 * Implement receive operations on a socket.
3057 * We depend on the way that records are added to the sockbuf
3058 * by sbappend*. In particular, each record (mbufs linked through m_next)
3059 * must begin with an address if the protocol so specifies,
3060 * followed by an optional mbuf or mbufs containing ancillary data,
3061 * and then zero or more mbufs of data.
3062 * In order to avoid blocking network interrupts for the entire time here,
3063 * we splx() while doing the actual copy to user space.
3064 * Although the sockbuf is locked, new data may still be appended,
3065 * and thus we must maintain consistency of the sockbuf during that time.
3066 *
3067 * The caller may receive the data as a single mbuf chain by supplying
3068 * an mbuf **mp0 for use in returning the chain. The uio is then used
3069 * only for the count in uio_resid.
3070 *
3071 * Returns: 0 Success
3072 * ENOBUFS
3073 * ENOTCONN
3074 * EWOULDBLOCK
3075 * uiomove:EFAULT
3076 * sblock:EWOULDBLOCK
3077 * sblock:EINTR
3078 * sbwait:EBADF
3079 * sbwait:EINTR
3080 * sodelayed_copy:EFAULT
3081 * <pru_rcvoob>:EINVAL[TCP]
3082 * <pru_rcvoob>:EWOULDBLOCK[TCP]
3083 * <pru_rcvoob>:???
3084 * <pr_domain->dom_externalize>:EMSGSIZE[AF_UNIX]
3085 * <pr_domain->dom_externalize>:ENOBUFS[AF_UNIX]
3086 * <pr_domain->dom_externalize>:???
3087 *
3088 * Notes: Additional return values from calls through <pru_rcvoob> and
3089 * <pr_domain->dom_externalize> depend on protocols other than
3090 * TCP or AF_UNIX, which are documented above.
3091 */
3092 int
3093 soreceive(struct socket *so, struct sockaddr **psa, struct uio *uio,
3094 struct mbuf **mp0, struct mbuf **controlp, int *flagsp)
3095 {
3096 struct mbuf *m, **mp, *ml = NULL;
3097 struct mbuf *nextrecord, *free_list;
3098 int flags, error, offset;
3099 user_ssize_t len;
3100 struct protosw *pr = so->so_proto;
3101 int moff, type = 0;
3102 user_ssize_t orig_resid = uio_resid(uio);
3103 user_ssize_t delayed_copy_len;
3104 int can_delay;
3105 int need_event;
3106 struct proc *p = current_proc();
3107 boolean_t en_tracing = FALSE;
3108
3109 /*
3110 * Sanity check on the length passed by caller as we are making 'int'
3111 * comparisons
3112 */
3113 if (orig_resid < 0 || orig_resid > INT_MAX)
3114 return (EINVAL);
3115
3116 KERNEL_DEBUG(DBG_FNC_SORECEIVE | DBG_FUNC_START, so,
3117 uio_resid(uio), so->so_rcv.sb_cc, so->so_rcv.sb_lowat,
3118 so->so_rcv.sb_hiwat);
3119
3120 socket_lock(so, 1);
3121 so_update_last_owner_locked(so, p);
3122 so_update_policy(so);
3123
3124 #ifdef MORE_LOCKING_DEBUG
3125 if (so->so_usecount == 1) {
3126 panic("%s: so=%x no other reference on socket\n", __func__, so);
3127 /* NOTREACHED */
3128 }
3129 #endif
3130 mp = mp0;
3131 if (psa != NULL)
3132 *psa = NULL;
3133 if (controlp != NULL)
3134 *controlp = NULL;
3135 if (flagsp != NULL)
3136 flags = *flagsp &~ MSG_EOR;
3137 else
3138 flags = 0;
3139
3140 /*
3141 * If a recv attempt is made on a previously-accepted socket
3142 * that has been marked as inactive (disconnected), reject
3143 * the request.
3144 */
3145 if (so->so_flags & SOF_DEFUNCT) {
3146 struct sockbuf *sb = &so->so_rcv;
3147
3148 error = ENOTCONN;
3149 SODEFUNCTLOG("%s[%d, %s]: defunct so 0x%llx [%d,%d] (%d)\n",
3150 __func__, proc_pid(p), proc_best_name(p),
3151 (uint64_t)DEBUG_KERNEL_ADDRPERM(so),
3152 SOCK_DOM(so), SOCK_TYPE(so), error);
3153 /*
3154 * This socket should have been disconnected and flushed
3155 * prior to being returned from sodefunct(); there should
3156 * be no data on its receive list, so panic otherwise.
3157 */
3158 if (so->so_state & SS_DEFUNCT)
3159 sb_empty_assert(sb, __func__);
3160 socket_unlock(so, 1);
3161 return (error);
3162 }
3163
3164 if ((so->so_flags1 & SOF1_PRECONNECT_DATA) &&
3165 pr->pr_usrreqs->pru_preconnect) {
3166 /*
3167 * A user may set the CONNECT_RESUME_ON_READ_WRITE-flag but not
3168 * calling write() right after this. *If* the app calls a read
3169 * we do not want to block this read indefinetely. Thus,
3170 * we trigger a connect so that the session gets initiated.
3171 */
3172 error = (*pr->pr_usrreqs->pru_preconnect)(so);
3173
3174 if (error) {
3175 socket_unlock(so, 1);
3176 return (error);
3177 }
3178 }
3179
3180 if (ENTR_SHOULDTRACE &&
3181 (SOCK_CHECK_DOM(so, AF_INET) || SOCK_CHECK_DOM(so, AF_INET6))) {
3182 /*
3183 * enable energy tracing for inet sockets that go over
3184 * non-loopback interfaces only.
3185 */
3186 struct inpcb *inp = sotoinpcb(so);
3187 if (inp->inp_last_outifp != NULL &&
3188 !(inp->inp_last_outifp->if_flags & IFF_LOOPBACK)) {
3189 en_tracing = TRUE;
3190 KERNEL_ENERGYTRACE(kEnTrActKernSockRead, DBG_FUNC_START,
3191 VM_KERNEL_ADDRPERM(so),
3192 ((so->so_state & SS_NBIO) ?
3193 kEnTrFlagNonBlocking : 0),
3194 (int64_t)orig_resid);
3195 }
3196 }
3197
3198 /*
3199 * When SO_WANTOOBFLAG is set we try to get out-of-band data
3200 * regardless of the flags argument. Here is the case were
3201 * out-of-band data is not inline.
3202 */
3203 if ((flags & MSG_OOB) ||
3204 ((so->so_options & SO_WANTOOBFLAG) != 0 &&
3205 (so->so_options & SO_OOBINLINE) == 0 &&
3206 (so->so_oobmark || (so->so_state & SS_RCVATMARK)))) {
3207 m = m_get(M_WAIT, MT_DATA);
3208 if (m == NULL) {
3209 socket_unlock(so, 1);
3210 KERNEL_DEBUG(DBG_FNC_SORECEIVE | DBG_FUNC_END,
3211 ENOBUFS, 0, 0, 0, 0);
3212 return (ENOBUFS);
3213 }
3214 error = (*pr->pr_usrreqs->pru_rcvoob)(so, m, flags & MSG_PEEK);
3215 if (error)
3216 goto bad;
3217 socket_unlock(so, 0);
3218 do {
3219 error = uiomove(mtod(m, caddr_t),
3220 imin(uio_resid(uio), m->m_len), uio);
3221 m = m_free(m);
3222 } while (uio_resid(uio) && error == 0 && m != NULL);
3223 socket_lock(so, 0);
3224 bad:
3225 if (m != NULL)
3226 m_freem(m);
3227
3228 if ((so->so_options & SO_WANTOOBFLAG) != 0) {
3229 if (error == EWOULDBLOCK || error == EINVAL) {
3230 /*
3231 * Let's try to get normal data:
3232 * EWOULDBLOCK: out-of-band data not
3233 * receive yet. EINVAL: out-of-band data
3234 * already read.
3235 */
3236 error = 0;
3237 goto nooob;
3238 } else if (error == 0 && flagsp != NULL) {
3239 *flagsp |= MSG_OOB;
3240 }
3241 }
3242 socket_unlock(so, 1);
3243 if (en_tracing) {
3244 KERNEL_ENERGYTRACE(kEnTrActKernSockRead, DBG_FUNC_END,
3245 VM_KERNEL_ADDRPERM(so), 0,
3246 (int64_t)(orig_resid - uio_resid(uio)));
3247 }
3248 KERNEL_DEBUG(DBG_FNC_SORECEIVE | DBG_FUNC_END, error,
3249 0, 0, 0, 0);
3250
3251 return (error);
3252 }
3253 nooob:
3254 if (mp != NULL)
3255 *mp = NULL;
3256
3257 if (so->so_state & SS_ISCONFIRMING && uio_resid(uio)) {
3258 (*pr->pr_usrreqs->pru_rcvd)(so, 0);
3259 }
3260
3261 free_list = NULL;
3262 delayed_copy_len = 0;
3263 restart:
3264 #ifdef MORE_LOCKING_DEBUG
3265 if (so->so_usecount <= 1)
3266 printf("soreceive: sblock so=0x%llx ref=%d on socket\n",
3267 (uint64_t)DEBUG_KERNEL_ADDRPERM(so), so->so_usecount);
3268 #endif
3269 /*
3270 * See if the socket has been closed (SS_NOFDREF|SS_CANTRCVMORE)
3271 * and if so just return to the caller. This could happen when
3272 * soreceive() is called by a socket upcall function during the
3273 * time the socket is freed. The socket buffer would have been
3274 * locked across the upcall, therefore we cannot put this thread
3275 * to sleep (else we will deadlock) or return EWOULDBLOCK (else
3276 * we may livelock), because the lock on the socket buffer will
3277 * only be released when the upcall routine returns to its caller.
3278 * Because the socket has been officially closed, there can be
3279 * no further read on it.
3280 *
3281 * A multipath subflow socket would have its SS_NOFDREF set by
3282 * default, so check for SOF_MP_SUBFLOW socket flag; when the
3283 * socket is closed for real, SOF_MP_SUBFLOW would be cleared.
3284 */
3285 if ((so->so_state & (SS_NOFDREF | SS_CANTRCVMORE)) ==
3286 (SS_NOFDREF | SS_CANTRCVMORE) && !(so->so_flags & SOF_MP_SUBFLOW)) {
3287 socket_unlock(so, 1);
3288 return (0);
3289 }
3290
3291 error = sblock(&so->so_rcv, SBLOCKWAIT(flags));
3292 if (error) {
3293 socket_unlock(so, 1);
3294 KERNEL_DEBUG(DBG_FNC_SORECEIVE | DBG_FUNC_END, error,
3295 0, 0, 0, 0);
3296 if (en_tracing) {
3297 KERNEL_ENERGYTRACE(kEnTrActKernSockRead, DBG_FUNC_END,
3298 VM_KERNEL_ADDRPERM(so), 0,
3299 (int64_t)(orig_resid - uio_resid(uio)));
3300 }
3301 return (error);
3302 }
3303
3304 m = so->so_rcv.sb_mb;
3305 /*
3306 * If we have less data than requested, block awaiting more
3307 * (subject to any timeout) if:
3308 * 1. the current count is less than the low water mark, or
3309 * 2. MSG_WAITALL is set, and it is possible to do the entire
3310 * receive operation at once if we block (resid <= hiwat).
3311 * 3. MSG_DONTWAIT is not set
3312 * If MSG_WAITALL is set but resid is larger than the receive buffer,
3313 * we have to do the receive in sections, and thus risk returning
3314 * a short count if a timeout or signal occurs after we start.
3315 */
3316 if (m == NULL || (((flags & MSG_DONTWAIT) == 0 &&
3317 so->so_rcv.sb_cc < uio_resid(uio)) &&
3318 (so->so_rcv.sb_cc < so->so_rcv.sb_lowat ||
3319 ((flags & MSG_WAITALL) && uio_resid(uio) <= so->so_rcv.sb_hiwat)) &&
3320 m->m_nextpkt == NULL && (pr->pr_flags & PR_ATOMIC) == 0)) {
3321 /*
3322 * Panic if we notice inconsistencies in the socket's
3323 * receive list; both sb_mb and sb_cc should correctly
3324 * reflect the contents of the list, otherwise we may
3325 * end up with false positives during select() or poll()
3326 * which could put the application in a bad state.
3327 */
3328 SB_MB_CHECK(&so->so_rcv);
3329
3330 if (so->so_error) {
3331 if (m != NULL)
3332 goto dontblock;
3333 error = so->so_error;
3334 if ((flags & MSG_PEEK) == 0)
3335 so->so_error = 0;
3336 goto release;
3337 }
3338 if (so->so_state & SS_CANTRCVMORE) {
3339 #if CONTENT_FILTER
3340 /*
3341 * Deal with half closed connections
3342 */
3343 if ((so->so_state & SS_ISDISCONNECTED) == 0 &&
3344 cfil_sock_data_pending(&so->so_rcv) != 0)
3345 CFIL_LOG(LOG_INFO,
3346 "so %llx ignore SS_CANTRCVMORE",
3347 (uint64_t)DEBUG_KERNEL_ADDRPERM(so));
3348 else
3349 #endif /* CONTENT_FILTER */
3350 if (m != NULL)
3351 goto dontblock;
3352 else
3353 goto release;
3354 }
3355 for (; m != NULL; m = m->m_next)
3356 if (m->m_type == MT_OOBDATA || (m->m_flags & M_EOR)) {
3357 m = so->so_rcv.sb_mb;
3358 goto dontblock;
3359 }
3360 if ((so->so_state & (SS_ISCONNECTED|SS_ISCONNECTING)) == 0 &&
3361 (so->so_proto->pr_flags & PR_CONNREQUIRED)) {
3362 error = ENOTCONN;
3363 goto release;
3364 }
3365 if (uio_resid(uio) == 0)
3366 goto release;
3367
3368 if ((so->so_state & SS_NBIO) ||
3369 (flags & (MSG_DONTWAIT|MSG_NBIO))) {
3370 error = EWOULDBLOCK;
3371 goto release;
3372 }
3373 SBLASTRECORDCHK(&so->so_rcv, "soreceive sbwait 1");
3374 SBLASTMBUFCHK(&so->so_rcv, "soreceive sbwait 1");
3375 sbunlock(&so->so_rcv, TRUE); /* keep socket locked */
3376 #if EVEN_MORE_LOCKING_DEBUG
3377 if (socket_debug)
3378 printf("Waiting for socket data\n");
3379 #endif
3380
3381 error = sbwait(&so->so_rcv);
3382 #if EVEN_MORE_LOCKING_DEBUG
3383 if (socket_debug)
3384 printf("SORECEIVE - sbwait returned %d\n", error);
3385 #endif
3386 if (so->so_usecount < 1) {
3387 panic("%s: after 2nd sblock so=%p ref=%d on socket\n",
3388 __func__, so, so->so_usecount);
3389 /* NOTREACHED */
3390 }
3391 if (error) {
3392 socket_unlock(so, 1);
3393 KERNEL_DEBUG(DBG_FNC_SORECEIVE | DBG_FUNC_END, error,
3394 0, 0, 0, 0);
3395 if (en_tracing) {
3396 KERNEL_ENERGYTRACE(kEnTrActKernSockRead, DBG_FUNC_END,
3397 VM_KERNEL_ADDRPERM(so), 0,
3398 (int64_t)(orig_resid - uio_resid(uio)));
3399 }
3400 return (error);
3401 }
3402 goto restart;
3403 }
3404 dontblock:
3405 OSIncrementAtomicLong(&p->p_stats->p_ru.ru_msgrcv);
3406 SBLASTRECORDCHK(&so->so_rcv, "soreceive 1");
3407 SBLASTMBUFCHK(&so->so_rcv, "soreceive 1");
3408 nextrecord = m->m_nextpkt;
3409
3410 if ((pr->pr_flags & PR_ADDR) && m->m_type == MT_SONAME) {
3411 error = soreceive_addr(p, so, psa, flags, &m, &nextrecord,
3412 mp0 == NULL);
3413 if (error == ERESTART)
3414 goto restart;
3415 else if (error != 0)
3416 goto release;
3417 orig_resid = 0;
3418 }
3419
3420 /*
3421 * Process one or more MT_CONTROL mbufs present before any data mbufs
3422 * in the first mbuf chain on the socket buffer. If MSG_PEEK, we
3423 * just copy the data; if !MSG_PEEK, we call into the protocol to
3424 * perform externalization.
3425 */
3426 if (m != NULL && m->m_type == MT_CONTROL) {
3427 error = soreceive_ctl(so, controlp, flags, &m, &nextrecord);
3428 if (error != 0)
3429 goto release;
3430 orig_resid = 0;
3431 }
3432
3433 /*
3434 * If the socket is a TCP socket with message delivery
3435 * enabled, then create a control msg to deliver the
3436 * relative TCP sequence number for this data. Waiting
3437 * until this point will protect against failures to
3438 * allocate an mbuf for control msgs.
3439 */
3440 if (so->so_type == SOCK_STREAM && SOCK_PROTO(so) == IPPROTO_TCP &&
3441 (so->so_flags & SOF_ENABLE_MSGS) && controlp != NULL) {
3442 struct mbuf *seq_cm;
3443
3444 seq_cm = sbcreatecontrol((caddr_t)&m->m_pkthdr.msg_seq,
3445 sizeof (uint32_t), SCM_SEQNUM, SOL_SOCKET);
3446 if (seq_cm == NULL) {
3447 /* unable to allocate a control mbuf */
3448 error = ENOBUFS;
3449 goto release;
3450 }
3451 *controlp = seq_cm;
3452 controlp = &seq_cm->m_next;
3453 }
3454
3455 if (m != NULL) {
3456 if (!(flags & MSG_PEEK)) {
3457 /*
3458 * We get here because m points to an mbuf following
3459 * any MT_SONAME or MT_CONTROL mbufs which have been
3460 * processed above. In any case, m should be pointing
3461 * to the head of the mbuf chain, and the nextrecord
3462 * should be either NULL or equal to m->m_nextpkt.
3463 * See comments above about SB_LOCK.
3464 */
3465 if (m != so->so_rcv.sb_mb ||
3466 m->m_nextpkt != nextrecord) {
3467 panic("%s: post-control !sync so=%p m=%p "
3468 "nextrecord=%p\n", __func__, so, m,
3469 nextrecord);
3470 /* NOTREACHED */
3471 }
3472 if (nextrecord == NULL)
3473 so->so_rcv.sb_lastrecord = m;
3474 }
3475 type = m->m_type;
3476 if (type == MT_OOBDATA)
3477 flags |= MSG_OOB;
3478 } else {
3479 if (!(flags & MSG_PEEK)) {
3480 SB_EMPTY_FIXUP(&so->so_rcv);
3481 }
3482 }
3483 SBLASTRECORDCHK(&so->so_rcv, "soreceive 2");
3484 SBLASTMBUFCHK(&so->so_rcv, "soreceive 2");
3485
3486 moff = 0;
3487 offset = 0;
3488
3489 if (!(flags & MSG_PEEK) && uio_resid(uio) > sorecvmincopy)
3490 can_delay = 1;
3491 else
3492 can_delay = 0;
3493
3494 need_event = 0;
3495
3496 while (m != NULL &&
3497 (uio_resid(uio) - delayed_copy_len) > 0 && error == 0) {
3498 if (m->m_type == MT_OOBDATA) {
3499 if (type != MT_OOBDATA)
3500 break;
3501 } else if (type == MT_OOBDATA) {
3502 break;
3503 }
3504 /*
3505 * Make sure to allways set MSG_OOB event when getting
3506 * out of band data inline.
3507 */
3508 if ((so->so_options & SO_WANTOOBFLAG) != 0 &&
3509 (so->so_options & SO_OOBINLINE) != 0 &&
3510 (so->so_state & SS_RCVATMARK) != 0) {
3511 flags |= MSG_OOB;
3512 }
3513 so->so_state &= ~SS_RCVATMARK;
3514 len = uio_resid(uio) - delayed_copy_len;
3515 if (so->so_oobmark && len > so->so_oobmark - offset)
3516 len = so->so_oobmark - offset;
3517 if (len > m->m_len - moff)
3518 len = m->m_len - moff;
3519 /*
3520 * If mp is set, just pass back the mbufs.
3521 * Otherwise copy them out via the uio, then free.
3522 * Sockbuf must be consistent here (points to current mbuf,
3523 * it points to next record) when we drop priority;
3524 * we must note any additions to the sockbuf when we
3525 * block interrupts again.
3526 */
3527 if (mp == NULL) {
3528 SBLASTRECORDCHK(&so->so_rcv, "soreceive uiomove");
3529 SBLASTMBUFCHK(&so->so_rcv, "soreceive uiomove");
3530 if (can_delay && len == m->m_len) {
3531 /*
3532 * only delay the copy if we're consuming the
3533 * mbuf and we're NOT in MSG_PEEK mode
3534 * and we have enough data to make it worthwile
3535 * to drop and retake the lock... can_delay
3536 * reflects the state of the 2 latter
3537 * constraints moff should always be zero
3538 * in these cases
3539 */
3540 delayed_copy_len += len;
3541 } else {
3542 if (delayed_copy_len) {
3543 error = sodelayed_copy(so, uio,
3544 &free_list, &delayed_copy_len);
3545
3546 if (error) {
3547 goto release;
3548 }
3549 /*
3550 * can only get here if MSG_PEEK is not
3551 * set therefore, m should point at the
3552 * head of the rcv queue; if it doesn't,
3553 * it means something drastically
3554 * changed while we were out from behind
3555 * the lock in sodelayed_copy. perhaps
3556 * a RST on the stream. in any event,
3557 * the stream has been interrupted. it's
3558 * probably best just to return whatever
3559 * data we've moved and let the caller
3560 * sort it out...
3561 */
3562 if (m != so->so_rcv.sb_mb) {
3563 break;
3564 }
3565 }
3566 socket_unlock(so, 0);
3567 error = uiomove(mtod(m, caddr_t) + moff,
3568 (int)len, uio);
3569 socket_lock(so, 0);
3570
3571 if (error)
3572 goto release;
3573 }
3574 } else {
3575 uio_setresid(uio, (uio_resid(uio) - len));
3576 }
3577 if (len == m->m_len - moff) {
3578 if (m->m_flags & M_EOR)
3579 flags |= MSG_EOR;
3580 if (flags & MSG_PEEK) {
3581 m = m->m_next;
3582 moff = 0;
3583 } else {
3584 nextrecord = m->m_nextpkt;
3585 sbfree(&so->so_rcv, m);
3586 m->m_nextpkt = NULL;
3587
3588 /*
3589 * If this packet is an unordered packet
3590 * (indicated by M_UNORDERED_DATA flag), remove
3591 * the additional bytes added to the
3592 * receive socket buffer size.
3593 */
3594 if ((so->so_flags & SOF_ENABLE_MSGS) &&
3595 m->m_len &&
3596 (m->m_flags & M_UNORDERED_DATA) &&
3597 sbreserve(&so->so_rcv,
3598 so->so_rcv.sb_hiwat - m->m_len)) {
3599 if (so->so_msg_state->msg_uno_bytes >
3600 m->m_len) {
3601 so->so_msg_state->
3602 msg_uno_bytes -= m->m_len;
3603 } else {
3604 so->so_msg_state->
3605 msg_uno_bytes = 0;
3606 }
3607 m->m_flags &= ~M_UNORDERED_DATA;
3608 }
3609
3610 if (mp != NULL) {
3611 *mp = m;
3612 mp = &m->m_next;
3613 so->so_rcv.sb_mb = m = m->m_next;
3614 *mp = NULL;
3615 } else {
3616 if (free_list == NULL)
3617 free_list = m;
3618 else
3619 ml->m_next = m;
3620 ml = m;
3621 so->so_rcv.sb_mb = m = m->m_next;
3622 ml->m_next = NULL;
3623 }
3624 if (m != NULL) {
3625 m->m_nextpkt = nextrecord;
3626 if (nextrecord == NULL)
3627 so->so_rcv.sb_lastrecord = m;
3628 } else {
3629 so->so_rcv.sb_mb = nextrecord;
3630 SB_EMPTY_FIXUP(&so->so_rcv);
3631 }
3632 SBLASTRECORDCHK(&so->so_rcv, "soreceive 3");
3633 SBLASTMBUFCHK(&so->so_rcv, "soreceive 3");
3634 }
3635 } else {
3636 if (flags & MSG_PEEK) {
3637 moff += len;
3638 } else {
3639 if (mp != NULL) {
3640 int copy_flag;
3641
3642 if (flags & MSG_DONTWAIT)
3643 copy_flag = M_DONTWAIT;
3644 else
3645 copy_flag = M_WAIT;
3646 *mp = m_copym(m, 0, len, copy_flag);
3647 /*
3648 * Failed to allocate an mbuf?
3649 * Adjust uio_resid back, it was
3650 * adjusted down by len bytes which
3651 * we didn't copy over.
3652 */
3653 if (*mp == NULL) {
3654 uio_setresid(uio,
3655 (uio_resid(uio) + len));
3656 break;
3657 }
3658 }
3659 m->m_data += len;
3660 m->m_len -= len;
3661 so->so_rcv.sb_cc -= len;
3662 }
3663 }
3664 if (so->so_oobmark) {
3665 if ((flags & MSG_PEEK) == 0) {
3666 so->so_oobmark -= len;
3667 if (so->so_oobmark == 0) {
3668 so->so_state |= SS_RCVATMARK;
3669 /*
3670 * delay posting the actual event until
3671 * after any delayed copy processing
3672 * has finished
3673 */
3674 need_event = 1;
3675 break;
3676 }
3677 } else {
3678 offset += len;
3679 if (offset == so->so_oobmark)
3680 break;
3681 }
3682 }
3683 if (flags & MSG_EOR)
3684 break;
3685 /*
3686 * If the MSG_WAITALL or MSG_WAITSTREAM flag is set
3687 * (for non-atomic socket), we must not quit until
3688 * "uio->uio_resid == 0" or an error termination.
3689 * If a signal/timeout occurs, return with a short
3690 * count but without error. Keep sockbuf locked
3691 * against other readers.
3692 */
3693 while (flags & (MSG_WAITALL|MSG_WAITSTREAM) && m == NULL &&
3694 (uio_resid(uio) - delayed_copy_len) > 0 &&
3695 !sosendallatonce(so) && !nextrecord) {
3696 if (so->so_error || ((so->so_state & SS_CANTRCVMORE)
3697 #if CONTENT_FILTER
3698 && cfil_sock_data_pending(&so->so_rcv) == 0
3699 #endif /* CONTENT_FILTER */
3700 ))
3701 goto release;
3702
3703 /*
3704 * Depending on the protocol (e.g. TCP), the following
3705 * might cause the socket lock to be dropped and later
3706 * be reacquired, and more data could have arrived and
3707 * have been appended to the receive socket buffer by
3708 * the time it returns. Therefore, we only sleep in
3709 * sbwait() below if and only if the socket buffer is
3710 * empty, in order to avoid a false sleep.
3711 */
3712 if (pr->pr_flags & PR_WANTRCVD && so->so_pcb &&
3713 (((struct inpcb *)so->so_pcb)->inp_state !=
3714 INPCB_STATE_DEAD))
3715 (*pr->pr_usrreqs->pru_rcvd)(so, flags);
3716
3717 SBLASTRECORDCHK(&so->so_rcv, "soreceive sbwait 2");
3718 SBLASTMBUFCHK(&so->so_rcv, "soreceive sbwait 2");
3719
3720 if (so->so_rcv.sb_mb == NULL && sbwait(&so->so_rcv)) {
3721 error = 0;
3722 goto release;
3723 }
3724 /*
3725 * have to wait until after we get back from the sbwait
3726 * to do the copy because we will drop the lock if we
3727 * have enough data that has been delayed... by dropping
3728 * the lock we open up a window allowing the netisr
3729 * thread to process the incoming packets and to change
3730 * the state of this socket... we're issuing the sbwait
3731 * because the socket is empty and we're expecting the
3732 * netisr thread to wake us up when more packets arrive;
3733 * if we allow that processing to happen and then sbwait
3734 * we could stall forever with packets sitting in the
3735 * socket if no further packets arrive from the remote
3736 * side.
3737 *
3738 * we want to copy before we've collected all the data
3739 * to satisfy this request to allow the copy to overlap
3740 * the incoming packet processing on an MP system
3741 */
3742 if (delayed_copy_len > sorecvmincopy &&
3743 (delayed_copy_len > (so->so_rcv.sb_hiwat / 2))) {
3744 error = sodelayed_copy(so, uio,
3745 &free_list, &delayed_copy_len);
3746
3747 if (error)
3748 goto release;
3749 }
3750 m = so->so_rcv.sb_mb;
3751 if (m != NULL) {
3752 nextrecord = m->m_nextpkt;
3753 }
3754 SB_MB_CHECK(&so->so_rcv);
3755 }
3756 }
3757 #ifdef MORE_LOCKING_DEBUG
3758 if (so->so_usecount <= 1) {
3759 panic("%s: after big while so=%p ref=%d on socket\n",
3760 __func__, so, so->so_usecount);
3761 /* NOTREACHED */
3762 }
3763 #endif
3764
3765 if (m != NULL && pr->pr_flags & PR_ATOMIC) {
3766 if (so->so_options & SO_DONTTRUNC) {
3767 flags |= MSG_RCVMORE;
3768 } else {
3769 flags |= MSG_TRUNC;
3770 if ((flags & MSG_PEEK) == 0)
3771 (void) sbdroprecord(&so->so_rcv);
3772 }
3773 }
3774
3775 /*
3776 * pru_rcvd below (for TCP) may cause more data to be received
3777 * if the socket lock is dropped prior to sending the ACK; some
3778 * legacy OpenTransport applications don't handle this well
3779 * (if it receives less data than requested while MSG_HAVEMORE
3780 * is set), and so we set the flag now based on what we know
3781 * prior to calling pru_rcvd.
3782 */
3783 if ((so->so_options & SO_WANTMORE) && so->so_rcv.sb_cc > 0)
3784 flags |= MSG_HAVEMORE;
3785
3786 if ((flags & MSG_PEEK) == 0) {
3787 if (m == NULL) {
3788 so->so_rcv.sb_mb = nextrecord;
3789 /*
3790 * First part is an inline SB_EMPTY_FIXUP(). Second
3791 * part makes sure sb_lastrecord is up-to-date if
3792 * there is still data in the socket buffer.
3793 */
3794 if (so->so_rcv.sb_mb == NULL) {
3795 so->so_rcv.sb_mbtail = NULL;
3796 so->so_rcv.sb_lastrecord = NULL;
3797 } else if (nextrecord->m_nextpkt == NULL) {
3798 so->so_rcv.sb_lastrecord = nextrecord;
3799 }
3800 SB_MB_CHECK(&so->so_rcv);
3801 }
3802 SBLASTRECORDCHK(&so->so_rcv, "soreceive 4");
3803 SBLASTMBUFCHK(&so->so_rcv, "soreceive 4");
3804 if (pr->pr_flags & PR_WANTRCVD && so->so_pcb)
3805 (*pr->pr_usrreqs->pru_rcvd)(so, flags);
3806 }
3807
3808 if (delayed_copy_len) {
3809 error = sodelayed_copy(so, uio, &free_list, &delayed_copy_len);
3810 if (error)
3811 goto release;
3812 }
3813 if (free_list != NULL) {
3814 m_freem_list(free_list);
3815 free_list = NULL;
3816 }
3817 if (need_event)
3818 postevent(so, 0, EV_OOB);
3819
3820 if (orig_resid == uio_resid(uio) && orig_resid &&
3821 (flags & MSG_EOR) == 0 && (so->so_state & SS_CANTRCVMORE) == 0) {
3822 sbunlock(&so->so_rcv, TRUE); /* keep socket locked */
3823 goto restart;
3824 }
3825
3826 if (flagsp != NULL)
3827 *flagsp |= flags;
3828 release:
3829 #ifdef MORE_LOCKING_DEBUG
3830 if (so->so_usecount <= 1) {
3831 panic("%s: release so=%p ref=%d on socket\n", __func__,
3832 so, so->so_usecount);
3833 /* NOTREACHED */
3834 }
3835 #endif
3836 if (delayed_copy_len)
3837 error = sodelayed_copy(so, uio, &free_list, &delayed_copy_len);
3838
3839 if (free_list != NULL)
3840 m_freem_list(free_list);
3841
3842 sbunlock(&so->so_rcv, FALSE); /* will unlock socket */
3843
3844 if (en_tracing) {
3845 KERNEL_ENERGYTRACE(kEnTrActKernSockRead, DBG_FUNC_END,
3846 VM_KERNEL_ADDRPERM(so),
3847 ((error == EWOULDBLOCK) ? kEnTrFlagNoWork : 0),
3848 (int64_t)(orig_resid - uio_resid(uio)));
3849 }
3850 KERNEL_DEBUG(DBG_FNC_SORECEIVE | DBG_FUNC_END, so, uio_resid(uio),
3851 so->so_rcv.sb_cc, 0, error);
3852
3853 return (error);
3854 }
3855
3856 /*
3857 * Returns: 0 Success
3858 * uiomove:EFAULT
3859 */
3860 static int
3861 sodelayed_copy(struct socket *so, struct uio *uio, struct mbuf **free_list,
3862 user_ssize_t *resid)
3863 {
3864 int error = 0;
3865 struct mbuf *m;
3866
3867 m = *free_list;
3868
3869 socket_unlock(so, 0);
3870
3871 while (m != NULL && error == 0) {
3872 error = uiomove(mtod(m, caddr_t), (int)m->m_len, uio);
3873 m = m->m_next;
3874 }
3875 m_freem_list(*free_list);
3876
3877 *free_list = NULL;
3878 *resid = 0;
3879
3880 socket_lock(so, 0);
3881
3882 return (error);
3883 }
3884
3885 static int
3886 sodelayed_copy_list(struct socket *so, struct recv_msg_elem *msgarray,
3887 u_int uiocnt, struct mbuf **free_list, user_ssize_t *resid)
3888 {
3889 #pragma unused(so)
3890 int error = 0;
3891 struct mbuf *ml, *m;
3892 int i = 0;
3893 struct uio *auio;
3894
3895 for (ml = *free_list, i = 0; ml != NULL && i < uiocnt;
3896 ml = ml->m_nextpkt, i++) {
3897 auio = msgarray[i].uio;
3898 for (m = ml; m != NULL; m = m->m_next) {
3899 error = uiomove(mtod(m, caddr_t), m->m_len, auio);
3900 if (error != 0)
3901 goto out;
3902 }
3903 }
3904 out:
3905 m_freem_list(*free_list);
3906
3907 *free_list = NULL;
3908 *resid = 0;
3909
3910 return (error);
3911 }
3912
3913 int
3914 soreceive_list(struct socket *so, struct recv_msg_elem *msgarray, u_int uiocnt,
3915 int *flagsp)
3916 {
3917 struct mbuf *m;
3918 struct mbuf *nextrecord;
3919 struct mbuf *ml = NULL, *free_list = NULL, *free_tail = NULL;
3920 int error;
3921 user_ssize_t len, pktlen, delayed_copy_len = 0;
3922 struct protosw *pr = so->so_proto;
3923 user_ssize_t resid;
3924 struct proc *p = current_proc();
3925 struct uio *auio = NULL;
3926 int npkts = 0;
3927 int sblocked = 0;
3928 struct sockaddr **psa = NULL;
3929 struct mbuf **controlp = NULL;
3930 int can_delay;
3931 int flags;
3932 struct mbuf *free_others = NULL;
3933
3934 KERNEL_DEBUG(DBG_FNC_SORECEIVE_LIST | DBG_FUNC_START,
3935 so, uiocnt,
3936 so->so_rcv.sb_cc, so->so_rcv.sb_lowat, so->so_rcv.sb_hiwat);
3937
3938 /*
3939 * Sanity checks:
3940 * - Only supports don't wait flags
3941 * - Only support datagram sockets (could be extended to raw)
3942 * - Must be atomic
3943 * - Protocol must support packet chains
3944 * - The uio array is NULL (should we panic?)
3945 */
3946 if (flagsp != NULL)
3947 flags = *flagsp;
3948 else
3949 flags = 0;
3950 if (flags & ~(MSG_PEEK | MSG_WAITALL | MSG_DONTWAIT | MSG_NEEDSA |
3951 MSG_NBIO)) {
3952 printf("%s invalid flags 0x%x\n", __func__, flags);
3953 error = EINVAL;
3954 goto out;
3955 }
3956 if (so->so_type != SOCK_DGRAM) {
3957 error = EINVAL;
3958 goto out;
3959 }
3960 if (sosendallatonce(so) == 0) {
3961 error = EINVAL;
3962 goto out;
3963 }
3964 if (so->so_proto->pr_usrreqs->pru_send_list == NULL) {
3965 error = EPROTONOSUPPORT;
3966 goto out;
3967 }
3968 if (msgarray == NULL) {
3969 printf("%s uioarray is NULL\n", __func__);
3970 error = EINVAL;
3971 goto out;
3972 }
3973 if (uiocnt == 0) {
3974 printf("%s uiocnt is 0\n", __func__);
3975 error = EINVAL;
3976 goto out;
3977 }
3978 /*
3979 * Sanity check on the length passed by caller as we are making 'int'
3980 * comparisons
3981 */
3982 resid = recv_msg_array_resid(msgarray, uiocnt);
3983 if (resid < 0 || resid > INT_MAX) {
3984 error = EINVAL;
3985 goto out;
3986 }
3987
3988 if (!(flags & MSG_PEEK) && sorecvmincopy > 0)
3989 can_delay = 1;
3990 else
3991 can_delay = 0;
3992
3993 socket_lock(so, 1);
3994 so_update_last_owner_locked(so, p);
3995 so_update_policy(so);
3996
3997 #if NECP
3998 so_update_necp_policy(so, NULL, NULL);
3999 #endif /* NECP */
4000
4001 /*
4002 * If a recv attempt is made on a previously-accepted socket
4003 * that has been marked as inactive (disconnected), reject
4004 * the request.
4005 */
4006 if (so->so_flags & SOF_DEFUNCT) {
4007 struct sockbuf *sb = &so->so_rcv;
4008
4009 error = ENOTCONN;
4010 SODEFUNCTLOG("%s[%d, %s]: defunct so 0x%llx [%d,%d] (%d)\n",
4011 __func__, proc_pid(p), proc_best_name(p),
4012 (uint64_t)DEBUG_KERNEL_ADDRPERM(so),
4013 SOCK_DOM(so), SOCK_TYPE(so), error);
4014 /*
4015 * This socket should have been disconnected and flushed
4016 * prior to being returned from sodefunct(); there should
4017 * be no data on its receive list, so panic otherwise.
4018 */
4019 if (so->so_state & SS_DEFUNCT)
4020 sb_empty_assert(sb, __func__);
4021 goto release;
4022 }
4023
4024 next:
4025 /*
4026 * The uio may be empty
4027 */
4028 if (npkts >= uiocnt) {
4029 error = 0;
4030 goto release;
4031 }
4032 restart:
4033 /*
4034 * See if the socket has been closed (SS_NOFDREF|SS_CANTRCVMORE)
4035 * and if so just return to the caller. This could happen when
4036 * soreceive() is called by a socket upcall function during the
4037 * time the socket is freed. The socket buffer would have been
4038 * locked across the upcall, therefore we cannot put this thread
4039 * to sleep (else we will deadlock) or return EWOULDBLOCK (else
4040 * we may livelock), because the lock on the socket buffer will
4041 * only be released when the upcall routine returns to its caller.
4042 * Because the socket has been officially closed, there can be
4043 * no further read on it.
4044 */
4045 if ((so->so_state & (SS_NOFDREF | SS_CANTRCVMORE)) ==
4046 (SS_NOFDREF | SS_CANTRCVMORE)) {
4047 error = 0;
4048 goto release;
4049 }
4050
4051 error = sblock(&so->so_rcv, SBLOCKWAIT(flags));
4052 if (error) {
4053 goto release;
4054 }
4055 sblocked = 1;
4056
4057 m = so->so_rcv.sb_mb;
4058 /*
4059 * Block awaiting more datagram if needed
4060 */
4061 if (m == NULL || (((flags & MSG_DONTWAIT) == 0 &&
4062 (so->so_rcv.sb_cc < so->so_rcv.sb_lowat ||
4063 ((flags & MSG_WAITALL) && npkts < uiocnt))))) {
4064 /*
4065 * Panic if we notice inconsistencies in the socket's
4066 * receive list; both sb_mb and sb_cc should correctly
4067 * reflect the contents of the list, otherwise we may
4068 * end up with false positives during select() or poll()
4069 * which could put the application in a bad state.
4070 */
4071 SB_MB_CHECK(&so->so_rcv);
4072
4073 if (so->so_error) {
4074 error = so->so_error;
4075 if ((flags & MSG_PEEK) == 0)
4076 so->so_error = 0;
4077 goto release;
4078 }
4079 if (so->so_state & SS_CANTRCVMORE) {
4080 goto release;
4081 }
4082 if ((so->so_state & (SS_ISCONNECTED|SS_ISCONNECTING)) == 0 &&
4083 (so->so_proto->pr_flags & PR_CONNREQUIRED)) {
4084 error = ENOTCONN;
4085 goto release;
4086 }
4087 if ((so->so_state & SS_NBIO) ||
4088 (flags & (MSG_DONTWAIT|MSG_NBIO))) {
4089 error = EWOULDBLOCK;
4090 goto release;
4091 }
4092 /*
4093 * Do not block if we got some data
4094 */
4095 if (free_list != NULL) {
4096 error = 0;
4097 goto release;
4098 }
4099
4100 SBLASTRECORDCHK(&so->so_rcv, "soreceive sbwait 1");
4101 SBLASTMBUFCHK(&so->so_rcv, "soreceive sbwait 1");
4102
4103 sbunlock(&so->so_rcv, TRUE); /* keep socket locked */
4104 sblocked = 0;
4105
4106 error = sbwait(&so->so_rcv);
4107 if (error) {
4108 goto release;
4109 }
4110 goto restart;
4111 }
4112
4113 OSIncrementAtomicLong(&p->p_stats->p_ru.ru_msgrcv);
4114 SBLASTRECORDCHK(&so->so_rcv, "soreceive 1");
4115 SBLASTMBUFCHK(&so->so_rcv, "soreceive 1");
4116
4117 /*
4118 * Consume the current uio index as we have a datagram
4119 */
4120 auio = msgarray[npkts].uio;
4121 resid = uio_resid(auio);
4122 msgarray[npkts].which |= SOCK_MSG_DATA;
4123 psa = (msgarray[npkts].which & SOCK_MSG_SA) ?
4124 &msgarray[npkts].psa : NULL;
4125 controlp = (msgarray[npkts].which & SOCK_MSG_CONTROL) ?
4126 &msgarray[npkts].controlp : NULL;
4127 npkts += 1;
4128 nextrecord = m->m_nextpkt;
4129
4130 if ((pr->pr_flags & PR_ADDR) && m->m_type == MT_SONAME) {
4131 error = soreceive_addr(p, so, psa, flags, &m, &nextrecord, 1);
4132 if (error == ERESTART)
4133 goto restart;
4134 else if (error != 0)
4135 goto release;
4136 }
4137
4138 if (m != NULL && m->m_type == MT_CONTROL) {
4139 error = soreceive_ctl(so, controlp, flags, &m, &nextrecord);
4140 if (error != 0)
4141 goto release;
4142 }
4143
4144 if (m->m_pkthdr.len == 0) {
4145 printf("%s:%d so %llx pkt %llx type %u pktlen null\n",
4146 __func__, __LINE__,
4147 (uint64_t)DEBUG_KERNEL_ADDRPERM(so),
4148 (uint64_t)DEBUG_KERNEL_ADDRPERM(m),
4149 m->m_type);
4150 }
4151
4152 /*
4153 * Loop to copy the mbufs of the current record
4154 * Support zero length packets
4155 */
4156 ml = NULL;
4157 pktlen = 0;
4158 while (m != NULL && (len = resid - pktlen) >= 0 && error == 0) {
4159 if (m->m_len == 0)
4160 panic("%p m_len zero", m);
4161 if (m->m_type == 0)
4162 panic("%p m_type zero", m);
4163 /*
4164 * Clip to the residual length
4165 */
4166 if (len > m->m_len)
4167 len = m->m_len;
4168 pktlen += len;
4169 /*
4170 * Copy the mbufs via the uio or delay the copy
4171 * Sockbuf must be consistent here (points to current mbuf,
4172 * it points to next record) when we drop priority;
4173 * we must note any additions to the sockbuf when we
4174 * block interrupts again.
4175 */
4176 if (len > 0 && can_delay == 0) {
4177 socket_unlock(so, 0);
4178 error = uiomove(mtod(m, caddr_t), (int)len, auio);
4179 socket_lock(so, 0);
4180 if (error)
4181 goto release;
4182 } else {
4183 delayed_copy_len += len;
4184 }
4185
4186 if (len == m->m_len) {
4187 /*
4188 * m was entirely copied
4189 */
4190 sbfree(&so->so_rcv, m);
4191 nextrecord = m->m_nextpkt;
4192 m->m_nextpkt = NULL;
4193
4194 /*
4195 * Set the first packet to the head of the free list
4196 */
4197 if (free_list == NULL)
4198 free_list = m;
4199 /*
4200 * Link current packet to tail of free list
4201 */
4202 if (ml == NULL) {
4203 if (free_tail != NULL)
4204 free_tail->m_nextpkt = m;
4205 free_tail = m;
4206 }
4207 /*
4208 * Link current mbuf to last mbuf of current packet
4209 */
4210 if (ml != NULL)
4211 ml->m_next = m;
4212 ml = m;
4213
4214 /*
4215 * Move next buf to head of socket buffer
4216 */
4217 so->so_rcv.sb_mb = m = ml->m_next;
4218 ml->m_next = NULL;
4219
4220 if (m != NULL) {
4221 m->m_nextpkt = nextrecord;
4222 if (nextrecord == NULL)
4223 so->so_rcv.sb_lastrecord = m;
4224 } else {
4225 so->so_rcv.sb_mb = nextrecord;
4226 SB_EMPTY_FIXUP(&so->so_rcv);
4227 }
4228 SBLASTRECORDCHK(&so->so_rcv, "soreceive 3");
4229 SBLASTMBUFCHK(&so->so_rcv, "soreceive 3");
4230 } else {
4231 /*
4232 * Stop the loop on partial copy
4233 */
4234 break;
4235 }
4236 }
4237 #ifdef MORE_LOCKING_DEBUG
4238 if (so->so_usecount <= 1) {
4239 panic("%s: after big while so=%llx ref=%d on socket\n",
4240 __func__,
4241 (uint64_t)DEBUG_KERNEL_ADDRPERM(so), so->so_usecount);
4242 /* NOTREACHED */
4243 }
4244 #endif
4245 /*
4246 * Tell the caller we made a partial copy
4247 */
4248 if (m != NULL) {
4249 if (so->so_options & SO_DONTTRUNC) {
4250 /*
4251 * Copyout first the freelist then the partial mbuf
4252 */
4253 socket_unlock(so, 0);
4254 if (delayed_copy_len)
4255 error = sodelayed_copy_list(so, msgarray,
4256 uiocnt, &free_list, &delayed_copy_len);
4257
4258 if (error == 0) {
4259 error = uiomove(mtod(m, caddr_t), (int)len,
4260 auio);
4261 }
4262 socket_lock(so, 0);
4263 if (error)
4264 goto release;
4265
4266 m->m_data += len;
4267 m->m_len -= len;
4268 so->so_rcv.sb_cc -= len;
4269 flags |= MSG_RCVMORE;
4270 } else {
4271 (void) sbdroprecord(&so->so_rcv);
4272 nextrecord = so->so_rcv.sb_mb;
4273 m = NULL;
4274 flags |= MSG_TRUNC;
4275 }
4276 }
4277
4278 if (m == NULL) {
4279 so->so_rcv.sb_mb = nextrecord;
4280 /*
4281 * First part is an inline SB_EMPTY_FIXUP(). Second
4282 * part makes sure sb_lastrecord is up-to-date if
4283 * there is still data in the socket buffer.
4284 */
4285 if (so->so_rcv.sb_mb == NULL) {
4286 so->so_rcv.sb_mbtail = NULL;
4287 so->so_rcv.sb_lastrecord = NULL;
4288 } else if (nextrecord->m_nextpkt == NULL) {
4289 so->so_rcv.sb_lastrecord = nextrecord;
4290 }
4291 SB_MB_CHECK(&so->so_rcv);
4292 }
4293 SBLASTRECORDCHK(&so->so_rcv, "soreceive 4");
4294 SBLASTMBUFCHK(&so->so_rcv, "soreceive 4");
4295
4296 /*
4297 * We can continue to the next packet as long as:
4298 * - We haven't exhausted the uio array
4299 * - There was no error
4300 * - A packet was not truncated
4301 * - We can still receive more data
4302 */
4303 if (npkts < uiocnt && error == 0 &&
4304 (flags & (MSG_RCVMORE | MSG_TRUNC)) == 0 &&
4305 (so->so_state & SS_CANTRCVMORE) == 0) {
4306 sbunlock(&so->so_rcv, TRUE); /* keep socket locked */
4307 sblocked = 0;
4308
4309 goto next;
4310 }
4311 if (flagsp != NULL)
4312 *flagsp |= flags;
4313
4314 release:
4315 /*
4316 * pru_rcvd may cause more data to be received if the socket lock
4317 * is dropped so we set MSG_HAVEMORE now based on what we know.
4318 * That way the caller won't be surprised if it receives less data
4319 * than requested.
4320 */
4321 if ((so->so_options & SO_WANTMORE) && so->so_rcv.sb_cc > 0)
4322 flags |= MSG_HAVEMORE;
4323
4324 if (pr->pr_flags & PR_WANTRCVD && so->so_pcb)
4325 (*pr->pr_usrreqs->pru_rcvd)(so, flags);
4326
4327 if (sblocked)
4328 sbunlock(&so->so_rcv, FALSE); /* will unlock socket */
4329 else
4330 socket_unlock(so, 1);
4331
4332 if (delayed_copy_len)
4333 error = sodelayed_copy_list(so, msgarray, uiocnt,
4334 &free_list, &delayed_copy_len);
4335 out:
4336 /*
4337 * Amortize the cost of freeing the mbufs
4338 */
4339 if (free_list != NULL)
4340 m_freem_list(free_list);
4341 if (free_others != NULL)
4342 m_freem_list(free_others);
4343
4344 KERNEL_DEBUG(DBG_FNC_SORECEIVE_LIST | DBG_FUNC_END, error,
4345 0, 0, 0, 0);
4346 return (error);
4347 }
4348
4349 /*
4350 * Returns: 0 Success
4351 * EINVAL
4352 * ENOTCONN
4353 * <pru_shutdown>:EINVAL
4354 * <pru_shutdown>:EADDRNOTAVAIL[TCP]
4355 * <pru_shutdown>:ENOBUFS[TCP]
4356 * <pru_shutdown>:EMSGSIZE[TCP]
4357 * <pru_shutdown>:EHOSTUNREACH[TCP]
4358 * <pru_shutdown>:ENETUNREACH[TCP]
4359 * <pru_shutdown>:ENETDOWN[TCP]
4360 * <pru_shutdown>:ENOMEM[TCP]
4361 * <pru_shutdown>:EACCES[TCP]
4362 * <pru_shutdown>:EMSGSIZE[TCP]
4363 * <pru_shutdown>:ENOBUFS[TCP]
4364 * <pru_shutdown>:???[TCP] [ignorable: mostly IPSEC/firewall/DLIL]
4365 * <pru_shutdown>:??? [other protocol families]
4366 */
4367 int
4368 soshutdown(struct socket *so, int how)
4369 {
4370 int error;
4371
4372 KERNEL_DEBUG(DBG_FNC_SOSHUTDOWN | DBG_FUNC_START, how, 0, 0, 0, 0);
4373
4374 switch (how) {
4375 case SHUT_RD:
4376 case SHUT_WR:
4377 case SHUT_RDWR:
4378 socket_lock(so, 1);
4379 if ((so->so_state &
4380 (SS_ISCONNECTED|SS_ISCONNECTING|SS_ISDISCONNECTING)) == 0) {
4381 error = ENOTCONN;
4382 } else {
4383 error = soshutdownlock(so, how);
4384 }
4385 socket_unlock(so, 1);
4386 break;
4387 default:
4388 error = EINVAL;
4389 break;
4390 }
4391
4392 KERNEL_DEBUG(DBG_FNC_SOSHUTDOWN | DBG_FUNC_END, how, error, 0, 0, 0);
4393
4394 return (error);
4395 }
4396
4397 int
4398 soshutdownlock_final(struct socket *so, int how)
4399 {
4400 struct protosw *pr = so->so_proto;
4401 int error = 0;
4402
4403 sflt_notify(so, sock_evt_shutdown, &how);
4404
4405 if (how != SHUT_WR) {
4406 if ((so->so_state & SS_CANTRCVMORE) != 0) {
4407 /* read already shut down */
4408 error = ENOTCONN;
4409 goto done;
4410 }
4411 sorflush(so);
4412 postevent(so, 0, EV_RCLOSED);
4413 }
4414 if (how != SHUT_RD) {
4415 if ((so->so_state & SS_CANTSENDMORE) != 0) {
4416 /* write already shut down */
4417 error = ENOTCONN;
4418 goto done;
4419 }
4420 error = (*pr->pr_usrreqs->pru_shutdown)(so);
4421 postevent(so, 0, EV_WCLOSED);
4422 }
4423 done:
4424 KERNEL_DEBUG(DBG_FNC_SOSHUTDOWN, how, 1, 0, 0, 0);
4425 return (error);
4426 }
4427
4428 int
4429 soshutdownlock(struct socket *so, int how)
4430 {
4431 int error = 0;
4432
4433 #if CONTENT_FILTER
4434 /*
4435 * A content filter may delay the actual shutdown until it
4436 * has processed the pending data
4437 */
4438 if (so->so_flags & SOF_CONTENT_FILTER) {
4439 error = cfil_sock_shutdown(so, &how);
4440 if (error == EJUSTRETURN) {
4441 error = 0;
4442 goto done;
4443 } else if (error != 0) {
4444 goto done;
4445 }
4446 }
4447 #endif /* CONTENT_FILTER */
4448
4449 error = soshutdownlock_final(so, how);
4450
4451 done:
4452 return (error);
4453 }
4454
4455 void
4456 sowflush(struct socket *so)
4457 {
4458 struct sockbuf *sb = &so->so_snd;
4459
4460 /*
4461 * Obtain lock on the socket buffer (SB_LOCK). This is required
4462 * to prevent the socket buffer from being unexpectedly altered
4463 * while it is used by another thread in socket send/receive.
4464 *
4465 * sblock() must not fail here, hence the assertion.
4466 */
4467 (void) sblock(sb, SBL_WAIT | SBL_NOINTR | SBL_IGNDEFUNCT);
4468 VERIFY(sb->sb_flags & SB_LOCK);
4469
4470 sb->sb_flags &= ~(SB_SEL|SB_UPCALL);
4471 sb->sb_flags |= SB_DROP;
4472 sb->sb_upcall = NULL;
4473 sb->sb_upcallarg = NULL;
4474
4475 sbunlock(sb, TRUE); /* keep socket locked */
4476
4477 selthreadclear(&sb->sb_sel);
4478 sbrelease(sb);
4479 }
4480
4481 void
4482 sorflush(struct socket *so)
4483 {
4484 struct sockbuf *sb = &so->so_rcv;
4485 struct protosw *pr = so->so_proto;
4486 struct sockbuf asb;
4487 #ifdef notyet
4488 lck_mtx_t *mutex_held;
4489 /*
4490 * XXX: This code is currently commented out, because we may get here
4491 * as part of sofreelastref(), and at that time, pr_getlock() may no
4492 * longer be able to return us the lock; this will be fixed in future.
4493 */
4494 if (so->so_proto->pr_getlock != NULL)
4495 mutex_held = (*so->so_proto->pr_getlock)(so, 0);
4496 else
4497 mutex_held = so->so_proto->pr_domain->dom_mtx;
4498
4499 LCK_MTX_ASSERT(mutex_held, LCK_MTX_ASSERT_OWNED);
4500 #endif /* notyet */
4501
4502 sflt_notify(so, sock_evt_flush_read, NULL);
4503
4504 socantrcvmore(so);
4505
4506 /*
4507 * Obtain lock on the socket buffer (SB_LOCK). This is required
4508 * to prevent the socket buffer from being unexpectedly altered
4509 * while it is used by another thread in socket send/receive.
4510 *
4511 * sblock() must not fail here, hence the assertion.
4512 */
4513 (void) sblock(sb, SBL_WAIT | SBL_NOINTR | SBL_IGNDEFUNCT);
4514 VERIFY(sb->sb_flags & SB_LOCK);
4515
4516 /*
4517 * Copy only the relevant fields from "sb" to "asb" which we
4518 * need for sbrelease() to function. In particular, skip
4519 * sb_sel as it contains the wait queue linkage, which would
4520 * wreak havoc if we were to issue selthreadclear() on "asb".
4521 * Make sure to not carry over SB_LOCK in "asb", as we need
4522 * to acquire it later as part of sbrelease().
4523 */
4524 bzero(&asb, sizeof (asb));
4525 asb.sb_cc = sb->sb_cc;
4526 asb.sb_hiwat = sb->sb_hiwat;
4527 asb.sb_mbcnt = sb->sb_mbcnt;
4528 asb.sb_mbmax = sb->sb_mbmax;
4529 asb.sb_ctl = sb->sb_ctl;
4530 asb.sb_lowat = sb->sb_lowat;
4531 asb.sb_mb = sb->sb_mb;
4532 asb.sb_mbtail = sb->sb_mbtail;
4533 asb.sb_lastrecord = sb->sb_lastrecord;
4534 asb.sb_so = sb->sb_so;
4535 asb.sb_flags = sb->sb_flags;
4536 asb.sb_flags &= ~(SB_LOCK|SB_SEL|SB_KNOTE|SB_UPCALL);
4537 asb.sb_flags |= SB_DROP;
4538
4539 /*
4540 * Ideally we'd bzero() these and preserve the ones we need;
4541 * but to do that we'd need to shuffle things around in the
4542 * sockbuf, and we can't do it now because there are KEXTS
4543 * that are directly referring to the socket structure.
4544 *
4545 * Setting SB_DROP acts as a barrier to prevent further appends.
4546 * Clearing SB_SEL is done for selthreadclear() below.
4547 */
4548 sb->sb_cc = 0;
4549 sb->sb_hiwat = 0;
4550 sb->sb_mbcnt = 0;
4551 sb->sb_mbmax = 0;
4552 sb->sb_ctl = 0;
4553 sb->sb_lowat = 0;
4554 sb->sb_mb = NULL;
4555 sb->sb_mbtail = NULL;
4556 sb->sb_lastrecord = NULL;
4557 sb->sb_timeo.tv_sec = 0;
4558 sb->sb_timeo.tv_usec = 0;
4559 sb->sb_upcall = NULL;
4560 sb->sb_upcallarg = NULL;
4561 sb->sb_flags &= ~(SB_SEL|SB_UPCALL);
4562 sb->sb_flags |= SB_DROP;
4563
4564 sbunlock(sb, TRUE); /* keep socket locked */
4565
4566 /*
4567 * Note that selthreadclear() is called on the original "sb" and
4568 * not the local "asb" because of the way wait queue linkage is
4569 * implemented. Given that selwakeup() may be triggered, SB_SEL
4570 * should no longer be set (cleared above.)
4571 */
4572 selthreadclear(&sb->sb_sel);
4573
4574 if ((pr->pr_flags & PR_RIGHTS) && pr->pr_domain->dom_dispose)
4575 (*pr->pr_domain->dom_dispose)(asb.sb_mb);
4576
4577 sbrelease(&asb);
4578 }
4579
4580 /*
4581 * Perhaps this routine, and sooptcopyout(), below, ought to come in
4582 * an additional variant to handle the case where the option value needs
4583 * to be some kind of integer, but not a specific size.
4584 * In addition to their use here, these functions are also called by the
4585 * protocol-level pr_ctloutput() routines.
4586 *
4587 * Returns: 0 Success
4588 * EINVAL
4589 * copyin:EFAULT
4590 */
4591 int
4592 sooptcopyin(struct sockopt *sopt, void *buf, size_t len, size_t minlen)
4593 {
4594 size_t valsize;
4595
4596 /*
4597 * If the user gives us more than we wanted, we ignore it,
4598 * but if we don't get the minimum length the caller
4599 * wants, we return EINVAL. On success, sopt->sopt_valsize
4600 * is set to however much we actually retrieved.
4601 */
4602 if ((valsize = sopt->sopt_valsize) < minlen)
4603 return (EINVAL);
4604 if (valsize > len)
4605 sopt->sopt_valsize = valsize = len;
4606
4607 if (sopt->sopt_p != kernproc)
4608 return (copyin(sopt->sopt_val, buf, valsize));
4609
4610 bcopy(CAST_DOWN(caddr_t, sopt->sopt_val), buf, valsize);
4611 return (0);
4612 }
4613
4614 /*
4615 * sooptcopyin_timeval
4616 * Copy in a timeval value into tv_p, and take into account whether the
4617 * the calling process is 64-bit or 32-bit. Moved the sanity checking
4618 * code here so that we can verify the 64-bit tv_sec value before we lose
4619 * the top 32-bits assigning tv64.tv_sec to tv_p->tv_sec.
4620 */
4621 static int
4622 sooptcopyin_timeval(struct sockopt *sopt, struct timeval *tv_p)
4623 {
4624 int error;
4625
4626 if (proc_is64bit(sopt->sopt_p)) {
4627 struct user64_timeval tv64;
4628
4629 if (sopt->sopt_valsize < sizeof (tv64))
4630 return (EINVAL);
4631
4632 sopt->sopt_valsize = sizeof (tv64);
4633 if (sopt->sopt_p != kernproc) {
4634 error = copyin(sopt->sopt_val, &tv64, sizeof (tv64));
4635 if (error != 0)
4636 return (error);
4637 } else {
4638 bcopy(CAST_DOWN(caddr_t, sopt->sopt_val), &tv64,
4639 sizeof (tv64));
4640 }
4641 if (tv64.tv_sec < 0 || tv64.tv_sec > LONG_MAX ||
4642 tv64.tv_usec < 0 || tv64.tv_usec >= 1000000)
4643 return (EDOM);
4644
4645 tv_p->tv_sec = tv64.tv_sec;
4646 tv_p->tv_usec = tv64.tv_usec;
4647 } else {
4648 struct user32_timeval tv32;
4649
4650 if (sopt->sopt_valsize < sizeof (tv32))
4651 return (EINVAL);
4652
4653 sopt->sopt_valsize = sizeof (tv32);
4654 if (sopt->sopt_p != kernproc) {
4655 error = copyin(sopt->sopt_val, &tv32, sizeof (tv32));
4656 if (error != 0) {
4657 return (error);
4658 }
4659 } else {
4660 bcopy(CAST_DOWN(caddr_t, sopt->sopt_val), &tv32,
4661 sizeof (tv32));
4662 }
4663 #ifndef __LP64__
4664 /*
4665 * K64todo "comparison is always false due to
4666 * limited range of data type"
4667 */
4668 if (tv32.tv_sec < 0 || tv32.tv_sec > LONG_MAX ||
4669 tv32.tv_usec < 0 || tv32.tv_usec >= 1000000)
4670 return (EDOM);
4671 #endif
4672 tv_p->tv_sec = tv32.tv_sec;
4673 tv_p->tv_usec = tv32.tv_usec;
4674 }
4675 return (0);
4676 }
4677
4678 int
4679 soopt_cred_check(struct socket *so, int priv, boolean_t allow_root)
4680 {
4681 kauth_cred_t cred = NULL;
4682 proc_t ep = PROC_NULL;
4683 uid_t uid;
4684 int error = 0;
4685
4686 if (so->so_flags & SOF_DELEGATED) {
4687 ep = proc_find(so->e_pid);
4688 if (ep)
4689 cred = kauth_cred_proc_ref(ep);
4690 }
4691
4692 uid = kauth_cred_getuid(cred ? cred : so->so_cred);
4693
4694 /* uid is 0 for root */
4695 if (uid != 0 || !allow_root)
4696 error = priv_check_cred(cred ? cred : so->so_cred, priv, 0);
4697 if (cred)
4698 kauth_cred_unref(&cred);
4699 if (ep != PROC_NULL)
4700 proc_rele(ep);
4701
4702 return (error);
4703 }
4704
4705 /*
4706 * Returns: 0 Success
4707 * EINVAL
4708 * ENOPROTOOPT
4709 * ENOBUFS
4710 * EDOM
4711 * sooptcopyin:EINVAL
4712 * sooptcopyin:EFAULT
4713 * sooptcopyin_timeval:EINVAL
4714 * sooptcopyin_timeval:EFAULT
4715 * sooptcopyin_timeval:EDOM
4716 * <pr_ctloutput>:EOPNOTSUPP[AF_UNIX]
4717 * <pr_ctloutput>:???w
4718 * sflt_attach_private:??? [whatever a filter author chooses]
4719 * <sf_setoption>:??? [whatever a filter author chooses]
4720 *
4721 * Notes: Other <pru_listen> returns depend on the protocol family; all
4722 * <sf_listen> returns depend on what the filter author causes
4723 * their filter to return.
4724 */
4725 int
4726 sosetoptlock(struct socket *so, struct sockopt *sopt, int dolock)
4727 {
4728 int error, optval;
4729 struct linger l;
4730 struct timeval tv;
4731 #if CONFIG_MACF_SOCKET
4732 struct mac extmac;
4733 #endif /* MAC_SOCKET */
4734
4735 if (sopt->sopt_dir != SOPT_SET)
4736 sopt->sopt_dir = SOPT_SET;
4737
4738 if (dolock)
4739 socket_lock(so, 1);
4740
4741 if ((so->so_state & (SS_CANTRCVMORE | SS_CANTSENDMORE)) ==
4742 (SS_CANTRCVMORE | SS_CANTSENDMORE) &&
4743 (so->so_flags & SOF_NPX_SETOPTSHUT) == 0) {
4744 /* the socket has been shutdown, no more sockopt's */
4745 error = EINVAL;
4746 goto out;
4747 }
4748
4749 error = sflt_setsockopt(so, sopt);
4750 if (error != 0) {
4751 if (error == EJUSTRETURN)
4752 error = 0;
4753 goto out;
4754 }
4755
4756 if (sopt->sopt_level != SOL_SOCKET) {
4757 if (so->so_proto != NULL &&
4758 so->so_proto->pr_ctloutput != NULL) {
4759 error = (*so->so_proto->pr_ctloutput)(so, sopt);
4760 goto out;
4761 }
4762 error = ENOPROTOOPT;
4763 } else {
4764 /*
4765 * Allow socket-level (SOL_SOCKET) options to be filtered by
4766 * the protocol layer, if needed. A zero value returned from
4767 * the handler means use default socket-level processing as
4768 * done by the rest of this routine. Otherwise, any other
4769 * return value indicates that the option is unsupported.
4770 */
4771 if (so->so_proto != NULL && (error = so->so_proto->pr_usrreqs->
4772 pru_socheckopt(so, sopt)) != 0)
4773 goto out;
4774
4775 error = 0;
4776 switch (sopt->sopt_name) {
4777 case SO_LINGER:
4778 case SO_LINGER_SEC:
4779 error = sooptcopyin(sopt, &l, sizeof (l), sizeof (l));
4780 if (error != 0)
4781 goto out;
4782
4783 so->so_linger = (sopt->sopt_name == SO_LINGER) ?
4784 l.l_linger : l.l_linger * hz;
4785 if (l.l_onoff != 0)
4786 so->so_options |= SO_LINGER;
4787 else
4788 so->so_options &= ~SO_LINGER;
4789 break;
4790
4791 case SO_DEBUG:
4792 case SO_KEEPALIVE:
4793 case SO_DONTROUTE:
4794 case SO_USELOOPBACK:
4795 case SO_BROADCAST:
4796 case SO_REUSEADDR:
4797 case SO_REUSEPORT:
4798 case SO_OOBINLINE:
4799 case SO_TIMESTAMP:
4800 case SO_TIMESTAMP_MONOTONIC:
4801 case SO_DONTTRUNC:
4802 case SO_WANTMORE:
4803 case SO_WANTOOBFLAG:
4804 case SO_NOWAKEFROMSLEEP:
4805 case SO_NOAPNFALLBK:
4806 error = sooptcopyin(sopt, &optval, sizeof (optval),
4807 sizeof (optval));
4808 if (error != 0)
4809 goto out;
4810 if (optval)
4811 so->so_options |= sopt->sopt_name;
4812 else
4813 so->so_options &= ~sopt->sopt_name;
4814 break;
4815
4816 case SO_SNDBUF:
4817 case SO_RCVBUF:
4818 case SO_SNDLOWAT:
4819 case SO_RCVLOWAT:
4820 error = sooptcopyin(sopt, &optval, sizeof (optval),
4821 sizeof (optval));
4822 if (error != 0)
4823 goto out;
4824
4825 /*
4826 * Values < 1 make no sense for any of these
4827 * options, so disallow them.
4828 */
4829 if (optval < 1) {
4830 error = EINVAL;
4831 goto out;
4832 }
4833
4834 switch (sopt->sopt_name) {
4835 case SO_SNDBUF:
4836 case SO_RCVBUF: {
4837 struct sockbuf *sb =
4838 (sopt->sopt_name == SO_SNDBUF) ?
4839 &so->so_snd : &so->so_rcv;
4840 if (sbreserve(sb, (u_int32_t)optval) == 0) {
4841 error = ENOBUFS;
4842 goto out;
4843 }
4844 sb->sb_flags |= SB_USRSIZE;
4845 sb->sb_flags &= ~SB_AUTOSIZE;
4846 sb->sb_idealsize = (u_int32_t)optval;
4847 break;
4848 }
4849 /*
4850 * Make sure the low-water is never greater than
4851 * the high-water.
4852 */
4853 case SO_SNDLOWAT: {
4854 int space = sbspace(&so->so_snd);
4855 u_int32_t hiwat = so->so_snd.sb_hiwat;
4856
4857 if (so->so_snd.sb_flags & SB_UNIX) {
4858 struct unpcb *unp =
4859 (struct unpcb *)(so->so_pcb);
4860 if (unp != NULL &&
4861 unp->unp_conn != NULL) {
4862 hiwat += unp->unp_conn->unp_cc;
4863 }
4864 }
4865
4866 so->so_snd.sb_lowat =
4867 (optval > hiwat) ?
4868 hiwat : optval;
4869
4870 if (space >= so->so_snd.sb_lowat) {
4871 sowwakeup(so);
4872 }
4873 break;
4874 }
4875 case SO_RCVLOWAT: {
4876 int64_t data_len;
4877 so->so_rcv.sb_lowat =
4878 (optval > so->so_rcv.sb_hiwat) ?
4879 so->so_rcv.sb_hiwat : optval;
4880 data_len = so->so_rcv.sb_cc
4881 - so->so_rcv.sb_ctl;
4882 if (data_len >= so->so_rcv.sb_lowat)
4883 sorwakeup(so);
4884 break;
4885 }
4886 }
4887 break;
4888
4889 case SO_SNDTIMEO:
4890 case SO_RCVTIMEO:
4891 error = sooptcopyin_timeval(sopt, &tv);
4892 if (error != 0)
4893 goto out;
4894
4895 switch (sopt->sopt_name) {
4896 case SO_SNDTIMEO:
4897 so->so_snd.sb_timeo = tv;
4898 break;
4899 case SO_RCVTIMEO:
4900 so->so_rcv.sb_timeo = tv;
4901 break;
4902 }
4903 break;
4904
4905 case SO_NKE: {
4906 struct so_nke nke;
4907
4908 error = sooptcopyin(sopt, &nke, sizeof (nke),
4909 sizeof (nke));
4910 if (error != 0)
4911 goto out;
4912
4913 error = sflt_attach_internal(so, nke.nke_handle);
4914 break;
4915 }
4916
4917 case SO_NOSIGPIPE:
4918 error = sooptcopyin(sopt, &optval, sizeof (optval),
4919 sizeof (optval));
4920 if (error != 0)
4921 goto out;
4922 if (optval != 0)
4923 so->so_flags |= SOF_NOSIGPIPE;
4924 else
4925 so->so_flags &= ~SOF_NOSIGPIPE;
4926 break;
4927
4928 case SO_NOADDRERR:
4929 error = sooptcopyin(sopt, &optval, sizeof (optval),
4930 sizeof (optval));
4931 if (error != 0)
4932 goto out;
4933 if (optval != 0)
4934 so->so_flags |= SOF_NOADDRAVAIL;
4935 else
4936 so->so_flags &= ~SOF_NOADDRAVAIL;
4937 break;
4938
4939 case SO_REUSESHAREUID:
4940 error = sooptcopyin(sopt, &optval, sizeof (optval),
4941 sizeof (optval));
4942 if (error != 0)
4943 goto out;
4944 if (optval != 0)
4945 so->so_flags |= SOF_REUSESHAREUID;
4946 else
4947 so->so_flags &= ~SOF_REUSESHAREUID;
4948 break;
4949
4950 case SO_NOTIFYCONFLICT:
4951 if (kauth_cred_issuser(kauth_cred_get()) == 0) {
4952 error = EPERM;
4953 goto out;
4954 }
4955 error = sooptcopyin(sopt, &optval, sizeof (optval),
4956 sizeof (optval));
4957 if (error != 0)
4958 goto out;
4959 if (optval != 0)
4960 so->so_flags |= SOF_NOTIFYCONFLICT;
4961 else
4962 so->so_flags &= ~SOF_NOTIFYCONFLICT;
4963 break;
4964
4965 case SO_RESTRICTIONS:
4966 error = sooptcopyin(sopt, &optval, sizeof (optval),
4967 sizeof (optval));
4968 if (error != 0)
4969 goto out;
4970
4971 error = so_set_restrictions(so, optval);
4972 break;
4973
4974 case SO_AWDL_UNRESTRICTED:
4975 if (SOCK_DOM(so) != PF_INET &&
4976 SOCK_DOM(so) != PF_INET6) {
4977 error = EOPNOTSUPP;
4978 goto out;
4979 }
4980 error = sooptcopyin(sopt, &optval, sizeof(optval),
4981 sizeof(optval));
4982 if (error != 0)
4983 goto out;
4984 if (optval != 0) {
4985 error = soopt_cred_check(so,
4986 PRIV_NET_RESTRICTED_AWDL, false);
4987 if (error == 0)
4988 inp_set_awdl_unrestricted(
4989 sotoinpcb(so));
4990 } else
4991 inp_clear_awdl_unrestricted(sotoinpcb(so));
4992 break;
4993 case SO_INTCOPROC_ALLOW:
4994 if (SOCK_DOM(so) != PF_INET6) {
4995 error = EOPNOTSUPP;
4996 goto out;
4997 }
4998 error = sooptcopyin(sopt, &optval, sizeof(optval),
4999 sizeof(optval));
5000 if (error != 0)
5001 goto out;
5002 if (optval != 0 &&
5003 inp_get_intcoproc_allowed(sotoinpcb(so)) == FALSE) {
5004 error = soopt_cred_check(so,
5005 PRIV_NET_RESTRICTED_INTCOPROC, false);
5006 if (error == 0)
5007 inp_set_intcoproc_allowed(
5008 sotoinpcb(so));
5009 } else if (optval == 0)
5010 inp_clear_intcoproc_allowed(sotoinpcb(so));
5011 break;
5012
5013 case SO_LABEL:
5014 #if CONFIG_MACF_SOCKET
5015 if ((error = sooptcopyin(sopt, &extmac, sizeof (extmac),
5016 sizeof (extmac))) != 0)
5017 goto out;
5018
5019 error = mac_setsockopt_label(proc_ucred(sopt->sopt_p),
5020 so, &extmac);
5021 #else
5022 error = EOPNOTSUPP;
5023 #endif /* MAC_SOCKET */
5024 break;
5025
5026 case SO_UPCALLCLOSEWAIT:
5027 error = sooptcopyin(sopt, &optval, sizeof (optval),
5028 sizeof (optval));
5029 if (error != 0)
5030 goto out;
5031 if (optval != 0)
5032 so->so_flags |= SOF_UPCALLCLOSEWAIT;
5033 else
5034 so->so_flags &= ~SOF_UPCALLCLOSEWAIT;
5035 break;
5036
5037 case SO_RANDOMPORT:
5038 error = sooptcopyin(sopt, &optval, sizeof (optval),
5039 sizeof (optval));
5040 if (error != 0)
5041 goto out;
5042 if (optval != 0)
5043 so->so_flags |= SOF_BINDRANDOMPORT;
5044 else
5045 so->so_flags &= ~SOF_BINDRANDOMPORT;
5046 break;
5047
5048 case SO_NP_EXTENSIONS: {
5049 struct so_np_extensions sonpx;
5050
5051 error = sooptcopyin(sopt, &sonpx, sizeof (sonpx),
5052 sizeof (sonpx));
5053 if (error != 0)
5054 goto out;
5055 if (sonpx.npx_mask & ~SONPX_MASK_VALID) {
5056 error = EINVAL;
5057 goto out;
5058 }
5059 /*
5060 * Only one bit defined for now
5061 */
5062 if ((sonpx.npx_mask & SONPX_SETOPTSHUT)) {
5063 if ((sonpx.npx_flags & SONPX_SETOPTSHUT))
5064 so->so_flags |= SOF_NPX_SETOPTSHUT;
5065 else
5066 so->so_flags &= ~SOF_NPX_SETOPTSHUT;
5067 }
5068 break;
5069 }
5070
5071 case SO_TRAFFIC_CLASS: {
5072 error = sooptcopyin(sopt, &optval, sizeof (optval),
5073 sizeof (optval));
5074 if (error != 0)
5075 goto out;
5076 if (optval >= SO_TC_NET_SERVICE_OFFSET) {
5077 int netsvc = optval - SO_TC_NET_SERVICE_OFFSET;
5078 error = so_set_net_service_type(so, netsvc);
5079 goto out;
5080 }
5081 error = so_set_traffic_class(so, optval);
5082 if (error != 0)
5083 goto out;
5084 so->so_flags1 &= ~SOF1_TC_NET_SERV_TYPE;
5085 so->so_netsvctype = _NET_SERVICE_TYPE_UNSPEC;
5086 break;
5087 }
5088
5089 case SO_RECV_TRAFFIC_CLASS: {
5090 error = sooptcopyin(sopt, &optval, sizeof (optval),
5091 sizeof (optval));
5092 if (error != 0)
5093 goto out;
5094 if (optval == 0)
5095 so->so_flags &= ~SOF_RECV_TRAFFIC_CLASS;
5096 else
5097 so->so_flags |= SOF_RECV_TRAFFIC_CLASS;
5098 break;
5099 }
5100
5101 #if (DEVELOPMENT || DEBUG)
5102 case SO_TRAFFIC_CLASS_DBG: {
5103 struct so_tcdbg so_tcdbg;
5104
5105 error = sooptcopyin(sopt, &so_tcdbg,
5106 sizeof (struct so_tcdbg), sizeof (struct so_tcdbg));
5107 if (error != 0)
5108 goto out;
5109 error = so_set_tcdbg(so, &so_tcdbg);
5110 if (error != 0)
5111 goto out;
5112 break;
5113 }
5114 #endif /* (DEVELOPMENT || DEBUG) */
5115
5116 case SO_PRIVILEGED_TRAFFIC_CLASS:
5117 error = priv_check_cred(kauth_cred_get(),
5118 PRIV_NET_PRIVILEGED_TRAFFIC_CLASS, 0);
5119 if (error != 0)
5120 goto out;
5121 error = sooptcopyin(sopt, &optval, sizeof (optval),
5122 sizeof (optval));
5123 if (error != 0)
5124 goto out;
5125 if (optval == 0)
5126 so->so_flags &= ~SOF_PRIVILEGED_TRAFFIC_CLASS;
5127 else
5128 so->so_flags |= SOF_PRIVILEGED_TRAFFIC_CLASS;
5129 break;
5130
5131 #if (DEVELOPMENT || DEBUG)
5132 case SO_DEFUNCTIT:
5133 error = sosetdefunct(current_proc(), so, 0, FALSE);
5134 if (error == 0)
5135 error = sodefunct(current_proc(), so, 0);
5136
5137 break;
5138 #endif /* (DEVELOPMENT || DEBUG) */
5139
5140 case SO_DEFUNCTOK:
5141 error = sooptcopyin(sopt, &optval, sizeof (optval),
5142 sizeof (optval));
5143 if (error != 0 || (so->so_flags & SOF_DEFUNCT)) {
5144 if (error == 0)
5145 error = EBADF;
5146 goto out;
5147 }
5148 /*
5149 * Any process can set SO_DEFUNCTOK (clear
5150 * SOF_NODEFUNCT), but only root can clear
5151 * SO_DEFUNCTOK (set SOF_NODEFUNCT).
5152 */
5153 if (optval == 0 &&
5154 kauth_cred_issuser(kauth_cred_get()) == 0) {
5155 error = EPERM;
5156 goto out;
5157 }
5158 if (optval)
5159 so->so_flags &= ~SOF_NODEFUNCT;
5160 else
5161 so->so_flags |= SOF_NODEFUNCT;
5162
5163 if (SOCK_DOM(so) == PF_INET ||
5164 SOCK_DOM(so) == PF_INET6) {
5165 char s[MAX_IPv6_STR_LEN];
5166 char d[MAX_IPv6_STR_LEN];
5167 struct inpcb *inp = sotoinpcb(so);
5168
5169 SODEFUNCTLOG("%s[%d, %s]: so 0x%llx "
5170 "[%s %s:%d -> %s:%d] is now marked "
5171 "as %seligible for "
5172 "defunct\n", __func__, proc_selfpid(),
5173 proc_best_name(current_proc()),
5174 (uint64_t)DEBUG_KERNEL_ADDRPERM(so),
5175 (SOCK_TYPE(so) == SOCK_STREAM) ?
5176 "TCP" : "UDP", inet_ntop(SOCK_DOM(so),
5177 ((SOCK_DOM(so) == PF_INET) ?
5178 (void *)&inp->inp_laddr.s_addr :
5179 (void *)&inp->in6p_laddr), s, sizeof (s)),
5180 ntohs(inp->in6p_lport),
5181 inet_ntop(SOCK_DOM(so),
5182 (SOCK_DOM(so) == PF_INET) ?
5183 (void *)&inp->inp_faddr.s_addr :
5184 (void *)&inp->in6p_faddr, d, sizeof (d)),
5185 ntohs(inp->in6p_fport),
5186 (so->so_flags & SOF_NODEFUNCT) ?
5187 "not " : "");
5188 } else {
5189 SODEFUNCTLOG("%s[%d, %s]: so 0x%llx [%d,%d] "
5190 "is now marked as %seligible for "
5191 "defunct\n",
5192 __func__, proc_selfpid(),
5193 proc_best_name(current_proc()),
5194 (uint64_t)DEBUG_KERNEL_ADDRPERM(so),
5195 SOCK_DOM(so), SOCK_TYPE(so),
5196 (so->so_flags & SOF_NODEFUNCT) ?
5197 "not " : "");
5198 }
5199 break;
5200
5201 case SO_ISDEFUNCT:
5202 /* This option is not settable */
5203 error = EINVAL;
5204 break;
5205
5206 case SO_OPPORTUNISTIC:
5207 error = sooptcopyin(sopt, &optval, sizeof (optval),
5208 sizeof (optval));
5209 if (error == 0)
5210 error = so_set_opportunistic(so, optval);
5211 break;
5212
5213 case SO_FLUSH:
5214 /* This option is handled by lower layer(s) */
5215 error = 0;
5216 break;
5217
5218 case SO_RECV_ANYIF:
5219 error = sooptcopyin(sopt, &optval, sizeof (optval),
5220 sizeof (optval));
5221 if (error == 0)
5222 error = so_set_recv_anyif(so, optval);
5223 break;
5224
5225 case SO_TRAFFIC_MGT_BACKGROUND: {
5226 /* This option is handled by lower layer(s) */
5227 error = 0;
5228 break;
5229 }
5230
5231 #if FLOW_DIVERT
5232 case SO_FLOW_DIVERT_TOKEN:
5233 error = flow_divert_token_set(so, sopt);
5234 break;
5235 #endif /* FLOW_DIVERT */
5236
5237
5238 case SO_DELEGATED:
5239 if ((error = sooptcopyin(sopt, &optval, sizeof (optval),
5240 sizeof (optval))) != 0)
5241 break;
5242
5243 error = so_set_effective_pid(so, optval, sopt->sopt_p);
5244 break;
5245
5246 case SO_DELEGATED_UUID: {
5247 uuid_t euuid;
5248
5249 if ((error = sooptcopyin(sopt, &euuid, sizeof (euuid),
5250 sizeof (euuid))) != 0)
5251 break;
5252
5253 error = so_set_effective_uuid(so, euuid, sopt->sopt_p);
5254 break;
5255 }
5256
5257 #if NECP
5258 case SO_NECP_ATTRIBUTES:
5259 error = necp_set_socket_attributes(so, sopt);
5260 break;
5261
5262 case SO_NECP_CLIENTUUID:
5263 if (SOCK_DOM(so) == PF_MULTIPATH) {
5264 /* Handled by MPTCP itself */
5265 break;
5266 }
5267
5268 if (SOCK_DOM(so) != PF_INET && SOCK_DOM(so) != PF_INET6) {
5269 error = EINVAL;
5270 goto out;
5271 }
5272
5273 struct inpcb *inp = sotoinpcb(so);
5274 if (!uuid_is_null(inp->necp_client_uuid)) {
5275 // Clear out the old client UUID if present
5276 necp_inpcb_remove_cb(inp);
5277 }
5278
5279 error = sooptcopyin(sopt, &inp->necp_client_uuid,
5280 sizeof(uuid_t), sizeof(uuid_t));
5281 if (error != 0) {
5282 goto out;
5283 }
5284
5285 if (uuid_is_null(inp->necp_client_uuid)) {
5286 error = EINVAL;
5287 goto out;
5288 }
5289
5290 error = necp_client_register_socket_flow(so->last_pid,
5291 inp->necp_client_uuid, inp);
5292 if (error != 0) {
5293 uuid_clear(inp->necp_client_uuid);
5294 goto out;
5295 }
5296
5297 if (inp->inp_lport != 0) {
5298 // There is bound local port, so this is not
5299 // a fresh socket. Assign to the client.
5300 necp_client_assign_from_socket(so->last_pid, inp->necp_client_uuid, inp);
5301 }
5302
5303 break;
5304 #endif /* NECP */
5305
5306 case SO_EXTENDED_BK_IDLE:
5307 error = sooptcopyin(sopt, &optval, sizeof (optval),
5308 sizeof (optval));
5309 if (error == 0)
5310 error = so_set_extended_bk_idle(so, optval);
5311 break;
5312
5313 case SO_MARK_CELLFALLBACK:
5314 error = sooptcopyin(sopt, &optval, sizeof(optval),
5315 sizeof(optval));
5316 if (error != 0)
5317 goto out;
5318 if (optval < 0) {
5319 error = EINVAL;
5320 goto out;
5321 }
5322 if (optval == 0)
5323 so->so_flags1 &= ~SOF1_CELLFALLBACK;
5324 else
5325 so->so_flags1 |= SOF1_CELLFALLBACK;
5326 break;
5327
5328 case SO_NET_SERVICE_TYPE: {
5329 error = sooptcopyin(sopt, &optval, sizeof(optval),
5330 sizeof(optval));
5331 if (error != 0)
5332 goto out;
5333 error = so_set_net_service_type(so, optval);
5334 break;
5335 }
5336
5337 case SO_QOSMARKING_POLICY_OVERRIDE:
5338 error = priv_check_cred(kauth_cred_get(),
5339 PRIV_NET_QOSMARKING_POLICY_OVERRIDE, 0);
5340 if (error != 0)
5341 goto out;
5342 error = sooptcopyin(sopt, &optval, sizeof(optval),
5343 sizeof(optval));
5344 if (error != 0)
5345 goto out;
5346 if (optval == 0)
5347 so->so_flags1 &= ~SOF1_QOSMARKING_POLICY_OVERRIDE;
5348 else
5349 so->so_flags1 |= SOF1_QOSMARKING_POLICY_OVERRIDE;
5350 break;
5351
5352 default:
5353 error = ENOPROTOOPT;
5354 break;
5355 }
5356 if (error == 0 && so->so_proto != NULL &&
5357 so->so_proto->pr_ctloutput != NULL) {
5358 (void) so->so_proto->pr_ctloutput(so, sopt);
5359 }
5360 }
5361 out:
5362 if (dolock)
5363 socket_unlock(so, 1);
5364 return (error);
5365 }
5366
5367 /* Helper routines for getsockopt */
5368 int
5369 sooptcopyout(struct sockopt *sopt, void *buf, size_t len)
5370 {
5371 int error;
5372 size_t valsize;
5373
5374 error = 0;
5375
5376 /*
5377 * Documented get behavior is that we always return a value,
5378 * possibly truncated to fit in the user's buffer.
5379 * Traditional behavior is that we always tell the user
5380 * precisely how much we copied, rather than something useful
5381 * like the total amount we had available for her.
5382 * Note that this interface is not idempotent; the entire answer must
5383 * generated ahead of time.
5384 */
5385 valsize = min(len, sopt->sopt_valsize);
5386 sopt->sopt_valsize = valsize;
5387 if (sopt->sopt_val != USER_ADDR_NULL) {
5388 if (sopt->sopt_p != kernproc)
5389 error = copyout(buf, sopt->sopt_val, valsize);
5390 else
5391 bcopy(buf, CAST_DOWN(caddr_t, sopt->sopt_val), valsize);
5392 }
5393 return (error);
5394 }
5395
5396 static int
5397 sooptcopyout_timeval(struct sockopt *sopt, const struct timeval *tv_p)
5398 {
5399 int error;
5400 size_t len;
5401 struct user64_timeval tv64 = {};
5402 struct user32_timeval tv32 = {};
5403 const void * val;
5404 size_t valsize;
5405
5406 error = 0;
5407 if (proc_is64bit(sopt->sopt_p)) {
5408 len = sizeof (tv64);
5409 tv64.tv_sec = tv_p->tv_sec;
5410 tv64.tv_usec = tv_p->tv_usec;
5411 val = &tv64;
5412 } else {
5413 len = sizeof (tv32);
5414 tv32.tv_sec = tv_p->tv_sec;
5415 tv32.tv_usec = tv_p->tv_usec;
5416 val = &tv32;
5417 }
5418 valsize = min(len, sopt->sopt_valsize);
5419 sopt->sopt_valsize = valsize;
5420 if (sopt->sopt_val != USER_ADDR_NULL) {
5421 if (sopt->sopt_p != kernproc)
5422 error = copyout(val, sopt->sopt_val, valsize);
5423 else
5424 bcopy(val, CAST_DOWN(caddr_t, sopt->sopt_val), valsize);
5425 }
5426 return (error);
5427 }
5428
5429 /*
5430 * Return: 0 Success
5431 * ENOPROTOOPT
5432 * <pr_ctloutput>:EOPNOTSUPP[AF_UNIX]
5433 * <pr_ctloutput>:???
5434 * <sf_getoption>:???
5435 */
5436 int
5437 sogetoptlock(struct socket *so, struct sockopt *sopt, int dolock)
5438 {
5439 int error, optval;
5440 struct linger l;
5441 struct timeval tv;
5442 #if CONFIG_MACF_SOCKET
5443 struct mac extmac;
5444 #endif /* MAC_SOCKET */
5445
5446 if (sopt->sopt_dir != SOPT_GET)
5447 sopt->sopt_dir = SOPT_GET;
5448
5449 if (dolock)
5450 socket_lock(so, 1);
5451
5452 error = sflt_getsockopt(so, sopt);
5453 if (error != 0) {
5454 if (error == EJUSTRETURN)
5455 error = 0;
5456 goto out;
5457 }
5458
5459 if (sopt->sopt_level != SOL_SOCKET) {
5460 if (so->so_proto != NULL &&
5461 so->so_proto->pr_ctloutput != NULL) {
5462 error = (*so->so_proto->pr_ctloutput)(so, sopt);
5463 goto out;
5464 }
5465 error = ENOPROTOOPT;
5466 } else {
5467 /*
5468 * Allow socket-level (SOL_SOCKET) options to be filtered by
5469 * the protocol layer, if needed. A zero value returned from
5470 * the handler means use default socket-level processing as
5471 * done by the rest of this routine. Otherwise, any other
5472 * return value indicates that the option is unsupported.
5473 */
5474 if (so->so_proto != NULL && (error = so->so_proto->pr_usrreqs->
5475 pru_socheckopt(so, sopt)) != 0)
5476 goto out;
5477
5478 error = 0;
5479 switch (sopt->sopt_name) {
5480 case SO_LINGER:
5481 case SO_LINGER_SEC:
5482 l.l_onoff = ((so->so_options & SO_LINGER) ? 1 : 0);
5483 l.l_linger = (sopt->sopt_name == SO_LINGER) ?
5484 so->so_linger : so->so_linger / hz;
5485 error = sooptcopyout(sopt, &l, sizeof (l));
5486 break;
5487
5488 case SO_USELOOPBACK:
5489 case SO_DONTROUTE:
5490 case SO_DEBUG:
5491 case SO_KEEPALIVE:
5492 case SO_REUSEADDR:
5493 case SO_REUSEPORT:
5494 case SO_BROADCAST:
5495 case SO_OOBINLINE:
5496 case SO_TIMESTAMP:
5497 case SO_TIMESTAMP_MONOTONIC:
5498 case SO_DONTTRUNC:
5499 case SO_WANTMORE:
5500 case SO_WANTOOBFLAG:
5501 case SO_NOWAKEFROMSLEEP:
5502 case SO_NOAPNFALLBK:
5503 optval = so->so_options & sopt->sopt_name;
5504 integer:
5505 error = sooptcopyout(sopt, &optval, sizeof (optval));
5506 break;
5507
5508 case SO_TYPE:
5509 optval = so->so_type;
5510 goto integer;
5511
5512 case SO_NREAD:
5513 if (so->so_proto->pr_flags & PR_ATOMIC) {
5514 int pkt_total;
5515 struct mbuf *m1;
5516
5517 pkt_total = 0;
5518 m1 = so->so_rcv.sb_mb;
5519 while (m1 != NULL) {
5520 if (m1->m_type == MT_DATA ||
5521 m1->m_type == MT_HEADER ||
5522 m1->m_type == MT_OOBDATA)
5523 pkt_total += m1->m_len;
5524 m1 = m1->m_next;
5525 }
5526 optval = pkt_total;
5527 } else {
5528 optval = so->so_rcv.sb_cc - so->so_rcv.sb_ctl;
5529 }
5530 goto integer;
5531
5532 case SO_NUMRCVPKT:
5533 if (so->so_proto->pr_flags & PR_ATOMIC) {
5534 int cnt = 0;
5535 struct mbuf *m1;
5536
5537 m1 = so->so_rcv.sb_mb;
5538 while (m1 != NULL) {
5539 if (m1->m_type == MT_DATA ||
5540 m1->m_type == MT_HEADER ||
5541 m1->m_type == MT_OOBDATA)
5542 cnt += 1;
5543 m1 = m1->m_nextpkt;
5544 }
5545 optval = cnt;
5546 goto integer;
5547 } else {
5548 error = EINVAL;
5549 break;
5550 }
5551
5552 case SO_NWRITE:
5553 optval = so->so_snd.sb_cc;
5554 goto integer;
5555
5556 case SO_ERROR:
5557 optval = so->so_error;
5558 so->so_error = 0;
5559 goto integer;
5560
5561 case SO_SNDBUF: {
5562 u_int32_t hiwat = so->so_snd.sb_hiwat;
5563
5564 if (so->so_snd.sb_flags & SB_UNIX) {
5565 struct unpcb *unp =
5566 (struct unpcb *)(so->so_pcb);
5567 if (unp != NULL && unp->unp_conn != NULL) {
5568 hiwat += unp->unp_conn->unp_cc;
5569 }
5570 }
5571
5572 optval = hiwat;
5573 goto integer;
5574 }
5575 case SO_RCVBUF:
5576 optval = so->so_rcv.sb_hiwat;
5577 goto integer;
5578
5579 case SO_SNDLOWAT:
5580 optval = so->so_snd.sb_lowat;
5581 goto integer;
5582
5583 case SO_RCVLOWAT:
5584 optval = so->so_rcv.sb_lowat;
5585 goto integer;
5586
5587 case SO_SNDTIMEO:
5588 case SO_RCVTIMEO:
5589 tv = (sopt->sopt_name == SO_SNDTIMEO ?
5590 so->so_snd.sb_timeo : so->so_rcv.sb_timeo);
5591
5592 error = sooptcopyout_timeval(sopt, &tv);
5593 break;
5594
5595 case SO_NOSIGPIPE:
5596 optval = (so->so_flags & SOF_NOSIGPIPE);
5597 goto integer;
5598
5599 case SO_NOADDRERR:
5600 optval = (so->so_flags & SOF_NOADDRAVAIL);
5601 goto integer;
5602
5603 case SO_REUSESHAREUID:
5604 optval = (so->so_flags & SOF_REUSESHAREUID);
5605 goto integer;
5606
5607
5608 case SO_NOTIFYCONFLICT:
5609 optval = (so->so_flags & SOF_NOTIFYCONFLICT);
5610 goto integer;
5611
5612 case SO_RESTRICTIONS:
5613 optval = so_get_restrictions(so);
5614 goto integer;
5615
5616 case SO_AWDL_UNRESTRICTED:
5617 if (SOCK_DOM(so) == PF_INET ||
5618 SOCK_DOM(so) == PF_INET6) {
5619 optval = inp_get_awdl_unrestricted(
5620 sotoinpcb(so));
5621 goto integer;
5622 } else
5623 error = EOPNOTSUPP;
5624 break;
5625
5626 case SO_INTCOPROC_ALLOW:
5627 if (SOCK_DOM(so) == PF_INET6) {
5628 optval = inp_get_intcoproc_allowed(
5629 sotoinpcb(so));
5630 goto integer;
5631 } else
5632 error = EOPNOTSUPP;
5633 break;
5634
5635 case SO_LABEL:
5636 #if CONFIG_MACF_SOCKET
5637 if ((error = sooptcopyin(sopt, &extmac, sizeof (extmac),
5638 sizeof (extmac))) != 0 ||
5639 (error = mac_socket_label_get(proc_ucred(
5640 sopt->sopt_p), so, &extmac)) != 0)
5641 break;
5642
5643 error = sooptcopyout(sopt, &extmac, sizeof (extmac));
5644 #else
5645 error = EOPNOTSUPP;
5646 #endif /* MAC_SOCKET */
5647 break;
5648
5649 case SO_PEERLABEL:
5650 #if CONFIG_MACF_SOCKET
5651 if ((error = sooptcopyin(sopt, &extmac, sizeof (extmac),
5652 sizeof (extmac))) != 0 ||
5653 (error = mac_socketpeer_label_get(proc_ucred(
5654 sopt->sopt_p), so, &extmac)) != 0)
5655 break;
5656
5657 error = sooptcopyout(sopt, &extmac, sizeof (extmac));
5658 #else
5659 error = EOPNOTSUPP;
5660 #endif /* MAC_SOCKET */
5661 break;
5662
5663 #ifdef __APPLE_API_PRIVATE
5664 case SO_UPCALLCLOSEWAIT:
5665 optval = (so->so_flags & SOF_UPCALLCLOSEWAIT);
5666 goto integer;
5667 #endif
5668 case SO_RANDOMPORT:
5669 optval = (so->so_flags & SOF_BINDRANDOMPORT);
5670 goto integer;
5671
5672 case SO_NP_EXTENSIONS: {
5673 struct so_np_extensions sonpx = {};
5674
5675 sonpx.npx_flags = (so->so_flags & SOF_NPX_SETOPTSHUT) ?
5676 SONPX_SETOPTSHUT : 0;
5677 sonpx.npx_mask = SONPX_MASK_VALID;
5678
5679 error = sooptcopyout(sopt, &sonpx,
5680 sizeof (struct so_np_extensions));
5681 break;
5682 }
5683
5684 case SO_TRAFFIC_CLASS:
5685 optval = so->so_traffic_class;
5686 goto integer;
5687
5688 case SO_RECV_TRAFFIC_CLASS:
5689 optval = (so->so_flags & SOF_RECV_TRAFFIC_CLASS);
5690 goto integer;
5691
5692 case SO_TRAFFIC_CLASS_STATS:
5693 error = sooptcopyout(sopt, &so->so_tc_stats,
5694 sizeof (so->so_tc_stats));
5695 break;
5696
5697 #if (DEVELOPMENT || DEBUG)
5698 case SO_TRAFFIC_CLASS_DBG:
5699 error = sogetopt_tcdbg(so, sopt);
5700 break;
5701 #endif /* (DEVELOPMENT || DEBUG) */
5702
5703 case SO_PRIVILEGED_TRAFFIC_CLASS:
5704 optval = (so->so_flags & SOF_PRIVILEGED_TRAFFIC_CLASS);
5705 goto integer;
5706
5707 case SO_DEFUNCTOK:
5708 optval = !(so->so_flags & SOF_NODEFUNCT);
5709 goto integer;
5710
5711 case SO_ISDEFUNCT:
5712 optval = (so->so_flags & SOF_DEFUNCT);
5713 goto integer;
5714
5715 case SO_OPPORTUNISTIC:
5716 optval = so_get_opportunistic(so);
5717 goto integer;
5718
5719 case SO_FLUSH:
5720 /* This option is not gettable */
5721 error = EINVAL;
5722 break;
5723
5724 case SO_RECV_ANYIF:
5725 optval = so_get_recv_anyif(so);
5726 goto integer;
5727
5728 case SO_TRAFFIC_MGT_BACKGROUND:
5729 /* This option is handled by lower layer(s) */
5730 if (so->so_proto != NULL &&
5731 so->so_proto->pr_ctloutput != NULL) {
5732 (void) so->so_proto->pr_ctloutput(so, sopt);
5733 }
5734 break;
5735
5736 #if FLOW_DIVERT
5737 case SO_FLOW_DIVERT_TOKEN:
5738 error = flow_divert_token_get(so, sopt);
5739 break;
5740 #endif /* FLOW_DIVERT */
5741
5742 #if NECP
5743 case SO_NECP_ATTRIBUTES:
5744 error = necp_get_socket_attributes(so, sopt);
5745 break;
5746
5747 case SO_NECP_CLIENTUUID:
5748 {
5749 uuid_t *ncu;
5750
5751 if (SOCK_DOM(so) == PF_MULTIPATH) {
5752 ncu = &mpsotomppcb(so)->necp_client_uuid;
5753 } else if (SOCK_DOM(so) == PF_INET || SOCK_DOM(so) == PF_INET6) {
5754 ncu = &sotoinpcb(so)->necp_client_uuid;
5755 } else {
5756 error = EINVAL;
5757 goto out;
5758 }
5759
5760 error = sooptcopyout(sopt, ncu, sizeof(uuid_t));
5761 break;
5762 }
5763 #endif /* NECP */
5764
5765 #if CONTENT_FILTER
5766 case SO_CFIL_SOCK_ID: {
5767 cfil_sock_id_t sock_id;
5768
5769 sock_id = cfil_sock_id_from_socket(so);
5770
5771 error = sooptcopyout(sopt, &sock_id,
5772 sizeof(cfil_sock_id_t));
5773 break;
5774 }
5775 #endif /* CONTENT_FILTER */
5776
5777 case SO_EXTENDED_BK_IDLE:
5778 optval = (so->so_flags1 & SOF1_EXTEND_BK_IDLE_WANTED);
5779 goto integer;
5780 case SO_MARK_CELLFALLBACK:
5781 optval = ((so->so_flags1 & SOF1_CELLFALLBACK) > 0)
5782 ? 1 : 0;
5783 goto integer;
5784 case SO_NET_SERVICE_TYPE: {
5785 if ((so->so_flags1 & SOF1_TC_NET_SERV_TYPE))
5786 optval = so->so_netsvctype;
5787 else
5788 optval = NET_SERVICE_TYPE_BE;
5789 goto integer;
5790 }
5791 case SO_NETSVC_MARKING_LEVEL:
5792 optval = so_get_netsvc_marking_level(so);
5793 goto integer;
5794
5795 default:
5796 error = ENOPROTOOPT;
5797 break;
5798 }
5799 }
5800 out:
5801 if (dolock)
5802 socket_unlock(so, 1);
5803 return (error);
5804 }
5805
5806 /*
5807 * The size limits on our soopt_getm is different from that on FreeBSD.
5808 * We limit the size of options to MCLBYTES. This will have to change
5809 * if we need to define options that need more space than MCLBYTES.
5810 */
5811 int
5812 soopt_getm(struct sockopt *sopt, struct mbuf **mp)
5813 {
5814 struct mbuf *m, *m_prev;
5815 int sopt_size = sopt->sopt_valsize;
5816 int how;
5817
5818 if (sopt_size <= 0 || sopt_size > MCLBYTES)
5819 return (EMSGSIZE);
5820
5821 how = sopt->sopt_p != kernproc ? M_WAIT : M_DONTWAIT;
5822 MGET(m, how, MT_DATA);
5823 if (m == NULL)
5824 return (ENOBUFS);
5825 if (sopt_size > MLEN) {
5826 MCLGET(m, how);
5827 if ((m->m_flags & M_EXT) == 0) {
5828 m_free(m);
5829 return (ENOBUFS);
5830 }
5831 m->m_len = min(MCLBYTES, sopt_size);
5832 } else {
5833 m->m_len = min(MLEN, sopt_size);
5834 }
5835 sopt_size -= m->m_len;
5836 *mp = m;
5837 m_prev = m;
5838
5839 while (sopt_size > 0) {
5840 MGET(m, how, MT_DATA);
5841 if (m == NULL) {
5842 m_freem(*mp);
5843 return (ENOBUFS);
5844 }
5845 if (sopt_size > MLEN) {
5846 MCLGET(m, how);
5847 if ((m->m_flags & M_EXT) == 0) {
5848 m_freem(*mp);
5849 m_freem(m);
5850 return (ENOBUFS);
5851 }
5852 m->m_len = min(MCLBYTES, sopt_size);
5853 } else {
5854 m->m_len = min(MLEN, sopt_size);
5855 }
5856 sopt_size -= m->m_len;
5857 m_prev->m_next = m;
5858 m_prev = m;
5859 }
5860 return (0);
5861 }
5862
5863 /* copyin sopt data into mbuf chain */
5864 int
5865 soopt_mcopyin(struct sockopt *sopt, struct mbuf *m)
5866 {
5867 struct mbuf *m0 = m;
5868
5869 if (sopt->sopt_val == USER_ADDR_NULL)
5870 return (0);
5871 while (m != NULL && sopt->sopt_valsize >= m->m_len) {
5872 if (sopt->sopt_p != kernproc) {
5873 int error;
5874
5875 error = copyin(sopt->sopt_val, mtod(m, char *),
5876 m->m_len);
5877 if (error != 0) {
5878 m_freem(m0);
5879 return (error);
5880 }
5881 } else {
5882 bcopy(CAST_DOWN(caddr_t, sopt->sopt_val),
5883 mtod(m, char *), m->m_len);
5884 }
5885 sopt->sopt_valsize -= m->m_len;
5886 sopt->sopt_val += m->m_len;
5887 m = m->m_next;
5888 }
5889 /* should be allocated enoughly at ip6_sooptmcopyin() */
5890 if (m != NULL) {
5891 panic("soopt_mcopyin");
5892 /* NOTREACHED */
5893 }
5894 return (0);
5895 }
5896
5897 /* copyout mbuf chain data into soopt */
5898 int
5899 soopt_mcopyout(struct sockopt *sopt, struct mbuf *m)
5900 {
5901 struct mbuf *m0 = m;
5902 size_t valsize = 0;
5903
5904 if (sopt->sopt_val == USER_ADDR_NULL)
5905 return (0);
5906 while (m != NULL && sopt->sopt_valsize >= m->m_len) {
5907 if (sopt->sopt_p != kernproc) {
5908 int error;
5909
5910 error = copyout(mtod(m, char *), sopt->sopt_val,
5911 m->m_len);
5912 if (error != 0) {
5913 m_freem(m0);
5914 return (error);
5915 }
5916 } else {
5917 bcopy(mtod(m, char *),
5918 CAST_DOWN(caddr_t, sopt->sopt_val), m->m_len);
5919 }
5920 sopt->sopt_valsize -= m->m_len;
5921 sopt->sopt_val += m->m_len;
5922 valsize += m->m_len;
5923 m = m->m_next;
5924 }
5925 if (m != NULL) {
5926 /* enough soopt buffer should be given from user-land */
5927 m_freem(m0);
5928 return (EINVAL);
5929 }
5930 sopt->sopt_valsize = valsize;
5931 return (0);
5932 }
5933
5934 void
5935 sohasoutofband(struct socket *so)
5936 {
5937 if (so->so_pgid < 0)
5938 gsignal(-so->so_pgid, SIGURG);
5939 else if (so->so_pgid > 0)
5940 proc_signal(so->so_pgid, SIGURG);
5941 selwakeup(&so->so_rcv.sb_sel);
5942 if (so->so_rcv.sb_flags & SB_KNOTE) {
5943 KNOTE(&so->so_rcv.sb_sel.si_note,
5944 (NOTE_OOB | SO_FILT_HINT_LOCKED));
5945 }
5946 }
5947
5948 int
5949 sopoll(struct socket *so, int events, kauth_cred_t cred, void * wql)
5950 {
5951 #pragma unused(cred)
5952 struct proc *p = current_proc();
5953 int revents = 0;
5954
5955 socket_lock(so, 1);
5956 so_update_last_owner_locked(so, PROC_NULL);
5957 so_update_policy(so);
5958
5959 if (events & (POLLIN | POLLRDNORM))
5960 if (soreadable(so))
5961 revents |= events & (POLLIN | POLLRDNORM);
5962
5963 if (events & (POLLOUT | POLLWRNORM))
5964 if (sowriteable(so))
5965 revents |= events & (POLLOUT | POLLWRNORM);
5966
5967 if (events & (POLLPRI | POLLRDBAND))
5968 if (so->so_oobmark || (so->so_state & SS_RCVATMARK))
5969 revents |= events & (POLLPRI | POLLRDBAND);
5970
5971 if (revents == 0) {
5972 if (events & (POLLIN | POLLPRI | POLLRDNORM | POLLRDBAND)) {
5973 /*
5974 * Darwin sets the flag first,
5975 * BSD calls selrecord first
5976 */
5977 so->so_rcv.sb_flags |= SB_SEL;
5978 selrecord(p, &so->so_rcv.sb_sel, wql);
5979 }
5980
5981 if (events & (POLLOUT | POLLWRNORM)) {
5982 /*
5983 * Darwin sets the flag first,
5984 * BSD calls selrecord first
5985 */
5986 so->so_snd.sb_flags |= SB_SEL;
5987 selrecord(p, &so->so_snd.sb_sel, wql);
5988 }
5989 }
5990
5991 socket_unlock(so, 1);
5992 return (revents);
5993 }
5994
5995 int
5996 soo_kqfilter(struct fileproc *fp, struct knote *kn,
5997 struct kevent_internal_s *kev, vfs_context_t ctx)
5998 {
5999 #pragma unused(fp)
6000 #if !CONFIG_MACF_SOCKET
6001 #pragma unused(ctx)
6002 #endif /* MAC_SOCKET */
6003 struct socket *so = (struct socket *)kn->kn_fp->f_fglob->fg_data;
6004 int result;
6005
6006 socket_lock(so, 1);
6007 so_update_last_owner_locked(so, PROC_NULL);
6008 so_update_policy(so);
6009
6010 #if CONFIG_MACF_SOCKET
6011 if (mac_socket_check_kqfilter(proc_ucred(vfs_context_proc(ctx)),
6012 kn, so) != 0) {
6013 socket_unlock(so, 1);
6014 kn->kn_flags = EV_ERROR;
6015 kn->kn_data = EPERM;
6016 return 0;
6017 }
6018 #endif /* MAC_SOCKET */
6019
6020 switch (kn->kn_filter) {
6021 case EVFILT_READ:
6022 kn->kn_filtid = EVFILTID_SOREAD;
6023 break;
6024 case EVFILT_WRITE:
6025 kn->kn_filtid = EVFILTID_SOWRITE;
6026 break;
6027 case EVFILT_SOCK:
6028 kn->kn_filtid = EVFILTID_SCK;
6029 break;
6030 case EVFILT_EXCEPT:
6031 kn->kn_filtid = EVFILTID_SOEXCEPT;
6032 break;
6033 default:
6034 socket_unlock(so, 1);
6035 kn->kn_flags = EV_ERROR;
6036 kn->kn_data = EINVAL;
6037 return 0;
6038 }
6039
6040 /*
6041 * call the appropriate sub-filter attach
6042 * with the socket still locked
6043 */
6044 result = knote_fops(kn)->f_attach(kn, kev);
6045
6046 socket_unlock(so, 1);
6047
6048 return result;
6049 }
6050
6051 static int
6052 filt_soread_common(struct knote *kn, struct socket *so)
6053 {
6054 if (so->so_options & SO_ACCEPTCONN) {
6055 int is_not_empty;
6056
6057 /*
6058 * Radar 6615193 handle the listen case dynamically
6059 * for kqueue read filter. This allows to call listen()
6060 * after registering the kqueue EVFILT_READ.
6061 */
6062
6063 kn->kn_data = so->so_qlen;
6064 is_not_empty = ! TAILQ_EMPTY(&so->so_comp);
6065
6066 return (is_not_empty);
6067 }
6068
6069 /* socket isn't a listener */
6070 /*
6071 * NOTE_LOWAT specifies new low water mark in data, i.e.
6072 * the bytes of protocol data. We therefore exclude any
6073 * control bytes.
6074 */
6075 kn->kn_data = so->so_rcv.sb_cc - so->so_rcv.sb_ctl;
6076
6077 if (kn->kn_sfflags & NOTE_OOB) {
6078 if (so->so_oobmark || (so->so_state & SS_RCVATMARK)) {
6079 kn->kn_fflags |= NOTE_OOB;
6080 kn->kn_data -= so->so_oobmark;
6081 return (1);
6082 }
6083 }
6084
6085 if ((so->so_state & SS_CANTRCVMORE)
6086 #if CONTENT_FILTER
6087 && cfil_sock_data_pending(&so->so_rcv) == 0
6088 #endif /* CONTENT_FILTER */
6089 ) {
6090 kn->kn_flags |= EV_EOF;
6091 kn->kn_fflags = so->so_error;
6092 return (1);
6093 }
6094
6095 if (so->so_error) { /* temporary udp error */
6096 return (1);
6097 }
6098
6099 int64_t lowwat = so->so_rcv.sb_lowat;
6100 /*
6101 * Ensure that when NOTE_LOWAT is used, the derived
6102 * low water mark is bounded by socket's rcv buf's
6103 * high and low water mark values.
6104 */
6105 if (kn->kn_sfflags & NOTE_LOWAT) {
6106 if (kn->kn_sdata > so->so_rcv.sb_hiwat)
6107 lowwat = so->so_rcv.sb_hiwat;
6108 else if (kn->kn_sdata > lowwat)
6109 lowwat = kn->kn_sdata;
6110 }
6111
6112 /*
6113 * The order below is important. Since NOTE_LOWAT
6114 * overrides sb_lowat, check for NOTE_LOWAT case
6115 * first.
6116 */
6117 if (kn->kn_sfflags & NOTE_LOWAT)
6118 return (kn->kn_data >= lowwat);
6119
6120 return (so->so_rcv.sb_cc >= lowwat);
6121 }
6122
6123 static int
6124 filt_sorattach(struct knote *kn, __unused struct kevent_internal_s *kev)
6125 {
6126 struct socket *so = (struct socket *)kn->kn_fp->f_fglob->fg_data;
6127
6128 /* socket locked */
6129
6130 /*
6131 * If the caller explicitly asked for OOB results (e.g. poll())
6132 * from EVFILT_READ, then save that off in the hookid field
6133 * and reserve the kn_flags EV_OOBAND bit for output only.
6134 */
6135 if (kn->kn_filter == EVFILT_READ &&
6136 kn->kn_flags & EV_OOBAND) {
6137 kn->kn_flags &= ~EV_OOBAND;
6138 kn->kn_hookid = EV_OOBAND;
6139 } else {
6140 kn->kn_hookid = 0;
6141 }
6142 if (KNOTE_ATTACH(&so->so_rcv.sb_sel.si_note, kn))
6143 so->so_rcv.sb_flags |= SB_KNOTE;
6144
6145 /* indicate if event is already fired */
6146 return filt_soread_common(kn, so);
6147 }
6148
6149 static void
6150 filt_sordetach(struct knote *kn)
6151 {
6152 struct socket *so = (struct socket *)kn->kn_fp->f_fglob->fg_data;
6153
6154 socket_lock(so, 1);
6155 if (so->so_rcv.sb_flags & SB_KNOTE)
6156 if (KNOTE_DETACH(&so->so_rcv.sb_sel.si_note, kn))
6157 so->so_rcv.sb_flags &= ~SB_KNOTE;
6158 socket_unlock(so, 1);
6159 }
6160
6161 /*ARGSUSED*/
6162 static int
6163 filt_soread(struct knote *kn, long hint)
6164 {
6165 struct socket *so = (struct socket *)kn->kn_fp->f_fglob->fg_data;
6166 int retval;
6167
6168 if ((hint & SO_FILT_HINT_LOCKED) == 0)
6169 socket_lock(so, 1);
6170
6171 retval = filt_soread_common(kn, so);
6172
6173 if ((hint & SO_FILT_HINT_LOCKED) == 0)
6174 socket_unlock(so, 1);
6175
6176 return retval;
6177 }
6178
6179 static int
6180 filt_sortouch(struct knote *kn, struct kevent_internal_s *kev)
6181 {
6182 struct socket *so = (struct socket *)kn->kn_fp->f_fglob->fg_data;
6183 int retval;
6184
6185 socket_lock(so, 1);
6186
6187 /* save off the new input fflags and data */
6188 kn->kn_sfflags = kev->fflags;
6189 kn->kn_sdata = kev->data;
6190 if ((kn->kn_status & KN_UDATA_SPECIFIC) == 0)
6191 kn->kn_udata = kev->udata;
6192
6193 /* determine if changes result in fired events */
6194 retval = filt_soread_common(kn, so);
6195
6196 socket_unlock(so, 1);
6197
6198 return retval;
6199 }
6200
6201 static int
6202 filt_sorprocess(struct knote *kn, struct filt_process_s *data, struct kevent_internal_s *kev)
6203 {
6204 #pragma unused(data)
6205 struct socket *so = (struct socket *)kn->kn_fp->f_fglob->fg_data;
6206 int retval;
6207
6208 socket_lock(so, 1);
6209 retval = filt_soread_common(kn, so);
6210 if (retval) {
6211 *kev = kn->kn_kevent;
6212 if (kn->kn_flags & EV_CLEAR) {
6213 kn->kn_fflags = 0;
6214 kn->kn_data = 0;
6215 }
6216 }
6217 socket_unlock(so, 1);
6218
6219 return retval;
6220 }
6221
6222 int
6223 so_wait_for_if_feedback(struct socket *so)
6224 {
6225 if ((SOCK_DOM(so) == PF_INET || SOCK_DOM(so) == PF_INET6) &&
6226 (so->so_state & SS_ISCONNECTED)) {
6227 struct inpcb *inp = sotoinpcb(so);
6228 if (INP_WAIT_FOR_IF_FEEDBACK(inp))
6229 return (1);
6230 }
6231 return (0);
6232 }
6233
6234 static int
6235 filt_sowrite_common(struct knote *kn, struct socket *so)
6236 {
6237 int ret = 0;
6238
6239 kn->kn_data = sbspace(&so->so_snd);
6240 if (so->so_state & SS_CANTSENDMORE) {
6241 kn->kn_flags |= EV_EOF;
6242 kn->kn_fflags = so->so_error;
6243 return 1;
6244 }
6245 if (so->so_error) { /* temporary udp error */
6246 return 1;
6247 }
6248 if (!socanwrite(so)) {
6249 return 0;
6250 }
6251 if (so->so_flags1 & SOF1_PRECONNECT_DATA) {
6252 return 1;
6253 }
6254 int64_t lowwat = so->so_snd.sb_lowat;
6255 if (kn->kn_sfflags & NOTE_LOWAT) {
6256 if (kn->kn_sdata > so->so_snd.sb_hiwat)
6257 lowwat = so->so_snd.sb_hiwat;
6258 else if (kn->kn_sdata > lowwat)
6259 lowwat = kn->kn_sdata;
6260 }
6261 if (kn->kn_data >= lowwat) {
6262 if ((so->so_flags & SOF_NOTSENT_LOWAT)
6263 #if (DEBUG || DEVELOPMENT)
6264 && so_notsent_lowat_check == 1
6265 #endif /* DEBUG || DEVELOPMENT */
6266 ) {
6267 if ((SOCK_DOM(so) == PF_INET ||
6268 SOCK_DOM(so) == PF_INET6) &&
6269 so->so_type == SOCK_STREAM) {
6270 ret = tcp_notsent_lowat_check(so);
6271 }
6272 #if MPTCP
6273 else if ((SOCK_DOM(so) == PF_MULTIPATH) &&
6274 (SOCK_PROTO(so) == IPPROTO_TCP)) {
6275 ret = mptcp_notsent_lowat_check(so);
6276 }
6277 #endif
6278 else {
6279 return 1;
6280 }
6281 } else {
6282 ret = 1;
6283 }
6284 }
6285 if (so_wait_for_if_feedback(so))
6286 ret = 0;
6287 return (ret);
6288 }
6289
6290 static int
6291 filt_sowattach(struct knote *kn, __unused struct kevent_internal_s *kev)
6292 {
6293 struct socket *so = (struct socket *)kn->kn_fp->f_fglob->fg_data;
6294
6295 /* socket locked */
6296 if (KNOTE_ATTACH(&so->so_snd.sb_sel.si_note, kn))
6297 so->so_snd.sb_flags |= SB_KNOTE;
6298
6299 /* determine if its already fired */
6300 return filt_sowrite_common(kn, so);
6301 }
6302
6303 static void
6304 filt_sowdetach(struct knote *kn)
6305 {
6306 struct socket *so = (struct socket *)kn->kn_fp->f_fglob->fg_data;
6307 socket_lock(so, 1);
6308
6309 if (so->so_snd.sb_flags & SB_KNOTE)
6310 if (KNOTE_DETACH(&so->so_snd.sb_sel.si_note, kn))
6311 so->so_snd.sb_flags &= ~SB_KNOTE;
6312 socket_unlock(so, 1);
6313 }
6314
6315 /*ARGSUSED*/
6316 static int
6317 filt_sowrite(struct knote *kn, long hint)
6318 {
6319 struct socket *so = (struct socket *)kn->kn_fp->f_fglob->fg_data;
6320 int ret;
6321
6322 if ((hint & SO_FILT_HINT_LOCKED) == 0)
6323 socket_lock(so, 1);
6324
6325 ret = filt_sowrite_common(kn, so);
6326
6327 if ((hint & SO_FILT_HINT_LOCKED) == 0)
6328 socket_unlock(so, 1);
6329
6330 return ret;
6331 }
6332
6333 static int
6334 filt_sowtouch(struct knote *kn, struct kevent_internal_s *kev)
6335 {
6336 struct socket *so = (struct socket *)kn->kn_fp->f_fglob->fg_data;
6337 int ret;
6338
6339 socket_lock(so, 1);
6340
6341 /*save off the new input fflags and data */
6342 kn->kn_sfflags = kev->fflags;
6343 kn->kn_sdata = kev->data;
6344 if ((kn->kn_status & KN_UDATA_SPECIFIC) == 0)
6345 kn->kn_udata = kev->udata;
6346
6347 /* determine if these changes result in a triggered event */
6348 ret = filt_sowrite_common(kn, so);
6349
6350 socket_unlock(so, 1);
6351
6352 return ret;
6353 }
6354
6355 static int
6356 filt_sowprocess(struct knote *kn, struct filt_process_s *data, struct kevent_internal_s *kev)
6357 {
6358 #pragma unused(data)
6359 struct socket *so = (struct socket *)kn->kn_fp->f_fglob->fg_data;
6360 int ret;
6361
6362 socket_lock(so, 1);
6363 ret = filt_sowrite_common(kn, so);
6364 if (ret) {
6365 *kev = kn->kn_kevent;
6366 if (kn->kn_flags & EV_CLEAR) {
6367 kn->kn_fflags = 0;
6368 kn->kn_data = 0;
6369 }
6370 }
6371 socket_unlock(so, 1);
6372 return ret;
6373 }
6374
6375 static int
6376 filt_sockev_common(struct knote *kn, struct socket *so, long ev_hint)
6377 {
6378 int ret = 0;
6379 uint32_t level_trigger = 0;
6380
6381 if (ev_hint & SO_FILT_HINT_CONNRESET) {
6382 kn->kn_fflags |= NOTE_CONNRESET;
6383 }
6384 if (ev_hint & SO_FILT_HINT_TIMEOUT) {
6385 kn->kn_fflags |= NOTE_TIMEOUT;
6386 }
6387 if (ev_hint & SO_FILT_HINT_NOSRCADDR) {
6388 kn->kn_fflags |= NOTE_NOSRCADDR;
6389 }
6390 if (ev_hint & SO_FILT_HINT_IFDENIED) {
6391 kn->kn_fflags |= NOTE_IFDENIED;
6392 }
6393 if (ev_hint & SO_FILT_HINT_KEEPALIVE) {
6394 kn->kn_fflags |= NOTE_KEEPALIVE;
6395 }
6396 if (ev_hint & SO_FILT_HINT_ADAPTIVE_WTIMO) {
6397 kn->kn_fflags |= NOTE_ADAPTIVE_WTIMO;
6398 }
6399 if (ev_hint & SO_FILT_HINT_ADAPTIVE_RTIMO) {
6400 kn->kn_fflags |= NOTE_ADAPTIVE_RTIMO;
6401 }
6402 if ((ev_hint & SO_FILT_HINT_CONNECTED) ||
6403 (so->so_state & SS_ISCONNECTED)) {
6404 kn->kn_fflags |= NOTE_CONNECTED;
6405 level_trigger |= NOTE_CONNECTED;
6406 }
6407 if ((ev_hint & SO_FILT_HINT_DISCONNECTED) ||
6408 (so->so_state & SS_ISDISCONNECTED)) {
6409 kn->kn_fflags |= NOTE_DISCONNECTED;
6410 level_trigger |= NOTE_DISCONNECTED;
6411 }
6412 if (ev_hint & SO_FILT_HINT_CONNINFO_UPDATED) {
6413 if (so->so_proto != NULL &&
6414 (so->so_proto->pr_flags & PR_EVCONNINFO))
6415 kn->kn_fflags |= NOTE_CONNINFO_UPDATED;
6416 }
6417
6418 if ((ev_hint & SO_FILT_HINT_NOTIFY_ACK) ||
6419 tcp_notify_ack_active(so)) {
6420 kn->kn_fflags |= NOTE_NOTIFY_ACK;
6421 }
6422
6423 if ((so->so_state & SS_CANTRCVMORE)
6424 #if CONTENT_FILTER
6425 && cfil_sock_data_pending(&so->so_rcv) == 0
6426 #endif /* CONTENT_FILTER */
6427 ) {
6428 kn->kn_fflags |= NOTE_READCLOSED;
6429 level_trigger |= NOTE_READCLOSED;
6430 }
6431
6432 if (so->so_state & SS_CANTSENDMORE) {
6433 kn->kn_fflags |= NOTE_WRITECLOSED;
6434 level_trigger |= NOTE_WRITECLOSED;
6435 }
6436
6437 if ((ev_hint & SO_FILT_HINT_SUSPEND) ||
6438 (so->so_flags & SOF_SUSPENDED)) {
6439 kn->kn_fflags &= ~(NOTE_SUSPEND | NOTE_RESUME);
6440
6441 /* If resume event was delivered before, reset it */
6442 kn->kn_hookid &= ~NOTE_RESUME;
6443
6444 kn->kn_fflags |= NOTE_SUSPEND;
6445 level_trigger |= NOTE_SUSPEND;
6446 }
6447
6448 if ((ev_hint & SO_FILT_HINT_RESUME) ||
6449 (so->so_flags & SOF_SUSPENDED) == 0) {
6450 kn->kn_fflags &= ~(NOTE_SUSPEND | NOTE_RESUME);
6451
6452 /* If suspend event was delivered before, reset it */
6453 kn->kn_hookid &= ~NOTE_SUSPEND;
6454
6455 kn->kn_fflags |= NOTE_RESUME;
6456 level_trigger |= NOTE_RESUME;
6457 }
6458
6459 if (so->so_error != 0) {
6460 ret = 1;
6461 kn->kn_data = so->so_error;
6462 kn->kn_flags |= EV_EOF;
6463 } else {
6464 get_sockev_state(so, (u_int32_t *)&(kn->kn_data));
6465 }
6466
6467 /* Reset any events that are not requested on this knote */
6468 kn->kn_fflags &= (kn->kn_sfflags & EVFILT_SOCK_ALL_MASK);
6469 level_trigger &= (kn->kn_sfflags & EVFILT_SOCK_ALL_MASK);
6470
6471 /* Find the level triggerred events that are already delivered */
6472 level_trigger &= kn->kn_hookid;
6473 level_trigger &= EVFILT_SOCK_LEVEL_TRIGGER_MASK;
6474
6475 /* Do not deliver level triggerred events more than once */
6476 if ((kn->kn_fflags & ~level_trigger) != 0)
6477 ret = 1;
6478
6479 return (ret);
6480 }
6481
6482 static int
6483 filt_sockattach(struct knote *kn, __unused struct kevent_internal_s *kev)
6484 {
6485 struct socket *so = (struct socket *)kn->kn_fp->f_fglob->fg_data;
6486
6487 /* socket locked */
6488 kn->kn_hookid = 0;
6489 if (KNOTE_ATTACH(&so->so_klist, kn))
6490 so->so_flags |= SOF_KNOTE;
6491
6492 /* determine if event already fired */
6493 return filt_sockev_common(kn, so, 0);
6494 }
6495
6496 static void
6497 filt_sockdetach(struct knote *kn)
6498 {
6499 struct socket *so = (struct socket *)kn->kn_fp->f_fglob->fg_data;
6500 socket_lock(so, 1);
6501
6502 if ((so->so_flags & SOF_KNOTE) != 0)
6503 if (KNOTE_DETACH(&so->so_klist, kn))
6504 so->so_flags &= ~SOF_KNOTE;
6505 socket_unlock(so, 1);
6506 }
6507
6508 static int
6509 filt_sockev(struct knote *kn, long hint)
6510 {
6511 int ret = 0, locked = 0;
6512 struct socket *so = (struct socket *)kn->kn_fp->f_fglob->fg_data;
6513 long ev_hint = (hint & SO_FILT_HINT_EV);
6514
6515 if ((hint & SO_FILT_HINT_LOCKED) == 0) {
6516 socket_lock(so, 1);
6517 locked = 1;
6518 }
6519
6520 ret = filt_sockev_common(kn, so, ev_hint);
6521
6522 if (locked)
6523 socket_unlock(so, 1);
6524
6525 return ret;
6526 }
6527
6528
6529
6530 /*
6531 * filt_socktouch - update event state
6532 */
6533 static int
6534 filt_socktouch(
6535 struct knote *kn,
6536 struct kevent_internal_s *kev)
6537 {
6538 struct socket *so = (struct socket *)kn->kn_fp->f_fglob->fg_data;
6539 uint32_t changed_flags;
6540 int ret;
6541
6542 socket_lock(so, 1);
6543
6544 /* save off the [result] data and fflags */
6545 changed_flags = (kn->kn_sfflags ^ kn->kn_hookid);
6546
6547 /* save off the new input fflags and data */
6548 kn->kn_sfflags = kev->fflags;
6549 kn->kn_sdata = kev->data;
6550 if ((kn->kn_status & KN_UDATA_SPECIFIC) == 0)
6551 kn->kn_udata = kev->udata;
6552
6553 /* restrict the current results to the (smaller?) set of new interest */
6554 /*
6555 * For compatibility with previous implementations, we leave kn_fflags
6556 * as they were before.
6557 */
6558 //kn->kn_fflags &= kev->fflags;
6559
6560 /*
6561 * Since we keep track of events that are already
6562 * delivered, if any of those events are not requested
6563 * anymore the state related to them can be reset
6564 */
6565 kn->kn_hookid &=
6566 ~(changed_flags & EVFILT_SOCK_LEVEL_TRIGGER_MASK);
6567
6568 /* determine if we have events to deliver */
6569 ret = filt_sockev_common(kn, so, 0);
6570
6571 socket_unlock(so, 1);
6572
6573 return ret;
6574 }
6575
6576 /*
6577 * filt_sockprocess - query event fired state and return data
6578 */
6579 static int
6580 filt_sockprocess(
6581 struct knote *kn,
6582 struct filt_process_s *data,
6583 struct kevent_internal_s *kev)
6584 {
6585 #pragma unused(data)
6586
6587 struct socket *so = (struct socket *)kn->kn_fp->f_fglob->fg_data;
6588 int ret = 0;
6589
6590 socket_lock(so, 1);
6591
6592 ret = filt_sockev_common(kn, so, 0);
6593 if (ret) {
6594 *kev = kn->kn_kevent;
6595
6596 /*
6597 * Store the state of the events being delivered. This
6598 * state can be used to deliver level triggered events
6599 * ateast once and still avoid waking up the application
6600 * multiple times as long as the event is active.
6601 */
6602 if (kn->kn_fflags != 0)
6603 kn->kn_hookid |= (kn->kn_fflags &
6604 EVFILT_SOCK_LEVEL_TRIGGER_MASK);
6605
6606 /*
6607 * NOTE_RESUME and NOTE_SUSPEND are an exception, deliver
6608 * only one of them and remember the last one that was
6609 * delivered last
6610 */
6611 if (kn->kn_fflags & NOTE_SUSPEND)
6612 kn->kn_hookid &= ~NOTE_RESUME;
6613 if (kn->kn_fflags & NOTE_RESUME)
6614 kn->kn_hookid &= ~NOTE_SUSPEND;
6615
6616 if (kn->kn_flags & EV_CLEAR) {
6617 kn->kn_data = 0;
6618 kn->kn_fflags = 0;
6619 }
6620 }
6621
6622 socket_unlock(so, 1);
6623
6624 return ret;
6625 }
6626
6627 void
6628 get_sockev_state(struct socket *so, u_int32_t *statep)
6629 {
6630 u_int32_t state = *(statep);
6631
6632 /*
6633 * If the state variable is already used by a previous event,
6634 * reset it.
6635 */
6636 if (state != 0)
6637 return;
6638
6639 if (so->so_state & SS_ISCONNECTED)
6640 state |= SOCKEV_CONNECTED;
6641 else
6642 state &= ~(SOCKEV_CONNECTED);
6643 state |= ((so->so_state & SS_ISDISCONNECTED) ? SOCKEV_DISCONNECTED : 0);
6644 *(statep) = state;
6645 }
6646
6647 #define SO_LOCK_HISTORY_STR_LEN \
6648 (2 * SO_LCKDBG_MAX * (2 + (2 * sizeof (void *)) + 1) + 1)
6649
6650 __private_extern__ const char *
6651 solockhistory_nr(struct socket *so)
6652 {
6653 size_t n = 0;
6654 int i;
6655 static char lock_history_str[SO_LOCK_HISTORY_STR_LEN];
6656
6657 bzero(lock_history_str, sizeof (lock_history_str));
6658 for (i = SO_LCKDBG_MAX - 1; i >= 0; i--) {
6659 n += snprintf(lock_history_str + n,
6660 SO_LOCK_HISTORY_STR_LEN - n, "%p:%p ",
6661 so->lock_lr[(so->next_lock_lr + i) % SO_LCKDBG_MAX],
6662 so->unlock_lr[(so->next_unlock_lr + i) % SO_LCKDBG_MAX]);
6663 }
6664 return (lock_history_str);
6665 }
6666
6667 void
6668 socket_lock(struct socket *so, int refcount)
6669 {
6670 void *lr_saved;
6671
6672 lr_saved = __builtin_return_address(0);
6673
6674 if (so->so_proto->pr_lock) {
6675 (*so->so_proto->pr_lock)(so, refcount, lr_saved);
6676 } else {
6677 #ifdef MORE_LOCKING_DEBUG
6678 LCK_MTX_ASSERT(so->so_proto->pr_domain->dom_mtx,
6679 LCK_MTX_ASSERT_NOTOWNED);
6680 #endif
6681 lck_mtx_lock(so->so_proto->pr_domain->dom_mtx);
6682 if (refcount)
6683 so->so_usecount++;
6684 so->lock_lr[so->next_lock_lr] = lr_saved;
6685 so->next_lock_lr = (so->next_lock_lr+1) % SO_LCKDBG_MAX;
6686 }
6687 }
6688
6689 void
6690 socket_lock_assert_owned(struct socket *so)
6691 {
6692 lck_mtx_t *mutex_held;
6693
6694 if (so->so_proto->pr_getlock != NULL)
6695 mutex_held = (*so->so_proto->pr_getlock)(so, 0);
6696 else
6697 mutex_held = so->so_proto->pr_domain->dom_mtx;
6698
6699 LCK_MTX_ASSERT(mutex_held, LCK_MTX_ASSERT_OWNED);
6700 }
6701
6702 int
6703 socket_try_lock(struct socket *so)
6704 {
6705 lck_mtx_t *mtx;
6706
6707 if (so->so_proto->pr_getlock != NULL)
6708 mtx = (*so->so_proto->pr_getlock)(so, 0);
6709 else
6710 mtx = so->so_proto->pr_domain->dom_mtx;
6711
6712 return (lck_mtx_try_lock(mtx));
6713 }
6714
6715 void
6716 socket_unlock(struct socket *so, int refcount)
6717 {
6718 void *lr_saved;
6719 lck_mtx_t *mutex_held;
6720
6721 lr_saved = __builtin_return_address(0);
6722
6723 if (so->so_proto == NULL) {
6724 panic("%s: null so_proto so=%p\n", __func__, so);
6725 /* NOTREACHED */
6726 }
6727
6728 if (so && so->so_proto->pr_unlock) {
6729 (*so->so_proto->pr_unlock)(so, refcount, lr_saved);
6730 } else {
6731 mutex_held = so->so_proto->pr_domain->dom_mtx;
6732 #ifdef MORE_LOCKING_DEBUG
6733 LCK_MTX_ASSERT(mutex_held, LCK_MTX_ASSERT_OWNED);
6734 #endif
6735 so->unlock_lr[so->next_unlock_lr] = lr_saved;
6736 so->next_unlock_lr = (so->next_unlock_lr+1) % SO_LCKDBG_MAX;
6737
6738 if (refcount) {
6739 if (so->so_usecount <= 0) {
6740 panic("%s: bad refcount=%d so=%p (%d, %d, %d) "
6741 "lrh=%s", __func__, so->so_usecount, so,
6742 SOCK_DOM(so), so->so_type,
6743 SOCK_PROTO(so), solockhistory_nr(so));
6744 /* NOTREACHED */
6745 }
6746
6747 so->so_usecount--;
6748 if (so->so_usecount == 0)
6749 sofreelastref(so, 1);
6750 }
6751 lck_mtx_unlock(mutex_held);
6752 }
6753 }
6754
6755 /* Called with socket locked, will unlock socket */
6756 void
6757 sofree(struct socket *so)
6758 {
6759 lck_mtx_t *mutex_held;
6760
6761 if (so->so_proto->pr_getlock != NULL)
6762 mutex_held = (*so->so_proto->pr_getlock)(so, 0);
6763 else
6764 mutex_held = so->so_proto->pr_domain->dom_mtx;
6765 LCK_MTX_ASSERT(mutex_held, LCK_MTX_ASSERT_OWNED);
6766
6767 sofreelastref(so, 0);
6768 }
6769
6770 void
6771 soreference(struct socket *so)
6772 {
6773 socket_lock(so, 1); /* locks & take one reference on socket */
6774 socket_unlock(so, 0); /* unlock only */
6775 }
6776
6777 void
6778 sodereference(struct socket *so)
6779 {
6780 socket_lock(so, 0);
6781 socket_unlock(so, 1);
6782 }
6783
6784 /*
6785 * Set or clear SOF_MULTIPAGES on the socket to enable or disable the
6786 * possibility of using jumbo clusters. Caller must ensure to hold
6787 * the socket lock.
6788 */
6789 void
6790 somultipages(struct socket *so, boolean_t set)
6791 {
6792 if (set)
6793 so->so_flags |= SOF_MULTIPAGES;
6794 else
6795 so->so_flags &= ~SOF_MULTIPAGES;
6796 }
6797
6798 void
6799 soif2kcl(struct socket *so, boolean_t set)
6800 {
6801 if (set)
6802 so->so_flags1 |= SOF1_IF_2KCL;
6803 else
6804 so->so_flags1 &= ~SOF1_IF_2KCL;
6805 }
6806
6807 int
6808 so_isdstlocal(struct socket *so) {
6809
6810 struct inpcb *inp = (struct inpcb *)so->so_pcb;
6811
6812 if (SOCK_DOM(so) == PF_INET)
6813 return (inaddr_local(inp->inp_faddr));
6814 else if (SOCK_DOM(so) == PF_INET6)
6815 return (in6addr_local(&inp->in6p_faddr));
6816
6817 return (0);
6818 }
6819
6820 int
6821 sosetdefunct(struct proc *p, struct socket *so, int level, boolean_t noforce)
6822 {
6823 struct sockbuf *rcv, *snd;
6824 int err = 0, defunct;
6825
6826 rcv = &so->so_rcv;
6827 snd = &so->so_snd;
6828
6829 defunct = (so->so_flags & SOF_DEFUNCT);
6830 if (defunct) {
6831 if (!(snd->sb_flags & rcv->sb_flags & SB_DROP)) {
6832 panic("%s: SB_DROP not set", __func__);
6833 /* NOTREACHED */
6834 }
6835 goto done;
6836 }
6837
6838 if (so->so_flags & SOF_NODEFUNCT) {
6839 if (noforce) {
6840 err = EOPNOTSUPP;
6841 SODEFUNCTLOG("%s[%d, %s]: (target pid %d "
6842 "name %s level %d) so 0x%llx [%d,%d] "
6843 "is not eligible for defunct "
6844 "(%d)\n", __func__, proc_selfpid(),
6845 proc_best_name(current_proc()), proc_pid(p),
6846 proc_best_name(p), level,
6847 (uint64_t)DEBUG_KERNEL_ADDRPERM(so),
6848 SOCK_DOM(so), SOCK_TYPE(so), err);
6849 return (err);
6850 }
6851 so->so_flags &= ~SOF_NODEFUNCT;
6852 SODEFUNCTLOG("%s[%d, %s]: (target pid %d name %s level %d) "
6853 "so 0x%llx [%d,%d] defunct by force\n", __func__,
6854 proc_selfpid(), proc_best_name(current_proc()),
6855 proc_pid(p), proc_best_name(p), level,
6856 (uint64_t)DEBUG_KERNEL_ADDRPERM(so),
6857 SOCK_DOM(so), SOCK_TYPE(so));
6858 } else if (so->so_flags1 & SOF1_EXTEND_BK_IDLE_WANTED) {
6859 struct inpcb *inp = (struct inpcb *)so->so_pcb;
6860 struct ifnet *ifp = inp->inp_last_outifp;
6861
6862 if (ifp && IFNET_IS_CELLULAR(ifp)) {
6863 OSIncrementAtomic(&soextbkidlestat.so_xbkidle_nocell);
6864 } else if (so->so_flags & SOF_DELEGATED) {
6865 OSIncrementAtomic(&soextbkidlestat.so_xbkidle_nodlgtd);
6866 } else if (soextbkidlestat.so_xbkidle_time == 0) {
6867 OSIncrementAtomic(&soextbkidlestat.so_xbkidle_notime);
6868 } else if (noforce) {
6869 OSIncrementAtomic(&soextbkidlestat.so_xbkidle_active);
6870
6871 so->so_flags1 |= SOF1_EXTEND_BK_IDLE_INPROG;
6872 so->so_extended_bk_start = net_uptime();
6873 OSBitOrAtomic(P_LXBKIDLEINPROG, &p->p_ladvflag);
6874
6875 inpcb_timer_sched(inp->inp_pcbinfo, INPCB_TIMER_LAZY);
6876
6877 err = EOPNOTSUPP;
6878 SODEFUNCTLOG("%s[%d, %s]: (target pid %d name %s "
6879 "level %d) extend bk idle so 0x%llx rcv hw %d "
6880 "cc %d\n",
6881 __func__, proc_selfpid(),
6882 proc_best_name(current_proc()), proc_pid(p),
6883 proc_best_name(p), level,
6884 (uint64_t)DEBUG_KERNEL_ADDRPERM(so),
6885 so->so_rcv.sb_hiwat, so->so_rcv.sb_cc);
6886 return (err);
6887 } else {
6888 OSIncrementAtomic(&soextbkidlestat.so_xbkidle_forced);
6889 }
6890 }
6891
6892 so->so_flags |= SOF_DEFUNCT;
6893
6894 /* Prevent further data from being appended to the socket buffers */
6895 snd->sb_flags |= SB_DROP;
6896 rcv->sb_flags |= SB_DROP;
6897
6898 /* Flush any existing data in the socket buffers */
6899 if (rcv->sb_cc != 0) {
6900 rcv->sb_flags &= ~SB_SEL;
6901 selthreadclear(&rcv->sb_sel);
6902 sbrelease(rcv);
6903 }
6904 if (snd->sb_cc != 0) {
6905 snd->sb_flags &= ~SB_SEL;
6906 selthreadclear(&snd->sb_sel);
6907 sbrelease(snd);
6908 }
6909
6910 done:
6911 SODEFUNCTLOG("%s[%d, %s]: (target pid %d name %s level %d) "
6912 "so 0x%llx [%d,%d] %s defunct%s\n", __func__, proc_selfpid(),
6913 proc_best_name(current_proc()), proc_pid(p), proc_best_name(p),
6914 level, (uint64_t)DEBUG_KERNEL_ADDRPERM(so), SOCK_DOM(so),
6915 SOCK_TYPE(so), defunct ? "is already" : "marked as",
6916 (so->so_flags1 & SOF1_EXTEND_BK_IDLE_WANTED) ? " extbkidle" : "");
6917
6918 return (err);
6919 }
6920
6921 int
6922 sodefunct(struct proc *p, struct socket *so, int level)
6923 {
6924 struct sockbuf *rcv, *snd;
6925
6926 if (!(so->so_flags & SOF_DEFUNCT)) {
6927 panic("%s improperly called", __func__);
6928 /* NOTREACHED */
6929 }
6930 if (so->so_state & SS_DEFUNCT)
6931 goto done;
6932
6933 rcv = &so->so_rcv;
6934 snd = &so->so_snd;
6935
6936 if (SOCK_DOM(so) == PF_INET || SOCK_DOM(so) == PF_INET6) {
6937 char s[MAX_IPv6_STR_LEN];
6938 char d[MAX_IPv6_STR_LEN];
6939 struct inpcb *inp = sotoinpcb(so);
6940
6941 SODEFUNCTLOG("%s[%d, %s]: (target pid %d name %s level %d) "
6942 "so 0x%llx [%s %s:%d -> %s:%d] is now defunct "
6943 "[rcv_si 0x%x, snd_si 0x%x, rcv_fl 0x%x, snd_fl 0x%x]\n",
6944 __func__, proc_selfpid(), proc_best_name(current_proc()),
6945 proc_pid(p), proc_best_name(p), level,
6946 (uint64_t)DEBUG_KERNEL_ADDRPERM(so),
6947 (SOCK_TYPE(so) == SOCK_STREAM) ? "TCP" : "UDP",
6948 inet_ntop(SOCK_DOM(so), ((SOCK_DOM(so) == PF_INET) ?
6949 (void *)&inp->inp_laddr.s_addr : (void *)&inp->in6p_laddr),
6950 s, sizeof (s)), ntohs(inp->in6p_lport),
6951 inet_ntop(SOCK_DOM(so), (SOCK_DOM(so) == PF_INET) ?
6952 (void *)&inp->inp_faddr.s_addr : (void *)&inp->in6p_faddr,
6953 d, sizeof (d)), ntohs(inp->in6p_fport),
6954 (uint32_t)rcv->sb_sel.si_flags,
6955 (uint32_t)snd->sb_sel.si_flags,
6956 rcv->sb_flags, snd->sb_flags);
6957 } else {
6958 SODEFUNCTLOG("%s[%d, %s]: (target pid %d name %s level %d) "
6959 "so 0x%llx [%d,%d] is now defunct [rcv_si 0x%x, "
6960 "snd_si 0x%x, rcv_fl 0x%x, snd_fl 0x%x]\n", __func__,
6961 proc_selfpid(), proc_best_name(current_proc()),
6962 proc_pid(p), proc_best_name(p), level,
6963 (uint64_t)DEBUG_KERNEL_ADDRPERM(so),
6964 SOCK_DOM(so), SOCK_TYPE(so),
6965 (uint32_t)rcv->sb_sel.si_flags,
6966 (uint32_t)snd->sb_sel.si_flags, rcv->sb_flags,
6967 snd->sb_flags);
6968 }
6969
6970 /*
6971 * Unwedge threads blocked on sbwait() and sb_lock().
6972 */
6973 sbwakeup(rcv);
6974 sbwakeup(snd);
6975
6976 so->so_flags1 |= SOF1_DEFUNCTINPROG;
6977 if (rcv->sb_flags & SB_LOCK)
6978 sbunlock(rcv, TRUE); /* keep socket locked */
6979 if (snd->sb_flags & SB_LOCK)
6980 sbunlock(snd, TRUE); /* keep socket locked */
6981
6982 /*
6983 * Flush the buffers and disconnect. We explicitly call shutdown
6984 * on both data directions to ensure that SS_CANT{RCV,SEND}MORE
6985 * states are set for the socket. This would also flush out data
6986 * hanging off the receive list of this socket.
6987 */
6988 (void) soshutdownlock_final(so, SHUT_RD);
6989 (void) soshutdownlock_final(so, SHUT_WR);
6990 (void) sodisconnectlocked(so);
6991
6992 /*
6993 * Explicitly handle connectionless-protocol disconnection
6994 * and release any remaining data in the socket buffers.
6995 */
6996 if (!(so->so_state & SS_ISDISCONNECTED))
6997 (void) soisdisconnected(so);
6998
6999 if (so->so_error == 0)
7000 so->so_error = EBADF;
7001
7002 if (rcv->sb_cc != 0) {
7003 rcv->sb_flags &= ~SB_SEL;
7004 selthreadclear(&rcv->sb_sel);
7005 sbrelease(rcv);
7006 }
7007 if (snd->sb_cc != 0) {
7008 snd->sb_flags &= ~SB_SEL;
7009 selthreadclear(&snd->sb_sel);
7010 sbrelease(snd);
7011 }
7012 so->so_state |= SS_DEFUNCT;
7013 OSIncrementAtomicLong((volatile long *)&sodefunct_calls);
7014
7015 done:
7016 return (0);
7017 }
7018
7019 int
7020 soresume(struct proc *p, struct socket *so, int locked)
7021 {
7022 if (locked == 0)
7023 socket_lock(so, 1);
7024
7025 if (so->so_flags1 & SOF1_EXTEND_BK_IDLE_INPROG) {
7026 SODEFUNCTLOG("%s[%d, %s]: (target pid %d name %s) so 0x%llx "
7027 "[%d,%d] resumed from bk idle\n",
7028 __func__, proc_selfpid(), proc_best_name(current_proc()),
7029 proc_pid(p), proc_best_name(p),
7030 (uint64_t)DEBUG_KERNEL_ADDRPERM(so),
7031 SOCK_DOM(so), SOCK_TYPE(so));
7032
7033 so->so_flags1 &= ~SOF1_EXTEND_BK_IDLE_INPROG;
7034 so->so_extended_bk_start = 0;
7035 OSBitAndAtomic(~P_LXBKIDLEINPROG, &p->p_ladvflag);
7036
7037 OSIncrementAtomic(&soextbkidlestat.so_xbkidle_resumed);
7038 OSDecrementAtomic(&soextbkidlestat.so_xbkidle_active);
7039 VERIFY(soextbkidlestat.so_xbkidle_active >= 0);
7040 }
7041 if (locked == 0)
7042 socket_unlock(so, 1);
7043
7044 return (0);
7045 }
7046
7047 /*
7048 * Does not attempt to account for sockets that are delegated from
7049 * the current process
7050 */
7051 int
7052 so_set_extended_bk_idle(struct socket *so, int optval)
7053 {
7054 int error = 0;
7055
7056 if ((SOCK_DOM(so) != PF_INET && SOCK_DOM(so) != PF_INET6) ||
7057 SOCK_PROTO(so) != IPPROTO_TCP) {
7058 OSDecrementAtomic(&soextbkidlestat.so_xbkidle_notsupp);
7059 error = EOPNOTSUPP;
7060 } else if (optval == 0) {
7061 so->so_flags1 &= ~SOF1_EXTEND_BK_IDLE_WANTED;
7062
7063 soresume(current_proc(), so, 1);
7064 } else {
7065 struct proc *p = current_proc();
7066 int i;
7067 struct filedesc *fdp;
7068 int count = 0;
7069
7070 /*
7071 * Unlock socket to avoid lock ordering issue with
7072 * the proc fd table lock
7073 */
7074 socket_unlock(so, 0);
7075
7076 proc_fdlock(p);
7077
7078 fdp = p->p_fd;
7079 for (i = 0; i < fdp->fd_nfiles; i++) {
7080 struct fileproc *fp = fdp->fd_ofiles[i];
7081 struct socket *so2;
7082
7083 if (fp == NULL ||
7084 (fdp->fd_ofileflags[i] & UF_RESERVED) != 0 ||
7085 FILEGLOB_DTYPE(fp->f_fglob) != DTYPE_SOCKET)
7086 continue;
7087
7088 so2 = (struct socket *)fp->f_fglob->fg_data;
7089 if (so != so2 &&
7090 so2->so_flags1 & SOF1_EXTEND_BK_IDLE_WANTED)
7091 count++;
7092 if (count >= soextbkidlestat.so_xbkidle_maxperproc)
7093 break;
7094 }
7095 proc_fdunlock(p);
7096
7097 socket_lock(so, 0);
7098
7099 if (count >= soextbkidlestat.so_xbkidle_maxperproc) {
7100 OSIncrementAtomic(&soextbkidlestat.so_xbkidle_toomany);
7101 error = EBUSY;
7102 } else if (so->so_flags & SOF_DELEGATED) {
7103 OSIncrementAtomic(&soextbkidlestat.so_xbkidle_nodlgtd);
7104 error = EBUSY;
7105 } else {
7106 so->so_flags1 |= SOF1_EXTEND_BK_IDLE_WANTED;
7107 OSIncrementAtomic(&soextbkidlestat.so_xbkidle_wantok);
7108 }
7109 SODEFUNCTLOG("%s[%d, %s]: so 0x%llx [%d,%d] "
7110 "%s marked for extended bk idle\n",
7111 __func__, proc_selfpid(), proc_best_name(current_proc()),
7112 (uint64_t)DEBUG_KERNEL_ADDRPERM(so),
7113 SOCK_DOM(so), SOCK_TYPE(so),
7114 (so->so_flags1 & SOF1_EXTEND_BK_IDLE_WANTED) ?
7115 "is" : "not");
7116 }
7117
7118 return (error);
7119 }
7120
7121 static void
7122 so_stop_extended_bk_idle(struct socket *so)
7123 {
7124 so->so_flags1 &= ~SOF1_EXTEND_BK_IDLE_INPROG;
7125 so->so_extended_bk_start = 0;
7126
7127 OSDecrementAtomic(&soextbkidlestat.so_xbkidle_active);
7128 VERIFY(soextbkidlestat.so_xbkidle_active >= 0);
7129 /*
7130 * Force defunct
7131 */
7132 sosetdefunct(current_proc(), so,
7133 SHUTDOWN_SOCKET_LEVEL_DISCONNECT_INTERNAL, FALSE);
7134 if (so->so_flags & SOF_DEFUNCT) {
7135 sodefunct(current_proc(), so,
7136 SHUTDOWN_SOCKET_LEVEL_DISCONNECT_INTERNAL);
7137 }
7138 }
7139
7140 void
7141 so_drain_extended_bk_idle(struct socket *so)
7142 {
7143 if (so && (so->so_flags1 & SOF1_EXTEND_BK_IDLE_INPROG)) {
7144 /*
7145 * Only penalize sockets that have outstanding data
7146 */
7147 if (so->so_rcv.sb_cc || so->so_snd.sb_cc) {
7148 so_stop_extended_bk_idle(so);
7149
7150 OSIncrementAtomic(&soextbkidlestat.so_xbkidle_drained);
7151 }
7152 }
7153 }
7154
7155 /*
7156 * Return values tells if socket is still in extended background idle
7157 */
7158 int
7159 so_check_extended_bk_idle_time(struct socket *so)
7160 {
7161 int ret = 1;
7162
7163 if ((so->so_flags1 & SOF1_EXTEND_BK_IDLE_INPROG)) {
7164 SODEFUNCTLOG("%s[%d, %s]: so 0x%llx [%d,%d]\n",
7165 __func__, proc_selfpid(), proc_best_name(current_proc()),
7166 (uint64_t)DEBUG_KERNEL_ADDRPERM(so),
7167 SOCK_DOM(so), SOCK_TYPE(so));
7168 if (net_uptime() - so->so_extended_bk_start >
7169 soextbkidlestat.so_xbkidle_time) {
7170 so_stop_extended_bk_idle(so);
7171
7172 OSIncrementAtomic(&soextbkidlestat.so_xbkidle_expired);
7173
7174 ret = 0;
7175 } else {
7176 struct inpcb *inp = (struct inpcb *)so->so_pcb;
7177
7178 inpcb_timer_sched(inp->inp_pcbinfo, INPCB_TIMER_LAZY);
7179 OSIncrementAtomic(&soextbkidlestat.so_xbkidle_resched);
7180 }
7181 }
7182
7183 return (ret);
7184 }
7185
7186 void
7187 resume_proc_sockets(proc_t p)
7188 {
7189 if (p->p_ladvflag & P_LXBKIDLEINPROG) {
7190 struct filedesc *fdp;
7191 int i;
7192
7193 proc_fdlock(p);
7194 fdp = p->p_fd;
7195 for (i = 0; i < fdp->fd_nfiles; i++) {
7196 struct fileproc *fp;
7197 struct socket *so;
7198
7199 fp = fdp->fd_ofiles[i];
7200 if (fp == NULL ||
7201 (fdp->fd_ofileflags[i] & UF_RESERVED) != 0 ||
7202 FILEGLOB_DTYPE(fp->f_fglob) != DTYPE_SOCKET)
7203 continue;
7204
7205 so = (struct socket *)fp->f_fglob->fg_data;
7206 (void) soresume(p, so, 0);
7207 }
7208 proc_fdunlock(p);
7209
7210 OSBitAndAtomic(~P_LXBKIDLEINPROG, &p->p_ladvflag);
7211 }
7212 }
7213
7214 __private_extern__ int
7215 so_set_recv_anyif(struct socket *so, int optval)
7216 {
7217 int ret = 0;
7218
7219 #if INET6
7220 if (SOCK_DOM(so) == PF_INET || SOCK_DOM(so) == PF_INET6) {
7221 #else
7222 if (SOCK_DOM(so) == PF_INET) {
7223 #endif /* !INET6 */
7224 if (optval)
7225 sotoinpcb(so)->inp_flags |= INP_RECV_ANYIF;
7226 else
7227 sotoinpcb(so)->inp_flags &= ~INP_RECV_ANYIF;
7228 }
7229
7230
7231 return (ret);
7232 }
7233
7234 __private_extern__ int
7235 so_get_recv_anyif(struct socket *so)
7236 {
7237 int ret = 0;
7238
7239 #if INET6
7240 if (SOCK_DOM(so) == PF_INET || SOCK_DOM(so) == PF_INET6) {
7241 #else
7242 if (SOCK_DOM(so) == PF_INET) {
7243 #endif /* !INET6 */
7244 ret = (sotoinpcb(so)->inp_flags & INP_RECV_ANYIF) ? 1 : 0;
7245 }
7246
7247 return (ret);
7248 }
7249
7250 int
7251 so_set_restrictions(struct socket *so, uint32_t vals)
7252 {
7253 int nocell_old, nocell_new;
7254 int noexpensive_old, noexpensive_new;
7255
7256 /*
7257 * Deny-type restrictions are trapdoors; once set they cannot be
7258 * unset for the lifetime of the socket. This allows them to be
7259 * issued by a framework on behalf of the application without
7260 * having to worry that they can be undone.
7261 *
7262 * Note here that socket-level restrictions overrides any protocol
7263 * level restrictions. For instance, SO_RESTRICT_DENY_CELLULAR
7264 * socket restriction issued on the socket has a higher precendence
7265 * than INP_NO_IFT_CELLULAR. The latter is affected by the UUID
7266 * policy PROC_UUID_NO_CELLULAR for unrestricted sockets only,
7267 * i.e. when SO_RESTRICT_DENY_CELLULAR has not been issued.
7268 */
7269 nocell_old = (so->so_restrictions & SO_RESTRICT_DENY_CELLULAR);
7270 noexpensive_old = (so->so_restrictions & SO_RESTRICT_DENY_EXPENSIVE);
7271 so->so_restrictions |= (vals & (SO_RESTRICT_DENY_IN |
7272 SO_RESTRICT_DENY_OUT | SO_RESTRICT_DENY_CELLULAR |
7273 SO_RESTRICT_DENY_EXPENSIVE));
7274 nocell_new = (so->so_restrictions & SO_RESTRICT_DENY_CELLULAR);
7275 noexpensive_new = (so->so_restrictions & SO_RESTRICT_DENY_EXPENSIVE);
7276
7277 /* we can only set, not clear restrictions */
7278 if ((nocell_new - nocell_old) == 0 &&
7279 (noexpensive_new - noexpensive_old) == 0)
7280 return (0);
7281 #if INET6
7282 if (SOCK_DOM(so) == PF_INET || SOCK_DOM(so) == PF_INET6) {
7283 #else
7284 if (SOCK_DOM(so) == PF_INET) {
7285 #endif /* !INET6 */
7286 if (nocell_new - nocell_old != 0) {
7287 /*
7288 * if deny cellular is now set, do what's needed
7289 * for INPCB
7290 */
7291 inp_set_nocellular(sotoinpcb(so));
7292 }
7293 if (noexpensive_new - noexpensive_old != 0) {
7294 inp_set_noexpensive(sotoinpcb(so));
7295 }
7296 }
7297
7298 if (SOCK_DOM(so) == PF_MULTIPATH)
7299 mptcp_set_restrictions(so);
7300
7301 return (0);
7302 }
7303
7304 uint32_t
7305 so_get_restrictions(struct socket *so)
7306 {
7307 return (so->so_restrictions & (SO_RESTRICT_DENY_IN |
7308 SO_RESTRICT_DENY_OUT |
7309 SO_RESTRICT_DENY_CELLULAR | SO_RESTRICT_DENY_EXPENSIVE));
7310 }
7311
7312 int
7313 so_set_effective_pid(struct socket *so, int epid, struct proc *p)
7314 {
7315 struct proc *ep = PROC_NULL;
7316 int error = 0;
7317
7318 /* pid 0 is reserved for kernel */
7319 if (epid == 0) {
7320 error = EINVAL;
7321 goto done;
7322 }
7323
7324 /*
7325 * If this is an in-kernel socket, prevent its delegate
7326 * association from changing unless the socket option is
7327 * coming from within the kernel itself.
7328 */
7329 if (so->last_pid == 0 && p != kernproc) {
7330 error = EACCES;
7331 goto done;
7332 }
7333
7334 /*
7335 * If this is issued by a process that's recorded as the
7336 * real owner of the socket, or if the pid is the same as
7337 * the process's own pid, then proceed. Otherwise ensure
7338 * that the issuing process has the necessary privileges.
7339 */
7340 if (epid != so->last_pid || epid != proc_pid(p)) {
7341 if ((error = priv_check_cred(kauth_cred_get(),
7342 PRIV_NET_PRIVILEGED_SOCKET_DELEGATE, 0))) {
7343 error = EACCES;
7344 goto done;
7345 }
7346 }
7347
7348 /* Find the process that corresponds to the effective pid */
7349 if ((ep = proc_find(epid)) == PROC_NULL) {
7350 error = ESRCH;
7351 goto done;
7352 }
7353
7354 /*
7355 * If a process tries to delegate the socket to itself, then
7356 * there's really nothing to do; treat it as a way for the
7357 * delegate association to be cleared. Note that we check
7358 * the passed-in proc rather than calling proc_selfpid(),
7359 * as we need to check the process issuing the socket option
7360 * which could be kernproc. Given that we don't allow 0 for
7361 * effective pid, it means that a delegated in-kernel socket
7362 * stays delegated during its lifetime (which is probably OK.)
7363 */
7364 if (epid == proc_pid(p)) {
7365 so->so_flags &= ~SOF_DELEGATED;
7366 so->e_upid = 0;
7367 so->e_pid = 0;
7368 uuid_clear(so->e_uuid);
7369 } else {
7370 so->so_flags |= SOF_DELEGATED;
7371 so->e_upid = proc_uniqueid(ep);
7372 so->e_pid = proc_pid(ep);
7373 proc_getexecutableuuid(ep, so->e_uuid, sizeof (so->e_uuid));
7374 }
7375 done:
7376 if (error == 0 && net_io_policy_log) {
7377 uuid_string_t buf;
7378
7379 uuid_unparse(so->e_uuid, buf);
7380 log(LOG_DEBUG, "%s[%s,%d]: so 0x%llx [%d,%d] epid %d (%s) "
7381 "euuid %s%s\n", __func__, proc_name_address(p),
7382 proc_pid(p), (uint64_t)DEBUG_KERNEL_ADDRPERM(so),
7383 SOCK_DOM(so), SOCK_TYPE(so),
7384 so->e_pid, proc_name_address(ep), buf,
7385 ((so->so_flags & SOF_DELEGATED) ? " [delegated]" : ""));
7386 } else if (error != 0 && net_io_policy_log) {
7387 log(LOG_ERR, "%s[%s,%d]: so 0x%llx [%d,%d] epid %d (%s) "
7388 "ERROR (%d)\n", __func__, proc_name_address(p),
7389 proc_pid(p), (uint64_t)DEBUG_KERNEL_ADDRPERM(so),
7390 SOCK_DOM(so), SOCK_TYPE(so),
7391 epid, (ep == PROC_NULL) ? "PROC_NULL" :
7392 proc_name_address(ep), error);
7393 }
7394
7395 /* Update this socket's policy upon success */
7396 if (error == 0) {
7397 so->so_policy_gencnt *= -1;
7398 so_update_policy(so);
7399 #if NECP
7400 so_update_necp_policy(so, NULL, NULL);
7401 #endif /* NECP */
7402 }
7403
7404 if (ep != PROC_NULL)
7405 proc_rele(ep);
7406
7407 return (error);
7408 }
7409
7410 int
7411 so_set_effective_uuid(struct socket *so, uuid_t euuid, struct proc *p)
7412 {
7413 uuid_string_t buf;
7414 uuid_t uuid;
7415 int error = 0;
7416
7417 /* UUID must not be all-zeroes (reserved for kernel) */
7418 if (uuid_is_null(euuid)) {
7419 error = EINVAL;
7420 goto done;
7421 }
7422
7423 /*
7424 * If this is an in-kernel socket, prevent its delegate
7425 * association from changing unless the socket option is
7426 * coming from within the kernel itself.
7427 */
7428 if (so->last_pid == 0 && p != kernproc) {
7429 error = EACCES;
7430 goto done;
7431 }
7432
7433 /* Get the UUID of the issuing process */
7434 proc_getexecutableuuid(p, uuid, sizeof (uuid));
7435
7436 /*
7437 * If this is issued by a process that's recorded as the
7438 * real owner of the socket, or if the uuid is the same as
7439 * the process's own uuid, then proceed. Otherwise ensure
7440 * that the issuing process has the necessary privileges.
7441 */
7442 if (uuid_compare(euuid, so->last_uuid) != 0 ||
7443 uuid_compare(euuid, uuid) != 0) {
7444 if ((error = priv_check_cred(kauth_cred_get(),
7445 PRIV_NET_PRIVILEGED_SOCKET_DELEGATE, 0))) {
7446 error = EACCES;
7447 goto done;
7448 }
7449 }
7450
7451 /*
7452 * If a process tries to delegate the socket to itself, then
7453 * there's really nothing to do; treat it as a way for the
7454 * delegate association to be cleared. Note that we check
7455 * the uuid of the passed-in proc rather than that of the
7456 * current process, as we need to check the process issuing
7457 * the socket option which could be kernproc itself. Given
7458 * that we don't allow 0 for effective uuid, it means that
7459 * a delegated in-kernel socket stays delegated during its
7460 * lifetime (which is okay.)
7461 */
7462 if (uuid_compare(euuid, uuid) == 0) {
7463 so->so_flags &= ~SOF_DELEGATED;
7464 so->e_upid = 0;
7465 so->e_pid = 0;
7466 uuid_clear(so->e_uuid);
7467 } else {
7468 so->so_flags |= SOF_DELEGATED;
7469 /*
7470 * Unlike so_set_effective_pid(), we only have the UUID
7471 * here and the process ID is not known. Inherit the
7472 * real {pid,upid} of the socket.
7473 */
7474 so->e_upid = so->last_upid;
7475 so->e_pid = so->last_pid;
7476 uuid_copy(so->e_uuid, euuid);
7477 }
7478
7479 done:
7480 if (error == 0 && net_io_policy_log) {
7481 uuid_unparse(so->e_uuid, buf);
7482 log(LOG_DEBUG, "%s[%s,%d]: so 0x%llx [%d,%d] epid %d "
7483 "euuid %s%s\n", __func__, proc_name_address(p), proc_pid(p),
7484 (uint64_t)DEBUG_KERNEL_ADDRPERM(so), SOCK_DOM(so),
7485 SOCK_TYPE(so), so->e_pid, buf,
7486 ((so->so_flags & SOF_DELEGATED) ? " [delegated]" : ""));
7487 } else if (error != 0 && net_io_policy_log) {
7488 uuid_unparse(euuid, buf);
7489 log(LOG_DEBUG, "%s[%s,%d]: so 0x%llx [%d,%d] euuid %s "
7490 "ERROR (%d)\n", __func__, proc_name_address(p), proc_pid(p),
7491 (uint64_t)DEBUG_KERNEL_ADDRPERM(so), SOCK_DOM(so),
7492 SOCK_TYPE(so), buf, error);
7493 }
7494
7495 /* Update this socket's policy upon success */
7496 if (error == 0) {
7497 so->so_policy_gencnt *= -1;
7498 so_update_policy(so);
7499 #if NECP
7500 so_update_necp_policy(so, NULL, NULL);
7501 #endif /* NECP */
7502 }
7503
7504 return (error);
7505 }
7506
7507 void
7508 netpolicy_post_msg(uint32_t ev_code, struct netpolicy_event_data *ev_data,
7509 uint32_t ev_datalen)
7510 {
7511 struct kev_msg ev_msg;
7512
7513 /*
7514 * A netpolicy event always starts with a netpolicy_event_data
7515 * structure, but the caller can provide for a longer event
7516 * structure to post, depending on the event code.
7517 */
7518 VERIFY(ev_data != NULL && ev_datalen >= sizeof (*ev_data));
7519
7520 bzero(&ev_msg, sizeof (ev_msg));
7521 ev_msg.vendor_code = KEV_VENDOR_APPLE;
7522 ev_msg.kev_class = KEV_NETWORK_CLASS;
7523 ev_msg.kev_subclass = KEV_NETPOLICY_SUBCLASS;
7524 ev_msg.event_code = ev_code;
7525
7526 ev_msg.dv[0].data_ptr = ev_data;
7527 ev_msg.dv[0].data_length = ev_datalen;
7528
7529 kev_post_msg(&ev_msg);
7530 }
7531
7532 void
7533 socket_post_kev_msg(uint32_t ev_code,
7534 struct kev_socket_event_data *ev_data,
7535 uint32_t ev_datalen)
7536 {
7537 struct kev_msg ev_msg;
7538
7539 bzero(&ev_msg, sizeof(ev_msg));
7540 ev_msg.vendor_code = KEV_VENDOR_APPLE;
7541 ev_msg.kev_class = KEV_NETWORK_CLASS;
7542 ev_msg.kev_subclass = KEV_SOCKET_SUBCLASS;
7543 ev_msg.event_code = ev_code;
7544
7545 ev_msg.dv[0].data_ptr = ev_data;
7546 ev_msg.dv[0]. data_length = ev_datalen;
7547
7548 kev_post_msg(&ev_msg);
7549 }
7550
7551 void
7552 socket_post_kev_msg_closed(struct socket *so)
7553 {
7554 struct kev_socket_closed ev;
7555 struct sockaddr *socksa = NULL, *peersa = NULL;
7556 int err;
7557 bzero(&ev, sizeof(ev));
7558 err = (*so->so_proto->pr_usrreqs->pru_sockaddr)(so, &socksa);
7559 if (err == 0) {
7560 err = (*so->so_proto->pr_usrreqs->pru_peeraddr)(so,
7561 &peersa);
7562 if (err == 0) {
7563 memcpy(&ev.ev_data.kev_sockname, socksa,
7564 min(socksa->sa_len,
7565 sizeof (ev.ev_data.kev_sockname)));
7566 memcpy(&ev.ev_data.kev_peername, peersa,
7567 min(peersa->sa_len,
7568 sizeof (ev.ev_data.kev_peername)));
7569 socket_post_kev_msg(KEV_SOCKET_CLOSED,
7570 &ev.ev_data, sizeof (ev));
7571 }
7572 }
7573 if (socksa != NULL)
7574 FREE(socksa, M_SONAME);
7575 if (peersa != NULL)
7576 FREE(peersa, M_SONAME);
7577 }