]> git.saurik.com Git - apple/xnu.git/blob - bsd/kern/uipc_socket.c
xnu-4570.1.46.tar.gz
[apple/xnu.git] / bsd / kern / uipc_socket.c
1 /*
2 * Copyright (c) 1998-2017 Apple Inc. All rights reserved.
3 *
4 * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
5 *
6 * This file contains Original Code and/or Modifications of Original Code
7 * as defined in and that are subject to the Apple Public Source License
8 * Version 2.0 (the 'License'). You may not use this file except in
9 * compliance with the License. The rights granted to you under the License
10 * may not be used to create, or enable the creation or redistribution of,
11 * unlawful or unlicensed copies of an Apple operating system, or to
12 * circumvent, violate, or enable the circumvention or violation of, any
13 * terms of an Apple operating system software license agreement.
14 *
15 * Please obtain a copy of the License at
16 * http://www.opensource.apple.com/apsl/ and read it before using this file.
17 *
18 * The Original Code and all software distributed under the License are
19 * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
20 * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
21 * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
22 * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
23 * Please see the License for the specific language governing rights and
24 * limitations under the License.
25 *
26 * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
27 */
28 /* Copyright (c) 1995 NeXT Computer, Inc. All Rights Reserved */
29 /*
30 * Copyright (c) 1982, 1986, 1988, 1990, 1993
31 * The Regents of the University of California. All rights reserved.
32 *
33 * Redistribution and use in source and binary forms, with or without
34 * modification, are permitted provided that the following conditions
35 * are met:
36 * 1. Redistributions of source code must retain the above copyright
37 * notice, this list of conditions and the following disclaimer.
38 * 2. Redistributions in binary form must reproduce the above copyright
39 * notice, this list of conditions and the following disclaimer in the
40 * documentation and/or other materials provided with the distribution.
41 * 3. All advertising materials mentioning features or use of this software
42 * must display the following acknowledgement:
43 * This product includes software developed by the University of
44 * California, Berkeley and its contributors.
45 * 4. Neither the name of the University nor the names of its contributors
46 * may be used to endorse or promote products derived from this software
47 * without specific prior written permission.
48 *
49 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
50 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
51 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
52 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
53 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
54 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
55 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
56 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
57 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
58 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
59 * SUCH DAMAGE.
60 *
61 * @(#)uipc_socket.c 8.3 (Berkeley) 4/15/94
62 */
63 /*
64 * NOTICE: This file was modified by SPARTA, Inc. in 2005 to introduce
65 * support for mandatory and extensible security protections. This notice
66 * is included in support of clause 2.2 (b) of the Apple Public License,
67 * Version 2.0.
68 */
69
70 #include <sys/param.h>
71 #include <sys/systm.h>
72 #include <sys/filedesc.h>
73 #include <sys/proc.h>
74 #include <sys/proc_internal.h>
75 #include <sys/kauth.h>
76 #include <sys/file_internal.h>
77 #include <sys/fcntl.h>
78 #include <sys/malloc.h>
79 #include <sys/mbuf.h>
80 #include <sys/domain.h>
81 #include <sys/kernel.h>
82 #include <sys/event.h>
83 #include <sys/poll.h>
84 #include <sys/protosw.h>
85 #include <sys/socket.h>
86 #include <sys/socketvar.h>
87 #include <sys/resourcevar.h>
88 #include <sys/signalvar.h>
89 #include <sys/sysctl.h>
90 #include <sys/syslog.h>
91 #include <sys/uio.h>
92 #include <sys/uio_internal.h>
93 #include <sys/ev.h>
94 #include <sys/kdebug.h>
95 #include <sys/un.h>
96 #include <sys/user.h>
97 #include <sys/priv.h>
98 #include <sys/kern_event.h>
99 #include <net/route.h>
100 #include <net/init.h>
101 #include <net/net_api_stats.h>
102 #include <net/ntstat.h>
103 #include <net/content_filter.h>
104 #include <netinet/in.h>
105 #include <netinet/in_pcb.h>
106 #include <netinet/in_tclass.h>
107 #include <netinet/tcp_var.h>
108 #include <netinet/ip6.h>
109 #include <netinet6/ip6_var.h>
110 #include <netinet/flow_divert.h>
111 #include <kern/zalloc.h>
112 #include <kern/locks.h>
113 #include <machine/limits.h>
114 #include <libkern/OSAtomic.h>
115 #include <pexpert/pexpert.h>
116 #include <kern/assert.h>
117 #include <kern/task.h>
118 #include <kern/policy_internal.h>
119
120 #include <sys/kpi_mbuf.h>
121 #include <sys/mcache.h>
122 #include <sys/unpcb.h>
123 #include <libkern/section_keywords.h>
124
125 #if CONFIG_MACF
126 #include <security/mac_framework.h>
127 #endif /* MAC */
128
129 #if MULTIPATH
130 #include <netinet/mp_pcb.h>
131 #include <netinet/mptcp_var.h>
132 #endif /* MULTIPATH */
133
134 #define ROUNDUP(a, b) (((a) + ((b) - 1)) & (~((b) - 1)))
135
136 #if DEBUG || DEVELOPMENT
137 #define DEBUG_KERNEL_ADDRPERM(_v) (_v)
138 #else
139 #define DEBUG_KERNEL_ADDRPERM(_v) VM_KERNEL_ADDRPERM(_v)
140 #endif
141
142 /* TODO: this should be in a header file somewhere */
143 extern char *proc_name_address(void *p);
144 extern char *proc_best_name(proc_t);
145
146 static u_int32_t so_cache_hw; /* High water mark for socache */
147 static u_int32_t so_cache_timeouts; /* number of timeouts */
148 static u_int32_t so_cache_max_freed; /* max freed per timeout */
149 static u_int32_t cached_sock_count = 0;
150 STAILQ_HEAD(, socket) so_cache_head;
151 int max_cached_sock_count = MAX_CACHED_SOCKETS;
152 static u_int32_t so_cache_time;
153 static int socketinit_done;
154 static struct zone *so_cache_zone;
155
156 static lck_grp_t *so_cache_mtx_grp;
157 static lck_attr_t *so_cache_mtx_attr;
158 static lck_grp_attr_t *so_cache_mtx_grp_attr;
159 static lck_mtx_t *so_cache_mtx;
160
161 #include <machine/limits.h>
162
163 static int filt_sorattach(struct knote *kn, struct kevent_internal_s *kev);
164 static void filt_sordetach(struct knote *kn);
165 static int filt_soread(struct knote *kn, long hint);
166 static int filt_sortouch(struct knote *kn, struct kevent_internal_s *kev);
167 static int filt_sorprocess(struct knote *kn, struct filt_process_s *data, struct kevent_internal_s *kev);
168
169 static int filt_sowattach(struct knote *kn, struct kevent_internal_s *kev);
170 static void filt_sowdetach(struct knote *kn);
171 static int filt_sowrite(struct knote *kn, long hint);
172 static int filt_sowtouch(struct knote *kn, struct kevent_internal_s *kev);
173 static int filt_sowprocess(struct knote *kn, struct filt_process_s *data, struct kevent_internal_s *kev);
174
175 static int filt_sockattach(struct knote *kn, struct kevent_internal_s *kev);
176 static void filt_sockdetach(struct knote *kn);
177 static int filt_sockev(struct knote *kn, long hint);
178 static int filt_socktouch(struct knote *kn, struct kevent_internal_s *kev);
179 static int filt_sockprocess(struct knote *kn, struct filt_process_s *data, struct kevent_internal_s *kev);
180
181 static int sooptcopyin_timeval(struct sockopt *, struct timeval *);
182 static int sooptcopyout_timeval(struct sockopt *, const struct timeval *);
183
184 SECURITY_READ_ONLY_EARLY(struct filterops) soread_filtops = {
185 .f_isfd = 1,
186 .f_attach = filt_sorattach,
187 .f_detach = filt_sordetach,
188 .f_event = filt_soread,
189 .f_touch = filt_sortouch,
190 .f_process = filt_sorprocess,
191 };
192
193 SECURITY_READ_ONLY_EARLY(struct filterops) sowrite_filtops = {
194 .f_isfd = 1,
195 .f_attach = filt_sowattach,
196 .f_detach = filt_sowdetach,
197 .f_event = filt_sowrite,
198 .f_touch = filt_sowtouch,
199 .f_process = filt_sowprocess,
200 };
201
202 SECURITY_READ_ONLY_EARLY(struct filterops) sock_filtops = {
203 .f_isfd = 1,
204 .f_attach = filt_sockattach,
205 .f_detach = filt_sockdetach,
206 .f_event = filt_sockev,
207 .f_touch = filt_socktouch,
208 .f_process = filt_sockprocess,
209 };
210
211 SECURITY_READ_ONLY_EARLY(struct filterops) soexcept_filtops = {
212 .f_isfd = 1,
213 .f_attach = filt_sorattach,
214 .f_detach = filt_sordetach,
215 .f_event = filt_soread,
216 .f_touch = filt_sortouch,
217 .f_process = filt_sorprocess,
218 };
219
220 SYSCTL_DECL(_kern_ipc);
221
222 #define EVEN_MORE_LOCKING_DEBUG 0
223
224 int socket_debug = 0;
225 SYSCTL_INT(_kern_ipc, OID_AUTO, socket_debug,
226 CTLFLAG_RW | CTLFLAG_LOCKED, &socket_debug, 0, "");
227
228 static unsigned long sodefunct_calls = 0;
229 SYSCTL_LONG(_kern_ipc, OID_AUTO, sodefunct_calls, CTLFLAG_LOCKED,
230 &sodefunct_calls, "");
231
232 static int socket_zone = M_SOCKET;
233 so_gen_t so_gencnt; /* generation count for sockets */
234
235 MALLOC_DEFINE(M_SONAME, "soname", "socket name");
236 MALLOC_DEFINE(M_PCB, "pcb", "protocol control block");
237
238 #define DBG_LAYER_IN_BEG NETDBG_CODE(DBG_NETSOCK, 0)
239 #define DBG_LAYER_IN_END NETDBG_CODE(DBG_NETSOCK, 2)
240 #define DBG_LAYER_OUT_BEG NETDBG_CODE(DBG_NETSOCK, 1)
241 #define DBG_LAYER_OUT_END NETDBG_CODE(DBG_NETSOCK, 3)
242 #define DBG_FNC_SOSEND NETDBG_CODE(DBG_NETSOCK, (4 << 8) | 1)
243 #define DBG_FNC_SOSEND_LIST NETDBG_CODE(DBG_NETSOCK, (4 << 8) | 3)
244 #define DBG_FNC_SORECEIVE NETDBG_CODE(DBG_NETSOCK, (8 << 8))
245 #define DBG_FNC_SORECEIVE_LIST NETDBG_CODE(DBG_NETSOCK, (8 << 8) | 3)
246 #define DBG_FNC_SOSHUTDOWN NETDBG_CODE(DBG_NETSOCK, (9 << 8))
247
248 #define MAX_SOOPTGETM_SIZE (128 * MCLBYTES)
249
250 int somaxconn = SOMAXCONN;
251 SYSCTL_INT(_kern_ipc, KIPC_SOMAXCONN, somaxconn,
252 CTLFLAG_RW | CTLFLAG_LOCKED, &somaxconn, 0, "");
253
254 /* Should we get a maximum also ??? */
255 static int sosendmaxchain = 65536;
256 static int sosendminchain = 16384;
257 static int sorecvmincopy = 16384;
258 SYSCTL_INT(_kern_ipc, OID_AUTO, sosendminchain,
259 CTLFLAG_RW | CTLFLAG_LOCKED, &sosendminchain, 0, "");
260 SYSCTL_INT(_kern_ipc, OID_AUTO, sorecvmincopy,
261 CTLFLAG_RW | CTLFLAG_LOCKED, &sorecvmincopy, 0, "");
262
263 /*
264 * Set to enable jumbo clusters (if available) for large writes when
265 * the socket is marked with SOF_MULTIPAGES; see below.
266 */
267 int sosendjcl = 1;
268 SYSCTL_INT(_kern_ipc, OID_AUTO, sosendjcl,
269 CTLFLAG_RW | CTLFLAG_LOCKED, &sosendjcl, 0, "");
270
271 /*
272 * Set this to ignore SOF_MULTIPAGES and use jumbo clusters for large
273 * writes on the socket for all protocols on any network interfaces,
274 * depending upon sosendjcl above. Be extra careful when setting this
275 * to 1, because sending down packets that cross physical pages down to
276 * broken drivers (those that falsely assume that the physical pages
277 * are contiguous) might lead to system panics or silent data corruption.
278 * When set to 0, the system will respect SOF_MULTIPAGES, which is set
279 * only for TCP sockets whose outgoing interface is IFNET_MULTIPAGES
280 * capable. Set this to 1 only for testing/debugging purposes.
281 */
282 int sosendjcl_ignore_capab = 0;
283 SYSCTL_INT(_kern_ipc, OID_AUTO, sosendjcl_ignore_capab,
284 CTLFLAG_RW | CTLFLAG_LOCKED, &sosendjcl_ignore_capab, 0, "");
285
286 /*
287 * Set this to ignore SOF1_IF_2KCL and use big clusters for large
288 * writes on the socket for all protocols on any network interfaces.
289 * Be extra careful when setting this to 1, because sending down packets with
290 * clusters larger that 2 KB might lead to system panics or data corruption.
291 * When set to 0, the system will respect SOF1_IF_2KCL, which is set
292 * on the outgoing interface
293 * Set this to 1 for testing/debugging purposes only.
294 */
295 int sosendbigcl_ignore_capab = 0;
296 SYSCTL_INT(_kern_ipc, OID_AUTO, sosendbigcl_ignore_capab,
297 CTLFLAG_RW | CTLFLAG_LOCKED, &sosendbigcl_ignore_capab, 0, "");
298
299 int sodefunctlog = 0;
300 SYSCTL_INT(_kern_ipc, OID_AUTO, sodefunctlog, CTLFLAG_RW | CTLFLAG_LOCKED,
301 &sodefunctlog, 0, "");
302
303 int sothrottlelog = 0;
304 SYSCTL_INT(_kern_ipc, OID_AUTO, sothrottlelog, CTLFLAG_RW | CTLFLAG_LOCKED,
305 &sothrottlelog, 0, "");
306
307 int sorestrictrecv = 1;
308 SYSCTL_INT(_kern_ipc, OID_AUTO, sorestrictrecv, CTLFLAG_RW | CTLFLAG_LOCKED,
309 &sorestrictrecv, 0, "Enable inbound interface restrictions");
310
311 int sorestrictsend = 1;
312 SYSCTL_INT(_kern_ipc, OID_AUTO, sorestrictsend, CTLFLAG_RW | CTLFLAG_LOCKED,
313 &sorestrictsend, 0, "Enable outbound interface restrictions");
314
315 int soreserveheadroom = 1;
316 SYSCTL_INT(_kern_ipc, OID_AUTO, soreserveheadroom, CTLFLAG_RW | CTLFLAG_LOCKED,
317 &soreserveheadroom, 0, "To allocate contiguous datagram buffers");
318
319 #if (DEBUG || DEVELOPMENT)
320 int so_notsent_lowat_check = 1;
321 SYSCTL_INT(_kern_ipc, OID_AUTO, notsent_lowat, CTLFLAG_RW|CTLFLAG_LOCKED,
322 &so_notsent_lowat_check, 0, "enable/disable notsnet lowat check");
323 #endif /* DEBUG || DEVELOPMENT */
324
325 int so_accept_list_waits = 0;
326 #if (DEBUG || DEVELOPMENT)
327 SYSCTL_INT(_kern_ipc, OID_AUTO, accept_list_waits, CTLFLAG_RW|CTLFLAG_LOCKED,
328 &so_accept_list_waits, 0, "number of waits for listener incomp list");
329 #endif /* DEBUG || DEVELOPMENT */
330
331 extern struct inpcbinfo tcbinfo;
332
333 /* TODO: these should be in header file */
334 extern int get_inpcb_str_size(void);
335 extern int get_tcp_str_size(void);
336
337 vm_size_t so_cache_zone_element_size;
338
339 static int sodelayed_copy(struct socket *, struct uio *, struct mbuf **,
340 user_ssize_t *);
341 static void cached_sock_alloc(struct socket **, int);
342 static void cached_sock_free(struct socket *);
343
344 /*
345 * Maximum of extended background idle sockets per process
346 * Set to zero to disable further setting of the option
347 */
348
349 #define SO_IDLE_BK_IDLE_MAX_PER_PROC 1
350 #define SO_IDLE_BK_IDLE_TIME 600
351 #define SO_IDLE_BK_IDLE_RCV_HIWAT 131072
352
353 struct soextbkidlestat soextbkidlestat;
354
355 SYSCTL_UINT(_kern_ipc, OID_AUTO, maxextbkidleperproc,
356 CTLFLAG_RW | CTLFLAG_LOCKED, &soextbkidlestat.so_xbkidle_maxperproc, 0,
357 "Maximum of extended background idle sockets per process");
358
359 SYSCTL_UINT(_kern_ipc, OID_AUTO, extbkidletime, CTLFLAG_RW | CTLFLAG_LOCKED,
360 &soextbkidlestat.so_xbkidle_time, 0,
361 "Time in seconds to keep extended background idle sockets");
362
363 SYSCTL_UINT(_kern_ipc, OID_AUTO, extbkidlercvhiwat, CTLFLAG_RW | CTLFLAG_LOCKED,
364 &soextbkidlestat.so_xbkidle_rcvhiwat, 0,
365 "High water mark for extended background idle sockets");
366
367 SYSCTL_STRUCT(_kern_ipc, OID_AUTO, extbkidlestat, CTLFLAG_RD | CTLFLAG_LOCKED,
368 &soextbkidlestat, soextbkidlestat, "");
369
370 int so_set_extended_bk_idle(struct socket *, int);
371
372
373 /*
374 * SOTCDB_NO_DSCP is set by default, to prevent the networking stack from
375 * setting the DSCP code on the packet based on the service class; see
376 * <rdar://problem/11277343> for details.
377 */
378 __private_extern__ u_int32_t sotcdb = 0;
379 SYSCTL_INT(_kern_ipc, OID_AUTO, sotcdb, CTLFLAG_RW | CTLFLAG_LOCKED,
380 &sotcdb, 0, "");
381
382 void
383 socketinit(void)
384 {
385 _CASSERT(sizeof(so_gencnt) == sizeof(uint64_t));
386 VERIFY(IS_P2ALIGNED(&so_gencnt, sizeof(uint32_t)));
387
388 #ifdef __LP64__
389 _CASSERT(sizeof(struct sa_endpoints) == sizeof(struct user64_sa_endpoints));
390 _CASSERT(offsetof(struct sa_endpoints, sae_srcif) == offsetof(struct user64_sa_endpoints, sae_srcif));
391 _CASSERT(offsetof(struct sa_endpoints, sae_srcaddr) == offsetof(struct user64_sa_endpoints, sae_srcaddr));
392 _CASSERT(offsetof(struct sa_endpoints, sae_srcaddrlen) == offsetof(struct user64_sa_endpoints, sae_srcaddrlen));
393 _CASSERT(offsetof(struct sa_endpoints, sae_dstaddr) == offsetof(struct user64_sa_endpoints, sae_dstaddr));
394 _CASSERT(offsetof(struct sa_endpoints, sae_dstaddrlen) == offsetof(struct user64_sa_endpoints, sae_dstaddrlen));
395 #else
396 _CASSERT(sizeof(struct sa_endpoints) == sizeof(struct user32_sa_endpoints));
397 _CASSERT(offsetof(struct sa_endpoints, sae_srcif) == offsetof(struct user32_sa_endpoints, sae_srcif));
398 _CASSERT(offsetof(struct sa_endpoints, sae_srcaddr) == offsetof(struct user32_sa_endpoints, sae_srcaddr));
399 _CASSERT(offsetof(struct sa_endpoints, sae_srcaddrlen) == offsetof(struct user32_sa_endpoints, sae_srcaddrlen));
400 _CASSERT(offsetof(struct sa_endpoints, sae_dstaddr) == offsetof(struct user32_sa_endpoints, sae_dstaddr));
401 _CASSERT(offsetof(struct sa_endpoints, sae_dstaddrlen) == offsetof(struct user32_sa_endpoints, sae_dstaddrlen));
402 #endif
403
404 if (socketinit_done) {
405 printf("socketinit: already called...\n");
406 return;
407 }
408 socketinit_done = 1;
409
410 PE_parse_boot_argn("socket_debug", &socket_debug,
411 sizeof (socket_debug));
412
413 /*
414 * allocate lock group attribute and group for socket cache mutex
415 */
416 so_cache_mtx_grp_attr = lck_grp_attr_alloc_init();
417 so_cache_mtx_grp = lck_grp_alloc_init("so_cache",
418 so_cache_mtx_grp_attr);
419
420 /*
421 * allocate the lock attribute for socket cache mutex
422 */
423 so_cache_mtx_attr = lck_attr_alloc_init();
424
425 /* cached sockets mutex */
426 so_cache_mtx = lck_mtx_alloc_init(so_cache_mtx_grp, so_cache_mtx_attr);
427 if (so_cache_mtx == NULL) {
428 panic("%s: unable to allocate so_cache_mtx\n", __func__);
429 /* NOTREACHED */
430 }
431 STAILQ_INIT(&so_cache_head);
432
433 so_cache_zone_element_size = (vm_size_t)(sizeof (struct socket) + 4
434 + get_inpcb_str_size() + 4 + get_tcp_str_size());
435
436 so_cache_zone = zinit(so_cache_zone_element_size,
437 (120000 * so_cache_zone_element_size), 8192, "socache zone");
438 zone_change(so_cache_zone, Z_CALLERACCT, FALSE);
439 zone_change(so_cache_zone, Z_NOENCRYPT, TRUE);
440
441 bzero(&soextbkidlestat, sizeof(struct soextbkidlestat));
442 soextbkidlestat.so_xbkidle_maxperproc = SO_IDLE_BK_IDLE_MAX_PER_PROC;
443 soextbkidlestat.so_xbkidle_time = SO_IDLE_BK_IDLE_TIME;
444 soextbkidlestat.so_xbkidle_rcvhiwat = SO_IDLE_BK_IDLE_RCV_HIWAT;
445
446 in_pcbinit();
447 sflt_init();
448 socket_tclass_init();
449 #if MULTIPATH
450 mp_pcbinit();
451 #endif /* MULTIPATH */
452 }
453
454 static void
455 cached_sock_alloc(struct socket **so, int waitok)
456 {
457 caddr_t temp;
458 uintptr_t offset;
459
460 lck_mtx_lock(so_cache_mtx);
461
462 if (!STAILQ_EMPTY(&so_cache_head)) {
463 VERIFY(cached_sock_count > 0);
464
465 *so = STAILQ_FIRST(&so_cache_head);
466 STAILQ_REMOVE_HEAD(&so_cache_head, so_cache_ent);
467 STAILQ_NEXT((*so), so_cache_ent) = NULL;
468
469 cached_sock_count--;
470 lck_mtx_unlock(so_cache_mtx);
471
472 temp = (*so)->so_saved_pcb;
473 bzero((caddr_t)*so, sizeof (struct socket));
474
475 (*so)->so_saved_pcb = temp;
476 } else {
477
478 lck_mtx_unlock(so_cache_mtx);
479
480 if (waitok)
481 *so = (struct socket *)zalloc(so_cache_zone);
482 else
483 *so = (struct socket *)zalloc_noblock(so_cache_zone);
484
485 if (*so == NULL)
486 return;
487
488 bzero((caddr_t)*so, sizeof (struct socket));
489
490 /*
491 * Define offsets for extra structures into our
492 * single block of memory. Align extra structures
493 * on longword boundaries.
494 */
495
496 offset = (uintptr_t)*so;
497 offset += sizeof (struct socket);
498
499 offset = ALIGN(offset);
500
501 (*so)->so_saved_pcb = (caddr_t)offset;
502 offset += get_inpcb_str_size();
503
504 offset = ALIGN(offset);
505
506 ((struct inpcb *)(void *)(*so)->so_saved_pcb)->inp_saved_ppcb =
507 (caddr_t)offset;
508 }
509
510 OSBitOrAtomic(SOF1_CACHED_IN_SOCK_LAYER, &(*so)->so_flags1);
511 }
512
513 static void
514 cached_sock_free(struct socket *so)
515 {
516
517 lck_mtx_lock(so_cache_mtx);
518
519 so_cache_time = net_uptime();
520 if (++cached_sock_count > max_cached_sock_count) {
521 --cached_sock_count;
522 lck_mtx_unlock(so_cache_mtx);
523 zfree(so_cache_zone, so);
524 } else {
525 if (so_cache_hw < cached_sock_count)
526 so_cache_hw = cached_sock_count;
527
528 STAILQ_INSERT_TAIL(&so_cache_head, so, so_cache_ent);
529
530 so->cache_timestamp = so_cache_time;
531 lck_mtx_unlock(so_cache_mtx);
532 }
533 }
534
535 void
536 so_update_last_owner_locked(struct socket *so, proc_t self)
537 {
538 if (so->last_pid != 0) {
539 /*
540 * last_pid and last_upid should remain zero for sockets
541 * created using sock_socket. The check above achieves that
542 */
543 if (self == PROC_NULL)
544 self = current_proc();
545
546 if (so->last_upid != proc_uniqueid(self) ||
547 so->last_pid != proc_pid(self)) {
548 so->last_upid = proc_uniqueid(self);
549 so->last_pid = proc_pid(self);
550 proc_getexecutableuuid(self, so->last_uuid,
551 sizeof (so->last_uuid));
552 }
553 proc_pidoriginatoruuid(so->so_vuuid, sizeof(so->so_vuuid));
554 }
555 }
556
557 void
558 so_update_policy(struct socket *so)
559 {
560 if (SOCK_DOM(so) == PF_INET || SOCK_DOM(so) == PF_INET6)
561 (void) inp_update_policy(sotoinpcb(so));
562 }
563
564 #if NECP
565 static void
566 so_update_necp_policy(struct socket *so, struct sockaddr *override_local_addr,
567 struct sockaddr *override_remote_addr)
568 {
569 if (SOCK_DOM(so) == PF_INET || SOCK_DOM(so) == PF_INET6)
570 inp_update_necp_policy(sotoinpcb(so), override_local_addr,
571 override_remote_addr, 0);
572 }
573 #endif /* NECP */
574
575 boolean_t
576 so_cache_timer(void)
577 {
578 struct socket *p;
579 int n_freed = 0;
580 boolean_t rc = FALSE;
581
582 lck_mtx_lock(so_cache_mtx);
583 so_cache_timeouts++;
584 so_cache_time = net_uptime();
585
586 while (!STAILQ_EMPTY(&so_cache_head)) {
587 VERIFY(cached_sock_count > 0);
588 p = STAILQ_FIRST(&so_cache_head);
589 if ((so_cache_time - p->cache_timestamp) <
590 SO_CACHE_TIME_LIMIT)
591 break;
592
593 STAILQ_REMOVE_HEAD(&so_cache_head, so_cache_ent);
594 --cached_sock_count;
595
596 zfree(so_cache_zone, p);
597
598 if (++n_freed >= SO_CACHE_MAX_FREE_BATCH) {
599 so_cache_max_freed++;
600 break;
601 }
602 }
603
604 /* Schedule again if there is more to cleanup */
605 if (!STAILQ_EMPTY(&so_cache_head))
606 rc = TRUE;
607
608 lck_mtx_unlock(so_cache_mtx);
609 return (rc);
610 }
611
612 /*
613 * Get a socket structure from our zone, and initialize it.
614 * We don't implement `waitok' yet (see comments in uipc_domain.c).
615 * Note that it would probably be better to allocate socket
616 * and PCB at the same time, but I'm not convinced that all
617 * the protocols can be easily modified to do this.
618 */
619 struct socket *
620 soalloc(int waitok, int dom, int type)
621 {
622 struct socket *so;
623
624 if ((dom == PF_INET) && (type == SOCK_STREAM)) {
625 cached_sock_alloc(&so, waitok);
626 } else {
627 MALLOC_ZONE(so, struct socket *, sizeof (*so), socket_zone,
628 M_WAITOK);
629 if (so != NULL)
630 bzero(so, sizeof (*so));
631 }
632 if (so != NULL) {
633 so->so_gencnt = OSIncrementAtomic64((SInt64 *)&so_gencnt);
634 so->so_zone = socket_zone;
635
636 /*
637 * Increment the socket allocation statistics
638 */
639 INC_ATOMIC_INT64_LIM(net_api_stats.nas_socket_alloc_total);
640
641 #if CONFIG_MACF_SOCKET
642 /* Convert waitok to M_WAITOK/M_NOWAIT for MAC Framework. */
643 if (mac_socket_label_init(so, !waitok) != 0) {
644 sodealloc(so);
645 return (NULL);
646 }
647 #endif /* MAC_SOCKET */
648 }
649
650 return (so);
651 }
652
653 int
654 socreate_internal(int dom, struct socket **aso, int type, int proto,
655 struct proc *p, uint32_t flags, struct proc *ep)
656 {
657 struct protosw *prp;
658 struct socket *so;
659 int error = 0;
660
661 #if TCPDEBUG
662 extern int tcpconsdebug;
663 #endif
664
665 VERIFY(aso != NULL);
666 *aso = NULL;
667
668 if (proto != 0)
669 prp = pffindproto(dom, proto, type);
670 else
671 prp = pffindtype(dom, type);
672
673 if (prp == NULL || prp->pr_usrreqs->pru_attach == NULL) {
674 if (pffinddomain(dom) == NULL)
675 return (EAFNOSUPPORT);
676 if (proto != 0) {
677 if (pffindprotonotype(dom, proto) != NULL)
678 return (EPROTOTYPE);
679 }
680 return (EPROTONOSUPPORT);
681 }
682 if (prp->pr_type != type)
683 return (EPROTOTYPE);
684 so = soalloc(1, dom, type);
685 if (so == NULL)
686 return (ENOBUFS);
687
688 switch (dom) {
689 case PF_LOCAL:
690 INC_ATOMIC_INT64_LIM(net_api_stats.nas_socket_domain_local_total);
691 break;
692 case PF_INET:
693 INC_ATOMIC_INT64_LIM(net_api_stats.nas_socket_domain_inet_total);
694 if (type == SOCK_STREAM) {
695 INC_ATOMIC_INT64_LIM(net_api_stats.nas_socket_inet_stream_total);
696 } else {
697 INC_ATOMIC_INT64_LIM(net_api_stats.nas_socket_inet_dgram_total);
698 }
699 break;
700 case PF_ROUTE:
701 INC_ATOMIC_INT64_LIM(net_api_stats.nas_socket_domain_route_total);
702 break;
703 case PF_NDRV:
704 INC_ATOMIC_INT64_LIM(net_api_stats.nas_socket_domain_ndrv_total);
705 break;
706 case PF_KEY:
707 INC_ATOMIC_INT64_LIM(net_api_stats.nas_socket_domain_key_total);
708 break;
709 case PF_INET6:
710 INC_ATOMIC_INT64_LIM(net_api_stats.nas_socket_domain_inet6_total);
711 if (type == SOCK_STREAM) {
712 INC_ATOMIC_INT64_LIM(net_api_stats.nas_socket_inet6_stream_total);
713 } else {
714 INC_ATOMIC_INT64_LIM(net_api_stats.nas_socket_inet6_dgram_total);
715 }
716 break;
717 case PF_SYSTEM:
718 INC_ATOMIC_INT64_LIM(net_api_stats.nas_socket_domain_system_total);
719 break;
720 case PF_MULTIPATH:
721 INC_ATOMIC_INT64_LIM(net_api_stats.nas_socket_domain_multipath_total);
722 break;
723 default:
724 INC_ATOMIC_INT64_LIM(net_api_stats.nas_socket_domain_other_total);
725 break;
726 }
727
728 if (flags & SOCF_ASYNC)
729 so->so_state |= SS_NBIO;
730
731 TAILQ_INIT(&so->so_incomp);
732 TAILQ_INIT(&so->so_comp);
733 so->so_type = type;
734 so->last_upid = proc_uniqueid(p);
735 so->last_pid = proc_pid(p);
736 proc_getexecutableuuid(p, so->last_uuid, sizeof (so->last_uuid));
737 proc_pidoriginatoruuid(so->so_vuuid, sizeof(so->so_vuuid));
738
739 if (ep != PROC_NULL && ep != p) {
740 so->e_upid = proc_uniqueid(ep);
741 so->e_pid = proc_pid(ep);
742 proc_getexecutableuuid(ep, so->e_uuid, sizeof (so->e_uuid));
743 so->so_flags |= SOF_DELEGATED;
744 }
745
746 so->so_cred = kauth_cred_proc_ref(p);
747 if (!suser(kauth_cred_get(), NULL))
748 so->so_state |= SS_PRIV;
749
750 so->so_proto = prp;
751 so->so_rcv.sb_flags |= SB_RECV;
752 so->so_rcv.sb_so = so->so_snd.sb_so = so;
753 so->next_lock_lr = 0;
754 so->next_unlock_lr = 0;
755
756 #if CONFIG_MACF_SOCKET
757 mac_socket_label_associate(kauth_cred_get(), so);
758 #endif /* MAC_SOCKET */
759
760 /*
761 * Attachment will create the per pcb lock if necessary and
762 * increase refcount for creation, make sure it's done before
763 * socket is inserted in lists.
764 */
765 so->so_usecount++;
766
767 error = (*prp->pr_usrreqs->pru_attach)(so, proto, p);
768 if (error != 0) {
769 /*
770 * Warning:
771 * If so_pcb is not zero, the socket will be leaked,
772 * so protocol attachment handler must be coded carefuly
773 */
774 so->so_state |= SS_NOFDREF;
775 VERIFY(so->so_usecount > 0);
776 so->so_usecount--;
777 sofreelastref(so, 1); /* will deallocate the socket */
778 return (error);
779 }
780
781 atomic_add_32(&prp->pr_domain->dom_refs, 1);
782 TAILQ_INIT(&so->so_evlist);
783
784 /* Attach socket filters for this protocol */
785 sflt_initsock(so);
786 #if TCPDEBUG
787 if (tcpconsdebug == 2)
788 so->so_options |= SO_DEBUG;
789 #endif
790 so_set_default_traffic_class(so);
791
792 /*
793 * If this thread or task is marked to create backgrounded sockets,
794 * mark the socket as background.
795 */
796 if (proc_get_effective_thread_policy(current_thread(),
797 TASK_POLICY_NEW_SOCKETS_BG)) {
798 socket_set_traffic_mgt_flags(so, TRAFFIC_MGT_SO_BACKGROUND);
799 so->so_background_thread = current_thread();
800 }
801
802 switch (dom) {
803 /*
804 * Don't mark Unix domain, system or multipath sockets as
805 * eligible for defunct by default.
806 */
807 case PF_LOCAL:
808 case PF_SYSTEM:
809 case PF_MULTIPATH:
810 so->so_flags |= SOF_NODEFUNCT;
811 break;
812 default:
813 break;
814 }
815
816 /*
817 * Entitlements can't be checked at socket creation time except if the
818 * application requested a feature guarded by a privilege (c.f., socket
819 * delegation).
820 * The priv(9) and the Sandboxing APIs are designed with the idea that
821 * a privilege check should only be triggered by a userland request.
822 * A privilege check at socket creation time is time consuming and
823 * could trigger many authorisation error messages from the security
824 * APIs.
825 */
826
827 *aso = so;
828
829 return (0);
830 }
831
832 /*
833 * Returns: 0 Success
834 * EAFNOSUPPORT
835 * EPROTOTYPE
836 * EPROTONOSUPPORT
837 * ENOBUFS
838 * <pru_attach>:ENOBUFS[AF_UNIX]
839 * <pru_attach>:ENOBUFS[TCP]
840 * <pru_attach>:ENOMEM[TCP]
841 * <pru_attach>:??? [other protocol families, IPSEC]
842 */
843 int
844 socreate(int dom, struct socket **aso, int type, int proto)
845 {
846 return (socreate_internal(dom, aso, type, proto, current_proc(), 0,
847 PROC_NULL));
848 }
849
850 int
851 socreate_delegate(int dom, struct socket **aso, int type, int proto, pid_t epid)
852 {
853 int error = 0;
854 struct proc *ep = PROC_NULL;
855
856 if ((proc_selfpid() != epid) && ((ep = proc_find(epid)) == PROC_NULL)) {
857 error = ESRCH;
858 goto done;
859 }
860
861 error = socreate_internal(dom, aso, type, proto, current_proc(), 0, ep);
862
863 /*
864 * It might not be wise to hold the proc reference when calling
865 * socreate_internal since it calls soalloc with M_WAITOK
866 */
867 done:
868 if (ep != PROC_NULL)
869 proc_rele(ep);
870
871 return (error);
872 }
873
874 /*
875 * Returns: 0 Success
876 * <pru_bind>:EINVAL Invalid argument [COMMON_START]
877 * <pru_bind>:EAFNOSUPPORT Address family not supported
878 * <pru_bind>:EADDRNOTAVAIL Address not available.
879 * <pru_bind>:EINVAL Invalid argument
880 * <pru_bind>:EAFNOSUPPORT Address family not supported [notdef]
881 * <pru_bind>:EACCES Permission denied
882 * <pru_bind>:EADDRINUSE Address in use
883 * <pru_bind>:EAGAIN Resource unavailable, try again
884 * <pru_bind>:EPERM Operation not permitted
885 * <pru_bind>:???
886 * <sf_bind>:???
887 *
888 * Notes: It's not possible to fully enumerate the return codes above,
889 * since socket filter authors and protocol family authors may
890 * not choose to limit their error returns to those listed, even
891 * though this may result in some software operating incorrectly.
892 *
893 * The error codes which are enumerated above are those known to
894 * be returned by the tcp_usr_bind function supplied.
895 */
896 int
897 sobindlock(struct socket *so, struct sockaddr *nam, int dolock)
898 {
899 struct proc *p = current_proc();
900 int error = 0;
901
902 if (dolock)
903 socket_lock(so, 1);
904
905 so_update_last_owner_locked(so, p);
906 so_update_policy(so);
907
908 #if NECP
909 so_update_necp_policy(so, nam, NULL);
910 #endif /* NECP */
911
912 /*
913 * If this is a bind request on a socket that has been marked
914 * as inactive, reject it now before we go any further.
915 */
916 if (so->so_flags & SOF_DEFUNCT) {
917 error = EINVAL;
918 SODEFUNCTLOG("%s[%d, %s]: defunct so 0x%llx [%d,%d] (%d)\n",
919 __func__, proc_pid(p), proc_best_name(p),
920 (uint64_t)DEBUG_KERNEL_ADDRPERM(so),
921 SOCK_DOM(so), SOCK_TYPE(so), error);
922 goto out;
923 }
924
925 /* Socket filter */
926 error = sflt_bind(so, nam);
927
928 if (error == 0)
929 error = (*so->so_proto->pr_usrreqs->pru_bind)(so, nam, p);
930 out:
931 if (dolock)
932 socket_unlock(so, 1);
933
934 if (error == EJUSTRETURN)
935 error = 0;
936
937 return (error);
938 }
939
940 void
941 sodealloc(struct socket *so)
942 {
943 kauth_cred_unref(&so->so_cred);
944
945 /* Remove any filters */
946 sflt_termsock(so);
947
948 #if CONTENT_FILTER
949 cfil_sock_detach(so);
950 #endif /* CONTENT_FILTER */
951
952 /* Delete the state allocated for msg queues on a socket */
953 if (so->so_flags & SOF_ENABLE_MSGS) {
954 FREE(so->so_msg_state, M_TEMP);
955 so->so_msg_state = NULL;
956 }
957 VERIFY(so->so_msg_state == NULL);
958
959 so->so_gencnt = OSIncrementAtomic64((SInt64 *)&so_gencnt);
960
961 #if CONFIG_MACF_SOCKET
962 mac_socket_label_destroy(so);
963 #endif /* MAC_SOCKET */
964
965 if (so->so_flags1 & SOF1_CACHED_IN_SOCK_LAYER) {
966 cached_sock_free(so);
967 } else {
968 FREE_ZONE(so, sizeof (*so), so->so_zone);
969 }
970 }
971
972 /*
973 * Returns: 0 Success
974 * EINVAL
975 * EOPNOTSUPP
976 * <pru_listen>:EINVAL[AF_UNIX]
977 * <pru_listen>:EINVAL[TCP]
978 * <pru_listen>:EADDRNOTAVAIL[TCP] Address not available.
979 * <pru_listen>:EINVAL[TCP] Invalid argument
980 * <pru_listen>:EAFNOSUPPORT[TCP] Address family not supported [notdef]
981 * <pru_listen>:EACCES[TCP] Permission denied
982 * <pru_listen>:EADDRINUSE[TCP] Address in use
983 * <pru_listen>:EAGAIN[TCP] Resource unavailable, try again
984 * <pru_listen>:EPERM[TCP] Operation not permitted
985 * <sf_listen>:???
986 *
987 * Notes: Other <pru_listen> returns depend on the protocol family; all
988 * <sf_listen> returns depend on what the filter author causes
989 * their filter to return.
990 */
991 int
992 solisten(struct socket *so, int backlog)
993 {
994 struct proc *p = current_proc();
995 int error = 0;
996
997 socket_lock(so, 1);
998
999 so_update_last_owner_locked(so, p);
1000 so_update_policy(so);
1001
1002 #if NECP
1003 so_update_necp_policy(so, NULL, NULL);
1004 #endif /* NECP */
1005
1006 if (so->so_proto == NULL) {
1007 error = EINVAL;
1008 goto out;
1009 }
1010 if ((so->so_proto->pr_flags & PR_CONNREQUIRED) == 0) {
1011 error = EOPNOTSUPP;
1012 goto out;
1013 }
1014
1015 /*
1016 * If the listen request is made on a socket that is not fully
1017 * disconnected, or on a socket that has been marked as inactive,
1018 * reject the request now.
1019 */
1020 if ((so->so_state &
1021 (SS_ISCONNECTED|SS_ISCONNECTING|SS_ISDISCONNECTING)) ||
1022 (so->so_flags & SOF_DEFUNCT)) {
1023 error = EINVAL;
1024 if (so->so_flags & SOF_DEFUNCT) {
1025 SODEFUNCTLOG("%s[%d, %s]: defunct so 0x%llx [%d,%d] "
1026 "(%d)\n", __func__, proc_pid(p),
1027 proc_best_name(p),
1028 (uint64_t)DEBUG_KERNEL_ADDRPERM(so),
1029 SOCK_DOM(so), SOCK_TYPE(so), error);
1030 }
1031 goto out;
1032 }
1033
1034 if ((so->so_restrictions & SO_RESTRICT_DENY_IN) != 0) {
1035 error = EPERM;
1036 goto out;
1037 }
1038
1039 error = sflt_listen(so);
1040 if (error == 0)
1041 error = (*so->so_proto->pr_usrreqs->pru_listen)(so, p);
1042
1043 if (error) {
1044 if (error == EJUSTRETURN)
1045 error = 0;
1046 goto out;
1047 }
1048
1049 if (TAILQ_EMPTY(&so->so_comp))
1050 so->so_options |= SO_ACCEPTCONN;
1051 /*
1052 * POSIX: The implementation may have an upper limit on the length of
1053 * the listen queue-either global or per accepting socket. If backlog
1054 * exceeds this limit, the length of the listen queue is set to the
1055 * limit.
1056 *
1057 * If listen() is called with a backlog argument value that is less
1058 * than 0, the function behaves as if it had been called with a backlog
1059 * argument value of 0.
1060 *
1061 * A backlog argument of 0 may allow the socket to accept connections,
1062 * in which case the length of the listen queue may be set to an
1063 * implementation-defined minimum value.
1064 */
1065 if (backlog <= 0 || backlog > somaxconn)
1066 backlog = somaxconn;
1067
1068 so->so_qlimit = backlog;
1069 out:
1070 socket_unlock(so, 1);
1071 return (error);
1072 }
1073
1074 /*
1075 * The "accept list lock" protects the fields related to the listener queues
1076 * because we can unlock a socket to respect the lock ordering between
1077 * the listener socket and its clients sockets. The lock ordering is first to
1078 * acquire the client socket before the listener socket.
1079 *
1080 * The accept list lock serializes access to the following fields:
1081 * - of the listener socket:
1082 * - so_comp
1083 * - so_incomp
1084 * - so_qlen
1085 * - so_inqlen
1086 * - of client sockets that are in so_comp or so_incomp:
1087 * - so_head
1088 * - so_list
1089 *
1090 * As one can see the accept list lock protects the consistent of the
1091 * linkage of the client sockets.
1092 *
1093 * Note that those fields may be read without holding the accept list lock
1094 * for a preflight provided the accept list lock is taken when committing
1095 * to take an action based on the result of the preflight. The preflight
1096 * saves the cost of doing the unlock/lock dance.
1097 */
1098 void
1099 so_acquire_accept_list(struct socket *head, struct socket *so)
1100 {
1101 lck_mtx_t *mutex_held;
1102
1103 if (head->so_proto->pr_getlock == NULL) {
1104 return;
1105 }
1106 mutex_held = (*head->so_proto->pr_getlock)(head, PR_F_WILLUNLOCK);
1107 LCK_MTX_ASSERT(mutex_held, LCK_MTX_ASSERT_OWNED);
1108
1109 if (!(head->so_flags1 & SOF1_ACCEPT_LIST_HELD)) {
1110 head->so_flags1 |= SOF1_ACCEPT_LIST_HELD;
1111 return;
1112 }
1113 if (so != NULL) {
1114 socket_unlock(so, 0);
1115 }
1116 while (head->so_flags1 & SOF1_ACCEPT_LIST_HELD) {
1117 so_accept_list_waits += 1;
1118 msleep((caddr_t)&head->so_incomp, mutex_held,
1119 PSOCK | PCATCH, __func__, NULL);
1120 }
1121 head->so_flags1 |= SOF1_ACCEPT_LIST_HELD;
1122 if (so != NULL) {
1123 socket_unlock(head, 0);
1124 socket_lock(so, 0);
1125 socket_lock(head, 0);
1126 }
1127 }
1128
1129 void
1130 so_release_accept_list(struct socket *head)
1131 {
1132 if (head->so_proto->pr_getlock != NULL) {
1133 lck_mtx_t *mutex_held;
1134
1135 mutex_held = (*head->so_proto->pr_getlock)(head, 0);
1136 LCK_MTX_ASSERT(mutex_held, LCK_MTX_ASSERT_OWNED);
1137
1138 head->so_flags1 &= ~SOF1_ACCEPT_LIST_HELD;
1139 wakeup((caddr_t)&head->so_incomp);
1140 }
1141 }
1142
1143 void
1144 sofreelastref(struct socket *so, int dealloc)
1145 {
1146 struct socket *head = so->so_head;
1147
1148 /* Assume socket is locked */
1149
1150 if (!(so->so_flags & SOF_PCBCLEARING) || !(so->so_state & SS_NOFDREF)) {
1151 selthreadclear(&so->so_snd.sb_sel);
1152 selthreadclear(&so->so_rcv.sb_sel);
1153 so->so_rcv.sb_flags &= ~(SB_SEL|SB_UPCALL);
1154 so->so_snd.sb_flags &= ~(SB_SEL|SB_UPCALL);
1155 so->so_event = sonullevent;
1156 return;
1157 }
1158 if (head != NULL) {
1159 /*
1160 * Need to lock the listener when the protocol has
1161 * per socket locks
1162 */
1163 if (head->so_proto->pr_getlock != NULL) {
1164 socket_lock(head, 1);
1165 so_acquire_accept_list(head, so);
1166 }
1167 if (so->so_state & SS_INCOMP) {
1168 so->so_state &= ~SS_INCOMP;
1169 TAILQ_REMOVE(&head->so_incomp, so, so_list);
1170 head->so_incqlen--;
1171 head->so_qlen--;
1172 so->so_head = NULL;
1173
1174 if (head->so_proto->pr_getlock != NULL) {
1175 so_release_accept_list(head);
1176 socket_unlock(head, 1);
1177 }
1178 } else if (so->so_state & SS_COMP) {
1179 if (head->so_proto->pr_getlock != NULL) {
1180 so_release_accept_list(head);
1181 socket_unlock(head, 1);
1182 }
1183 /*
1184 * We must not decommission a socket that's
1185 * on the accept(2) queue. If we do, then
1186 * accept(2) may hang after select(2) indicated
1187 * that the listening socket was ready.
1188 */
1189 selthreadclear(&so->so_snd.sb_sel);
1190 selthreadclear(&so->so_rcv.sb_sel);
1191 so->so_rcv.sb_flags &= ~(SB_SEL|SB_UPCALL);
1192 so->so_snd.sb_flags &= ~(SB_SEL|SB_UPCALL);
1193 so->so_event = sonullevent;
1194 return;
1195 } else {
1196 if (head->so_proto->pr_getlock != NULL) {
1197 so_release_accept_list(head);
1198 socket_unlock(head, 1);
1199 }
1200 printf("sofree: not queued\n");
1201 }
1202 }
1203 sowflush(so);
1204 sorflush(so);
1205
1206 #if FLOW_DIVERT
1207 if (so->so_flags & SOF_FLOW_DIVERT) {
1208 flow_divert_detach(so);
1209 }
1210 #endif /* FLOW_DIVERT */
1211
1212 /* 3932268: disable upcall */
1213 so->so_rcv.sb_flags &= ~SB_UPCALL;
1214 so->so_snd.sb_flags &= ~(SB_UPCALL|SB_SNDBYTE_CNT);
1215 so->so_event = sonullevent;
1216
1217 if (dealloc)
1218 sodealloc(so);
1219 }
1220
1221 void
1222 soclose_wait_locked(struct socket *so)
1223 {
1224 lck_mtx_t *mutex_held;
1225
1226 if (so->so_proto->pr_getlock != NULL)
1227 mutex_held = (*so->so_proto->pr_getlock)(so, PR_F_WILLUNLOCK);
1228 else
1229 mutex_held = so->so_proto->pr_domain->dom_mtx;
1230 LCK_MTX_ASSERT(mutex_held, LCK_MTX_ASSERT_OWNED);
1231
1232 /*
1233 * Double check here and return if there's no outstanding upcall;
1234 * otherwise proceed further only if SOF_UPCALLCLOSEWAIT is set.
1235 */
1236 if (!so->so_upcallusecount || !(so->so_flags & SOF_UPCALLCLOSEWAIT))
1237 return;
1238 so->so_rcv.sb_flags &= ~SB_UPCALL;
1239 so->so_snd.sb_flags &= ~SB_UPCALL;
1240 so->so_flags |= SOF_CLOSEWAIT;
1241
1242 (void) msleep((caddr_t)&so->so_upcallusecount, mutex_held, (PZERO - 1),
1243 "soclose_wait_locked", NULL);
1244 LCK_MTX_ASSERT(mutex_held, LCK_MTX_ASSERT_OWNED);
1245 so->so_flags &= ~SOF_CLOSEWAIT;
1246 }
1247
1248 /*
1249 * Close a socket on last file table reference removal.
1250 * Initiate disconnect if connected.
1251 * Free socket when disconnect complete.
1252 */
1253 int
1254 soclose_locked(struct socket *so)
1255 {
1256 int error = 0;
1257 struct timespec ts;
1258
1259 if (so->so_usecount == 0) {
1260 panic("soclose: so=%p refcount=0\n", so);
1261 /* NOTREACHED */
1262 }
1263
1264 sflt_notify(so, sock_evt_closing, NULL);
1265
1266 if (so->so_upcallusecount)
1267 soclose_wait_locked(so);
1268
1269 #if CONTENT_FILTER
1270 /*
1271 * We have to wait until the content filters are done
1272 */
1273 if ((so->so_flags & SOF_CONTENT_FILTER) != 0) {
1274 cfil_sock_close_wait(so);
1275 cfil_sock_is_closed(so);
1276 cfil_sock_detach(so);
1277 }
1278 #endif /* CONTENT_FILTER */
1279
1280 if (so->so_flags1 & SOF1_EXTEND_BK_IDLE_INPROG) {
1281 soresume(current_proc(), so, 1);
1282 so->so_flags1 &= ~SOF1_EXTEND_BK_IDLE_WANTED;
1283 }
1284
1285 if ((so->so_options & SO_ACCEPTCONN)) {
1286 struct socket *sp, *sonext;
1287 int persocklock = 0;
1288 int incomp_overflow_only;
1289
1290 /*
1291 * We do not want new connection to be added
1292 * to the connection queues
1293 */
1294 so->so_options &= ~SO_ACCEPTCONN;
1295
1296 /*
1297 * We can drop the lock on the listener once
1298 * we've acquired the incoming list
1299 */
1300 if (so->so_proto->pr_getlock != NULL) {
1301 persocklock = 1;
1302 so_acquire_accept_list(so, NULL);
1303 socket_unlock(so, 0);
1304 }
1305 again:
1306 incomp_overflow_only = 1;
1307
1308 TAILQ_FOREACH_SAFE(sp, &so->so_incomp, so_list, sonext) {
1309 /*
1310 * Radar 5350314
1311 * skip sockets thrown away by tcpdropdropblreq
1312 * they will get cleanup by the garbage collection.
1313 * otherwise, remove the incomp socket from the queue
1314 * and let soabort trigger the appropriate cleanup.
1315 */
1316 if (sp->so_flags & SOF_OVERFLOW)
1317 continue;
1318
1319 if (persocklock != 0)
1320 socket_lock(sp, 1);
1321
1322 /*
1323 * Radar 27945981
1324 * The extra reference for the list insure the
1325 * validity of the socket pointer when we perform the
1326 * unlock of the head above
1327 */
1328 if (sp->so_state & SS_INCOMP) {
1329 sp->so_state &= ~SS_INCOMP;
1330 sp->so_head = NULL;
1331 TAILQ_REMOVE(&so->so_incomp, sp, so_list);
1332 so->so_incqlen--;
1333 so->so_qlen--;
1334
1335 (void) soabort(sp);
1336 } else {
1337 panic("%s sp %p in so_incomp but !SS_INCOMP",
1338 __func__, sp);
1339 }
1340
1341 if (persocklock != 0)
1342 socket_unlock(sp, 1);
1343 }
1344
1345 TAILQ_FOREACH_SAFE(sp, &so->so_comp, so_list, sonext) {
1346 /* Dequeue from so_comp since sofree() won't do it */
1347 if (persocklock != 0)
1348 socket_lock(sp, 1);
1349
1350 if (sp->so_state & SS_COMP) {
1351 sp->so_state &= ~SS_COMP;
1352 sp->so_head = NULL;
1353 TAILQ_REMOVE(&so->so_comp, sp, so_list);
1354 so->so_qlen--;
1355
1356 (void) soabort(sp);
1357 } else {
1358 panic("%s sp %p in so_comp but !SS_COMP",
1359 __func__, sp);
1360 }
1361
1362 if (persocklock)
1363 socket_unlock(sp, 1);
1364 }
1365
1366 if (incomp_overflow_only == 0 && !TAILQ_EMPTY(&so->so_incomp)) {
1367 #if (DEBUG|DEVELOPMENT)
1368 panic("%s head %p so_comp not empty\n", __func__, so);
1369 #endif /* (DEVELOPMENT || DEBUG) */
1370
1371 goto again;
1372 }
1373
1374 if (!TAILQ_EMPTY(&so->so_comp)) {
1375 #if (DEBUG|DEVELOPMENT)
1376 panic("%s head %p so_comp not empty\n", __func__, so);
1377 #endif /* (DEVELOPMENT || DEBUG) */
1378
1379 goto again;
1380 }
1381
1382 if (persocklock) {
1383 socket_lock(so, 0);
1384 so_release_accept_list(so);
1385 }
1386 }
1387 if (so->so_pcb == NULL) {
1388 /* 3915887: mark the socket as ready for dealloc */
1389 so->so_flags |= SOF_PCBCLEARING;
1390 goto discard;
1391 }
1392 if (so->so_state & SS_ISCONNECTED) {
1393 if ((so->so_state & SS_ISDISCONNECTING) == 0) {
1394 error = sodisconnectlocked(so);
1395 if (error)
1396 goto drop;
1397 }
1398 if (so->so_options & SO_LINGER) {
1399 lck_mtx_t *mutex_held;
1400
1401 if ((so->so_state & SS_ISDISCONNECTING) &&
1402 (so->so_state & SS_NBIO))
1403 goto drop;
1404 if (so->so_proto->pr_getlock != NULL)
1405 mutex_held = (*so->so_proto->pr_getlock)(so, PR_F_WILLUNLOCK);
1406 else
1407 mutex_held = so->so_proto->pr_domain->dom_mtx;
1408 while (so->so_state & SS_ISCONNECTED) {
1409 ts.tv_sec = (so->so_linger/100);
1410 ts.tv_nsec = (so->so_linger % 100) *
1411 NSEC_PER_USEC * 1000 * 10;
1412 error = msleep((caddr_t)&so->so_timeo,
1413 mutex_held, PSOCK | PCATCH, "soclose", &ts);
1414 if (error) {
1415 /*
1416 * It's OK when the time fires,
1417 * don't report an error
1418 */
1419 if (error == EWOULDBLOCK)
1420 error = 0;
1421 break;
1422 }
1423 }
1424 }
1425 }
1426 drop:
1427 if (so->so_usecount == 0) {
1428 panic("soclose: usecount is zero so=%p\n", so);
1429 /* NOTREACHED */
1430 }
1431 if (so->so_pcb != NULL && !(so->so_flags & SOF_PCBCLEARING)) {
1432 int error2 = (*so->so_proto->pr_usrreqs->pru_detach)(so);
1433 if (error == 0)
1434 error = error2;
1435 }
1436 if (so->so_usecount <= 0) {
1437 panic("soclose: usecount is zero so=%p\n", so);
1438 /* NOTREACHED */
1439 }
1440 discard:
1441 if (so->so_pcb != NULL && !(so->so_flags & SOF_MP_SUBFLOW) &&
1442 (so->so_state & SS_NOFDREF)) {
1443 panic("soclose: NOFDREF");
1444 /* NOTREACHED */
1445 }
1446 so->so_state |= SS_NOFDREF;
1447
1448 if ((so->so_flags & SOF_KNOTE) != 0)
1449 KNOTE(&so->so_klist, SO_FILT_HINT_LOCKED);
1450
1451 atomic_add_32(&so->so_proto->pr_domain->dom_refs, -1);
1452 evsofree(so);
1453
1454 VERIFY(so->so_usecount > 0);
1455 so->so_usecount--;
1456 sofree(so);
1457 return (error);
1458 }
1459
1460 int
1461 soclose(struct socket *so)
1462 {
1463 int error = 0;
1464 socket_lock(so, 1);
1465
1466 if (so->so_retaincnt == 0) {
1467 error = soclose_locked(so);
1468 } else {
1469 /*
1470 * if the FD is going away, but socket is
1471 * retained in kernel remove its reference
1472 */
1473 so->so_usecount--;
1474 if (so->so_usecount < 2)
1475 panic("soclose: retaincnt non null and so=%p "
1476 "usecount=%d\n", so, so->so_usecount);
1477 }
1478 socket_unlock(so, 1);
1479 return (error);
1480 }
1481
1482 /*
1483 * Must be called at splnet...
1484 */
1485 /* Should already be locked */
1486 int
1487 soabort(struct socket *so)
1488 {
1489 int error;
1490
1491 #ifdef MORE_LOCKING_DEBUG
1492 lck_mtx_t *mutex_held;
1493
1494 if (so->so_proto->pr_getlock != NULL)
1495 mutex_held = (*so->so_proto->pr_getlock)(so, 0);
1496 else
1497 mutex_held = so->so_proto->pr_domain->dom_mtx;
1498 LCK_MTX_ASSERT(mutex_held, LCK_MTX_ASSERT_OWNED);
1499 #endif
1500
1501 if ((so->so_flags & SOF_ABORTED) == 0) {
1502 so->so_flags |= SOF_ABORTED;
1503 error = (*so->so_proto->pr_usrreqs->pru_abort)(so);
1504 if (error) {
1505 sofree(so);
1506 return (error);
1507 }
1508 }
1509 return (0);
1510 }
1511
1512 int
1513 soacceptlock(struct socket *so, struct sockaddr **nam, int dolock)
1514 {
1515 int error;
1516
1517 if (dolock)
1518 socket_lock(so, 1);
1519
1520 so_update_last_owner_locked(so, PROC_NULL);
1521 so_update_policy(so);
1522 #if NECP
1523 so_update_necp_policy(so, NULL, NULL);
1524 #endif /* NECP */
1525
1526 if ((so->so_state & SS_NOFDREF) == 0)
1527 panic("soaccept: !NOFDREF");
1528 so->so_state &= ~SS_NOFDREF;
1529 error = (*so->so_proto->pr_usrreqs->pru_accept)(so, nam);
1530
1531 if (dolock)
1532 socket_unlock(so, 1);
1533 return (error);
1534 }
1535
1536 int
1537 soaccept(struct socket *so, struct sockaddr **nam)
1538 {
1539 return (soacceptlock(so, nam, 1));
1540 }
1541
1542 int
1543 soacceptfilter(struct socket *so, struct socket *head)
1544 {
1545 struct sockaddr *local = NULL, *remote = NULL;
1546 int error = 0;
1547
1548 /*
1549 * Hold the lock even if this socket has not been made visible
1550 * to the filter(s). For sockets with global locks, this protects
1551 * against the head or peer going away
1552 */
1553 socket_lock(so, 1);
1554 if (sogetaddr_locked(so, &remote, 1) != 0 ||
1555 sogetaddr_locked(so, &local, 0) != 0) {
1556 so->so_state &= ~SS_NOFDREF;
1557 socket_unlock(so, 1);
1558 soclose(so);
1559 /* Out of resources; try it again next time */
1560 error = ECONNABORTED;
1561 goto done;
1562 }
1563
1564 error = sflt_accept(head, so, local, remote);
1565
1566 /*
1567 * If we get EJUSTRETURN from one of the filters, mark this socket
1568 * as inactive and return it anyway. This newly accepted socket
1569 * will be disconnected later before we hand it off to the caller.
1570 */
1571 if (error == EJUSTRETURN) {
1572 error = 0;
1573 (void) sosetdefunct(current_proc(), so,
1574 SHUTDOWN_SOCKET_LEVEL_DISCONNECT_INTERNAL, FALSE);
1575 }
1576
1577 if (error != 0) {
1578 /*
1579 * This may seem like a duplication to the above error
1580 * handling part when we return ECONNABORTED, except
1581 * the following is done while holding the lock since
1582 * the socket has been exposed to the filter(s) earlier.
1583 */
1584 so->so_state &= ~SS_NOFDREF;
1585 socket_unlock(so, 1);
1586 soclose(so);
1587 /* Propagate socket filter's error code to the caller */
1588 } else {
1589 socket_unlock(so, 1);
1590 }
1591 done:
1592 /* Callee checks for NULL pointer */
1593 sock_freeaddr(remote);
1594 sock_freeaddr(local);
1595 return (error);
1596 }
1597
1598 /*
1599 * Returns: 0 Success
1600 * EOPNOTSUPP Operation not supported on socket
1601 * EISCONN Socket is connected
1602 * <pru_connect>:EADDRNOTAVAIL Address not available.
1603 * <pru_connect>:EINVAL Invalid argument
1604 * <pru_connect>:EAFNOSUPPORT Address family not supported [notdef]
1605 * <pru_connect>:EACCES Permission denied
1606 * <pru_connect>:EADDRINUSE Address in use
1607 * <pru_connect>:EAGAIN Resource unavailable, try again
1608 * <pru_connect>:EPERM Operation not permitted
1609 * <sf_connect_out>:??? [anything a filter writer might set]
1610 */
1611 int
1612 soconnectlock(struct socket *so, struct sockaddr *nam, int dolock)
1613 {
1614 int error;
1615 struct proc *p = current_proc();
1616
1617 if (dolock)
1618 socket_lock(so, 1);
1619
1620 so_update_last_owner_locked(so, p);
1621 so_update_policy(so);
1622
1623 #if NECP
1624 so_update_necp_policy(so, NULL, nam);
1625 #endif /* NECP */
1626
1627 /*
1628 * If this is a listening socket or if this is a previously-accepted
1629 * socket that has been marked as inactive, reject the connect request.
1630 */
1631 if ((so->so_options & SO_ACCEPTCONN) || (so->so_flags & SOF_DEFUNCT)) {
1632 error = EOPNOTSUPP;
1633 if (so->so_flags & SOF_DEFUNCT) {
1634 SODEFUNCTLOG("%s[%d, %s]: defunct so 0x%llx [%d,%d] "
1635 "(%d)\n", __func__, proc_pid(p),
1636 proc_best_name(p),
1637 (uint64_t)DEBUG_KERNEL_ADDRPERM(so),
1638 SOCK_DOM(so), SOCK_TYPE(so), error);
1639 }
1640 if (dolock)
1641 socket_unlock(so, 1);
1642 return (error);
1643 }
1644
1645 if ((so->so_restrictions & SO_RESTRICT_DENY_OUT) != 0) {
1646 if (dolock)
1647 socket_unlock(so, 1);
1648 return (EPERM);
1649 }
1650
1651 /*
1652 * If protocol is connection-based, can only connect once.
1653 * Otherwise, if connected, try to disconnect first.
1654 * This allows user to disconnect by connecting to, e.g.,
1655 * a null address.
1656 */
1657 if (so->so_state & (SS_ISCONNECTED|SS_ISCONNECTING) &&
1658 ((so->so_proto->pr_flags & PR_CONNREQUIRED) ||
1659 (error = sodisconnectlocked(so)))) {
1660 error = EISCONN;
1661 } else {
1662 /*
1663 * Run connect filter before calling protocol:
1664 * - non-blocking connect returns before completion;
1665 */
1666 error = sflt_connectout(so, nam);
1667 if (error != 0) {
1668 if (error == EJUSTRETURN)
1669 error = 0;
1670 } else {
1671 error = (*so->so_proto->pr_usrreqs->pru_connect)
1672 (so, nam, p);
1673 }
1674 }
1675 if (dolock)
1676 socket_unlock(so, 1);
1677 return (error);
1678 }
1679
1680 int
1681 soconnect(struct socket *so, struct sockaddr *nam)
1682 {
1683 return (soconnectlock(so, nam, 1));
1684 }
1685
1686 /*
1687 * Returns: 0 Success
1688 * <pru_connect2>:EINVAL[AF_UNIX]
1689 * <pru_connect2>:EPROTOTYPE[AF_UNIX]
1690 * <pru_connect2>:??? [other protocol families]
1691 *
1692 * Notes: <pru_connect2> is not supported by [TCP].
1693 */
1694 int
1695 soconnect2(struct socket *so1, struct socket *so2)
1696 {
1697 int error;
1698
1699 socket_lock(so1, 1);
1700 if (so2->so_proto->pr_lock)
1701 socket_lock(so2, 1);
1702
1703 error = (*so1->so_proto->pr_usrreqs->pru_connect2)(so1, so2);
1704
1705 socket_unlock(so1, 1);
1706 if (so2->so_proto->pr_lock)
1707 socket_unlock(so2, 1);
1708 return (error);
1709 }
1710
1711 int
1712 soconnectxlocked(struct socket *so, struct sockaddr *src,
1713 struct sockaddr *dst, struct proc *p, uint32_t ifscope,
1714 sae_associd_t aid, sae_connid_t *pcid, uint32_t flags, void *arg,
1715 uint32_t arglen, uio_t auio, user_ssize_t *bytes_written)
1716 {
1717 int error;
1718
1719 so_update_last_owner_locked(so, p);
1720 so_update_policy(so);
1721
1722 /*
1723 * If this is a listening socket or if this is a previously-accepted
1724 * socket that has been marked as inactive, reject the connect request.
1725 */
1726 if ((so->so_options & SO_ACCEPTCONN) || (so->so_flags & SOF_DEFUNCT)) {
1727 error = EOPNOTSUPP;
1728 if (so->so_flags & SOF_DEFUNCT) {
1729 SODEFUNCTLOG("%s[%d, %s]: defunct so 0x%llx [%d,%d] "
1730 "(%d)\n", __func__, proc_pid(p),
1731 proc_best_name(p),
1732 (uint64_t)DEBUG_KERNEL_ADDRPERM(so),
1733 SOCK_DOM(so), SOCK_TYPE(so), error);
1734 }
1735 return (error);
1736 }
1737
1738 if ((so->so_restrictions & SO_RESTRICT_DENY_OUT) != 0)
1739 return (EPERM);
1740
1741 /*
1742 * If protocol is connection-based, can only connect once
1743 * unless PR_MULTICONN is set. Otherwise, if connected,
1744 * try to disconnect first. This allows user to disconnect
1745 * by connecting to, e.g., a null address.
1746 */
1747 if ((so->so_state & (SS_ISCONNECTED|SS_ISCONNECTING)) &&
1748 !(so->so_proto->pr_flags & PR_MULTICONN) &&
1749 ((so->so_proto->pr_flags & PR_CONNREQUIRED) ||
1750 (error = sodisconnectlocked(so)) != 0)) {
1751 error = EISCONN;
1752 } else {
1753 /*
1754 * Run connect filter before calling protocol:
1755 * - non-blocking connect returns before completion;
1756 */
1757 error = sflt_connectout(so, dst);
1758 if (error != 0) {
1759 /* Disable PRECONNECT_DATA, as we don't need to send a SYN anymore. */
1760 so->so_flags1 &= ~SOF1_PRECONNECT_DATA;
1761 if (error == EJUSTRETURN)
1762 error = 0;
1763 } else {
1764 error = (*so->so_proto->pr_usrreqs->pru_connectx)
1765 (so, src, dst, p, ifscope, aid, pcid,
1766 flags, arg, arglen, auio, bytes_written);
1767 }
1768 }
1769
1770 return (error);
1771 }
1772
1773 int
1774 sodisconnectlocked(struct socket *so)
1775 {
1776 int error;
1777
1778 if ((so->so_state & SS_ISCONNECTED) == 0) {
1779 error = ENOTCONN;
1780 goto bad;
1781 }
1782 if (so->so_state & SS_ISDISCONNECTING) {
1783 error = EALREADY;
1784 goto bad;
1785 }
1786
1787 error = (*so->so_proto->pr_usrreqs->pru_disconnect)(so);
1788 if (error == 0)
1789 sflt_notify(so, sock_evt_disconnected, NULL);
1790
1791 bad:
1792 return (error);
1793 }
1794
1795 /* Locking version */
1796 int
1797 sodisconnect(struct socket *so)
1798 {
1799 int error;
1800
1801 socket_lock(so, 1);
1802 error = sodisconnectlocked(so);
1803 socket_unlock(so, 1);
1804 return (error);
1805 }
1806
1807 int
1808 sodisconnectxlocked(struct socket *so, sae_associd_t aid, sae_connid_t cid)
1809 {
1810 int error;
1811
1812 /*
1813 * Call the protocol disconnectx handler; let it handle all
1814 * matters related to the connection state of this session.
1815 */
1816 error = (*so->so_proto->pr_usrreqs->pru_disconnectx)(so, aid, cid);
1817 if (error == 0) {
1818 /*
1819 * The event applies only for the session, not for
1820 * the disconnection of individual subflows.
1821 */
1822 if (so->so_state & (SS_ISDISCONNECTING|SS_ISDISCONNECTED))
1823 sflt_notify(so, sock_evt_disconnected, NULL);
1824 }
1825 return (error);
1826 }
1827
1828 int
1829 sodisconnectx(struct socket *so, sae_associd_t aid, sae_connid_t cid)
1830 {
1831 int error;
1832
1833 socket_lock(so, 1);
1834 error = sodisconnectxlocked(so, aid, cid);
1835 socket_unlock(so, 1);
1836 return (error);
1837 }
1838
1839 #define SBLOCKWAIT(f) (((f) & MSG_DONTWAIT) ? 0 : SBL_WAIT)
1840
1841 /*
1842 * sosendcheck will lock the socket buffer if it isn't locked and
1843 * verify that there is space for the data being inserted.
1844 *
1845 * Returns: 0 Success
1846 * EPIPE
1847 * sblock:EWOULDBLOCK
1848 * sblock:EINTR
1849 * sbwait:EBADF
1850 * sbwait:EINTR
1851 * [so_error]:???
1852 */
1853 int
1854 sosendcheck(struct socket *so, struct sockaddr *addr, user_ssize_t resid,
1855 int32_t clen, int32_t atomic, int flags, int *sblocked,
1856 struct mbuf *control)
1857 {
1858 int error = 0;
1859 int32_t space;
1860 int assumelock = 0;
1861
1862 restart:
1863 if (*sblocked == 0) {
1864 if ((so->so_snd.sb_flags & SB_LOCK) != 0 &&
1865 so->so_send_filt_thread != 0 &&
1866 so->so_send_filt_thread == current_thread()) {
1867 /*
1868 * We're being called recursively from a filter,
1869 * allow this to continue. Radar 4150520.
1870 * Don't set sblocked because we don't want
1871 * to perform an unlock later.
1872 */
1873 assumelock = 1;
1874 } else {
1875 error = sblock(&so->so_snd, SBLOCKWAIT(flags));
1876 if (error) {
1877 if (so->so_flags & SOF_DEFUNCT)
1878 goto defunct;
1879 return (error);
1880 }
1881 *sblocked = 1;
1882 }
1883 }
1884
1885 /*
1886 * If a send attempt is made on a socket that has been marked
1887 * as inactive (disconnected), reject the request.
1888 */
1889 if (so->so_flags & SOF_DEFUNCT) {
1890 defunct:
1891 error = EPIPE;
1892 SODEFUNCTLOG("%s[%d, %s]: defunct so 0x%llx [%d,%d] (%d)\n",
1893 __func__, proc_selfpid(), proc_best_name(current_proc()),
1894 (uint64_t)DEBUG_KERNEL_ADDRPERM(so),
1895 SOCK_DOM(so), SOCK_TYPE(so), error);
1896 return (error);
1897 }
1898
1899 if (so->so_state & SS_CANTSENDMORE) {
1900 #if CONTENT_FILTER
1901 /*
1902 * Can re-inject data of half closed connections
1903 */
1904 if ((so->so_state & SS_ISDISCONNECTED) == 0 &&
1905 so->so_snd.sb_cfil_thread == current_thread() &&
1906 cfil_sock_data_pending(&so->so_snd) != 0)
1907 CFIL_LOG(LOG_INFO,
1908 "so %llx ignore SS_CANTSENDMORE",
1909 (uint64_t)DEBUG_KERNEL_ADDRPERM(so));
1910 else
1911 #endif /* CONTENT_FILTER */
1912 return (EPIPE);
1913 }
1914 if (so->so_error) {
1915 error = so->so_error;
1916 so->so_error = 0;
1917 return (error);
1918 }
1919
1920 if ((so->so_state & SS_ISCONNECTED) == 0) {
1921 if ((so->so_proto->pr_flags & PR_CONNREQUIRED) != 0) {
1922 if (((so->so_state & SS_ISCONFIRMING) == 0) &&
1923 (resid != 0 || clen == 0) &&
1924 !(so->so_flags1 & SOF1_PRECONNECT_DATA))
1925 return (ENOTCONN);
1926
1927 } else if (addr == 0 && !(flags&MSG_HOLD)) {
1928 return ((so->so_proto->pr_flags & PR_CONNREQUIRED) ?
1929 ENOTCONN : EDESTADDRREQ);
1930 }
1931 }
1932
1933 if (so->so_flags & SOF_ENABLE_MSGS)
1934 space = msgq_sbspace(so, control);
1935 else
1936 space = sbspace(&so->so_snd);
1937
1938 if (flags & MSG_OOB)
1939 space += 1024;
1940 if ((atomic && resid > so->so_snd.sb_hiwat) ||
1941 clen > so->so_snd.sb_hiwat)
1942 return (EMSGSIZE);
1943
1944 if ((space < resid + clen &&
1945 (atomic || (space < (int32_t)so->so_snd.sb_lowat) ||
1946 space < clen)) ||
1947 (so->so_type == SOCK_STREAM && so_wait_for_if_feedback(so))) {
1948 /*
1949 * don't block the connectx call when there's more data
1950 * than can be copied.
1951 */
1952 if (so->so_flags1 & SOF1_PRECONNECT_DATA) {
1953 if (space == 0) {
1954 return (EWOULDBLOCK);
1955 }
1956 if (space < (int32_t)so->so_snd.sb_lowat) {
1957 return (0);
1958 }
1959 }
1960 if ((so->so_state & SS_NBIO) || (flags & MSG_NBIO) ||
1961 assumelock) {
1962 return (EWOULDBLOCK);
1963 }
1964 sbunlock(&so->so_snd, TRUE); /* keep socket locked */
1965 *sblocked = 0;
1966 error = sbwait(&so->so_snd);
1967 if (error) {
1968 if (so->so_flags & SOF_DEFUNCT)
1969 goto defunct;
1970 return (error);
1971 }
1972 goto restart;
1973 }
1974 return (0);
1975 }
1976
1977 /*
1978 * Send on a socket.
1979 * If send must go all at once and message is larger than
1980 * send buffering, then hard error.
1981 * Lock against other senders.
1982 * If must go all at once and not enough room now, then
1983 * inform user that this would block and do nothing.
1984 * Otherwise, if nonblocking, send as much as possible.
1985 * The data to be sent is described by "uio" if nonzero,
1986 * otherwise by the mbuf chain "top" (which must be null
1987 * if uio is not). Data provided in mbuf chain must be small
1988 * enough to send all at once.
1989 *
1990 * Returns nonzero on error, timeout or signal; callers
1991 * must check for short counts if EINTR/ERESTART are returned.
1992 * Data and control buffers are freed on return.
1993 * Experiment:
1994 * MSG_HOLD: go thru most of sosend(), but just enqueue the mbuf
1995 * MSG_SEND: go thru as for MSG_HOLD on current fragment, then
1996 * point at the mbuf chain being constructed and go from there.
1997 *
1998 * Returns: 0 Success
1999 * EOPNOTSUPP
2000 * EINVAL
2001 * ENOBUFS
2002 * uiomove:EFAULT
2003 * sosendcheck:EPIPE
2004 * sosendcheck:EWOULDBLOCK
2005 * sosendcheck:EINTR
2006 * sosendcheck:EBADF
2007 * sosendcheck:EINTR
2008 * sosendcheck:??? [value from so_error]
2009 * <pru_send>:ECONNRESET[TCP]
2010 * <pru_send>:EINVAL[TCP]
2011 * <pru_send>:ENOBUFS[TCP]
2012 * <pru_send>:EADDRINUSE[TCP]
2013 * <pru_send>:EADDRNOTAVAIL[TCP]
2014 * <pru_send>:EAFNOSUPPORT[TCP]
2015 * <pru_send>:EACCES[TCP]
2016 * <pru_send>:EAGAIN[TCP]
2017 * <pru_send>:EPERM[TCP]
2018 * <pru_send>:EMSGSIZE[TCP]
2019 * <pru_send>:EHOSTUNREACH[TCP]
2020 * <pru_send>:ENETUNREACH[TCP]
2021 * <pru_send>:ENETDOWN[TCP]
2022 * <pru_send>:ENOMEM[TCP]
2023 * <pru_send>:ENOBUFS[TCP]
2024 * <pru_send>:???[TCP] [ignorable: mostly IPSEC/firewall/DLIL]
2025 * <pru_send>:EINVAL[AF_UNIX]
2026 * <pru_send>:EOPNOTSUPP[AF_UNIX]
2027 * <pru_send>:EPIPE[AF_UNIX]
2028 * <pru_send>:ENOTCONN[AF_UNIX]
2029 * <pru_send>:EISCONN[AF_UNIX]
2030 * <pru_send>:???[AF_UNIX] [whatever a filter author chooses]
2031 * <sf_data_out>:??? [whatever a filter author chooses]
2032 *
2033 * Notes: Other <pru_send> returns depend on the protocol family; all
2034 * <sf_data_out> returns depend on what the filter author causes
2035 * their filter to return.
2036 */
2037 int
2038 sosend(struct socket *so, struct sockaddr *addr, struct uio *uio,
2039 struct mbuf *top, struct mbuf *control, int flags)
2040 {
2041 struct mbuf **mp;
2042 struct mbuf *m, *freelist = NULL;
2043 user_ssize_t space, len, resid, orig_resid;
2044 int clen = 0, error, dontroute, mlen, sendflags;
2045 int atomic = sosendallatonce(so) || top;
2046 int sblocked = 0;
2047 struct proc *p = current_proc();
2048 struct mbuf *control_copy = NULL;
2049 uint16_t headroom = 0;
2050 boolean_t en_tracing = FALSE;
2051
2052 if (uio != NULL)
2053 resid = uio_resid(uio);
2054 else
2055 resid = top->m_pkthdr.len;
2056
2057 KERNEL_DEBUG((DBG_FNC_SOSEND | DBG_FUNC_START), so, resid,
2058 so->so_snd.sb_cc, so->so_snd.sb_lowat, so->so_snd.sb_hiwat);
2059
2060 socket_lock(so, 1);
2061
2062 /*
2063 * trace if tracing & network (vs. unix) sockets & and
2064 * non-loopback
2065 */
2066 if (ENTR_SHOULDTRACE &&
2067 (SOCK_CHECK_DOM(so, AF_INET) || SOCK_CHECK_DOM(so, AF_INET6))) {
2068 struct inpcb *inp = sotoinpcb(so);
2069 if (inp->inp_last_outifp != NULL &&
2070 !(inp->inp_last_outifp->if_flags & IFF_LOOPBACK)) {
2071 en_tracing = TRUE;
2072 KERNEL_ENERGYTRACE(kEnTrActKernSockWrite, DBG_FUNC_START,
2073 VM_KERNEL_ADDRPERM(so),
2074 ((so->so_state & SS_NBIO) ? kEnTrFlagNonBlocking : 0),
2075 (int64_t)resid);
2076 orig_resid = resid;
2077 }
2078 }
2079
2080 /*
2081 * Re-injection should not affect process accounting
2082 */
2083 if ((flags & MSG_SKIPCFIL) == 0) {
2084 so_update_last_owner_locked(so, p);
2085 so_update_policy(so);
2086
2087 #if NECP
2088 so_update_necp_policy(so, NULL, addr);
2089 #endif /* NECP */
2090 }
2091
2092 if (so->so_type != SOCK_STREAM && (flags & MSG_OOB) != 0) {
2093 error = EOPNOTSUPP;
2094 goto out_locked;
2095 }
2096
2097 /*
2098 * In theory resid should be unsigned.
2099 * However, space must be signed, as it might be less than 0
2100 * if we over-committed, and we must use a signed comparison
2101 * of space and resid. On the other hand, a negative resid
2102 * causes us to loop sending 0-length segments to the protocol.
2103 *
2104 * Usually, MSG_EOR isn't used on SOCK_STREAM type sockets.
2105 * But it will be used by sockets doing message delivery.
2106 *
2107 * Note: We limit resid to be a positive int value as we use
2108 * imin() to set bytes_to_copy -- radr://14558484
2109 */
2110 if (resid < 0 || resid > INT_MAX || (so->so_type == SOCK_STREAM &&
2111 !(so->so_flags & SOF_ENABLE_MSGS) && (flags & MSG_EOR))) {
2112 error = EINVAL;
2113 goto out_locked;
2114 }
2115
2116 dontroute = (flags & MSG_DONTROUTE) &&
2117 (so->so_options & SO_DONTROUTE) == 0 &&
2118 (so->so_proto->pr_flags & PR_ATOMIC);
2119 OSIncrementAtomicLong(&p->p_stats->p_ru.ru_msgsnd);
2120
2121 if (control != NULL)
2122 clen = control->m_len;
2123
2124 if (soreserveheadroom != 0)
2125 headroom = so->so_pktheadroom;
2126
2127 do {
2128 error = sosendcheck(so, addr, resid, clen, atomic, flags,
2129 &sblocked, control);
2130 if (error)
2131 goto out_locked;
2132
2133 mp = &top;
2134 if (so->so_flags & SOF_ENABLE_MSGS)
2135 space = msgq_sbspace(so, control);
2136 else
2137 space = sbspace(&so->so_snd) - clen;
2138 space += ((flags & MSG_OOB) ? 1024 : 0);
2139
2140 do {
2141 if (uio == NULL) {
2142 /*
2143 * Data is prepackaged in "top".
2144 */
2145 resid = 0;
2146 if (flags & MSG_EOR)
2147 top->m_flags |= M_EOR;
2148 } else {
2149 int chainlength;
2150 int bytes_to_copy;
2151 boolean_t jumbocl;
2152 boolean_t bigcl;
2153 int bytes_to_alloc;
2154
2155 bytes_to_copy = imin(resid, space);
2156
2157 bytes_to_alloc = bytes_to_copy;
2158 if (top == NULL)
2159 bytes_to_alloc += headroom;
2160
2161 if (sosendminchain > 0)
2162 chainlength = 0;
2163 else
2164 chainlength = sosendmaxchain;
2165
2166 /*
2167 * Use big 4 KB cluster when the outgoing interface
2168 * does not prefer 2 KB clusters
2169 */
2170 bigcl = !(so->so_flags1 & SOF1_IF_2KCL) ||
2171 sosendbigcl_ignore_capab;
2172
2173 /*
2174 * Attempt to use larger than system page-size
2175 * clusters for large writes only if there is
2176 * a jumbo cluster pool and if the socket is
2177 * marked accordingly.
2178 */
2179 jumbocl = sosendjcl && njcl > 0 &&
2180 ((so->so_flags & SOF_MULTIPAGES) ||
2181 sosendjcl_ignore_capab) &&
2182 bigcl;
2183
2184 socket_unlock(so, 0);
2185
2186 do {
2187 int num_needed;
2188 int hdrs_needed = (top == NULL) ? 1 : 0;
2189
2190 /*
2191 * try to maintain a local cache of mbuf
2192 * clusters needed to complete this
2193 * write the list is further limited to
2194 * the number that are currently needed
2195 * to fill the socket this mechanism
2196 * allows a large number of mbufs/
2197 * clusters to be grabbed under a single
2198 * mbuf lock... if we can't get any
2199 * clusters, than fall back to trying
2200 * for mbufs if we fail early (or
2201 * miscalcluate the number needed) make
2202 * sure to release any clusters we
2203 * haven't yet consumed.
2204 */
2205 if (freelist == NULL &&
2206 bytes_to_alloc > MBIGCLBYTES &&
2207 jumbocl) {
2208 num_needed =
2209 bytes_to_alloc / M16KCLBYTES;
2210
2211 if ((bytes_to_alloc -
2212 (num_needed * M16KCLBYTES))
2213 >= MINCLSIZE)
2214 num_needed++;
2215
2216 freelist =
2217 m_getpackets_internal(
2218 (unsigned int *)&num_needed,
2219 hdrs_needed, M_WAIT, 0,
2220 M16KCLBYTES);
2221 /*
2222 * Fall back to 4K cluster size
2223 * if allocation failed
2224 */
2225 }
2226
2227 if (freelist == NULL &&
2228 bytes_to_alloc > MCLBYTES &&
2229 bigcl) {
2230 num_needed =
2231 bytes_to_alloc / MBIGCLBYTES;
2232
2233 if ((bytes_to_alloc -
2234 (num_needed * MBIGCLBYTES)) >=
2235 MINCLSIZE)
2236 num_needed++;
2237
2238 freelist =
2239 m_getpackets_internal(
2240 (unsigned int *)&num_needed,
2241 hdrs_needed, M_WAIT, 0,
2242 MBIGCLBYTES);
2243 /*
2244 * Fall back to cluster size
2245 * if allocation failed
2246 */
2247 }
2248
2249 /*
2250 * Allocate a cluster as we want to
2251 * avoid to split the data in more
2252 * that one segment and using MINCLSIZE
2253 * would lead us to allocate two mbufs
2254 */
2255 if (soreserveheadroom != 0 &&
2256 freelist == NULL &&
2257 ((top == NULL &&
2258 bytes_to_alloc > _MHLEN) ||
2259 bytes_to_alloc > _MLEN)) {
2260 num_needed = ROUNDUP(bytes_to_alloc, MCLBYTES) /
2261 MCLBYTES;
2262 freelist =
2263 m_getpackets_internal(
2264 (unsigned int *)&num_needed,
2265 hdrs_needed, M_WAIT, 0,
2266 MCLBYTES);
2267 /*
2268 * Fall back to a single mbuf
2269 * if allocation failed
2270 */
2271 } else if (freelist == NULL &&
2272 bytes_to_alloc > MINCLSIZE) {
2273 num_needed =
2274 bytes_to_alloc / MCLBYTES;
2275
2276 if ((bytes_to_alloc -
2277 (num_needed * MCLBYTES)) >=
2278 MINCLSIZE)
2279 num_needed++;
2280
2281 freelist =
2282 m_getpackets_internal(
2283 (unsigned int *)&num_needed,
2284 hdrs_needed, M_WAIT, 0,
2285 MCLBYTES);
2286 /*
2287 * Fall back to a single mbuf
2288 * if allocation failed
2289 */
2290 }
2291 /*
2292 * For datagram protocols, leave
2293 * headroom for protocol headers
2294 * in the first cluster of the chain
2295 */
2296 if (freelist != NULL && atomic &&
2297 top == NULL && headroom > 0) {
2298 freelist->m_data += headroom;
2299 }
2300
2301 /*
2302 * Fall back to regular mbufs without
2303 * reserving the socket headroom
2304 */
2305 if (freelist == NULL) {
2306 if (top == NULL)
2307 MGETHDR(freelist,
2308 M_WAIT, MT_DATA);
2309 else
2310 MGET(freelist,
2311 M_WAIT, MT_DATA);
2312
2313 if (freelist == NULL) {
2314 error = ENOBUFS;
2315 socket_lock(so, 0);
2316 goto out_locked;
2317 }
2318 /*
2319 * For datagram protocols,
2320 * leave room for protocol
2321 * headers in first mbuf.
2322 */
2323 if (atomic && top == NULL &&
2324 bytes_to_copy < MHLEN) {
2325 MH_ALIGN(freelist,
2326 bytes_to_copy);
2327 }
2328 }
2329 m = freelist;
2330 freelist = m->m_next;
2331 m->m_next = NULL;
2332
2333 if ((m->m_flags & M_EXT))
2334 mlen = m->m_ext.ext_size -
2335 m_leadingspace(m);
2336 else if ((m->m_flags & M_PKTHDR))
2337 mlen =
2338 MHLEN - m_leadingspace(m);
2339 else
2340 mlen = MLEN - m_leadingspace(m);
2341 len = imin(mlen, bytes_to_copy);
2342
2343 chainlength += len;
2344
2345 space -= len;
2346
2347 error = uiomove(mtod(m, caddr_t),
2348 len, uio);
2349
2350 resid = uio_resid(uio);
2351
2352 m->m_len = len;
2353 *mp = m;
2354 top->m_pkthdr.len += len;
2355 if (error)
2356 break;
2357 mp = &m->m_next;
2358 if (resid <= 0) {
2359 if (flags & MSG_EOR)
2360 top->m_flags |= M_EOR;
2361 break;
2362 }
2363 bytes_to_copy = min(resid, space);
2364
2365 } while (space > 0 &&
2366 (chainlength < sosendmaxchain || atomic ||
2367 resid < MINCLSIZE));
2368
2369 socket_lock(so, 0);
2370
2371 if (error)
2372 goto out_locked;
2373 }
2374
2375 if (flags & (MSG_HOLD|MSG_SEND)) {
2376 /* Enqueue for later, go away if HOLD */
2377 struct mbuf *mb1;
2378 if (so->so_temp && (flags & MSG_FLUSH)) {
2379 m_freem(so->so_temp);
2380 so->so_temp = NULL;
2381 }
2382 if (so->so_temp)
2383 so->so_tail->m_next = top;
2384 else
2385 so->so_temp = top;
2386 mb1 = top;
2387 while (mb1->m_next)
2388 mb1 = mb1->m_next;
2389 so->so_tail = mb1;
2390 if (flags & MSG_HOLD) {
2391 top = NULL;
2392 goto out_locked;
2393 }
2394 top = so->so_temp;
2395 }
2396 if (dontroute)
2397 so->so_options |= SO_DONTROUTE;
2398
2399 /*
2400 * Compute flags here, for pru_send and NKEs
2401 *
2402 * If the user set MSG_EOF, the protocol
2403 * understands this flag and nothing left to
2404 * send then use PRU_SEND_EOF instead of PRU_SEND.
2405 */
2406 sendflags = (flags & MSG_OOB) ? PRUS_OOB :
2407 ((flags & MSG_EOF) &&
2408 (so->so_proto->pr_flags & PR_IMPLOPCL) &&
2409 (resid <= 0)) ? PRUS_EOF :
2410 /* If there is more to send set PRUS_MORETOCOME */
2411 (resid > 0 && space > 0) ? PRUS_MORETOCOME : 0;
2412
2413 if ((flags & MSG_SKIPCFIL) == 0) {
2414 /*
2415 * Socket filter processing
2416 */
2417 error = sflt_data_out(so, addr, &top,
2418 &control, (sendflags & MSG_OOB) ?
2419 sock_data_filt_flag_oob : 0);
2420 if (error) {
2421 if (error == EJUSTRETURN) {
2422 error = 0;
2423 clen = 0;
2424 control = NULL;
2425 top = NULL;
2426 }
2427 goto out_locked;
2428 }
2429 #if CONTENT_FILTER
2430 /*
2431 * Content filter processing
2432 */
2433 error = cfil_sock_data_out(so, addr, top,
2434 control, (sendflags & MSG_OOB) ?
2435 sock_data_filt_flag_oob : 0);
2436 if (error) {
2437 if (error == EJUSTRETURN) {
2438 error = 0;
2439 clen = 0;
2440 control = NULL;
2441 top = NULL;
2442 }
2443 goto out_locked;
2444 }
2445 #endif /* CONTENT_FILTER */
2446 }
2447 if (so->so_flags & SOF_ENABLE_MSGS) {
2448 /*
2449 * Make a copy of control mbuf,
2450 * so that msg priority can be
2451 * passed to subsequent mbufs.
2452 */
2453 control_copy = m_dup(control, M_NOWAIT);
2454 }
2455 error = (*so->so_proto->pr_usrreqs->pru_send)
2456 (so, sendflags, top, addr, control, p);
2457
2458 if (flags & MSG_SEND)
2459 so->so_temp = NULL;
2460
2461 if (dontroute)
2462 so->so_options &= ~SO_DONTROUTE;
2463
2464 clen = 0;
2465 control = control_copy;
2466 control_copy = NULL;
2467 top = NULL;
2468 mp = &top;
2469 if (error)
2470 goto out_locked;
2471 } while (resid && space > 0);
2472 } while (resid);
2473
2474 out_locked:
2475 if (sblocked)
2476 sbunlock(&so->so_snd, FALSE); /* will unlock socket */
2477 else
2478 socket_unlock(so, 1);
2479 if (top != NULL)
2480 m_freem(top);
2481 if (control != NULL)
2482 m_freem(control);
2483 if (freelist != NULL)
2484 m_freem_list(freelist);
2485 if (control_copy != NULL)
2486 m_freem(control_copy);
2487
2488 soclearfastopen(so);
2489
2490 if (en_tracing) {
2491 /* resid passed here is the bytes left in uio */
2492 KERNEL_ENERGYTRACE(kEnTrActKernSockWrite, DBG_FUNC_END,
2493 VM_KERNEL_ADDRPERM(so),
2494 ((error == EWOULDBLOCK) ? kEnTrFlagNoWork : 0),
2495 (int64_t)(orig_resid - resid));
2496 }
2497 KERNEL_DEBUG(DBG_FNC_SOSEND | DBG_FUNC_END, so, resid,
2498 so->so_snd.sb_cc, space, error);
2499
2500 return (error);
2501 }
2502
2503 /*
2504 * Supported only connected sockets (no address) without ancillary data
2505 * (control mbuf) for atomic protocols
2506 */
2507 int
2508 sosend_list(struct socket *so, struct uio **uioarray, u_int uiocnt, int flags)
2509 {
2510 struct mbuf *m, *freelist = NULL;
2511 user_ssize_t len, resid;
2512 int error, dontroute, mlen;
2513 int atomic = sosendallatonce(so);
2514 int sblocked = 0;
2515 struct proc *p = current_proc();
2516 u_int uiofirst = 0;
2517 u_int uiolast = 0;
2518 struct mbuf *top = NULL;
2519 uint16_t headroom = 0;
2520 boolean_t bigcl;
2521
2522 KERNEL_DEBUG((DBG_FNC_SOSEND_LIST | DBG_FUNC_START), so, uiocnt,
2523 so->so_snd.sb_cc, so->so_snd.sb_lowat, so->so_snd.sb_hiwat);
2524
2525 if (so->so_type != SOCK_DGRAM) {
2526 error = EINVAL;
2527 goto out;
2528 }
2529 if (atomic == 0) {
2530 error = EINVAL;
2531 goto out;
2532 }
2533 if (so->so_proto->pr_usrreqs->pru_send_list == NULL) {
2534 error = EPROTONOSUPPORT;
2535 goto out;
2536 }
2537 if (flags & ~(MSG_DONTWAIT | MSG_NBIO)) {
2538 error = EINVAL;
2539 goto out;
2540 }
2541 resid = uio_array_resid(uioarray, uiocnt);
2542
2543 /*
2544 * In theory resid should be unsigned.
2545 * However, space must be signed, as it might be less than 0
2546 * if we over-committed, and we must use a signed comparison
2547 * of space and resid. On the other hand, a negative resid
2548 * causes us to loop sending 0-length segments to the protocol.
2549 *
2550 * Note: We limit resid to be a positive int value as we use
2551 * imin() to set bytes_to_copy -- radr://14558484
2552 */
2553 if (resid < 0 || resid > INT_MAX) {
2554 error = EINVAL;
2555 goto out;
2556 }
2557
2558 socket_lock(so, 1);
2559 so_update_last_owner_locked(so, p);
2560 so_update_policy(so);
2561
2562 #if NECP
2563 so_update_necp_policy(so, NULL, NULL);
2564 #endif /* NECP */
2565
2566 dontroute = (flags & MSG_DONTROUTE) &&
2567 (so->so_options & SO_DONTROUTE) == 0 &&
2568 (so->so_proto->pr_flags & PR_ATOMIC);
2569 OSIncrementAtomicLong(&p->p_stats->p_ru.ru_msgsnd);
2570
2571 error = sosendcheck(so, NULL, resid, 0, atomic, flags,
2572 &sblocked, NULL);
2573 if (error)
2574 goto release;
2575
2576 /*
2577 * Use big 4 KB clusters when the outgoing interface does not prefer
2578 * 2 KB clusters
2579 */
2580 bigcl = !(so->so_flags1 & SOF1_IF_2KCL) || sosendbigcl_ignore_capab;
2581
2582 if (soreserveheadroom != 0)
2583 headroom = so->so_pktheadroom;
2584
2585 do {
2586 int i;
2587 int num_needed = 0;
2588 int chainlength;
2589 size_t maxpktlen = 0;
2590 int bytes_to_alloc;
2591
2592 if (sosendminchain > 0)
2593 chainlength = 0;
2594 else
2595 chainlength = sosendmaxchain;
2596
2597 socket_unlock(so, 0);
2598
2599 /*
2600 * Find a set of uio that fit in a reasonable number
2601 * of mbuf packets
2602 */
2603 for (i = uiofirst; i < uiocnt; i++) {
2604 struct uio *auio = uioarray[i];
2605
2606 len = uio_resid(auio);
2607
2608 /* Do nothing for empty messages */
2609 if (len == 0)
2610 continue;
2611
2612 num_needed += 1;
2613 uiolast += 1;
2614
2615 if (len > maxpktlen)
2616 maxpktlen = len;
2617
2618 chainlength += len;
2619 if (chainlength > sosendmaxchain)
2620 break;
2621 }
2622 /*
2623 * Nothing left to send
2624 */
2625 if (num_needed == 0) {
2626 socket_lock(so, 0);
2627 break;
2628 }
2629 /*
2630 * Allocate buffer large enough to include headroom space for
2631 * network and link header
2632 *
2633 */
2634 bytes_to_alloc = maxpktlen + headroom;
2635
2636 /*
2637 * Allocate a single contiguous buffer of the smallest available
2638 * size when possible
2639 */
2640 if (bytes_to_alloc > MCLBYTES &&
2641 bytes_to_alloc <= MBIGCLBYTES && bigcl) {
2642 freelist = m_getpackets_internal(
2643 (unsigned int *)&num_needed,
2644 num_needed, M_WAIT, 1,
2645 MBIGCLBYTES);
2646 } else if (bytes_to_alloc > _MHLEN &&
2647 bytes_to_alloc <= MCLBYTES) {
2648 freelist = m_getpackets_internal(
2649 (unsigned int *)&num_needed,
2650 num_needed, M_WAIT, 1,
2651 MCLBYTES);
2652 } else {
2653 freelist = m_allocpacket_internal(
2654 (unsigned int *)&num_needed,
2655 bytes_to_alloc, NULL, M_WAIT, 1, 0);
2656 }
2657
2658 if (freelist == NULL) {
2659 socket_lock(so, 0);
2660 error = ENOMEM;
2661 goto release;
2662 }
2663 /*
2664 * Copy each uio of the set into its own mbuf packet
2665 */
2666 for (i = uiofirst, m = freelist;
2667 i < uiolast && m != NULL;
2668 i++) {
2669 int bytes_to_copy;
2670 struct mbuf *n;
2671 struct uio *auio = uioarray[i];
2672
2673 bytes_to_copy = uio_resid(auio);
2674
2675 /* Do nothing for empty messages */
2676 if (bytes_to_copy == 0)
2677 continue;
2678 /*
2679 * Leave headroom for protocol headers
2680 * in the first mbuf of the chain
2681 */
2682 m->m_data += headroom;
2683
2684 for (n = m; n != NULL; n = n->m_next) {
2685 if ((m->m_flags & M_EXT))
2686 mlen = m->m_ext.ext_size -
2687 m_leadingspace(m);
2688 else if ((m->m_flags & M_PKTHDR))
2689 mlen =
2690 MHLEN - m_leadingspace(m);
2691 else
2692 mlen = MLEN - m_leadingspace(m);
2693 len = imin(mlen, bytes_to_copy);
2694
2695 /*
2696 * Note: uiomove() decrements the iovec
2697 * length
2698 */
2699 error = uiomove(mtod(n, caddr_t),
2700 len, auio);
2701 if (error != 0)
2702 break;
2703 n->m_len = len;
2704 m->m_pkthdr.len += len;
2705
2706 VERIFY(m->m_pkthdr.len <= maxpktlen);
2707
2708 bytes_to_copy -= len;
2709 resid -= len;
2710 }
2711 if (m->m_pkthdr.len == 0) {
2712 printf(
2713 "%s:%d so %llx pkt %llx type %u len null\n",
2714 __func__, __LINE__,
2715 (uint64_t)DEBUG_KERNEL_ADDRPERM(so),
2716 (uint64_t)DEBUG_KERNEL_ADDRPERM(m),
2717 m->m_type);
2718 }
2719 if (error != 0)
2720 break;
2721 m = m->m_nextpkt;
2722 }
2723
2724 socket_lock(so, 0);
2725
2726 if (error)
2727 goto release;
2728 top = freelist;
2729 freelist = NULL;
2730
2731 if (dontroute)
2732 so->so_options |= SO_DONTROUTE;
2733
2734 if ((flags & MSG_SKIPCFIL) == 0) {
2735 struct mbuf **prevnextp = NULL;
2736
2737 for (i = uiofirst, m = top;
2738 i < uiolast && m != NULL;
2739 i++) {
2740 struct mbuf *nextpkt = m->m_nextpkt;
2741
2742 /*
2743 * Socket filter processing
2744 */
2745 error = sflt_data_out(so, NULL, &m,
2746 NULL, 0);
2747 if (error != 0 && error != EJUSTRETURN)
2748 goto release;
2749
2750 #if CONTENT_FILTER
2751 if (error == 0) {
2752 /*
2753 * Content filter processing
2754 */
2755 error = cfil_sock_data_out(so, NULL, m,
2756 NULL, 0);
2757 if (error != 0 && error != EJUSTRETURN)
2758 goto release;
2759 }
2760 #endif /* CONTENT_FILTER */
2761 /*
2762 * Remove packet from the list when
2763 * swallowed by a filter
2764 */
2765 if (error == EJUSTRETURN) {
2766 error = 0;
2767 if (prevnextp != NULL)
2768 *prevnextp = nextpkt;
2769 else
2770 top = nextpkt;
2771 }
2772
2773 m = nextpkt;
2774 if (m != NULL)
2775 prevnextp = &m->m_nextpkt;
2776 }
2777 }
2778 if (top != NULL)
2779 error = (*so->so_proto->pr_usrreqs->pru_send_list)
2780 (so, 0, top, NULL, NULL, p);
2781
2782 if (dontroute)
2783 so->so_options &= ~SO_DONTROUTE;
2784
2785 top = NULL;
2786 uiofirst = uiolast;
2787 } while (resid > 0 && error == 0);
2788 release:
2789 if (sblocked)
2790 sbunlock(&so->so_snd, FALSE); /* will unlock socket */
2791 else
2792 socket_unlock(so, 1);
2793 out:
2794 if (top != NULL)
2795 m_freem(top);
2796 if (freelist != NULL)
2797 m_freem_list(freelist);
2798
2799 KERNEL_DEBUG(DBG_FNC_SOSEND_LIST | DBG_FUNC_END, so, resid,
2800 so->so_snd.sb_cc, 0, error);
2801
2802 return (error);
2803 }
2804
2805 /*
2806 * May return ERESTART when packet is dropped by MAC policy check
2807 */
2808 static int
2809 soreceive_addr(struct proc *p, struct socket *so, struct sockaddr **psa,
2810 int flags, struct mbuf **mp, struct mbuf **nextrecordp, int canwait)
2811 {
2812 int error = 0;
2813 struct mbuf *m = *mp;
2814 struct mbuf *nextrecord = *nextrecordp;
2815
2816 KASSERT(m->m_type == MT_SONAME, ("receive 1a"));
2817 #if CONFIG_MACF_SOCKET_SUBSET
2818 /*
2819 * Call the MAC framework for policy checking if we're in
2820 * the user process context and the socket isn't connected.
2821 */
2822 if (p != kernproc && !(so->so_state & SS_ISCONNECTED)) {
2823 struct mbuf *m0 = m;
2824 /*
2825 * Dequeue this record (temporarily) from the receive
2826 * list since we're about to drop the socket's lock
2827 * where a new record may arrive and be appended to
2828 * the list. Upon MAC policy failure, the record
2829 * will be freed. Otherwise, we'll add it back to
2830 * the head of the list. We cannot rely on SB_LOCK
2831 * because append operation uses the socket's lock.
2832 */
2833 do {
2834 m->m_nextpkt = NULL;
2835 sbfree(&so->so_rcv, m);
2836 m = m->m_next;
2837 } while (m != NULL);
2838 m = m0;
2839 so->so_rcv.sb_mb = nextrecord;
2840 SB_EMPTY_FIXUP(&so->so_rcv);
2841 SBLASTRECORDCHK(&so->so_rcv, "soreceive 1a");
2842 SBLASTMBUFCHK(&so->so_rcv, "soreceive 1a");
2843 socket_unlock(so, 0);
2844
2845 if (mac_socket_check_received(proc_ucred(p), so,
2846 mtod(m, struct sockaddr *)) != 0) {
2847 /*
2848 * MAC policy failure; free this record and
2849 * process the next record (or block until
2850 * one is available). We have adjusted sb_cc
2851 * and sb_mbcnt above so there is no need to
2852 * call sbfree() again.
2853 */
2854 m_freem(m);
2855 /*
2856 * Clear SB_LOCK but don't unlock the socket.
2857 * Process the next record or wait for one.
2858 */
2859 socket_lock(so, 0);
2860 sbunlock(&so->so_rcv, TRUE); /* stay locked */
2861 error = ERESTART;
2862 goto done;
2863 }
2864 socket_lock(so, 0);
2865 /*
2866 * If the socket has been defunct'd, drop it.
2867 */
2868 if (so->so_flags & SOF_DEFUNCT) {
2869 m_freem(m);
2870 error = ENOTCONN;
2871 goto done;
2872 }
2873 /*
2874 * Re-adjust the socket receive list and re-enqueue
2875 * the record in front of any packets which may have
2876 * been appended while we dropped the lock.
2877 */
2878 for (m = m0; m->m_next != NULL; m = m->m_next)
2879 sballoc(&so->so_rcv, m);
2880 sballoc(&so->so_rcv, m);
2881 if (so->so_rcv.sb_mb == NULL) {
2882 so->so_rcv.sb_lastrecord = m0;
2883 so->so_rcv.sb_mbtail = m;
2884 }
2885 m = m0;
2886 nextrecord = m->m_nextpkt = so->so_rcv.sb_mb;
2887 so->so_rcv.sb_mb = m;
2888 SBLASTRECORDCHK(&so->so_rcv, "soreceive 1b");
2889 SBLASTMBUFCHK(&so->so_rcv, "soreceive 1b");
2890 }
2891 #endif /* CONFIG_MACF_SOCKET_SUBSET */
2892 if (psa != NULL) {
2893 *psa = dup_sockaddr(mtod(m, struct sockaddr *), canwait);
2894 if ((*psa == NULL) && (flags & MSG_NEEDSA)) {
2895 error = EWOULDBLOCK;
2896 goto done;
2897 }
2898 }
2899 if (flags & MSG_PEEK) {
2900 m = m->m_next;
2901 } else {
2902 sbfree(&so->so_rcv, m);
2903 if (m->m_next == NULL && so->so_rcv.sb_cc != 0) {
2904 panic("%s: about to create invalid socketbuf",
2905 __func__);
2906 /* NOTREACHED */
2907 }
2908 MFREE(m, so->so_rcv.sb_mb);
2909 m = so->so_rcv.sb_mb;
2910 if (m != NULL) {
2911 m->m_nextpkt = nextrecord;
2912 } else {
2913 so->so_rcv.sb_mb = nextrecord;
2914 SB_EMPTY_FIXUP(&so->so_rcv);
2915 }
2916 }
2917 done:
2918 *mp = m;
2919 *nextrecordp = nextrecord;
2920
2921 return (error);
2922 }
2923
2924 /*
2925 * Process one or more MT_CONTROL mbufs present before any data mbufs
2926 * in the first mbuf chain on the socket buffer. If MSG_PEEK, we
2927 * just copy the data; if !MSG_PEEK, we call into the protocol to
2928 * perform externalization.
2929 */
2930 static int
2931 soreceive_ctl(struct socket *so, struct mbuf **controlp, int flags,
2932 struct mbuf **mp, struct mbuf **nextrecordp)
2933 {
2934 int error = 0;
2935 struct mbuf *cm = NULL, *cmn;
2936 struct mbuf **cme = &cm;
2937 struct sockbuf *sb_rcv = &so->so_rcv;
2938 struct mbuf **msgpcm = NULL;
2939 struct mbuf *m = *mp;
2940 struct mbuf *nextrecord = *nextrecordp;
2941 struct protosw *pr = so->so_proto;
2942
2943 /*
2944 * Externalizing the control messages would require us to
2945 * drop the socket's lock below. Once we re-acquire the
2946 * lock, the mbuf chain might change. In order to preserve
2947 * consistency, we unlink all control messages from the
2948 * first mbuf chain in one shot and link them separately
2949 * onto a different chain.
2950 */
2951 do {
2952 if (flags & MSG_PEEK) {
2953 if (controlp != NULL) {
2954 if (*controlp == NULL) {
2955 msgpcm = controlp;
2956 }
2957 *controlp = m_copy(m, 0, m->m_len);
2958
2959 /*
2960 * If we failed to allocate an mbuf,
2961 * release any previously allocated
2962 * mbufs for control data. Return
2963 * an error. Keep the mbufs in the
2964 * socket as this is using
2965 * MSG_PEEK flag.
2966 */
2967 if (*controlp == NULL) {
2968 m_freem(*msgpcm);
2969 error = ENOBUFS;
2970 goto done;
2971 }
2972 controlp = &(*controlp)->m_next;
2973 }
2974 m = m->m_next;
2975 } else {
2976 m->m_nextpkt = NULL;
2977 sbfree(sb_rcv, m);
2978 sb_rcv->sb_mb = m->m_next;
2979 m->m_next = NULL;
2980 *cme = m;
2981 cme = &(*cme)->m_next;
2982 m = sb_rcv->sb_mb;
2983 }
2984 } while (m != NULL && m->m_type == MT_CONTROL);
2985
2986 if (!(flags & MSG_PEEK)) {
2987 if (sb_rcv->sb_mb != NULL) {
2988 sb_rcv->sb_mb->m_nextpkt = nextrecord;
2989 } else {
2990 sb_rcv->sb_mb = nextrecord;
2991 SB_EMPTY_FIXUP(sb_rcv);
2992 }
2993 if (nextrecord == NULL)
2994 sb_rcv->sb_lastrecord = m;
2995 }
2996
2997 SBLASTRECORDCHK(&so->so_rcv, "soreceive ctl");
2998 SBLASTMBUFCHK(&so->so_rcv, "soreceive ctl");
2999
3000 while (cm != NULL) {
3001 int cmsg_type;
3002
3003 cmn = cm->m_next;
3004 cm->m_next = NULL;
3005 cmsg_type = mtod(cm, struct cmsghdr *)->cmsg_type;
3006
3007 /*
3008 * Call the protocol to externalize SCM_RIGHTS message
3009 * and return the modified message to the caller upon
3010 * success. Otherwise, all other control messages are
3011 * returned unmodified to the caller. Note that we
3012 * only get into this loop if MSG_PEEK is not set.
3013 */
3014 if (pr->pr_domain->dom_externalize != NULL &&
3015 cmsg_type == SCM_RIGHTS) {
3016 /*
3017 * Release socket lock: see 3903171. This
3018 * would also allow more records to be appended
3019 * to the socket buffer. We still have SB_LOCK
3020 * set on it, so we can be sure that the head
3021 * of the mbuf chain won't change.
3022 */
3023 socket_unlock(so, 0);
3024 error = (*pr->pr_domain->dom_externalize)(cm);
3025 socket_lock(so, 0);
3026 } else {
3027 error = 0;
3028 }
3029
3030 if (controlp != NULL && error == 0) {
3031 *controlp = cm;
3032 controlp = &(*controlp)->m_next;
3033 } else {
3034 (void) m_free(cm);
3035 }
3036 cm = cmn;
3037 }
3038 /*
3039 * Update the value of nextrecord in case we received new
3040 * records when the socket was unlocked above for
3041 * externalizing SCM_RIGHTS.
3042 */
3043 if (m != NULL)
3044 nextrecord = sb_rcv->sb_mb->m_nextpkt;
3045 else
3046 nextrecord = sb_rcv->sb_mb;
3047
3048 done:
3049 *mp = m;
3050 *nextrecordp = nextrecord;
3051
3052 return (error);
3053 }
3054
3055 /*
3056 * Implement receive operations on a socket.
3057 * We depend on the way that records are added to the sockbuf
3058 * by sbappend*. In particular, each record (mbufs linked through m_next)
3059 * must begin with an address if the protocol so specifies,
3060 * followed by an optional mbuf or mbufs containing ancillary data,
3061 * and then zero or more mbufs of data.
3062 * In order to avoid blocking network interrupts for the entire time here,
3063 * we splx() while doing the actual copy to user space.
3064 * Although the sockbuf is locked, new data may still be appended,
3065 * and thus we must maintain consistency of the sockbuf during that time.
3066 *
3067 * The caller may receive the data as a single mbuf chain by supplying
3068 * an mbuf **mp0 for use in returning the chain. The uio is then used
3069 * only for the count in uio_resid.
3070 *
3071 * Returns: 0 Success
3072 * ENOBUFS
3073 * ENOTCONN
3074 * EWOULDBLOCK
3075 * uiomove:EFAULT
3076 * sblock:EWOULDBLOCK
3077 * sblock:EINTR
3078 * sbwait:EBADF
3079 * sbwait:EINTR
3080 * sodelayed_copy:EFAULT
3081 * <pru_rcvoob>:EINVAL[TCP]
3082 * <pru_rcvoob>:EWOULDBLOCK[TCP]
3083 * <pru_rcvoob>:???
3084 * <pr_domain->dom_externalize>:EMSGSIZE[AF_UNIX]
3085 * <pr_domain->dom_externalize>:ENOBUFS[AF_UNIX]
3086 * <pr_domain->dom_externalize>:???
3087 *
3088 * Notes: Additional return values from calls through <pru_rcvoob> and
3089 * <pr_domain->dom_externalize> depend on protocols other than
3090 * TCP or AF_UNIX, which are documented above.
3091 */
3092 int
3093 soreceive(struct socket *so, struct sockaddr **psa, struct uio *uio,
3094 struct mbuf **mp0, struct mbuf **controlp, int *flagsp)
3095 {
3096 struct mbuf *m, **mp, *ml = NULL;
3097 struct mbuf *nextrecord, *free_list;
3098 int flags, error, offset;
3099 user_ssize_t len;
3100 struct protosw *pr = so->so_proto;
3101 int moff, type = 0;
3102 user_ssize_t orig_resid = uio_resid(uio);
3103 user_ssize_t delayed_copy_len;
3104 int can_delay;
3105 int need_event;
3106 struct proc *p = current_proc();
3107 boolean_t en_tracing = FALSE;
3108
3109 /*
3110 * Sanity check on the length passed by caller as we are making 'int'
3111 * comparisons
3112 */
3113 if (orig_resid < 0 || orig_resid > INT_MAX)
3114 return (EINVAL);
3115
3116 KERNEL_DEBUG(DBG_FNC_SORECEIVE | DBG_FUNC_START, so,
3117 uio_resid(uio), so->so_rcv.sb_cc, so->so_rcv.sb_lowat,
3118 so->so_rcv.sb_hiwat);
3119
3120 socket_lock(so, 1);
3121 so_update_last_owner_locked(so, p);
3122 so_update_policy(so);
3123
3124 #ifdef MORE_LOCKING_DEBUG
3125 if (so->so_usecount == 1) {
3126 panic("%s: so=%x no other reference on socket\n", __func__, so);
3127 /* NOTREACHED */
3128 }
3129 #endif
3130 mp = mp0;
3131 if (psa != NULL)
3132 *psa = NULL;
3133 if (controlp != NULL)
3134 *controlp = NULL;
3135 if (flagsp != NULL)
3136 flags = *flagsp &~ MSG_EOR;
3137 else
3138 flags = 0;
3139
3140 /*
3141 * If a recv attempt is made on a previously-accepted socket
3142 * that has been marked as inactive (disconnected), reject
3143 * the request.
3144 */
3145 if (so->so_flags & SOF_DEFUNCT) {
3146 struct sockbuf *sb = &so->so_rcv;
3147
3148 error = ENOTCONN;
3149 SODEFUNCTLOG("%s[%d, %s]: defunct so 0x%llx [%d,%d] (%d)\n",
3150 __func__, proc_pid(p), proc_best_name(p),
3151 (uint64_t)DEBUG_KERNEL_ADDRPERM(so),
3152 SOCK_DOM(so), SOCK_TYPE(so), error);
3153 /*
3154 * This socket should have been disconnected and flushed
3155 * prior to being returned from sodefunct(); there should
3156 * be no data on its receive list, so panic otherwise.
3157 */
3158 if (so->so_state & SS_DEFUNCT)
3159 sb_empty_assert(sb, __func__);
3160 socket_unlock(so, 1);
3161 return (error);
3162 }
3163
3164 if ((so->so_flags1 & SOF1_PRECONNECT_DATA) &&
3165 pr->pr_usrreqs->pru_preconnect) {
3166 /*
3167 * A user may set the CONNECT_RESUME_ON_READ_WRITE-flag but not
3168 * calling write() right after this. *If* the app calls a read
3169 * we do not want to block this read indefinetely. Thus,
3170 * we trigger a connect so that the session gets initiated.
3171 */
3172 error = (*pr->pr_usrreqs->pru_preconnect)(so);
3173
3174 if (error) {
3175 socket_unlock(so, 1);
3176 return (error);
3177 }
3178 }
3179
3180 if (ENTR_SHOULDTRACE &&
3181 (SOCK_CHECK_DOM(so, AF_INET) || SOCK_CHECK_DOM(so, AF_INET6))) {
3182 /*
3183 * enable energy tracing for inet sockets that go over
3184 * non-loopback interfaces only.
3185 */
3186 struct inpcb *inp = sotoinpcb(so);
3187 if (inp->inp_last_outifp != NULL &&
3188 !(inp->inp_last_outifp->if_flags & IFF_LOOPBACK)) {
3189 en_tracing = TRUE;
3190 KERNEL_ENERGYTRACE(kEnTrActKernSockRead, DBG_FUNC_START,
3191 VM_KERNEL_ADDRPERM(so),
3192 ((so->so_state & SS_NBIO) ?
3193 kEnTrFlagNonBlocking : 0),
3194 (int64_t)orig_resid);
3195 }
3196 }
3197
3198 /*
3199 * When SO_WANTOOBFLAG is set we try to get out-of-band data
3200 * regardless of the flags argument. Here is the case were
3201 * out-of-band data is not inline.
3202 */
3203 if ((flags & MSG_OOB) ||
3204 ((so->so_options & SO_WANTOOBFLAG) != 0 &&
3205 (so->so_options & SO_OOBINLINE) == 0 &&
3206 (so->so_oobmark || (so->so_state & SS_RCVATMARK)))) {
3207 m = m_get(M_WAIT, MT_DATA);
3208 if (m == NULL) {
3209 socket_unlock(so, 1);
3210 KERNEL_DEBUG(DBG_FNC_SORECEIVE | DBG_FUNC_END,
3211 ENOBUFS, 0, 0, 0, 0);
3212 return (ENOBUFS);
3213 }
3214 error = (*pr->pr_usrreqs->pru_rcvoob)(so, m, flags & MSG_PEEK);
3215 if (error)
3216 goto bad;
3217 socket_unlock(so, 0);
3218 do {
3219 error = uiomove(mtod(m, caddr_t),
3220 imin(uio_resid(uio), m->m_len), uio);
3221 m = m_free(m);
3222 } while (uio_resid(uio) && error == 0 && m != NULL);
3223 socket_lock(so, 0);
3224 bad:
3225 if (m != NULL)
3226 m_freem(m);
3227
3228 if ((so->so_options & SO_WANTOOBFLAG) != 0) {
3229 if (error == EWOULDBLOCK || error == EINVAL) {
3230 /*
3231 * Let's try to get normal data:
3232 * EWOULDBLOCK: out-of-band data not
3233 * receive yet. EINVAL: out-of-band data
3234 * already read.
3235 */
3236 error = 0;
3237 goto nooob;
3238 } else if (error == 0 && flagsp != NULL) {
3239 *flagsp |= MSG_OOB;
3240 }
3241 }
3242 socket_unlock(so, 1);
3243 if (en_tracing) {
3244 KERNEL_ENERGYTRACE(kEnTrActKernSockRead, DBG_FUNC_END,
3245 VM_KERNEL_ADDRPERM(so), 0,
3246 (int64_t)(orig_resid - uio_resid(uio)));
3247 }
3248 KERNEL_DEBUG(DBG_FNC_SORECEIVE | DBG_FUNC_END, error,
3249 0, 0, 0, 0);
3250
3251 return (error);
3252 }
3253 nooob:
3254 if (mp != NULL)
3255 *mp = NULL;
3256
3257 if (so->so_state & SS_ISCONFIRMING && uio_resid(uio)) {
3258 (*pr->pr_usrreqs->pru_rcvd)(so, 0);
3259 }
3260
3261 free_list = NULL;
3262 delayed_copy_len = 0;
3263 restart:
3264 #ifdef MORE_LOCKING_DEBUG
3265 if (so->so_usecount <= 1)
3266 printf("soreceive: sblock so=0x%llx ref=%d on socket\n",
3267 (uint64_t)DEBUG_KERNEL_ADDRPERM(so), so->so_usecount);
3268 #endif
3269 /*
3270 * See if the socket has been closed (SS_NOFDREF|SS_CANTRCVMORE)
3271 * and if so just return to the caller. This could happen when
3272 * soreceive() is called by a socket upcall function during the
3273 * time the socket is freed. The socket buffer would have been
3274 * locked across the upcall, therefore we cannot put this thread
3275 * to sleep (else we will deadlock) or return EWOULDBLOCK (else
3276 * we may livelock), because the lock on the socket buffer will
3277 * only be released when the upcall routine returns to its caller.
3278 * Because the socket has been officially closed, there can be
3279 * no further read on it.
3280 *
3281 * A multipath subflow socket would have its SS_NOFDREF set by
3282 * default, so check for SOF_MP_SUBFLOW socket flag; when the
3283 * socket is closed for real, SOF_MP_SUBFLOW would be cleared.
3284 */
3285 if ((so->so_state & (SS_NOFDREF | SS_CANTRCVMORE)) ==
3286 (SS_NOFDREF | SS_CANTRCVMORE) && !(so->so_flags & SOF_MP_SUBFLOW)) {
3287 socket_unlock(so, 1);
3288 return (0);
3289 }
3290
3291 error = sblock(&so->so_rcv, SBLOCKWAIT(flags));
3292 if (error) {
3293 socket_unlock(so, 1);
3294 KERNEL_DEBUG(DBG_FNC_SORECEIVE | DBG_FUNC_END, error,
3295 0, 0, 0, 0);
3296 if (en_tracing) {
3297 KERNEL_ENERGYTRACE(kEnTrActKernSockRead, DBG_FUNC_END,
3298 VM_KERNEL_ADDRPERM(so), 0,
3299 (int64_t)(orig_resid - uio_resid(uio)));
3300 }
3301 return (error);
3302 }
3303
3304 m = so->so_rcv.sb_mb;
3305 /*
3306 * If we have less data than requested, block awaiting more
3307 * (subject to any timeout) if:
3308 * 1. the current count is less than the low water mark, or
3309 * 2. MSG_WAITALL is set, and it is possible to do the entire
3310 * receive operation at once if we block (resid <= hiwat).
3311 * 3. MSG_DONTWAIT is not set
3312 * If MSG_WAITALL is set but resid is larger than the receive buffer,
3313 * we have to do the receive in sections, and thus risk returning
3314 * a short count if a timeout or signal occurs after we start.
3315 */
3316 if (m == NULL || (((flags & MSG_DONTWAIT) == 0 &&
3317 so->so_rcv.sb_cc < uio_resid(uio)) &&
3318 (so->so_rcv.sb_cc < so->so_rcv.sb_lowat ||
3319 ((flags & MSG_WAITALL) && uio_resid(uio) <= so->so_rcv.sb_hiwat)) &&
3320 m->m_nextpkt == NULL && (pr->pr_flags & PR_ATOMIC) == 0)) {
3321 /*
3322 * Panic if we notice inconsistencies in the socket's
3323 * receive list; both sb_mb and sb_cc should correctly
3324 * reflect the contents of the list, otherwise we may
3325 * end up with false positives during select() or poll()
3326 * which could put the application in a bad state.
3327 */
3328 SB_MB_CHECK(&so->so_rcv);
3329
3330 if (so->so_error) {
3331 if (m != NULL)
3332 goto dontblock;
3333 error = so->so_error;
3334 if ((flags & MSG_PEEK) == 0)
3335 so->so_error = 0;
3336 goto release;
3337 }
3338 if (so->so_state & SS_CANTRCVMORE) {
3339 #if CONTENT_FILTER
3340 /*
3341 * Deal with half closed connections
3342 */
3343 if ((so->so_state & SS_ISDISCONNECTED) == 0 &&
3344 cfil_sock_data_pending(&so->so_rcv) != 0)
3345 CFIL_LOG(LOG_INFO,
3346 "so %llx ignore SS_CANTRCVMORE",
3347 (uint64_t)DEBUG_KERNEL_ADDRPERM(so));
3348 else
3349 #endif /* CONTENT_FILTER */
3350 if (m != NULL)
3351 goto dontblock;
3352 else
3353 goto release;
3354 }
3355 for (; m != NULL; m = m->m_next)
3356 if (m->m_type == MT_OOBDATA || (m->m_flags & M_EOR)) {
3357 m = so->so_rcv.sb_mb;
3358 goto dontblock;
3359 }
3360 if ((so->so_state & (SS_ISCONNECTED|SS_ISCONNECTING)) == 0 &&
3361 (so->so_proto->pr_flags & PR_CONNREQUIRED)) {
3362 error = ENOTCONN;
3363 goto release;
3364 }
3365 if (uio_resid(uio) == 0)
3366 goto release;
3367
3368 if ((so->so_state & SS_NBIO) ||
3369 (flags & (MSG_DONTWAIT|MSG_NBIO))) {
3370 error = EWOULDBLOCK;
3371 goto release;
3372 }
3373 SBLASTRECORDCHK(&so->so_rcv, "soreceive sbwait 1");
3374 SBLASTMBUFCHK(&so->so_rcv, "soreceive sbwait 1");
3375 sbunlock(&so->so_rcv, TRUE); /* keep socket locked */
3376 #if EVEN_MORE_LOCKING_DEBUG
3377 if (socket_debug)
3378 printf("Waiting for socket data\n");
3379 #endif
3380
3381 error = sbwait(&so->so_rcv);
3382 #if EVEN_MORE_LOCKING_DEBUG
3383 if (socket_debug)
3384 printf("SORECEIVE - sbwait returned %d\n", error);
3385 #endif
3386 if (so->so_usecount < 1) {
3387 panic("%s: after 2nd sblock so=%p ref=%d on socket\n",
3388 __func__, so, so->so_usecount);
3389 /* NOTREACHED */
3390 }
3391 if (error) {
3392 socket_unlock(so, 1);
3393 KERNEL_DEBUG(DBG_FNC_SORECEIVE | DBG_FUNC_END, error,
3394 0, 0, 0, 0);
3395 if (en_tracing) {
3396 KERNEL_ENERGYTRACE(kEnTrActKernSockRead, DBG_FUNC_END,
3397 VM_KERNEL_ADDRPERM(so), 0,
3398 (int64_t)(orig_resid - uio_resid(uio)));
3399 }
3400 return (error);
3401 }
3402 goto restart;
3403 }
3404 dontblock:
3405 OSIncrementAtomicLong(&p->p_stats->p_ru.ru_msgrcv);
3406 SBLASTRECORDCHK(&so->so_rcv, "soreceive 1");
3407 SBLASTMBUFCHK(&so->so_rcv, "soreceive 1");
3408 nextrecord = m->m_nextpkt;
3409
3410 if ((pr->pr_flags & PR_ADDR) && m->m_type == MT_SONAME) {
3411 error = soreceive_addr(p, so, psa, flags, &m, &nextrecord,
3412 mp0 == NULL);
3413 if (error == ERESTART)
3414 goto restart;
3415 else if (error != 0)
3416 goto release;
3417 orig_resid = 0;
3418 }
3419
3420 /*
3421 * Process one or more MT_CONTROL mbufs present before any data mbufs
3422 * in the first mbuf chain on the socket buffer. If MSG_PEEK, we
3423 * just copy the data; if !MSG_PEEK, we call into the protocol to
3424 * perform externalization.
3425 */
3426 if (m != NULL && m->m_type == MT_CONTROL) {
3427 error = soreceive_ctl(so, controlp, flags, &m, &nextrecord);
3428 if (error != 0)
3429 goto release;
3430 orig_resid = 0;
3431 }
3432
3433 /*
3434 * If the socket is a TCP socket with message delivery
3435 * enabled, then create a control msg to deliver the
3436 * relative TCP sequence number for this data. Waiting
3437 * until this point will protect against failures to
3438 * allocate an mbuf for control msgs.
3439 */
3440 if (so->so_type == SOCK_STREAM && SOCK_PROTO(so) == IPPROTO_TCP &&
3441 (so->so_flags & SOF_ENABLE_MSGS) && controlp != NULL) {
3442 struct mbuf *seq_cm;
3443
3444 seq_cm = sbcreatecontrol((caddr_t)&m->m_pkthdr.msg_seq,
3445 sizeof (uint32_t), SCM_SEQNUM, SOL_SOCKET);
3446 if (seq_cm == NULL) {
3447 /* unable to allocate a control mbuf */
3448 error = ENOBUFS;
3449 goto release;
3450 }
3451 *controlp = seq_cm;
3452 controlp = &seq_cm->m_next;
3453 }
3454
3455 if (m != NULL) {
3456 if (!(flags & MSG_PEEK)) {
3457 /*
3458 * We get here because m points to an mbuf following
3459 * any MT_SONAME or MT_CONTROL mbufs which have been
3460 * processed above. In any case, m should be pointing
3461 * to the head of the mbuf chain, and the nextrecord
3462 * should be either NULL or equal to m->m_nextpkt.
3463 * See comments above about SB_LOCK.
3464 */
3465 if (m != so->so_rcv.sb_mb ||
3466 m->m_nextpkt != nextrecord) {
3467 panic("%s: post-control !sync so=%p m=%p "
3468 "nextrecord=%p\n", __func__, so, m,
3469 nextrecord);
3470 /* NOTREACHED */
3471 }
3472 if (nextrecord == NULL)
3473 so->so_rcv.sb_lastrecord = m;
3474 }
3475 type = m->m_type;
3476 if (type == MT_OOBDATA)
3477 flags |= MSG_OOB;
3478 } else {
3479 if (!(flags & MSG_PEEK)) {
3480 SB_EMPTY_FIXUP(&so->so_rcv);
3481 }
3482 }
3483 SBLASTRECORDCHK(&so->so_rcv, "soreceive 2");
3484 SBLASTMBUFCHK(&so->so_rcv, "soreceive 2");
3485
3486 moff = 0;
3487 offset = 0;
3488
3489 if (!(flags & MSG_PEEK) && uio_resid(uio) > sorecvmincopy)
3490 can_delay = 1;
3491 else
3492 can_delay = 0;
3493
3494 need_event = 0;
3495
3496 while (m != NULL &&
3497 (uio_resid(uio) - delayed_copy_len) > 0 && error == 0) {
3498 if (m->m_type == MT_OOBDATA) {
3499 if (type != MT_OOBDATA)
3500 break;
3501 } else if (type == MT_OOBDATA) {
3502 break;
3503 }
3504 /*
3505 * Make sure to allways set MSG_OOB event when getting
3506 * out of band data inline.
3507 */
3508 if ((so->so_options & SO_WANTOOBFLAG) != 0 &&
3509 (so->so_options & SO_OOBINLINE) != 0 &&
3510 (so->so_state & SS_RCVATMARK) != 0) {
3511 flags |= MSG_OOB;
3512 }
3513 so->so_state &= ~SS_RCVATMARK;
3514 len = uio_resid(uio) - delayed_copy_len;
3515 if (so->so_oobmark && len > so->so_oobmark - offset)
3516 len = so->so_oobmark - offset;
3517 if (len > m->m_len - moff)
3518 len = m->m_len - moff;
3519 /*
3520 * If mp is set, just pass back the mbufs.
3521 * Otherwise copy them out via the uio, then free.
3522 * Sockbuf must be consistent here (points to current mbuf,
3523 * it points to next record) when we drop priority;
3524 * we must note any additions to the sockbuf when we
3525 * block interrupts again.
3526 */
3527 if (mp == NULL) {
3528 SBLASTRECORDCHK(&so->so_rcv, "soreceive uiomove");
3529 SBLASTMBUFCHK(&so->so_rcv, "soreceive uiomove");
3530 if (can_delay && len == m->m_len) {
3531 /*
3532 * only delay the copy if we're consuming the
3533 * mbuf and we're NOT in MSG_PEEK mode
3534 * and we have enough data to make it worthwile
3535 * to drop and retake the lock... can_delay
3536 * reflects the state of the 2 latter
3537 * constraints moff should always be zero
3538 * in these cases
3539 */
3540 delayed_copy_len += len;
3541 } else {
3542 if (delayed_copy_len) {
3543 error = sodelayed_copy(so, uio,
3544 &free_list, &delayed_copy_len);
3545
3546 if (error) {
3547 goto release;
3548 }
3549 /*
3550 * can only get here if MSG_PEEK is not
3551 * set therefore, m should point at the
3552 * head of the rcv queue; if it doesn't,
3553 * it means something drastically
3554 * changed while we were out from behind
3555 * the lock in sodelayed_copy. perhaps
3556 * a RST on the stream. in any event,
3557 * the stream has been interrupted. it's
3558 * probably best just to return whatever
3559 * data we've moved and let the caller
3560 * sort it out...
3561 */
3562 if (m != so->so_rcv.sb_mb) {
3563 break;
3564 }
3565 }
3566 socket_unlock(so, 0);
3567 error = uiomove(mtod(m, caddr_t) + moff,
3568 (int)len, uio);
3569 socket_lock(so, 0);
3570
3571 if (error)
3572 goto release;
3573 }
3574 } else {
3575 uio_setresid(uio, (uio_resid(uio) - len));
3576 }
3577 if (len == m->m_len - moff) {
3578 if (m->m_flags & M_EOR)
3579 flags |= MSG_EOR;
3580 if (flags & MSG_PEEK) {
3581 m = m->m_next;
3582 moff = 0;
3583 } else {
3584 nextrecord = m->m_nextpkt;
3585 sbfree(&so->so_rcv, m);
3586 m->m_nextpkt = NULL;
3587
3588 /*
3589 * If this packet is an unordered packet
3590 * (indicated by M_UNORDERED_DATA flag), remove
3591 * the additional bytes added to the
3592 * receive socket buffer size.
3593 */
3594 if ((so->so_flags & SOF_ENABLE_MSGS) &&
3595 m->m_len &&
3596 (m->m_flags & M_UNORDERED_DATA) &&
3597 sbreserve(&so->so_rcv,
3598 so->so_rcv.sb_hiwat - m->m_len)) {
3599 if (so->so_msg_state->msg_uno_bytes >
3600 m->m_len) {
3601 so->so_msg_state->
3602 msg_uno_bytes -= m->m_len;
3603 } else {
3604 so->so_msg_state->
3605 msg_uno_bytes = 0;
3606 }
3607 m->m_flags &= ~M_UNORDERED_DATA;
3608 }
3609
3610 if (mp != NULL) {
3611 *mp = m;
3612 mp = &m->m_next;
3613 so->so_rcv.sb_mb = m = m->m_next;
3614 *mp = NULL;
3615 } else {
3616 if (free_list == NULL)
3617 free_list = m;
3618 else
3619 ml->m_next = m;
3620 ml = m;
3621 so->so_rcv.sb_mb = m = m->m_next;
3622 ml->m_next = NULL;
3623 }
3624 if (m != NULL) {
3625 m->m_nextpkt = nextrecord;
3626 if (nextrecord == NULL)
3627 so->so_rcv.sb_lastrecord = m;
3628 } else {
3629 so->so_rcv.sb_mb = nextrecord;
3630 SB_EMPTY_FIXUP(&so->so_rcv);
3631 }
3632 SBLASTRECORDCHK(&so->so_rcv, "soreceive 3");
3633 SBLASTMBUFCHK(&so->so_rcv, "soreceive 3");
3634 }
3635 } else {
3636 if (flags & MSG_PEEK) {
3637 moff += len;
3638 } else {
3639 if (mp != NULL) {
3640 int copy_flag;
3641
3642 if (flags & MSG_DONTWAIT)
3643 copy_flag = M_DONTWAIT;
3644 else
3645 copy_flag = M_WAIT;
3646 *mp = m_copym(m, 0, len, copy_flag);
3647 /*
3648 * Failed to allocate an mbuf?
3649 * Adjust uio_resid back, it was
3650 * adjusted down by len bytes which
3651 * we didn't copy over.
3652 */
3653 if (*mp == NULL) {
3654 uio_setresid(uio,
3655 (uio_resid(uio) + len));
3656 break;
3657 }
3658 }
3659 m->m_data += len;
3660 m->m_len -= len;
3661 so->so_rcv.sb_cc -= len;
3662 }
3663 }
3664 if (so->so_oobmark) {
3665 if ((flags & MSG_PEEK) == 0) {
3666 so->so_oobmark -= len;
3667 if (so->so_oobmark == 0) {
3668 so->so_state |= SS_RCVATMARK;
3669 /*
3670 * delay posting the actual event until
3671 * after any delayed copy processing
3672 * has finished
3673 */
3674 need_event = 1;
3675 break;
3676 }
3677 } else {
3678 offset += len;
3679 if (offset == so->so_oobmark)
3680 break;
3681 }
3682 }
3683 if (flags & MSG_EOR)
3684 break;
3685 /*
3686 * If the MSG_WAITALL or MSG_WAITSTREAM flag is set
3687 * (for non-atomic socket), we must not quit until
3688 * "uio->uio_resid == 0" or an error termination.
3689 * If a signal/timeout occurs, return with a short
3690 * count but without error. Keep sockbuf locked
3691 * against other readers.
3692 */
3693 while (flags & (MSG_WAITALL|MSG_WAITSTREAM) && m == NULL &&
3694 (uio_resid(uio) - delayed_copy_len) > 0 &&
3695 !sosendallatonce(so) && !nextrecord) {
3696 if (so->so_error || ((so->so_state & SS_CANTRCVMORE)
3697 #if CONTENT_FILTER
3698 && cfil_sock_data_pending(&so->so_rcv) == 0
3699 #endif /* CONTENT_FILTER */
3700 ))
3701 goto release;
3702
3703 /*
3704 * Depending on the protocol (e.g. TCP), the following
3705 * might cause the socket lock to be dropped and later
3706 * be reacquired, and more data could have arrived and
3707 * have been appended to the receive socket buffer by
3708 * the time it returns. Therefore, we only sleep in
3709 * sbwait() below if and only if the socket buffer is
3710 * empty, in order to avoid a false sleep.
3711 */
3712 if (pr->pr_flags & PR_WANTRCVD && so->so_pcb &&
3713 (((struct inpcb *)so->so_pcb)->inp_state !=
3714 INPCB_STATE_DEAD))
3715 (*pr->pr_usrreqs->pru_rcvd)(so, flags);
3716
3717 SBLASTRECORDCHK(&so->so_rcv, "soreceive sbwait 2");
3718 SBLASTMBUFCHK(&so->so_rcv, "soreceive sbwait 2");
3719
3720 if (so->so_rcv.sb_mb == NULL && sbwait(&so->so_rcv)) {
3721 error = 0;
3722 goto release;
3723 }
3724 /*
3725 * have to wait until after we get back from the sbwait
3726 * to do the copy because we will drop the lock if we
3727 * have enough data that has been delayed... by dropping
3728 * the lock we open up a window allowing the netisr
3729 * thread to process the incoming packets and to change
3730 * the state of this socket... we're issuing the sbwait
3731 * because the socket is empty and we're expecting the
3732 * netisr thread to wake us up when more packets arrive;
3733 * if we allow that processing to happen and then sbwait
3734 * we could stall forever with packets sitting in the
3735 * socket if no further packets arrive from the remote
3736 * side.
3737 *
3738 * we want to copy before we've collected all the data
3739 * to satisfy this request to allow the copy to overlap
3740 * the incoming packet processing on an MP system
3741 */
3742 if (delayed_copy_len > sorecvmincopy &&
3743 (delayed_copy_len > (so->so_rcv.sb_hiwat / 2))) {
3744 error = sodelayed_copy(so, uio,
3745 &free_list, &delayed_copy_len);
3746
3747 if (error)
3748 goto release;
3749 }
3750 m = so->so_rcv.sb_mb;
3751 if (m != NULL) {
3752 nextrecord = m->m_nextpkt;
3753 }
3754 SB_MB_CHECK(&so->so_rcv);
3755 }
3756 }
3757 #ifdef MORE_LOCKING_DEBUG
3758 if (so->so_usecount <= 1) {
3759 panic("%s: after big while so=%p ref=%d on socket\n",
3760 __func__, so, so->so_usecount);
3761 /* NOTREACHED */
3762 }
3763 #endif
3764
3765 if (m != NULL && pr->pr_flags & PR_ATOMIC) {
3766 if (so->so_options & SO_DONTTRUNC) {
3767 flags |= MSG_RCVMORE;
3768 } else {
3769 flags |= MSG_TRUNC;
3770 if ((flags & MSG_PEEK) == 0)
3771 (void) sbdroprecord(&so->so_rcv);
3772 }
3773 }
3774
3775 /*
3776 * pru_rcvd below (for TCP) may cause more data to be received
3777 * if the socket lock is dropped prior to sending the ACK; some
3778 * legacy OpenTransport applications don't handle this well
3779 * (if it receives less data than requested while MSG_HAVEMORE
3780 * is set), and so we set the flag now based on what we know
3781 * prior to calling pru_rcvd.
3782 */
3783 if ((so->so_options & SO_WANTMORE) && so->so_rcv.sb_cc > 0)
3784 flags |= MSG_HAVEMORE;
3785
3786 if ((flags & MSG_PEEK) == 0) {
3787 if (m == NULL) {
3788 so->so_rcv.sb_mb = nextrecord;
3789 /*
3790 * First part is an inline SB_EMPTY_FIXUP(). Second
3791 * part makes sure sb_lastrecord is up-to-date if
3792 * there is still data in the socket buffer.
3793 */
3794 if (so->so_rcv.sb_mb == NULL) {
3795 so->so_rcv.sb_mbtail = NULL;
3796 so->so_rcv.sb_lastrecord = NULL;
3797 } else if (nextrecord->m_nextpkt == NULL) {
3798 so->so_rcv.sb_lastrecord = nextrecord;
3799 }
3800 SB_MB_CHECK(&so->so_rcv);
3801 }
3802 SBLASTRECORDCHK(&so->so_rcv, "soreceive 4");
3803 SBLASTMBUFCHK(&so->so_rcv, "soreceive 4");
3804 if (pr->pr_flags & PR_WANTRCVD && so->so_pcb)
3805 (*pr->pr_usrreqs->pru_rcvd)(so, flags);
3806 }
3807
3808 if (delayed_copy_len) {
3809 error = sodelayed_copy(so, uio, &free_list, &delayed_copy_len);
3810 if (error)
3811 goto release;
3812 }
3813 if (free_list != NULL) {
3814 m_freem_list(free_list);
3815 free_list = NULL;
3816 }
3817 if (need_event)
3818 postevent(so, 0, EV_OOB);
3819
3820 if (orig_resid == uio_resid(uio) && orig_resid &&
3821 (flags & MSG_EOR) == 0 && (so->so_state & SS_CANTRCVMORE) == 0) {
3822 sbunlock(&so->so_rcv, TRUE); /* keep socket locked */
3823 goto restart;
3824 }
3825
3826 if (flagsp != NULL)
3827 *flagsp |= flags;
3828 release:
3829 #ifdef MORE_LOCKING_DEBUG
3830 if (so->so_usecount <= 1) {
3831 panic("%s: release so=%p ref=%d on socket\n", __func__,
3832 so, so->so_usecount);
3833 /* NOTREACHED */
3834 }
3835 #endif
3836 if (delayed_copy_len)
3837 error = sodelayed_copy(so, uio, &free_list, &delayed_copy_len);
3838
3839 if (free_list != NULL)
3840 m_freem_list(free_list);
3841
3842 sbunlock(&so->so_rcv, FALSE); /* will unlock socket */
3843
3844 if (en_tracing) {
3845 KERNEL_ENERGYTRACE(kEnTrActKernSockRead, DBG_FUNC_END,
3846 VM_KERNEL_ADDRPERM(so),
3847 ((error == EWOULDBLOCK) ? kEnTrFlagNoWork : 0),
3848 (int64_t)(orig_resid - uio_resid(uio)));
3849 }
3850 KERNEL_DEBUG(DBG_FNC_SORECEIVE | DBG_FUNC_END, so, uio_resid(uio),
3851 so->so_rcv.sb_cc, 0, error);
3852
3853 return (error);
3854 }
3855
3856 /*
3857 * Returns: 0 Success
3858 * uiomove:EFAULT
3859 */
3860 static int
3861 sodelayed_copy(struct socket *so, struct uio *uio, struct mbuf **free_list,
3862 user_ssize_t *resid)
3863 {
3864 int error = 0;
3865 struct mbuf *m;
3866
3867 m = *free_list;
3868
3869 socket_unlock(so, 0);
3870
3871 while (m != NULL && error == 0) {
3872 error = uiomove(mtod(m, caddr_t), (int)m->m_len, uio);
3873 m = m->m_next;
3874 }
3875 m_freem_list(*free_list);
3876
3877 *free_list = NULL;
3878 *resid = 0;
3879
3880 socket_lock(so, 0);
3881
3882 return (error);
3883 }
3884
3885 static int
3886 sodelayed_copy_list(struct socket *so, struct recv_msg_elem *msgarray,
3887 u_int uiocnt, struct mbuf **free_list, user_ssize_t *resid)
3888 {
3889 #pragma unused(so)
3890 int error = 0;
3891 struct mbuf *ml, *m;
3892 int i = 0;
3893 struct uio *auio;
3894
3895 for (ml = *free_list, i = 0; ml != NULL && i < uiocnt;
3896 ml = ml->m_nextpkt, i++) {
3897 auio = msgarray[i].uio;
3898 for (m = ml; m != NULL; m = m->m_next) {
3899 error = uiomove(mtod(m, caddr_t), m->m_len, auio);
3900 if (error != 0)
3901 goto out;
3902 }
3903 }
3904 out:
3905 m_freem_list(*free_list);
3906
3907 *free_list = NULL;
3908 *resid = 0;
3909
3910 return (error);
3911 }
3912
3913 int
3914 soreceive_list(struct socket *so, struct recv_msg_elem *msgarray, u_int uiocnt,
3915 int *flagsp)
3916 {
3917 struct mbuf *m;
3918 struct mbuf *nextrecord;
3919 struct mbuf *ml = NULL, *free_list = NULL, *free_tail = NULL;
3920 int error;
3921 user_ssize_t len, pktlen, delayed_copy_len = 0;
3922 struct protosw *pr = so->so_proto;
3923 user_ssize_t resid;
3924 struct proc *p = current_proc();
3925 struct uio *auio = NULL;
3926 int npkts = 0;
3927 int sblocked = 0;
3928 struct sockaddr **psa = NULL;
3929 struct mbuf **controlp = NULL;
3930 int can_delay;
3931 int flags;
3932 struct mbuf *free_others = NULL;
3933
3934 KERNEL_DEBUG(DBG_FNC_SORECEIVE_LIST | DBG_FUNC_START,
3935 so, uiocnt,
3936 so->so_rcv.sb_cc, so->so_rcv.sb_lowat, so->so_rcv.sb_hiwat);
3937
3938 /*
3939 * Sanity checks:
3940 * - Only supports don't wait flags
3941 * - Only support datagram sockets (could be extended to raw)
3942 * - Must be atomic
3943 * - Protocol must support packet chains
3944 * - The uio array is NULL (should we panic?)
3945 */
3946 if (flagsp != NULL)
3947 flags = *flagsp;
3948 else
3949 flags = 0;
3950 if (flags & ~(MSG_PEEK | MSG_WAITALL | MSG_DONTWAIT | MSG_NEEDSA |
3951 MSG_NBIO)) {
3952 printf("%s invalid flags 0x%x\n", __func__, flags);
3953 error = EINVAL;
3954 goto out;
3955 }
3956 if (so->so_type != SOCK_DGRAM) {
3957 error = EINVAL;
3958 goto out;
3959 }
3960 if (sosendallatonce(so) == 0) {
3961 error = EINVAL;
3962 goto out;
3963 }
3964 if (so->so_proto->pr_usrreqs->pru_send_list == NULL) {
3965 error = EPROTONOSUPPORT;
3966 goto out;
3967 }
3968 if (msgarray == NULL) {
3969 printf("%s uioarray is NULL\n", __func__);
3970 error = EINVAL;
3971 goto out;
3972 }
3973 if (uiocnt == 0) {
3974 printf("%s uiocnt is 0\n", __func__);
3975 error = EINVAL;
3976 goto out;
3977 }
3978 /*
3979 * Sanity check on the length passed by caller as we are making 'int'
3980 * comparisons
3981 */
3982 resid = recv_msg_array_resid(msgarray, uiocnt);
3983 if (resid < 0 || resid > INT_MAX) {
3984 error = EINVAL;
3985 goto out;
3986 }
3987
3988 if (!(flags & MSG_PEEK) && sorecvmincopy > 0)
3989 can_delay = 1;
3990 else
3991 can_delay = 0;
3992
3993 socket_lock(so, 1);
3994 so_update_last_owner_locked(so, p);
3995 so_update_policy(so);
3996
3997 #if NECP
3998 so_update_necp_policy(so, NULL, NULL);
3999 #endif /* NECP */
4000
4001 /*
4002 * If a recv attempt is made on a previously-accepted socket
4003 * that has been marked as inactive (disconnected), reject
4004 * the request.
4005 */
4006 if (so->so_flags & SOF_DEFUNCT) {
4007 struct sockbuf *sb = &so->so_rcv;
4008
4009 error = ENOTCONN;
4010 SODEFUNCTLOG("%s[%d, %s]: defunct so 0x%llx [%d,%d] (%d)\n",
4011 __func__, proc_pid(p), proc_best_name(p),
4012 (uint64_t)DEBUG_KERNEL_ADDRPERM(so),
4013 SOCK_DOM(so), SOCK_TYPE(so), error);
4014 /*
4015 * This socket should have been disconnected and flushed
4016 * prior to being returned from sodefunct(); there should
4017 * be no data on its receive list, so panic otherwise.
4018 */
4019 if (so->so_state & SS_DEFUNCT)
4020 sb_empty_assert(sb, __func__);
4021 goto release;
4022 }
4023
4024 next:
4025 /*
4026 * The uio may be empty
4027 */
4028 if (npkts >= uiocnt) {
4029 error = 0;
4030 goto release;
4031 }
4032 restart:
4033 /*
4034 * See if the socket has been closed (SS_NOFDREF|SS_CANTRCVMORE)
4035 * and if so just return to the caller. This could happen when
4036 * soreceive() is called by a socket upcall function during the
4037 * time the socket is freed. The socket buffer would have been
4038 * locked across the upcall, therefore we cannot put this thread
4039 * to sleep (else we will deadlock) or return EWOULDBLOCK (else
4040 * we may livelock), because the lock on the socket buffer will
4041 * only be released when the upcall routine returns to its caller.
4042 * Because the socket has been officially closed, there can be
4043 * no further read on it.
4044 */
4045 if ((so->so_state & (SS_NOFDREF | SS_CANTRCVMORE)) ==
4046 (SS_NOFDREF | SS_CANTRCVMORE)) {
4047 error = 0;
4048 goto release;
4049 }
4050
4051 error = sblock(&so->so_rcv, SBLOCKWAIT(flags));
4052 if (error) {
4053 goto release;
4054 }
4055 sblocked = 1;
4056
4057 m = so->so_rcv.sb_mb;
4058 /*
4059 * Block awaiting more datagram if needed
4060 */
4061 if (m == NULL || (((flags & MSG_DONTWAIT) == 0 &&
4062 (so->so_rcv.sb_cc < so->so_rcv.sb_lowat ||
4063 ((flags & MSG_WAITALL) && npkts < uiocnt))))) {
4064 /*
4065 * Panic if we notice inconsistencies in the socket's
4066 * receive list; both sb_mb and sb_cc should correctly
4067 * reflect the contents of the list, otherwise we may
4068 * end up with false positives during select() or poll()
4069 * which could put the application in a bad state.
4070 */
4071 SB_MB_CHECK(&so->so_rcv);
4072
4073 if (so->so_error) {
4074 error = so->so_error;
4075 if ((flags & MSG_PEEK) == 0)
4076 so->so_error = 0;
4077 goto release;
4078 }
4079 if (so->so_state & SS_CANTRCVMORE) {
4080 goto release;
4081 }
4082 if ((so->so_state & (SS_ISCONNECTED|SS_ISCONNECTING)) == 0 &&
4083 (so->so_proto->pr_flags & PR_CONNREQUIRED)) {
4084 error = ENOTCONN;
4085 goto release;
4086 }
4087 if ((so->so_state & SS_NBIO) ||
4088 (flags & (MSG_DONTWAIT|MSG_NBIO))) {
4089 error = EWOULDBLOCK;
4090 goto release;
4091 }
4092 /*
4093 * Do not block if we got some data
4094 */
4095 if (free_list != NULL) {
4096 error = 0;
4097 goto release;
4098 }
4099
4100 SBLASTRECORDCHK(&so->so_rcv, "soreceive sbwait 1");
4101 SBLASTMBUFCHK(&so->so_rcv, "soreceive sbwait 1");
4102
4103 sbunlock(&so->so_rcv, TRUE); /* keep socket locked */
4104 sblocked = 0;
4105
4106 error = sbwait(&so->so_rcv);
4107 if (error) {
4108 goto release;
4109 }
4110 goto restart;
4111 }
4112
4113 OSIncrementAtomicLong(&p->p_stats->p_ru.ru_msgrcv);
4114 SBLASTRECORDCHK(&so->so_rcv, "soreceive 1");
4115 SBLASTMBUFCHK(&so->so_rcv, "soreceive 1");
4116
4117 /*
4118 * Consume the current uio index as we have a datagram
4119 */
4120 auio = msgarray[npkts].uio;
4121 resid = uio_resid(auio);
4122 msgarray[npkts].which |= SOCK_MSG_DATA;
4123 psa = (msgarray[npkts].which & SOCK_MSG_SA) ?
4124 &msgarray[npkts].psa : NULL;
4125 controlp = (msgarray[npkts].which & SOCK_MSG_CONTROL) ?
4126 &msgarray[npkts].controlp : NULL;
4127 npkts += 1;
4128 nextrecord = m->m_nextpkt;
4129
4130 if ((pr->pr_flags & PR_ADDR) && m->m_type == MT_SONAME) {
4131 error = soreceive_addr(p, so, psa, flags, &m, &nextrecord, 1);
4132 if (error == ERESTART)
4133 goto restart;
4134 else if (error != 0)
4135 goto release;
4136 }
4137
4138 if (m != NULL && m->m_type == MT_CONTROL) {
4139 error = soreceive_ctl(so, controlp, flags, &m, &nextrecord);
4140 if (error != 0)
4141 goto release;
4142 }
4143
4144 if (m->m_pkthdr.len == 0) {
4145 printf("%s:%d so %llx pkt %llx type %u pktlen null\n",
4146 __func__, __LINE__,
4147 (uint64_t)DEBUG_KERNEL_ADDRPERM(so),
4148 (uint64_t)DEBUG_KERNEL_ADDRPERM(m),
4149 m->m_type);
4150 }
4151
4152 /*
4153 * Loop to copy the mbufs of the current record
4154 * Support zero length packets
4155 */
4156 ml = NULL;
4157 pktlen = 0;
4158 while (m != NULL && (len = resid - pktlen) >= 0 && error == 0) {
4159 if (m->m_len == 0)
4160 panic("%p m_len zero", m);
4161 if (m->m_type == 0)
4162 panic("%p m_type zero", m);
4163 /*
4164 * Clip to the residual length
4165 */
4166 if (len > m->m_len)
4167 len = m->m_len;
4168 pktlen += len;
4169 /*
4170 * Copy the mbufs via the uio or delay the copy
4171 * Sockbuf must be consistent here (points to current mbuf,
4172 * it points to next record) when we drop priority;
4173 * we must note any additions to the sockbuf when we
4174 * block interrupts again.
4175 */
4176 if (len > 0 && can_delay == 0) {
4177 socket_unlock(so, 0);
4178 error = uiomove(mtod(m, caddr_t), (int)len, auio);
4179 socket_lock(so, 0);
4180 if (error)
4181 goto release;
4182 } else {
4183 delayed_copy_len += len;
4184 }
4185
4186 if (len == m->m_len) {
4187 /*
4188 * m was entirely copied
4189 */
4190 sbfree(&so->so_rcv, m);
4191 nextrecord = m->m_nextpkt;
4192 m->m_nextpkt = NULL;
4193
4194 /*
4195 * Set the first packet to the head of the free list
4196 */
4197 if (free_list == NULL)
4198 free_list = m;
4199 /*
4200 * Link current packet to tail of free list
4201 */
4202 if (ml == NULL) {
4203 if (free_tail != NULL)
4204 free_tail->m_nextpkt = m;
4205 free_tail = m;
4206 }
4207 /*
4208 * Link current mbuf to last mbuf of current packet
4209 */
4210 if (ml != NULL)
4211 ml->m_next = m;
4212 ml = m;
4213
4214 /*
4215 * Move next buf to head of socket buffer
4216 */
4217 so->so_rcv.sb_mb = m = ml->m_next;
4218 ml->m_next = NULL;
4219
4220 if (m != NULL) {
4221 m->m_nextpkt = nextrecord;
4222 if (nextrecord == NULL)
4223 so->so_rcv.sb_lastrecord = m;
4224 } else {
4225 so->so_rcv.sb_mb = nextrecord;
4226 SB_EMPTY_FIXUP(&so->so_rcv);
4227 }
4228 SBLASTRECORDCHK(&so->so_rcv, "soreceive 3");
4229 SBLASTMBUFCHK(&so->so_rcv, "soreceive 3");
4230 } else {
4231 /*
4232 * Stop the loop on partial copy
4233 */
4234 break;
4235 }
4236 }
4237 #ifdef MORE_LOCKING_DEBUG
4238 if (so->so_usecount <= 1) {
4239 panic("%s: after big while so=%llx ref=%d on socket\n",
4240 __func__,
4241 (uint64_t)DEBUG_KERNEL_ADDRPERM(so), so->so_usecount);
4242 /* NOTREACHED */
4243 }
4244 #endif
4245 /*
4246 * Tell the caller we made a partial copy
4247 */
4248 if (m != NULL) {
4249 if (so->so_options & SO_DONTTRUNC) {
4250 /*
4251 * Copyout first the freelist then the partial mbuf
4252 */
4253 socket_unlock(so, 0);
4254 if (delayed_copy_len)
4255 error = sodelayed_copy_list(so, msgarray,
4256 uiocnt, &free_list, &delayed_copy_len);
4257
4258 if (error == 0) {
4259 error = uiomove(mtod(m, caddr_t), (int)len,
4260 auio);
4261 }
4262 socket_lock(so, 0);
4263 if (error)
4264 goto release;
4265
4266 m->m_data += len;
4267 m->m_len -= len;
4268 so->so_rcv.sb_cc -= len;
4269 flags |= MSG_RCVMORE;
4270 } else {
4271 (void) sbdroprecord(&so->so_rcv);
4272 nextrecord = so->so_rcv.sb_mb;
4273 m = NULL;
4274 flags |= MSG_TRUNC;
4275 }
4276 }
4277
4278 if (m == NULL) {
4279 so->so_rcv.sb_mb = nextrecord;
4280 /*
4281 * First part is an inline SB_EMPTY_FIXUP(). Second
4282 * part makes sure sb_lastrecord is up-to-date if
4283 * there is still data in the socket buffer.
4284 */
4285 if (so->so_rcv.sb_mb == NULL) {
4286 so->so_rcv.sb_mbtail = NULL;
4287 so->so_rcv.sb_lastrecord = NULL;
4288 } else if (nextrecord->m_nextpkt == NULL) {
4289 so->so_rcv.sb_lastrecord = nextrecord;
4290 }
4291 SB_MB_CHECK(&so->so_rcv);
4292 }
4293 SBLASTRECORDCHK(&so->so_rcv, "soreceive 4");
4294 SBLASTMBUFCHK(&so->so_rcv, "soreceive 4");
4295
4296 /*
4297 * We can continue to the next packet as long as:
4298 * - We haven't exhausted the uio array
4299 * - There was no error
4300 * - A packet was not truncated
4301 * - We can still receive more data
4302 */
4303 if (npkts < uiocnt && error == 0 &&
4304 (flags & (MSG_RCVMORE | MSG_TRUNC)) == 0 &&
4305 (so->so_state & SS_CANTRCVMORE) == 0) {
4306 sbunlock(&so->so_rcv, TRUE); /* keep socket locked */
4307 sblocked = 0;
4308
4309 goto next;
4310 }
4311 if (flagsp != NULL)
4312 *flagsp |= flags;
4313
4314 release:
4315 /*
4316 * pru_rcvd may cause more data to be received if the socket lock
4317 * is dropped so we set MSG_HAVEMORE now based on what we know.
4318 * That way the caller won't be surprised if it receives less data
4319 * than requested.
4320 */
4321 if ((so->so_options & SO_WANTMORE) && so->so_rcv.sb_cc > 0)
4322 flags |= MSG_HAVEMORE;
4323
4324 if (pr->pr_flags & PR_WANTRCVD && so->so_pcb)
4325 (*pr->pr_usrreqs->pru_rcvd)(so, flags);
4326
4327 if (sblocked)
4328 sbunlock(&so->so_rcv, FALSE); /* will unlock socket */
4329 else
4330 socket_unlock(so, 1);
4331
4332 if (delayed_copy_len)
4333 error = sodelayed_copy_list(so, msgarray, uiocnt,
4334 &free_list, &delayed_copy_len);
4335 out:
4336 /*
4337 * Amortize the cost of freeing the mbufs
4338 */
4339 if (free_list != NULL)
4340 m_freem_list(free_list);
4341 if (free_others != NULL)
4342 m_freem_list(free_others);
4343
4344 KERNEL_DEBUG(DBG_FNC_SORECEIVE_LIST | DBG_FUNC_END, error,
4345 0, 0, 0, 0);
4346 return (error);
4347 }
4348
4349 /*
4350 * Returns: 0 Success
4351 * EINVAL
4352 * ENOTCONN
4353 * <pru_shutdown>:EINVAL
4354 * <pru_shutdown>:EADDRNOTAVAIL[TCP]
4355 * <pru_shutdown>:ENOBUFS[TCP]
4356 * <pru_shutdown>:EMSGSIZE[TCP]
4357 * <pru_shutdown>:EHOSTUNREACH[TCP]
4358 * <pru_shutdown>:ENETUNREACH[TCP]
4359 * <pru_shutdown>:ENETDOWN[TCP]
4360 * <pru_shutdown>:ENOMEM[TCP]
4361 * <pru_shutdown>:EACCES[TCP]
4362 * <pru_shutdown>:EMSGSIZE[TCP]
4363 * <pru_shutdown>:ENOBUFS[TCP]
4364 * <pru_shutdown>:???[TCP] [ignorable: mostly IPSEC/firewall/DLIL]
4365 * <pru_shutdown>:??? [other protocol families]
4366 */
4367 int
4368 soshutdown(struct socket *so, int how)
4369 {
4370 int error;
4371
4372 KERNEL_DEBUG(DBG_FNC_SOSHUTDOWN | DBG_FUNC_START, how, 0, 0, 0, 0);
4373
4374 switch (how) {
4375 case SHUT_RD:
4376 case SHUT_WR:
4377 case SHUT_RDWR:
4378 socket_lock(so, 1);
4379 if ((so->so_state &
4380 (SS_ISCONNECTED|SS_ISCONNECTING|SS_ISDISCONNECTING)) == 0) {
4381 error = ENOTCONN;
4382 } else {
4383 error = soshutdownlock(so, how);
4384 }
4385 socket_unlock(so, 1);
4386 break;
4387 default:
4388 error = EINVAL;
4389 break;
4390 }
4391
4392 KERNEL_DEBUG(DBG_FNC_SOSHUTDOWN | DBG_FUNC_END, how, error, 0, 0, 0);
4393
4394 return (error);
4395 }
4396
4397 int
4398 soshutdownlock_final(struct socket *so, int how)
4399 {
4400 struct protosw *pr = so->so_proto;
4401 int error = 0;
4402
4403 sflt_notify(so, sock_evt_shutdown, &how);
4404
4405 if (how != SHUT_WR) {
4406 if ((so->so_state & SS_CANTRCVMORE) != 0) {
4407 /* read already shut down */
4408 error = ENOTCONN;
4409 goto done;
4410 }
4411 sorflush(so);
4412 postevent(so, 0, EV_RCLOSED);
4413 }
4414 if (how != SHUT_RD) {
4415 if ((so->so_state & SS_CANTSENDMORE) != 0) {
4416 /* write already shut down */
4417 error = ENOTCONN;
4418 goto done;
4419 }
4420 error = (*pr->pr_usrreqs->pru_shutdown)(so);
4421 postevent(so, 0, EV_WCLOSED);
4422 }
4423 done:
4424 KERNEL_DEBUG(DBG_FNC_SOSHUTDOWN, how, 1, 0, 0, 0);
4425 return (error);
4426 }
4427
4428 int
4429 soshutdownlock(struct socket *so, int how)
4430 {
4431 int error = 0;
4432
4433 #if CONTENT_FILTER
4434 /*
4435 * A content filter may delay the actual shutdown until it
4436 * has processed the pending data
4437 */
4438 if (so->so_flags & SOF_CONTENT_FILTER) {
4439 error = cfil_sock_shutdown(so, &how);
4440 if (error == EJUSTRETURN) {
4441 error = 0;
4442 goto done;
4443 } else if (error != 0) {
4444 goto done;
4445 }
4446 }
4447 #endif /* CONTENT_FILTER */
4448
4449 error = soshutdownlock_final(so, how);
4450
4451 done:
4452 return (error);
4453 }
4454
4455 void
4456 sowflush(struct socket *so)
4457 {
4458 struct sockbuf *sb = &so->so_snd;
4459
4460 /*
4461 * Obtain lock on the socket buffer (SB_LOCK). This is required
4462 * to prevent the socket buffer from being unexpectedly altered
4463 * while it is used by another thread in socket send/receive.
4464 *
4465 * sblock() must not fail here, hence the assertion.
4466 */
4467 (void) sblock(sb, SBL_WAIT | SBL_NOINTR | SBL_IGNDEFUNCT);
4468 VERIFY(sb->sb_flags & SB_LOCK);
4469
4470 sb->sb_flags &= ~(SB_SEL|SB_UPCALL);
4471 sb->sb_flags |= SB_DROP;
4472 sb->sb_upcall = NULL;
4473 sb->sb_upcallarg = NULL;
4474
4475 sbunlock(sb, TRUE); /* keep socket locked */
4476
4477 selthreadclear(&sb->sb_sel);
4478 sbrelease(sb);
4479 }
4480
4481 void
4482 sorflush(struct socket *so)
4483 {
4484 struct sockbuf *sb = &so->so_rcv;
4485 struct protosw *pr = so->so_proto;
4486 struct sockbuf asb;
4487 #ifdef notyet
4488 lck_mtx_t *mutex_held;
4489 /*
4490 * XXX: This code is currently commented out, because we may get here
4491 * as part of sofreelastref(), and at that time, pr_getlock() may no
4492 * longer be able to return us the lock; this will be fixed in future.
4493 */
4494 if (so->so_proto->pr_getlock != NULL)
4495 mutex_held = (*so->so_proto->pr_getlock)(so, 0);
4496 else
4497 mutex_held = so->so_proto->pr_domain->dom_mtx;
4498
4499 LCK_MTX_ASSERT(mutex_held, LCK_MTX_ASSERT_OWNED);
4500 #endif /* notyet */
4501
4502 sflt_notify(so, sock_evt_flush_read, NULL);
4503
4504 socantrcvmore(so);
4505
4506 /*
4507 * Obtain lock on the socket buffer (SB_LOCK). This is required
4508 * to prevent the socket buffer from being unexpectedly altered
4509 * while it is used by another thread in socket send/receive.
4510 *
4511 * sblock() must not fail here, hence the assertion.
4512 */
4513 (void) sblock(sb, SBL_WAIT | SBL_NOINTR | SBL_IGNDEFUNCT);
4514 VERIFY(sb->sb_flags & SB_LOCK);
4515
4516 /*
4517 * Copy only the relevant fields from "sb" to "asb" which we
4518 * need for sbrelease() to function. In particular, skip
4519 * sb_sel as it contains the wait queue linkage, which would
4520 * wreak havoc if we were to issue selthreadclear() on "asb".
4521 * Make sure to not carry over SB_LOCK in "asb", as we need
4522 * to acquire it later as part of sbrelease().
4523 */
4524 bzero(&asb, sizeof (asb));
4525 asb.sb_cc = sb->sb_cc;
4526 asb.sb_hiwat = sb->sb_hiwat;
4527 asb.sb_mbcnt = sb->sb_mbcnt;
4528 asb.sb_mbmax = sb->sb_mbmax;
4529 asb.sb_ctl = sb->sb_ctl;
4530 asb.sb_lowat = sb->sb_lowat;
4531 asb.sb_mb = sb->sb_mb;
4532 asb.sb_mbtail = sb->sb_mbtail;
4533 asb.sb_lastrecord = sb->sb_lastrecord;
4534 asb.sb_so = sb->sb_so;
4535 asb.sb_flags = sb->sb_flags;
4536 asb.sb_flags &= ~(SB_LOCK|SB_SEL|SB_KNOTE|SB_UPCALL);
4537 asb.sb_flags |= SB_DROP;
4538
4539 /*
4540 * Ideally we'd bzero() these and preserve the ones we need;
4541 * but to do that we'd need to shuffle things around in the
4542 * sockbuf, and we can't do it now because there are KEXTS
4543 * that are directly referring to the socket structure.
4544 *
4545 * Setting SB_DROP acts as a barrier to prevent further appends.
4546 * Clearing SB_SEL is done for selthreadclear() below.
4547 */
4548 sb->sb_cc = 0;
4549 sb->sb_hiwat = 0;
4550 sb->sb_mbcnt = 0;
4551 sb->sb_mbmax = 0;
4552 sb->sb_ctl = 0;
4553 sb->sb_lowat = 0;
4554 sb->sb_mb = NULL;
4555 sb->sb_mbtail = NULL;
4556 sb->sb_lastrecord = NULL;
4557 sb->sb_timeo.tv_sec = 0;
4558 sb->sb_timeo.tv_usec = 0;
4559 sb->sb_upcall = NULL;
4560 sb->sb_upcallarg = NULL;
4561 sb->sb_flags &= ~(SB_SEL|SB_UPCALL);
4562 sb->sb_flags |= SB_DROP;
4563
4564 sbunlock(sb, TRUE); /* keep socket locked */
4565
4566 /*
4567 * Note that selthreadclear() is called on the original "sb" and
4568 * not the local "asb" because of the way wait queue linkage is
4569 * implemented. Given that selwakeup() may be triggered, SB_SEL
4570 * should no longer be set (cleared above.)
4571 */
4572 selthreadclear(&sb->sb_sel);
4573
4574 if ((pr->pr_flags & PR_RIGHTS) && pr->pr_domain->dom_dispose)
4575 (*pr->pr_domain->dom_dispose)(asb.sb_mb);
4576
4577 sbrelease(&asb);
4578 }
4579
4580 /*
4581 * Perhaps this routine, and sooptcopyout(), below, ought to come in
4582 * an additional variant to handle the case where the option value needs
4583 * to be some kind of integer, but not a specific size.
4584 * In addition to their use here, these functions are also called by the
4585 * protocol-level pr_ctloutput() routines.
4586 *
4587 * Returns: 0 Success
4588 * EINVAL
4589 * copyin:EFAULT
4590 */
4591 int
4592 sooptcopyin(struct sockopt *sopt, void *buf, size_t len, size_t minlen)
4593 {
4594 size_t valsize;
4595
4596 /*
4597 * If the user gives us more than we wanted, we ignore it,
4598 * but if we don't get the minimum length the caller
4599 * wants, we return EINVAL. On success, sopt->sopt_valsize
4600 * is set to however much we actually retrieved.
4601 */
4602 if ((valsize = sopt->sopt_valsize) < minlen)
4603 return (EINVAL);
4604 if (valsize > len)
4605 sopt->sopt_valsize = valsize = len;
4606
4607 if (sopt->sopt_p != kernproc)
4608 return (copyin(sopt->sopt_val, buf, valsize));
4609
4610 bcopy(CAST_DOWN(caddr_t, sopt->sopt_val), buf, valsize);
4611 return (0);
4612 }
4613
4614 /*
4615 * sooptcopyin_timeval
4616 * Copy in a timeval value into tv_p, and take into account whether the
4617 * the calling process is 64-bit or 32-bit. Moved the sanity checking
4618 * code here so that we can verify the 64-bit tv_sec value before we lose
4619 * the top 32-bits assigning tv64.tv_sec to tv_p->tv_sec.
4620 */
4621 static int
4622 sooptcopyin_timeval(struct sockopt *sopt, struct timeval *tv_p)
4623 {
4624 int error;
4625
4626 if (proc_is64bit(sopt->sopt_p)) {
4627 struct user64_timeval tv64;
4628
4629 if (sopt->sopt_valsize < sizeof (tv64))
4630 return (EINVAL);
4631
4632 sopt->sopt_valsize = sizeof (tv64);
4633 if (sopt->sopt_p != kernproc) {
4634 error = copyin(sopt->sopt_val, &tv64, sizeof (tv64));
4635 if (error != 0)
4636 return (error);
4637 } else {
4638 bcopy(CAST_DOWN(caddr_t, sopt->sopt_val), &tv64,
4639 sizeof (tv64));
4640 }
4641 if (tv64.tv_sec < 0 || tv64.tv_sec > LONG_MAX ||
4642 tv64.tv_usec < 0 || tv64.tv_usec >= 1000000)
4643 return (EDOM);
4644
4645 tv_p->tv_sec = tv64.tv_sec;
4646 tv_p->tv_usec = tv64.tv_usec;
4647 } else {
4648 struct user32_timeval tv32;
4649
4650 if (sopt->sopt_valsize < sizeof (tv32))
4651 return (EINVAL);
4652
4653 sopt->sopt_valsize = sizeof (tv32);
4654 if (sopt->sopt_p != kernproc) {
4655 error = copyin(sopt->sopt_val, &tv32, sizeof (tv32));
4656 if (error != 0) {
4657 return (error);
4658 }
4659 } else {
4660 bcopy(CAST_DOWN(caddr_t, sopt->sopt_val), &tv32,
4661 sizeof (tv32));
4662 }
4663 #ifndef __LP64__
4664 /*
4665 * K64todo "comparison is always false due to
4666 * limited range of data type"
4667 */
4668 if (tv32.tv_sec < 0 || tv32.tv_sec > LONG_MAX ||
4669 tv32.tv_usec < 0 || tv32.tv_usec >= 1000000)
4670 return (EDOM);
4671 #endif
4672 tv_p->tv_sec = tv32.tv_sec;
4673 tv_p->tv_usec = tv32.tv_usec;
4674 }
4675 return (0);
4676 }
4677
4678 int
4679 soopt_cred_check(struct socket *so, int priv, boolean_t allow_root)
4680 {
4681 kauth_cred_t cred = NULL;
4682 proc_t ep = PROC_NULL;
4683 uid_t uid;
4684 int error = 0;
4685
4686 if (so->so_flags & SOF_DELEGATED) {
4687 ep = proc_find(so->e_pid);
4688 if (ep)
4689 cred = kauth_cred_proc_ref(ep);
4690 }
4691
4692 uid = kauth_cred_getuid(cred ? cred : so->so_cred);
4693
4694 /* uid is 0 for root */
4695 if (uid != 0 || !allow_root)
4696 error = priv_check_cred(cred ? cred : so->so_cred, priv, 0);
4697 if (cred)
4698 kauth_cred_unref(&cred);
4699 if (ep != PROC_NULL)
4700 proc_rele(ep);
4701
4702 return (error);
4703 }
4704
4705 /*
4706 * Returns: 0 Success
4707 * EINVAL
4708 * ENOPROTOOPT
4709 * ENOBUFS
4710 * EDOM
4711 * sooptcopyin:EINVAL
4712 * sooptcopyin:EFAULT
4713 * sooptcopyin_timeval:EINVAL
4714 * sooptcopyin_timeval:EFAULT
4715 * sooptcopyin_timeval:EDOM
4716 * <pr_ctloutput>:EOPNOTSUPP[AF_UNIX]
4717 * <pr_ctloutput>:???w
4718 * sflt_attach_private:??? [whatever a filter author chooses]
4719 * <sf_setoption>:??? [whatever a filter author chooses]
4720 *
4721 * Notes: Other <pru_listen> returns depend on the protocol family; all
4722 * <sf_listen> returns depend on what the filter author causes
4723 * their filter to return.
4724 */
4725 int
4726 sosetoptlock(struct socket *so, struct sockopt *sopt, int dolock)
4727 {
4728 int error, optval;
4729 struct linger l;
4730 struct timeval tv;
4731 #if CONFIG_MACF_SOCKET
4732 struct mac extmac;
4733 #endif /* MAC_SOCKET */
4734
4735 if (sopt->sopt_dir != SOPT_SET)
4736 sopt->sopt_dir = SOPT_SET;
4737
4738 if (dolock)
4739 socket_lock(so, 1);
4740
4741 if ((so->so_state & (SS_CANTRCVMORE | SS_CANTSENDMORE)) ==
4742 (SS_CANTRCVMORE | SS_CANTSENDMORE) &&
4743 (so->so_flags & SOF_NPX_SETOPTSHUT) == 0) {
4744 /* the socket has been shutdown, no more sockopt's */
4745 error = EINVAL;
4746 goto out;
4747 }
4748
4749 error = sflt_setsockopt(so, sopt);
4750 if (error != 0) {
4751 if (error == EJUSTRETURN)
4752 error = 0;
4753 goto out;
4754 }
4755
4756 if (sopt->sopt_level != SOL_SOCKET) {
4757 if (so->so_proto != NULL &&
4758 so->so_proto->pr_ctloutput != NULL) {
4759 error = (*so->so_proto->pr_ctloutput)(so, sopt);
4760 goto out;
4761 }
4762 error = ENOPROTOOPT;
4763 } else {
4764 /*
4765 * Allow socket-level (SOL_SOCKET) options to be filtered by
4766 * the protocol layer, if needed. A zero value returned from
4767 * the handler means use default socket-level processing as
4768 * done by the rest of this routine. Otherwise, any other
4769 * return value indicates that the option is unsupported.
4770 */
4771 if (so->so_proto != NULL && (error = so->so_proto->pr_usrreqs->
4772 pru_socheckopt(so, sopt)) != 0)
4773 goto out;
4774
4775 error = 0;
4776 switch (sopt->sopt_name) {
4777 case SO_LINGER:
4778 case SO_LINGER_SEC:
4779 error = sooptcopyin(sopt, &l, sizeof (l), sizeof (l));
4780 if (error != 0)
4781 goto out;
4782
4783 so->so_linger = (sopt->sopt_name == SO_LINGER) ?
4784 l.l_linger : l.l_linger * hz;
4785 if (l.l_onoff != 0)
4786 so->so_options |= SO_LINGER;
4787 else
4788 so->so_options &= ~SO_LINGER;
4789 break;
4790
4791 case SO_DEBUG:
4792 case SO_KEEPALIVE:
4793 case SO_DONTROUTE:
4794 case SO_USELOOPBACK:
4795 case SO_BROADCAST:
4796 case SO_REUSEADDR:
4797 case SO_REUSEPORT:
4798 case SO_OOBINLINE:
4799 case SO_TIMESTAMP:
4800 case SO_TIMESTAMP_MONOTONIC:
4801 case SO_DONTTRUNC:
4802 case SO_WANTMORE:
4803 case SO_WANTOOBFLAG:
4804 case SO_NOWAKEFROMSLEEP:
4805 case SO_NOAPNFALLBK:
4806 error = sooptcopyin(sopt, &optval, sizeof (optval),
4807 sizeof (optval));
4808 if (error != 0)
4809 goto out;
4810 if (optval)
4811 so->so_options |= sopt->sopt_name;
4812 else
4813 so->so_options &= ~sopt->sopt_name;
4814 break;
4815
4816 case SO_SNDBUF:
4817 case SO_RCVBUF:
4818 case SO_SNDLOWAT:
4819 case SO_RCVLOWAT:
4820 error = sooptcopyin(sopt, &optval, sizeof (optval),
4821 sizeof (optval));
4822 if (error != 0)
4823 goto out;
4824
4825 /*
4826 * Values < 1 make no sense for any of these
4827 * options, so disallow them.
4828 */
4829 if (optval < 1) {
4830 error = EINVAL;
4831 goto out;
4832 }
4833
4834 switch (sopt->sopt_name) {
4835 case SO_SNDBUF:
4836 case SO_RCVBUF: {
4837 struct sockbuf *sb =
4838 (sopt->sopt_name == SO_SNDBUF) ?
4839 &so->so_snd : &so->so_rcv;
4840 if (sbreserve(sb, (u_int32_t)optval) == 0) {
4841 error = ENOBUFS;
4842 goto out;
4843 }
4844 sb->sb_flags |= SB_USRSIZE;
4845 sb->sb_flags &= ~SB_AUTOSIZE;
4846 sb->sb_idealsize = (u_int32_t)optval;
4847 break;
4848 }
4849 /*
4850 * Make sure the low-water is never greater than
4851 * the high-water.
4852 */
4853 case SO_SNDLOWAT: {
4854 int space = sbspace(&so->so_snd);
4855 u_int32_t hiwat = so->so_snd.sb_hiwat;
4856
4857 if (so->so_snd.sb_flags & SB_UNIX) {
4858 struct unpcb *unp =
4859 (struct unpcb *)(so->so_pcb);
4860 if (unp != NULL &&
4861 unp->unp_conn != NULL) {
4862 hiwat += unp->unp_conn->unp_cc;
4863 }
4864 }
4865
4866 so->so_snd.sb_lowat =
4867 (optval > hiwat) ?
4868 hiwat : optval;
4869
4870 if (space >= so->so_snd.sb_lowat) {
4871 sowwakeup(so);
4872 }
4873 break;
4874 }
4875 case SO_RCVLOWAT: {
4876 int64_t data_len;
4877 so->so_rcv.sb_lowat =
4878 (optval > so->so_rcv.sb_hiwat) ?
4879 so->so_rcv.sb_hiwat : optval;
4880 data_len = so->so_rcv.sb_cc
4881 - so->so_rcv.sb_ctl;
4882 if (data_len >= so->so_rcv.sb_lowat)
4883 sorwakeup(so);
4884 break;
4885 }
4886 }
4887 break;
4888
4889 case SO_SNDTIMEO:
4890 case SO_RCVTIMEO:
4891 error = sooptcopyin_timeval(sopt, &tv);
4892 if (error != 0)
4893 goto out;
4894
4895 switch (sopt->sopt_name) {
4896 case SO_SNDTIMEO:
4897 so->so_snd.sb_timeo = tv;
4898 break;
4899 case SO_RCVTIMEO:
4900 so->so_rcv.sb_timeo = tv;
4901 break;
4902 }
4903 break;
4904
4905 case SO_NKE: {
4906 struct so_nke nke;
4907
4908 error = sooptcopyin(sopt, &nke, sizeof (nke),
4909 sizeof (nke));
4910 if (error != 0)
4911 goto out;
4912
4913 error = sflt_attach_internal(so, nke.nke_handle);
4914 break;
4915 }
4916
4917 case SO_NOSIGPIPE:
4918 error = sooptcopyin(sopt, &optval, sizeof (optval),
4919 sizeof (optval));
4920 if (error != 0)
4921 goto out;
4922 if (optval != 0)
4923 so->so_flags |= SOF_NOSIGPIPE;
4924 else
4925 so->so_flags &= ~SOF_NOSIGPIPE;
4926 break;
4927
4928 case SO_NOADDRERR:
4929 error = sooptcopyin(sopt, &optval, sizeof (optval),
4930 sizeof (optval));
4931 if (error != 0)
4932 goto out;
4933 if (optval != 0)
4934 so->so_flags |= SOF_NOADDRAVAIL;
4935 else
4936 so->so_flags &= ~SOF_NOADDRAVAIL;
4937 break;
4938
4939 case SO_REUSESHAREUID:
4940 error = sooptcopyin(sopt, &optval, sizeof (optval),
4941 sizeof (optval));
4942 if (error != 0)
4943 goto out;
4944 if (optval != 0)
4945 so->so_flags |= SOF_REUSESHAREUID;
4946 else
4947 so->so_flags &= ~SOF_REUSESHAREUID;
4948 break;
4949
4950 case SO_NOTIFYCONFLICT:
4951 if (kauth_cred_issuser(kauth_cred_get()) == 0) {
4952 error = EPERM;
4953 goto out;
4954 }
4955 error = sooptcopyin(sopt, &optval, sizeof (optval),
4956 sizeof (optval));
4957 if (error != 0)
4958 goto out;
4959 if (optval != 0)
4960 so->so_flags |= SOF_NOTIFYCONFLICT;
4961 else
4962 so->so_flags &= ~SOF_NOTIFYCONFLICT;
4963 break;
4964
4965 case SO_RESTRICTIONS:
4966 error = sooptcopyin(sopt, &optval, sizeof (optval),
4967 sizeof (optval));
4968 if (error != 0)
4969 goto out;
4970
4971 error = so_set_restrictions(so, optval);
4972 break;
4973
4974 case SO_AWDL_UNRESTRICTED:
4975 if (SOCK_DOM(so) != PF_INET &&
4976 SOCK_DOM(so) != PF_INET6) {
4977 error = EOPNOTSUPP;
4978 goto out;
4979 }
4980 error = sooptcopyin(sopt, &optval, sizeof(optval),
4981 sizeof(optval));
4982 if (error != 0)
4983 goto out;
4984 if (optval != 0) {
4985 error = soopt_cred_check(so,
4986 PRIV_NET_RESTRICTED_AWDL, false);
4987 if (error == 0)
4988 inp_set_awdl_unrestricted(
4989 sotoinpcb(so));
4990 } else
4991 inp_clear_awdl_unrestricted(sotoinpcb(so));
4992 break;
4993 case SO_INTCOPROC_ALLOW:
4994 if (SOCK_DOM(so) != PF_INET6) {
4995 error = EOPNOTSUPP;
4996 goto out;
4997 }
4998 error = sooptcopyin(sopt, &optval, sizeof(optval),
4999 sizeof(optval));
5000 if (error != 0)
5001 goto out;
5002 if (optval != 0 &&
5003 inp_get_intcoproc_allowed(sotoinpcb(so)) == FALSE) {
5004 error = soopt_cred_check(so,
5005 PRIV_NET_RESTRICTED_INTCOPROC, false);
5006 if (error == 0)
5007 inp_set_intcoproc_allowed(
5008 sotoinpcb(so));
5009 } else if (optval == 0)
5010 inp_clear_intcoproc_allowed(sotoinpcb(so));
5011 break;
5012
5013 case SO_LABEL:
5014 #if CONFIG_MACF_SOCKET
5015 if ((error = sooptcopyin(sopt, &extmac, sizeof (extmac),
5016 sizeof (extmac))) != 0)
5017 goto out;
5018
5019 error = mac_setsockopt_label(proc_ucred(sopt->sopt_p),
5020 so, &extmac);
5021 #else
5022 error = EOPNOTSUPP;
5023 #endif /* MAC_SOCKET */
5024 break;
5025
5026 case SO_UPCALLCLOSEWAIT:
5027 error = sooptcopyin(sopt, &optval, sizeof (optval),
5028 sizeof (optval));
5029 if (error != 0)
5030 goto out;
5031 if (optval != 0)
5032 so->so_flags |= SOF_UPCALLCLOSEWAIT;
5033 else
5034 so->so_flags &= ~SOF_UPCALLCLOSEWAIT;
5035 break;
5036
5037 case SO_RANDOMPORT:
5038 error = sooptcopyin(sopt, &optval, sizeof (optval),
5039 sizeof (optval));
5040 if (error != 0)
5041 goto out;
5042 if (optval != 0)
5043 so->so_flags |= SOF_BINDRANDOMPORT;
5044 else
5045 so->so_flags &= ~SOF_BINDRANDOMPORT;
5046 break;
5047
5048 case SO_NP_EXTENSIONS: {
5049 struct so_np_extensions sonpx;
5050
5051 error = sooptcopyin(sopt, &sonpx, sizeof (sonpx),
5052 sizeof (sonpx));
5053 if (error != 0)
5054 goto out;
5055 if (sonpx.npx_mask & ~SONPX_MASK_VALID) {
5056 error = EINVAL;
5057 goto out;
5058 }
5059 /*
5060 * Only one bit defined for now
5061 */
5062 if ((sonpx.npx_mask & SONPX_SETOPTSHUT)) {
5063 if ((sonpx.npx_flags & SONPX_SETOPTSHUT))
5064 so->so_flags |= SOF_NPX_SETOPTSHUT;
5065 else
5066 so->so_flags &= ~SOF_NPX_SETOPTSHUT;
5067 }
5068 break;
5069 }
5070
5071 case SO_TRAFFIC_CLASS: {
5072 error = sooptcopyin(sopt, &optval, sizeof (optval),
5073 sizeof (optval));
5074 if (error != 0)
5075 goto out;
5076 if (optval >= SO_TC_NET_SERVICE_OFFSET) {
5077 int netsvc = optval - SO_TC_NET_SERVICE_OFFSET;
5078 error = so_set_net_service_type(so, netsvc);
5079 goto out;
5080 }
5081 error = so_set_traffic_class(so, optval);
5082 if (error != 0)
5083 goto out;
5084 so->so_flags1 &= ~SOF1_TC_NET_SERV_TYPE;
5085 so->so_netsvctype = _NET_SERVICE_TYPE_UNSPEC;
5086 break;
5087 }
5088
5089 case SO_RECV_TRAFFIC_CLASS: {
5090 error = sooptcopyin(sopt, &optval, sizeof (optval),
5091 sizeof (optval));
5092 if (error != 0)
5093 goto out;
5094 if (optval == 0)
5095 so->so_flags &= ~SOF_RECV_TRAFFIC_CLASS;
5096 else
5097 so->so_flags |= SOF_RECV_TRAFFIC_CLASS;
5098 break;
5099 }
5100
5101 #if (DEVELOPMENT || DEBUG)
5102 case SO_TRAFFIC_CLASS_DBG: {
5103 struct so_tcdbg so_tcdbg;
5104
5105 error = sooptcopyin(sopt, &so_tcdbg,
5106 sizeof (struct so_tcdbg), sizeof (struct so_tcdbg));
5107 if (error != 0)
5108 goto out;
5109 error = so_set_tcdbg(so, &so_tcdbg);
5110 if (error != 0)
5111 goto out;
5112 break;
5113 }
5114 #endif /* (DEVELOPMENT || DEBUG) */
5115
5116 case SO_PRIVILEGED_TRAFFIC_CLASS:
5117 error = priv_check_cred(kauth_cred_get(),
5118 PRIV_NET_PRIVILEGED_TRAFFIC_CLASS, 0);
5119 if (error != 0)
5120 goto out;
5121 error = sooptcopyin(sopt, &optval, sizeof (optval),
5122 sizeof (optval));
5123 if (error != 0)
5124 goto out;
5125 if (optval == 0)
5126 so->so_flags &= ~SOF_PRIVILEGED_TRAFFIC_CLASS;
5127 else
5128 so->so_flags |= SOF_PRIVILEGED_TRAFFIC_CLASS;
5129 break;
5130
5131 case SO_DEFUNCTOK:
5132 error = sooptcopyin(sopt, &optval, sizeof (optval),
5133 sizeof (optval));
5134 if (error != 0 || (so->so_flags & SOF_DEFUNCT)) {
5135 if (error == 0)
5136 error = EBADF;
5137 goto out;
5138 }
5139 /*
5140 * Any process can set SO_DEFUNCTOK (clear
5141 * SOF_NODEFUNCT), but only root can clear
5142 * SO_DEFUNCTOK (set SOF_NODEFUNCT).
5143 */
5144 if (optval == 0 &&
5145 kauth_cred_issuser(kauth_cred_get()) == 0) {
5146 error = EPERM;
5147 goto out;
5148 }
5149 if (optval)
5150 so->so_flags &= ~SOF_NODEFUNCT;
5151 else
5152 so->so_flags |= SOF_NODEFUNCT;
5153
5154 if (SOCK_DOM(so) == PF_INET ||
5155 SOCK_DOM(so) == PF_INET6) {
5156 char s[MAX_IPv6_STR_LEN];
5157 char d[MAX_IPv6_STR_LEN];
5158 struct inpcb *inp = sotoinpcb(so);
5159
5160 SODEFUNCTLOG("%s[%d, %s]: so 0x%llx "
5161 "[%s %s:%d -> %s:%d] is now marked "
5162 "as %seligible for "
5163 "defunct\n", __func__, proc_selfpid(),
5164 proc_best_name(current_proc()),
5165 (uint64_t)DEBUG_KERNEL_ADDRPERM(so),
5166 (SOCK_TYPE(so) == SOCK_STREAM) ?
5167 "TCP" : "UDP", inet_ntop(SOCK_DOM(so),
5168 ((SOCK_DOM(so) == PF_INET) ?
5169 (void *)&inp->inp_laddr.s_addr :
5170 (void *)&inp->in6p_laddr), s, sizeof (s)),
5171 ntohs(inp->in6p_lport),
5172 inet_ntop(SOCK_DOM(so),
5173 (SOCK_DOM(so) == PF_INET) ?
5174 (void *)&inp->inp_faddr.s_addr :
5175 (void *)&inp->in6p_faddr, d, sizeof (d)),
5176 ntohs(inp->in6p_fport),
5177 (so->so_flags & SOF_NODEFUNCT) ?
5178 "not " : "");
5179 } else {
5180 SODEFUNCTLOG("%s[%d, %s]: so 0x%llx [%d,%d] "
5181 "is now marked as %seligible for "
5182 "defunct\n",
5183 __func__, proc_selfpid(),
5184 proc_best_name(current_proc()),
5185 (uint64_t)DEBUG_KERNEL_ADDRPERM(so),
5186 SOCK_DOM(so), SOCK_TYPE(so),
5187 (so->so_flags & SOF_NODEFUNCT) ?
5188 "not " : "");
5189 }
5190 break;
5191
5192 case SO_ISDEFUNCT:
5193 /* This option is not settable */
5194 error = EINVAL;
5195 break;
5196
5197 case SO_OPPORTUNISTIC:
5198 error = sooptcopyin(sopt, &optval, sizeof (optval),
5199 sizeof (optval));
5200 if (error == 0)
5201 error = so_set_opportunistic(so, optval);
5202 break;
5203
5204 case SO_FLUSH:
5205 /* This option is handled by lower layer(s) */
5206 error = 0;
5207 break;
5208
5209 case SO_RECV_ANYIF:
5210 error = sooptcopyin(sopt, &optval, sizeof (optval),
5211 sizeof (optval));
5212 if (error == 0)
5213 error = so_set_recv_anyif(so, optval);
5214 break;
5215
5216 case SO_TRAFFIC_MGT_BACKGROUND: {
5217 /* This option is handled by lower layer(s) */
5218 error = 0;
5219 break;
5220 }
5221
5222 #if FLOW_DIVERT
5223 case SO_FLOW_DIVERT_TOKEN:
5224 error = flow_divert_token_set(so, sopt);
5225 break;
5226 #endif /* FLOW_DIVERT */
5227
5228
5229 case SO_DELEGATED:
5230 if ((error = sooptcopyin(sopt, &optval, sizeof (optval),
5231 sizeof (optval))) != 0)
5232 break;
5233
5234 error = so_set_effective_pid(so, optval, sopt->sopt_p);
5235 break;
5236
5237 case SO_DELEGATED_UUID: {
5238 uuid_t euuid;
5239
5240 if ((error = sooptcopyin(sopt, &euuid, sizeof (euuid),
5241 sizeof (euuid))) != 0)
5242 break;
5243
5244 error = so_set_effective_uuid(so, euuid, sopt->sopt_p);
5245 break;
5246 }
5247
5248 #if NECP
5249 case SO_NECP_ATTRIBUTES:
5250 error = necp_set_socket_attributes(so, sopt);
5251 break;
5252
5253 case SO_NECP_CLIENTUUID:
5254 if (SOCK_DOM(so) == PF_MULTIPATH) {
5255 /* Handled by MPTCP itself */
5256 break;
5257 }
5258
5259 if (SOCK_DOM(so) != PF_INET && SOCK_DOM(so) != PF_INET6) {
5260 error = EINVAL;
5261 goto out;
5262 }
5263
5264 struct inpcb *inp = sotoinpcb(so);
5265 if (!uuid_is_null(inp->necp_client_uuid)) {
5266 // Clear out the old client UUID if present
5267 necp_inpcb_remove_cb(inp);
5268 }
5269
5270 error = sooptcopyin(sopt, &inp->necp_client_uuid,
5271 sizeof(uuid_t), sizeof(uuid_t));
5272 if (error != 0) {
5273 goto out;
5274 }
5275
5276 if (uuid_is_null(inp->necp_client_uuid)) {
5277 error = EINVAL;
5278 goto out;
5279 }
5280
5281 error = necp_client_register_socket_flow(so->last_pid,
5282 inp->necp_client_uuid, inp);
5283 if (error != 0) {
5284 uuid_clear(inp->necp_client_uuid);
5285 goto out;
5286 }
5287
5288 if (inp->inp_lport != 0) {
5289 // There is bound local port, so this is not
5290 // a fresh socket. Assign to the client.
5291 necp_client_assign_from_socket(so->last_pid, inp->necp_client_uuid, inp);
5292 }
5293
5294 break;
5295 #endif /* NECP */
5296
5297 case SO_EXTENDED_BK_IDLE:
5298 error = sooptcopyin(sopt, &optval, sizeof (optval),
5299 sizeof (optval));
5300 if (error == 0)
5301 error = so_set_extended_bk_idle(so, optval);
5302 break;
5303
5304 case SO_MARK_CELLFALLBACK:
5305 error = sooptcopyin(sopt, &optval, sizeof(optval),
5306 sizeof(optval));
5307 if (error != 0)
5308 goto out;
5309 if (optval < 0) {
5310 error = EINVAL;
5311 goto out;
5312 }
5313 if (optval == 0)
5314 so->so_flags1 &= ~SOF1_CELLFALLBACK;
5315 else
5316 so->so_flags1 |= SOF1_CELLFALLBACK;
5317 break;
5318
5319 case SO_NET_SERVICE_TYPE: {
5320 error = sooptcopyin(sopt, &optval, sizeof(optval),
5321 sizeof(optval));
5322 if (error != 0)
5323 goto out;
5324 error = so_set_net_service_type(so, optval);
5325 break;
5326 }
5327
5328 case SO_QOSMARKING_POLICY_OVERRIDE:
5329 error = priv_check_cred(kauth_cred_get(),
5330 PRIV_NET_QOSMARKING_POLICY_OVERRIDE, 0);
5331 if (error != 0)
5332 goto out;
5333 error = sooptcopyin(sopt, &optval, sizeof(optval),
5334 sizeof(optval));
5335 if (error != 0)
5336 goto out;
5337 if (optval == 0)
5338 so->so_flags1 &= ~SOF1_QOSMARKING_POLICY_OVERRIDE;
5339 else
5340 so->so_flags1 |= SOF1_QOSMARKING_POLICY_OVERRIDE;
5341 break;
5342
5343 default:
5344 error = ENOPROTOOPT;
5345 break;
5346 }
5347 if (error == 0 && so->so_proto != NULL &&
5348 so->so_proto->pr_ctloutput != NULL) {
5349 (void) so->so_proto->pr_ctloutput(so, sopt);
5350 }
5351 }
5352 out:
5353 if (dolock)
5354 socket_unlock(so, 1);
5355 return (error);
5356 }
5357
5358 /* Helper routines for getsockopt */
5359 int
5360 sooptcopyout(struct sockopt *sopt, void *buf, size_t len)
5361 {
5362 int error;
5363 size_t valsize;
5364
5365 error = 0;
5366
5367 /*
5368 * Documented get behavior is that we always return a value,
5369 * possibly truncated to fit in the user's buffer.
5370 * Traditional behavior is that we always tell the user
5371 * precisely how much we copied, rather than something useful
5372 * like the total amount we had available for her.
5373 * Note that this interface is not idempotent; the entire answer must
5374 * generated ahead of time.
5375 */
5376 valsize = min(len, sopt->sopt_valsize);
5377 sopt->sopt_valsize = valsize;
5378 if (sopt->sopt_val != USER_ADDR_NULL) {
5379 if (sopt->sopt_p != kernproc)
5380 error = copyout(buf, sopt->sopt_val, valsize);
5381 else
5382 bcopy(buf, CAST_DOWN(caddr_t, sopt->sopt_val), valsize);
5383 }
5384 return (error);
5385 }
5386
5387 static int
5388 sooptcopyout_timeval(struct sockopt *sopt, const struct timeval *tv_p)
5389 {
5390 int error;
5391 size_t len;
5392 struct user64_timeval tv64 = {};
5393 struct user32_timeval tv32 = {};
5394 const void * val;
5395 size_t valsize;
5396
5397 error = 0;
5398 if (proc_is64bit(sopt->sopt_p)) {
5399 len = sizeof (tv64);
5400 tv64.tv_sec = tv_p->tv_sec;
5401 tv64.tv_usec = tv_p->tv_usec;
5402 val = &tv64;
5403 } else {
5404 len = sizeof (tv32);
5405 tv32.tv_sec = tv_p->tv_sec;
5406 tv32.tv_usec = tv_p->tv_usec;
5407 val = &tv32;
5408 }
5409 valsize = min(len, sopt->sopt_valsize);
5410 sopt->sopt_valsize = valsize;
5411 if (sopt->sopt_val != USER_ADDR_NULL) {
5412 if (sopt->sopt_p != kernproc)
5413 error = copyout(val, sopt->sopt_val, valsize);
5414 else
5415 bcopy(val, CAST_DOWN(caddr_t, sopt->sopt_val), valsize);
5416 }
5417 return (error);
5418 }
5419
5420 /*
5421 * Return: 0 Success
5422 * ENOPROTOOPT
5423 * <pr_ctloutput>:EOPNOTSUPP[AF_UNIX]
5424 * <pr_ctloutput>:???
5425 * <sf_getoption>:???
5426 */
5427 int
5428 sogetoptlock(struct socket *so, struct sockopt *sopt, int dolock)
5429 {
5430 int error, optval;
5431 struct linger l;
5432 struct timeval tv;
5433 #if CONFIG_MACF_SOCKET
5434 struct mac extmac;
5435 #endif /* MAC_SOCKET */
5436
5437 if (sopt->sopt_dir != SOPT_GET)
5438 sopt->sopt_dir = SOPT_GET;
5439
5440 if (dolock)
5441 socket_lock(so, 1);
5442
5443 error = sflt_getsockopt(so, sopt);
5444 if (error != 0) {
5445 if (error == EJUSTRETURN)
5446 error = 0;
5447 goto out;
5448 }
5449
5450 if (sopt->sopt_level != SOL_SOCKET) {
5451 if (so->so_proto != NULL &&
5452 so->so_proto->pr_ctloutput != NULL) {
5453 error = (*so->so_proto->pr_ctloutput)(so, sopt);
5454 goto out;
5455 }
5456 error = ENOPROTOOPT;
5457 } else {
5458 /*
5459 * Allow socket-level (SOL_SOCKET) options to be filtered by
5460 * the protocol layer, if needed. A zero value returned from
5461 * the handler means use default socket-level processing as
5462 * done by the rest of this routine. Otherwise, any other
5463 * return value indicates that the option is unsupported.
5464 */
5465 if (so->so_proto != NULL && (error = so->so_proto->pr_usrreqs->
5466 pru_socheckopt(so, sopt)) != 0)
5467 goto out;
5468
5469 error = 0;
5470 switch (sopt->sopt_name) {
5471 case SO_LINGER:
5472 case SO_LINGER_SEC:
5473 l.l_onoff = ((so->so_options & SO_LINGER) ? 1 : 0);
5474 l.l_linger = (sopt->sopt_name == SO_LINGER) ?
5475 so->so_linger : so->so_linger / hz;
5476 error = sooptcopyout(sopt, &l, sizeof (l));
5477 break;
5478
5479 case SO_USELOOPBACK:
5480 case SO_DONTROUTE:
5481 case SO_DEBUG:
5482 case SO_KEEPALIVE:
5483 case SO_REUSEADDR:
5484 case SO_REUSEPORT:
5485 case SO_BROADCAST:
5486 case SO_OOBINLINE:
5487 case SO_TIMESTAMP:
5488 case SO_TIMESTAMP_MONOTONIC:
5489 case SO_DONTTRUNC:
5490 case SO_WANTMORE:
5491 case SO_WANTOOBFLAG:
5492 case SO_NOWAKEFROMSLEEP:
5493 case SO_NOAPNFALLBK:
5494 optval = so->so_options & sopt->sopt_name;
5495 integer:
5496 error = sooptcopyout(sopt, &optval, sizeof (optval));
5497 break;
5498
5499 case SO_TYPE:
5500 optval = so->so_type;
5501 goto integer;
5502
5503 case SO_NREAD:
5504 if (so->so_proto->pr_flags & PR_ATOMIC) {
5505 int pkt_total;
5506 struct mbuf *m1;
5507
5508 pkt_total = 0;
5509 m1 = so->so_rcv.sb_mb;
5510 while (m1 != NULL) {
5511 if (m1->m_type == MT_DATA ||
5512 m1->m_type == MT_HEADER ||
5513 m1->m_type == MT_OOBDATA)
5514 pkt_total += m1->m_len;
5515 m1 = m1->m_next;
5516 }
5517 optval = pkt_total;
5518 } else {
5519 optval = so->so_rcv.sb_cc - so->so_rcv.sb_ctl;
5520 }
5521 goto integer;
5522
5523 case SO_NUMRCVPKT:
5524 if (so->so_proto->pr_flags & PR_ATOMIC) {
5525 int cnt = 0;
5526 struct mbuf *m1;
5527
5528 m1 = so->so_rcv.sb_mb;
5529 while (m1 != NULL) {
5530 if (m1->m_type == MT_DATA ||
5531 m1->m_type == MT_HEADER ||
5532 m1->m_type == MT_OOBDATA)
5533 cnt += 1;
5534 m1 = m1->m_nextpkt;
5535 }
5536 optval = cnt;
5537 goto integer;
5538 } else {
5539 error = EINVAL;
5540 break;
5541 }
5542
5543 case SO_NWRITE:
5544 optval = so->so_snd.sb_cc;
5545 goto integer;
5546
5547 case SO_ERROR:
5548 optval = so->so_error;
5549 so->so_error = 0;
5550 goto integer;
5551
5552 case SO_SNDBUF: {
5553 u_int32_t hiwat = so->so_snd.sb_hiwat;
5554
5555 if (so->so_snd.sb_flags & SB_UNIX) {
5556 struct unpcb *unp =
5557 (struct unpcb *)(so->so_pcb);
5558 if (unp != NULL && unp->unp_conn != NULL) {
5559 hiwat += unp->unp_conn->unp_cc;
5560 }
5561 }
5562
5563 optval = hiwat;
5564 goto integer;
5565 }
5566 case SO_RCVBUF:
5567 optval = so->so_rcv.sb_hiwat;
5568 goto integer;
5569
5570 case SO_SNDLOWAT:
5571 optval = so->so_snd.sb_lowat;
5572 goto integer;
5573
5574 case SO_RCVLOWAT:
5575 optval = so->so_rcv.sb_lowat;
5576 goto integer;
5577
5578 case SO_SNDTIMEO:
5579 case SO_RCVTIMEO:
5580 tv = (sopt->sopt_name == SO_SNDTIMEO ?
5581 so->so_snd.sb_timeo : so->so_rcv.sb_timeo);
5582
5583 error = sooptcopyout_timeval(sopt, &tv);
5584 break;
5585
5586 case SO_NOSIGPIPE:
5587 optval = (so->so_flags & SOF_NOSIGPIPE);
5588 goto integer;
5589
5590 case SO_NOADDRERR:
5591 optval = (so->so_flags & SOF_NOADDRAVAIL);
5592 goto integer;
5593
5594 case SO_REUSESHAREUID:
5595 optval = (so->so_flags & SOF_REUSESHAREUID);
5596 goto integer;
5597
5598
5599 case SO_NOTIFYCONFLICT:
5600 optval = (so->so_flags & SOF_NOTIFYCONFLICT);
5601 goto integer;
5602
5603 case SO_RESTRICTIONS:
5604 optval = so_get_restrictions(so);
5605 goto integer;
5606
5607 case SO_AWDL_UNRESTRICTED:
5608 if (SOCK_DOM(so) == PF_INET ||
5609 SOCK_DOM(so) == PF_INET6) {
5610 optval = inp_get_awdl_unrestricted(
5611 sotoinpcb(so));
5612 goto integer;
5613 } else
5614 error = EOPNOTSUPP;
5615 break;
5616
5617 case SO_INTCOPROC_ALLOW:
5618 if (SOCK_DOM(so) == PF_INET6) {
5619 optval = inp_get_intcoproc_allowed(
5620 sotoinpcb(so));
5621 goto integer;
5622 } else
5623 error = EOPNOTSUPP;
5624 break;
5625
5626 case SO_LABEL:
5627 #if CONFIG_MACF_SOCKET
5628 if ((error = sooptcopyin(sopt, &extmac, sizeof (extmac),
5629 sizeof (extmac))) != 0 ||
5630 (error = mac_socket_label_get(proc_ucred(
5631 sopt->sopt_p), so, &extmac)) != 0)
5632 break;
5633
5634 error = sooptcopyout(sopt, &extmac, sizeof (extmac));
5635 #else
5636 error = EOPNOTSUPP;
5637 #endif /* MAC_SOCKET */
5638 break;
5639
5640 case SO_PEERLABEL:
5641 #if CONFIG_MACF_SOCKET
5642 if ((error = sooptcopyin(sopt, &extmac, sizeof (extmac),
5643 sizeof (extmac))) != 0 ||
5644 (error = mac_socketpeer_label_get(proc_ucred(
5645 sopt->sopt_p), so, &extmac)) != 0)
5646 break;
5647
5648 error = sooptcopyout(sopt, &extmac, sizeof (extmac));
5649 #else
5650 error = EOPNOTSUPP;
5651 #endif /* MAC_SOCKET */
5652 break;
5653
5654 #ifdef __APPLE_API_PRIVATE
5655 case SO_UPCALLCLOSEWAIT:
5656 optval = (so->so_flags & SOF_UPCALLCLOSEWAIT);
5657 goto integer;
5658 #endif
5659 case SO_RANDOMPORT:
5660 optval = (so->so_flags & SOF_BINDRANDOMPORT);
5661 goto integer;
5662
5663 case SO_NP_EXTENSIONS: {
5664 struct so_np_extensions sonpx;
5665
5666 sonpx.npx_flags = (so->so_flags & SOF_NPX_SETOPTSHUT) ?
5667 SONPX_SETOPTSHUT : 0;
5668 sonpx.npx_mask = SONPX_MASK_VALID;
5669
5670 error = sooptcopyout(sopt, &sonpx,
5671 sizeof (struct so_np_extensions));
5672 break;
5673 }
5674
5675 case SO_TRAFFIC_CLASS:
5676 optval = so->so_traffic_class;
5677 goto integer;
5678
5679 case SO_RECV_TRAFFIC_CLASS:
5680 optval = (so->so_flags & SOF_RECV_TRAFFIC_CLASS);
5681 goto integer;
5682
5683 case SO_TRAFFIC_CLASS_STATS:
5684 error = sooptcopyout(sopt, &so->so_tc_stats,
5685 sizeof (so->so_tc_stats));
5686 break;
5687
5688 #if (DEVELOPMENT || DEBUG)
5689 case SO_TRAFFIC_CLASS_DBG:
5690 error = sogetopt_tcdbg(so, sopt);
5691 break;
5692 #endif /* (DEVELOPMENT || DEBUG) */
5693
5694 case SO_PRIVILEGED_TRAFFIC_CLASS:
5695 optval = (so->so_flags & SOF_PRIVILEGED_TRAFFIC_CLASS);
5696 goto integer;
5697
5698 case SO_DEFUNCTOK:
5699 optval = !(so->so_flags & SOF_NODEFUNCT);
5700 goto integer;
5701
5702 case SO_ISDEFUNCT:
5703 optval = (so->so_flags & SOF_DEFUNCT);
5704 goto integer;
5705
5706 case SO_OPPORTUNISTIC:
5707 optval = so_get_opportunistic(so);
5708 goto integer;
5709
5710 case SO_FLUSH:
5711 /* This option is not gettable */
5712 error = EINVAL;
5713 break;
5714
5715 case SO_RECV_ANYIF:
5716 optval = so_get_recv_anyif(so);
5717 goto integer;
5718
5719 case SO_TRAFFIC_MGT_BACKGROUND:
5720 /* This option is handled by lower layer(s) */
5721 if (so->so_proto != NULL &&
5722 so->so_proto->pr_ctloutput != NULL) {
5723 (void) so->so_proto->pr_ctloutput(so, sopt);
5724 }
5725 break;
5726
5727 #if FLOW_DIVERT
5728 case SO_FLOW_DIVERT_TOKEN:
5729 error = flow_divert_token_get(so, sopt);
5730 break;
5731 #endif /* FLOW_DIVERT */
5732
5733 #if NECP
5734 case SO_NECP_ATTRIBUTES:
5735 error = necp_get_socket_attributes(so, sopt);
5736 break;
5737
5738 case SO_NECP_CLIENTUUID:
5739 {
5740 uuid_t *ncu;
5741
5742 if (SOCK_DOM(so) == PF_MULTIPATH) {
5743 ncu = &mpsotomppcb(so)->necp_client_uuid;
5744 } else if (SOCK_DOM(so) == PF_INET || SOCK_DOM(so) == PF_INET6) {
5745 ncu = &sotoinpcb(so)->necp_client_uuid;
5746 } else {
5747 error = EINVAL;
5748 goto out;
5749 }
5750
5751 error = sooptcopyout(sopt, ncu, sizeof(uuid_t));
5752 break;
5753 }
5754 #endif /* NECP */
5755
5756 #if CONTENT_FILTER
5757 case SO_CFIL_SOCK_ID: {
5758 cfil_sock_id_t sock_id;
5759
5760 sock_id = cfil_sock_id_from_socket(so);
5761
5762 error = sooptcopyout(sopt, &sock_id,
5763 sizeof(cfil_sock_id_t));
5764 break;
5765 }
5766 #endif /* CONTENT_FILTER */
5767
5768 case SO_EXTENDED_BK_IDLE:
5769 optval = (so->so_flags1 & SOF1_EXTEND_BK_IDLE_WANTED);
5770 goto integer;
5771 case SO_MARK_CELLFALLBACK:
5772 optval = ((so->so_flags1 & SOF1_CELLFALLBACK) > 0)
5773 ? 1 : 0;
5774 goto integer;
5775 case SO_NET_SERVICE_TYPE: {
5776 if ((so->so_flags1 & SOF1_TC_NET_SERV_TYPE))
5777 optval = so->so_netsvctype;
5778 else
5779 optval = NET_SERVICE_TYPE_BE;
5780 goto integer;
5781 }
5782 case SO_NETSVC_MARKING_LEVEL:
5783 optval = so_get_netsvc_marking_level(so);
5784 goto integer;
5785
5786 default:
5787 error = ENOPROTOOPT;
5788 break;
5789 }
5790 }
5791 out:
5792 if (dolock)
5793 socket_unlock(so, 1);
5794 return (error);
5795 }
5796
5797 /*
5798 * The size limits on our soopt_getm is different from that on FreeBSD.
5799 * We limit the size of options to MCLBYTES. This will have to change
5800 * if we need to define options that need more space than MCLBYTES.
5801 */
5802 int
5803 soopt_getm(struct sockopt *sopt, struct mbuf **mp)
5804 {
5805 struct mbuf *m, *m_prev;
5806 int sopt_size = sopt->sopt_valsize;
5807 int how;
5808
5809 if (sopt_size <= 0 || sopt_size > MCLBYTES)
5810 return (EMSGSIZE);
5811
5812 how = sopt->sopt_p != kernproc ? M_WAIT : M_DONTWAIT;
5813 MGET(m, how, MT_DATA);
5814 if (m == NULL)
5815 return (ENOBUFS);
5816 if (sopt_size > MLEN) {
5817 MCLGET(m, how);
5818 if ((m->m_flags & M_EXT) == 0) {
5819 m_free(m);
5820 return (ENOBUFS);
5821 }
5822 m->m_len = min(MCLBYTES, sopt_size);
5823 } else {
5824 m->m_len = min(MLEN, sopt_size);
5825 }
5826 sopt_size -= m->m_len;
5827 *mp = m;
5828 m_prev = m;
5829
5830 while (sopt_size > 0) {
5831 MGET(m, how, MT_DATA);
5832 if (m == NULL) {
5833 m_freem(*mp);
5834 return (ENOBUFS);
5835 }
5836 if (sopt_size > MLEN) {
5837 MCLGET(m, how);
5838 if ((m->m_flags & M_EXT) == 0) {
5839 m_freem(*mp);
5840 m_freem(m);
5841 return (ENOBUFS);
5842 }
5843 m->m_len = min(MCLBYTES, sopt_size);
5844 } else {
5845 m->m_len = min(MLEN, sopt_size);
5846 }
5847 sopt_size -= m->m_len;
5848 m_prev->m_next = m;
5849 m_prev = m;
5850 }
5851 return (0);
5852 }
5853
5854 /* copyin sopt data into mbuf chain */
5855 int
5856 soopt_mcopyin(struct sockopt *sopt, struct mbuf *m)
5857 {
5858 struct mbuf *m0 = m;
5859
5860 if (sopt->sopt_val == USER_ADDR_NULL)
5861 return (0);
5862 while (m != NULL && sopt->sopt_valsize >= m->m_len) {
5863 if (sopt->sopt_p != kernproc) {
5864 int error;
5865
5866 error = copyin(sopt->sopt_val, mtod(m, char *),
5867 m->m_len);
5868 if (error != 0) {
5869 m_freem(m0);
5870 return (error);
5871 }
5872 } else {
5873 bcopy(CAST_DOWN(caddr_t, sopt->sopt_val),
5874 mtod(m, char *), m->m_len);
5875 }
5876 sopt->sopt_valsize -= m->m_len;
5877 sopt->sopt_val += m->m_len;
5878 m = m->m_next;
5879 }
5880 /* should be allocated enoughly at ip6_sooptmcopyin() */
5881 if (m != NULL) {
5882 panic("soopt_mcopyin");
5883 /* NOTREACHED */
5884 }
5885 return (0);
5886 }
5887
5888 /* copyout mbuf chain data into soopt */
5889 int
5890 soopt_mcopyout(struct sockopt *sopt, struct mbuf *m)
5891 {
5892 struct mbuf *m0 = m;
5893 size_t valsize = 0;
5894
5895 if (sopt->sopt_val == USER_ADDR_NULL)
5896 return (0);
5897 while (m != NULL && sopt->sopt_valsize >= m->m_len) {
5898 if (sopt->sopt_p != kernproc) {
5899 int error;
5900
5901 error = copyout(mtod(m, char *), sopt->sopt_val,
5902 m->m_len);
5903 if (error != 0) {
5904 m_freem(m0);
5905 return (error);
5906 }
5907 } else {
5908 bcopy(mtod(m, char *),
5909 CAST_DOWN(caddr_t, sopt->sopt_val), m->m_len);
5910 }
5911 sopt->sopt_valsize -= m->m_len;
5912 sopt->sopt_val += m->m_len;
5913 valsize += m->m_len;
5914 m = m->m_next;
5915 }
5916 if (m != NULL) {
5917 /* enough soopt buffer should be given from user-land */
5918 m_freem(m0);
5919 return (EINVAL);
5920 }
5921 sopt->sopt_valsize = valsize;
5922 return (0);
5923 }
5924
5925 void
5926 sohasoutofband(struct socket *so)
5927 {
5928 if (so->so_pgid < 0)
5929 gsignal(-so->so_pgid, SIGURG);
5930 else if (so->so_pgid > 0)
5931 proc_signal(so->so_pgid, SIGURG);
5932 selwakeup(&so->so_rcv.sb_sel);
5933 if (so->so_rcv.sb_flags & SB_KNOTE) {
5934 KNOTE(&so->so_rcv.sb_sel.si_note,
5935 (NOTE_OOB | SO_FILT_HINT_LOCKED));
5936 }
5937 }
5938
5939 int
5940 sopoll(struct socket *so, int events, kauth_cred_t cred, void * wql)
5941 {
5942 #pragma unused(cred)
5943 struct proc *p = current_proc();
5944 int revents = 0;
5945
5946 socket_lock(so, 1);
5947 so_update_last_owner_locked(so, PROC_NULL);
5948 so_update_policy(so);
5949
5950 if (events & (POLLIN | POLLRDNORM))
5951 if (soreadable(so))
5952 revents |= events & (POLLIN | POLLRDNORM);
5953
5954 if (events & (POLLOUT | POLLWRNORM))
5955 if (sowriteable(so))
5956 revents |= events & (POLLOUT | POLLWRNORM);
5957
5958 if (events & (POLLPRI | POLLRDBAND))
5959 if (so->so_oobmark || (so->so_state & SS_RCVATMARK))
5960 revents |= events & (POLLPRI | POLLRDBAND);
5961
5962 if (revents == 0) {
5963 if (events & (POLLIN | POLLPRI | POLLRDNORM | POLLRDBAND)) {
5964 /*
5965 * Darwin sets the flag first,
5966 * BSD calls selrecord first
5967 */
5968 so->so_rcv.sb_flags |= SB_SEL;
5969 selrecord(p, &so->so_rcv.sb_sel, wql);
5970 }
5971
5972 if (events & (POLLOUT | POLLWRNORM)) {
5973 /*
5974 * Darwin sets the flag first,
5975 * BSD calls selrecord first
5976 */
5977 so->so_snd.sb_flags |= SB_SEL;
5978 selrecord(p, &so->so_snd.sb_sel, wql);
5979 }
5980 }
5981
5982 socket_unlock(so, 1);
5983 return (revents);
5984 }
5985
5986 int
5987 soo_kqfilter(struct fileproc *fp, struct knote *kn,
5988 struct kevent_internal_s *kev, vfs_context_t ctx)
5989 {
5990 #pragma unused(fp)
5991 #if !CONFIG_MACF_SOCKET
5992 #pragma unused(ctx)
5993 #endif /* MAC_SOCKET */
5994 struct socket *so = (struct socket *)kn->kn_fp->f_fglob->fg_data;
5995 int result;
5996
5997 socket_lock(so, 1);
5998 so_update_last_owner_locked(so, PROC_NULL);
5999 so_update_policy(so);
6000
6001 #if CONFIG_MACF_SOCKET
6002 if (mac_socket_check_kqfilter(proc_ucred(vfs_context_proc(ctx)),
6003 kn, so) != 0) {
6004 socket_unlock(so, 1);
6005 kn->kn_flags = EV_ERROR;
6006 kn->kn_data = EPERM;
6007 return 0;
6008 }
6009 #endif /* MAC_SOCKET */
6010
6011 switch (kn->kn_filter) {
6012 case EVFILT_READ:
6013 kn->kn_filtid = EVFILTID_SOREAD;
6014 break;
6015 case EVFILT_WRITE:
6016 kn->kn_filtid = EVFILTID_SOWRITE;
6017 break;
6018 case EVFILT_SOCK:
6019 kn->kn_filtid = EVFILTID_SCK;
6020 break;
6021 case EVFILT_EXCEPT:
6022 kn->kn_filtid = EVFILTID_SOEXCEPT;
6023 break;
6024 default:
6025 socket_unlock(so, 1);
6026 kn->kn_flags = EV_ERROR;
6027 kn->kn_data = EINVAL;
6028 return 0;
6029 }
6030
6031 /*
6032 * call the appropriate sub-filter attach
6033 * with the socket still locked
6034 */
6035 result = knote_fops(kn)->f_attach(kn, kev);
6036
6037 socket_unlock(so, 1);
6038
6039 return result;
6040 }
6041
6042 static int
6043 filt_soread_common(struct knote *kn, struct socket *so)
6044 {
6045 if (so->so_options & SO_ACCEPTCONN) {
6046 int is_not_empty;
6047
6048 /*
6049 * Radar 6615193 handle the listen case dynamically
6050 * for kqueue read filter. This allows to call listen()
6051 * after registering the kqueue EVFILT_READ.
6052 */
6053
6054 kn->kn_data = so->so_qlen;
6055 is_not_empty = ! TAILQ_EMPTY(&so->so_comp);
6056
6057 return (is_not_empty);
6058 }
6059
6060 /* socket isn't a listener */
6061 /*
6062 * NOTE_LOWAT specifies new low water mark in data, i.e.
6063 * the bytes of protocol data. We therefore exclude any
6064 * control bytes.
6065 */
6066 kn->kn_data = so->so_rcv.sb_cc - so->so_rcv.sb_ctl;
6067
6068 if (kn->kn_sfflags & NOTE_OOB) {
6069 if (so->so_oobmark || (so->so_state & SS_RCVATMARK)) {
6070 kn->kn_fflags |= NOTE_OOB;
6071 kn->kn_data -= so->so_oobmark;
6072 return (1);
6073 }
6074 }
6075
6076 if ((so->so_state & SS_CANTRCVMORE)
6077 #if CONTENT_FILTER
6078 && cfil_sock_data_pending(&so->so_rcv) == 0
6079 #endif /* CONTENT_FILTER */
6080 ) {
6081 kn->kn_flags |= EV_EOF;
6082 kn->kn_fflags = so->so_error;
6083 return (1);
6084 }
6085
6086 if (so->so_error) { /* temporary udp error */
6087 return (1);
6088 }
6089
6090 int64_t lowwat = so->so_rcv.sb_lowat;
6091 /*
6092 * Ensure that when NOTE_LOWAT is used, the derived
6093 * low water mark is bounded by socket's rcv buf's
6094 * high and low water mark values.
6095 */
6096 if (kn->kn_sfflags & NOTE_LOWAT) {
6097 if (kn->kn_sdata > so->so_rcv.sb_hiwat)
6098 lowwat = so->so_rcv.sb_hiwat;
6099 else if (kn->kn_sdata > lowwat)
6100 lowwat = kn->kn_sdata;
6101 }
6102
6103 /*
6104 * The order below is important. Since NOTE_LOWAT
6105 * overrides sb_lowat, check for NOTE_LOWAT case
6106 * first.
6107 */
6108 if (kn->kn_sfflags & NOTE_LOWAT)
6109 return (kn->kn_data >= lowwat);
6110
6111 return (so->so_rcv.sb_cc >= lowwat);
6112 }
6113
6114 static int
6115 filt_sorattach(struct knote *kn, __unused struct kevent_internal_s *kev)
6116 {
6117 struct socket *so = (struct socket *)kn->kn_fp->f_fglob->fg_data;
6118
6119 /* socket locked */
6120
6121 /*
6122 * If the caller explicitly asked for OOB results (e.g. poll())
6123 * from EVFILT_READ, then save that off in the hookid field
6124 * and reserve the kn_flags EV_OOBAND bit for output only.
6125 */
6126 if (kn->kn_filter == EVFILT_READ &&
6127 kn->kn_flags & EV_OOBAND) {
6128 kn->kn_flags &= ~EV_OOBAND;
6129 kn->kn_hookid = EV_OOBAND;
6130 } else {
6131 kn->kn_hookid = 0;
6132 }
6133 if (KNOTE_ATTACH(&so->so_rcv.sb_sel.si_note, kn))
6134 so->so_rcv.sb_flags |= SB_KNOTE;
6135
6136 /* indicate if event is already fired */
6137 return filt_soread_common(kn, so);
6138 }
6139
6140 static void
6141 filt_sordetach(struct knote *kn)
6142 {
6143 struct socket *so = (struct socket *)kn->kn_fp->f_fglob->fg_data;
6144
6145 socket_lock(so, 1);
6146 if (so->so_rcv.sb_flags & SB_KNOTE)
6147 if (KNOTE_DETACH(&so->so_rcv.sb_sel.si_note, kn))
6148 so->so_rcv.sb_flags &= ~SB_KNOTE;
6149 socket_unlock(so, 1);
6150 }
6151
6152 /*ARGSUSED*/
6153 static int
6154 filt_soread(struct knote *kn, long hint)
6155 {
6156 struct socket *so = (struct socket *)kn->kn_fp->f_fglob->fg_data;
6157 int retval;
6158
6159 if ((hint & SO_FILT_HINT_LOCKED) == 0)
6160 socket_lock(so, 1);
6161
6162 retval = filt_soread_common(kn, so);
6163
6164 if ((hint & SO_FILT_HINT_LOCKED) == 0)
6165 socket_unlock(so, 1);
6166
6167 return retval;
6168 }
6169
6170 static int
6171 filt_sortouch(struct knote *kn, struct kevent_internal_s *kev)
6172 {
6173 struct socket *so = (struct socket *)kn->kn_fp->f_fglob->fg_data;
6174 int retval;
6175
6176 socket_lock(so, 1);
6177
6178 /* save off the new input fflags and data */
6179 kn->kn_sfflags = kev->fflags;
6180 kn->kn_sdata = kev->data;
6181 if ((kn->kn_status & KN_UDATA_SPECIFIC) == 0)
6182 kn->kn_udata = kev->udata;
6183
6184 /* determine if changes result in fired events */
6185 retval = filt_soread_common(kn, so);
6186
6187 socket_unlock(so, 1);
6188
6189 return retval;
6190 }
6191
6192 static int
6193 filt_sorprocess(struct knote *kn, struct filt_process_s *data, struct kevent_internal_s *kev)
6194 {
6195 #pragma unused(data)
6196 struct socket *so = (struct socket *)kn->kn_fp->f_fglob->fg_data;
6197 int retval;
6198
6199 socket_lock(so, 1);
6200 retval = filt_soread_common(kn, so);
6201 if (retval) {
6202 *kev = kn->kn_kevent;
6203 if (kn->kn_flags & EV_CLEAR) {
6204 kn->kn_fflags = 0;
6205 kn->kn_data = 0;
6206 }
6207 }
6208 socket_unlock(so, 1);
6209
6210 return retval;
6211 }
6212
6213 int
6214 so_wait_for_if_feedback(struct socket *so)
6215 {
6216 if ((SOCK_DOM(so) == PF_INET || SOCK_DOM(so) == PF_INET6) &&
6217 (so->so_state & SS_ISCONNECTED)) {
6218 struct inpcb *inp = sotoinpcb(so);
6219 if (INP_WAIT_FOR_IF_FEEDBACK(inp))
6220 return (1);
6221 }
6222 return (0);
6223 }
6224
6225 static int
6226 filt_sowrite_common(struct knote *kn, struct socket *so)
6227 {
6228 int ret = 0;
6229
6230 kn->kn_data = sbspace(&so->so_snd);
6231 if (so->so_state & SS_CANTSENDMORE) {
6232 kn->kn_flags |= EV_EOF;
6233 kn->kn_fflags = so->so_error;
6234 return 1;
6235 }
6236 if (so->so_error) { /* temporary udp error */
6237 return 1;
6238 }
6239 if (!socanwrite(so)) {
6240 return 0;
6241 }
6242 if (so->so_flags1 & SOF1_PRECONNECT_DATA) {
6243 return 1;
6244 }
6245 int64_t lowwat = so->so_snd.sb_lowat;
6246 if (kn->kn_sfflags & NOTE_LOWAT) {
6247 if (kn->kn_sdata > so->so_snd.sb_hiwat)
6248 lowwat = so->so_snd.sb_hiwat;
6249 else if (kn->kn_sdata > lowwat)
6250 lowwat = kn->kn_sdata;
6251 }
6252 if (kn->kn_data >= lowwat) {
6253 if ((so->so_flags & SOF_NOTSENT_LOWAT)
6254 #if (DEBUG || DEVELOPMENT)
6255 && so_notsent_lowat_check == 1
6256 #endif /* DEBUG || DEVELOPMENT */
6257 ) {
6258 if ((SOCK_DOM(so) == PF_INET ||
6259 SOCK_DOM(so) == PF_INET6) &&
6260 so->so_type == SOCK_STREAM) {
6261 ret = tcp_notsent_lowat_check(so);
6262 }
6263 #if MPTCP
6264 else if ((SOCK_DOM(so) == PF_MULTIPATH) &&
6265 (SOCK_PROTO(so) == IPPROTO_TCP)) {
6266 ret = mptcp_notsent_lowat_check(so);
6267 }
6268 #endif
6269 else {
6270 return 1;
6271 }
6272 } else {
6273 ret = 1;
6274 }
6275 }
6276 if (so_wait_for_if_feedback(so))
6277 ret = 0;
6278 return (ret);
6279 }
6280
6281 static int
6282 filt_sowattach(struct knote *kn, __unused struct kevent_internal_s *kev)
6283 {
6284 struct socket *so = (struct socket *)kn->kn_fp->f_fglob->fg_data;
6285
6286 /* socket locked */
6287 if (KNOTE_ATTACH(&so->so_snd.sb_sel.si_note, kn))
6288 so->so_snd.sb_flags |= SB_KNOTE;
6289
6290 /* determine if its already fired */
6291 return filt_sowrite_common(kn, so);
6292 }
6293
6294 static void
6295 filt_sowdetach(struct knote *kn)
6296 {
6297 struct socket *so = (struct socket *)kn->kn_fp->f_fglob->fg_data;
6298 socket_lock(so, 1);
6299
6300 if (so->so_snd.sb_flags & SB_KNOTE)
6301 if (KNOTE_DETACH(&so->so_snd.sb_sel.si_note, kn))
6302 so->so_snd.sb_flags &= ~SB_KNOTE;
6303 socket_unlock(so, 1);
6304 }
6305
6306 /*ARGSUSED*/
6307 static int
6308 filt_sowrite(struct knote *kn, long hint)
6309 {
6310 struct socket *so = (struct socket *)kn->kn_fp->f_fglob->fg_data;
6311 int ret;
6312
6313 if ((hint & SO_FILT_HINT_LOCKED) == 0)
6314 socket_lock(so, 1);
6315
6316 ret = filt_sowrite_common(kn, so);
6317
6318 if ((hint & SO_FILT_HINT_LOCKED) == 0)
6319 socket_unlock(so, 1);
6320
6321 return ret;
6322 }
6323
6324 static int
6325 filt_sowtouch(struct knote *kn, struct kevent_internal_s *kev)
6326 {
6327 struct socket *so = (struct socket *)kn->kn_fp->f_fglob->fg_data;
6328 int ret;
6329
6330 socket_lock(so, 1);
6331
6332 /*save off the new input fflags and data */
6333 kn->kn_sfflags = kev->fflags;
6334 kn->kn_sdata = kev->data;
6335 if ((kn->kn_status & KN_UDATA_SPECIFIC) == 0)
6336 kn->kn_udata = kev->udata;
6337
6338 /* determine if these changes result in a triggered event */
6339 ret = filt_sowrite_common(kn, so);
6340
6341 socket_unlock(so, 1);
6342
6343 return ret;
6344 }
6345
6346 static int
6347 filt_sowprocess(struct knote *kn, struct filt_process_s *data, struct kevent_internal_s *kev)
6348 {
6349 #pragma unused(data)
6350 struct socket *so = (struct socket *)kn->kn_fp->f_fglob->fg_data;
6351 int ret;
6352
6353 socket_lock(so, 1);
6354 ret = filt_sowrite_common(kn, so);
6355 if (ret) {
6356 *kev = kn->kn_kevent;
6357 if (kn->kn_flags & EV_CLEAR) {
6358 kn->kn_fflags = 0;
6359 kn->kn_data = 0;
6360 }
6361 }
6362 socket_unlock(so, 1);
6363 return ret;
6364 }
6365
6366 static int
6367 filt_sockev_common(struct knote *kn, struct socket *so, long ev_hint)
6368 {
6369 int ret = 0;
6370 uint32_t level_trigger = 0;
6371
6372 if (ev_hint & SO_FILT_HINT_CONNRESET) {
6373 kn->kn_fflags |= NOTE_CONNRESET;
6374 }
6375 if (ev_hint & SO_FILT_HINT_TIMEOUT) {
6376 kn->kn_fflags |= NOTE_TIMEOUT;
6377 }
6378 if (ev_hint & SO_FILT_HINT_NOSRCADDR) {
6379 kn->kn_fflags |= NOTE_NOSRCADDR;
6380 }
6381 if (ev_hint & SO_FILT_HINT_IFDENIED) {
6382 kn->kn_fflags |= NOTE_IFDENIED;
6383 }
6384 if (ev_hint & SO_FILT_HINT_KEEPALIVE) {
6385 kn->kn_fflags |= NOTE_KEEPALIVE;
6386 }
6387 if (ev_hint & SO_FILT_HINT_ADAPTIVE_WTIMO) {
6388 kn->kn_fflags |= NOTE_ADAPTIVE_WTIMO;
6389 }
6390 if (ev_hint & SO_FILT_HINT_ADAPTIVE_RTIMO) {
6391 kn->kn_fflags |= NOTE_ADAPTIVE_RTIMO;
6392 }
6393 if ((ev_hint & SO_FILT_HINT_CONNECTED) ||
6394 (so->so_state & SS_ISCONNECTED)) {
6395 kn->kn_fflags |= NOTE_CONNECTED;
6396 level_trigger |= NOTE_CONNECTED;
6397 }
6398 if ((ev_hint & SO_FILT_HINT_DISCONNECTED) ||
6399 (so->so_state & SS_ISDISCONNECTED)) {
6400 kn->kn_fflags |= NOTE_DISCONNECTED;
6401 level_trigger |= NOTE_DISCONNECTED;
6402 }
6403 if (ev_hint & SO_FILT_HINT_CONNINFO_UPDATED) {
6404 if (so->so_proto != NULL &&
6405 (so->so_proto->pr_flags & PR_EVCONNINFO))
6406 kn->kn_fflags |= NOTE_CONNINFO_UPDATED;
6407 }
6408
6409 if ((ev_hint & SO_FILT_HINT_NOTIFY_ACK) ||
6410 tcp_notify_ack_active(so)) {
6411 kn->kn_fflags |= NOTE_NOTIFY_ACK;
6412 }
6413
6414 if ((so->so_state & SS_CANTRCVMORE)
6415 #if CONTENT_FILTER
6416 && cfil_sock_data_pending(&so->so_rcv) == 0
6417 #endif /* CONTENT_FILTER */
6418 ) {
6419 kn->kn_fflags |= NOTE_READCLOSED;
6420 level_trigger |= NOTE_READCLOSED;
6421 }
6422
6423 if (so->so_state & SS_CANTSENDMORE) {
6424 kn->kn_fflags |= NOTE_WRITECLOSED;
6425 level_trigger |= NOTE_WRITECLOSED;
6426 }
6427
6428 if ((ev_hint & SO_FILT_HINT_SUSPEND) ||
6429 (so->so_flags & SOF_SUSPENDED)) {
6430 kn->kn_fflags &= ~(NOTE_SUSPEND | NOTE_RESUME);
6431
6432 /* If resume event was delivered before, reset it */
6433 kn->kn_hookid &= ~NOTE_RESUME;
6434
6435 kn->kn_fflags |= NOTE_SUSPEND;
6436 level_trigger |= NOTE_SUSPEND;
6437 }
6438
6439 if ((ev_hint & SO_FILT_HINT_RESUME) ||
6440 (so->so_flags & SOF_SUSPENDED) == 0) {
6441 kn->kn_fflags &= ~(NOTE_SUSPEND | NOTE_RESUME);
6442
6443 /* If suspend event was delivered before, reset it */
6444 kn->kn_hookid &= ~NOTE_SUSPEND;
6445
6446 kn->kn_fflags |= NOTE_RESUME;
6447 level_trigger |= NOTE_RESUME;
6448 }
6449
6450 if (so->so_error != 0) {
6451 ret = 1;
6452 kn->kn_data = so->so_error;
6453 kn->kn_flags |= EV_EOF;
6454 } else {
6455 get_sockev_state(so, (u_int32_t *)&(kn->kn_data));
6456 }
6457
6458 /* Reset any events that are not requested on this knote */
6459 kn->kn_fflags &= (kn->kn_sfflags & EVFILT_SOCK_ALL_MASK);
6460 level_trigger &= (kn->kn_sfflags & EVFILT_SOCK_ALL_MASK);
6461
6462 /* Find the level triggerred events that are already delivered */
6463 level_trigger &= kn->kn_hookid;
6464 level_trigger &= EVFILT_SOCK_LEVEL_TRIGGER_MASK;
6465
6466 /* Do not deliver level triggerred events more than once */
6467 if ((kn->kn_fflags & ~level_trigger) != 0)
6468 ret = 1;
6469
6470 return (ret);
6471 }
6472
6473 static int
6474 filt_sockattach(struct knote *kn, __unused struct kevent_internal_s *kev)
6475 {
6476 struct socket *so = (struct socket *)kn->kn_fp->f_fglob->fg_data;
6477
6478 /* socket locked */
6479 kn->kn_hookid = 0;
6480 if (KNOTE_ATTACH(&so->so_klist, kn))
6481 so->so_flags |= SOF_KNOTE;
6482
6483 /* determine if event already fired */
6484 return filt_sockev_common(kn, so, 0);
6485 }
6486
6487 static void
6488 filt_sockdetach(struct knote *kn)
6489 {
6490 struct socket *so = (struct socket *)kn->kn_fp->f_fglob->fg_data;
6491 socket_lock(so, 1);
6492
6493 if ((so->so_flags & SOF_KNOTE) != 0)
6494 if (KNOTE_DETACH(&so->so_klist, kn))
6495 so->so_flags &= ~SOF_KNOTE;
6496 socket_unlock(so, 1);
6497 }
6498
6499 static int
6500 filt_sockev(struct knote *kn, long hint)
6501 {
6502 int ret = 0, locked = 0;
6503 struct socket *so = (struct socket *)kn->kn_fp->f_fglob->fg_data;
6504 long ev_hint = (hint & SO_FILT_HINT_EV);
6505
6506 if ((hint & SO_FILT_HINT_LOCKED) == 0) {
6507 socket_lock(so, 1);
6508 locked = 1;
6509 }
6510
6511 ret = filt_sockev_common(kn, so, ev_hint);
6512
6513 if (locked)
6514 socket_unlock(so, 1);
6515
6516 return ret;
6517 }
6518
6519
6520
6521 /*
6522 * filt_socktouch - update event state
6523 */
6524 static int
6525 filt_socktouch(
6526 struct knote *kn,
6527 struct kevent_internal_s *kev)
6528 {
6529 struct socket *so = (struct socket *)kn->kn_fp->f_fglob->fg_data;
6530 uint32_t changed_flags;
6531 int ret;
6532
6533 socket_lock(so, 1);
6534
6535 /* save off the [result] data and fflags */
6536 changed_flags = (kn->kn_sfflags ^ kn->kn_hookid);
6537
6538 /* save off the new input fflags and data */
6539 kn->kn_sfflags = kev->fflags;
6540 kn->kn_sdata = kev->data;
6541 if ((kn->kn_status & KN_UDATA_SPECIFIC) == 0)
6542 kn->kn_udata = kev->udata;
6543
6544 /* restrict the current results to the (smaller?) set of new interest */
6545 /*
6546 * For compatibility with previous implementations, we leave kn_fflags
6547 * as they were before.
6548 */
6549 //kn->kn_fflags &= kev->fflags;
6550
6551 /*
6552 * Since we keep track of events that are already
6553 * delivered, if any of those events are not requested
6554 * anymore the state related to them can be reset
6555 */
6556 kn->kn_hookid &=
6557 ~(changed_flags & EVFILT_SOCK_LEVEL_TRIGGER_MASK);
6558
6559 /* determine if we have events to deliver */
6560 ret = filt_sockev_common(kn, so, 0);
6561
6562 socket_unlock(so, 1);
6563
6564 return ret;
6565 }
6566
6567 /*
6568 * filt_sockprocess - query event fired state and return data
6569 */
6570 static int
6571 filt_sockprocess(
6572 struct knote *kn,
6573 struct filt_process_s *data,
6574 struct kevent_internal_s *kev)
6575 {
6576 #pragma unused(data)
6577
6578 struct socket *so = (struct socket *)kn->kn_fp->f_fglob->fg_data;
6579 int ret = 0;
6580
6581 socket_lock(so, 1);
6582
6583 ret = filt_sockev_common(kn, so, 0);
6584 if (ret) {
6585 *kev = kn->kn_kevent;
6586
6587 /*
6588 * Store the state of the events being delivered. This
6589 * state can be used to deliver level triggered events
6590 * ateast once and still avoid waking up the application
6591 * multiple times as long as the event is active.
6592 */
6593 if (kn->kn_fflags != 0)
6594 kn->kn_hookid |= (kn->kn_fflags &
6595 EVFILT_SOCK_LEVEL_TRIGGER_MASK);
6596
6597 /*
6598 * NOTE_RESUME and NOTE_SUSPEND are an exception, deliver
6599 * only one of them and remember the last one that was
6600 * delivered last
6601 */
6602 if (kn->kn_fflags & NOTE_SUSPEND)
6603 kn->kn_hookid &= ~NOTE_RESUME;
6604 if (kn->kn_fflags & NOTE_RESUME)
6605 kn->kn_hookid &= ~NOTE_SUSPEND;
6606
6607 if (kn->kn_flags & EV_CLEAR) {
6608 kn->kn_data = 0;
6609 kn->kn_fflags = 0;
6610 }
6611 }
6612
6613 socket_unlock(so, 1);
6614
6615 return ret;
6616 }
6617
6618 void
6619 get_sockev_state(struct socket *so, u_int32_t *statep)
6620 {
6621 u_int32_t state = *(statep);
6622
6623 /*
6624 * If the state variable is already used by a previous event,
6625 * reset it.
6626 */
6627 if (state != 0)
6628 return;
6629
6630 if (so->so_state & SS_ISCONNECTED)
6631 state |= SOCKEV_CONNECTED;
6632 else
6633 state &= ~(SOCKEV_CONNECTED);
6634 state |= ((so->so_state & SS_ISDISCONNECTED) ? SOCKEV_DISCONNECTED : 0);
6635 *(statep) = state;
6636 }
6637
6638 #define SO_LOCK_HISTORY_STR_LEN \
6639 (2 * SO_LCKDBG_MAX * (2 + (2 * sizeof (void *)) + 1) + 1)
6640
6641 __private_extern__ const char *
6642 solockhistory_nr(struct socket *so)
6643 {
6644 size_t n = 0;
6645 int i;
6646 static char lock_history_str[SO_LOCK_HISTORY_STR_LEN];
6647
6648 bzero(lock_history_str, sizeof (lock_history_str));
6649 for (i = SO_LCKDBG_MAX - 1; i >= 0; i--) {
6650 n += snprintf(lock_history_str + n,
6651 SO_LOCK_HISTORY_STR_LEN - n, "%p:%p ",
6652 so->lock_lr[(so->next_lock_lr + i) % SO_LCKDBG_MAX],
6653 so->unlock_lr[(so->next_unlock_lr + i) % SO_LCKDBG_MAX]);
6654 }
6655 return (lock_history_str);
6656 }
6657
6658 void
6659 socket_lock(struct socket *so, int refcount)
6660 {
6661 void *lr_saved;
6662
6663 lr_saved = __builtin_return_address(0);
6664
6665 if (so->so_proto->pr_lock) {
6666 (*so->so_proto->pr_lock)(so, refcount, lr_saved);
6667 } else {
6668 #ifdef MORE_LOCKING_DEBUG
6669 LCK_MTX_ASSERT(so->so_proto->pr_domain->dom_mtx,
6670 LCK_MTX_ASSERT_NOTOWNED);
6671 #endif
6672 lck_mtx_lock(so->so_proto->pr_domain->dom_mtx);
6673 if (refcount)
6674 so->so_usecount++;
6675 so->lock_lr[so->next_lock_lr] = lr_saved;
6676 so->next_lock_lr = (so->next_lock_lr+1) % SO_LCKDBG_MAX;
6677 }
6678 }
6679
6680 void
6681 socket_lock_assert_owned(struct socket *so)
6682 {
6683 lck_mtx_t *mutex_held;
6684
6685 if (so->so_proto->pr_getlock != NULL)
6686 mutex_held = (*so->so_proto->pr_getlock)(so, 0);
6687 else
6688 mutex_held = so->so_proto->pr_domain->dom_mtx;
6689
6690 LCK_MTX_ASSERT(mutex_held, LCK_MTX_ASSERT_OWNED);
6691 }
6692
6693 int
6694 socket_try_lock(struct socket *so)
6695 {
6696 lck_mtx_t *mtx;
6697
6698 if (so->so_proto->pr_getlock != NULL)
6699 mtx = (*so->so_proto->pr_getlock)(so, 0);
6700 else
6701 mtx = so->so_proto->pr_domain->dom_mtx;
6702
6703 return (lck_mtx_try_lock(mtx));
6704 }
6705
6706 void
6707 socket_unlock(struct socket *so, int refcount)
6708 {
6709 void *lr_saved;
6710 lck_mtx_t *mutex_held;
6711
6712 lr_saved = __builtin_return_address(0);
6713
6714 if (so->so_proto == NULL) {
6715 panic("%s: null so_proto so=%p\n", __func__, so);
6716 /* NOTREACHED */
6717 }
6718
6719 if (so && so->so_proto->pr_unlock) {
6720 (*so->so_proto->pr_unlock)(so, refcount, lr_saved);
6721 } else {
6722 mutex_held = so->so_proto->pr_domain->dom_mtx;
6723 #ifdef MORE_LOCKING_DEBUG
6724 LCK_MTX_ASSERT(mutex_held, LCK_MTX_ASSERT_OWNED);
6725 #endif
6726 so->unlock_lr[so->next_unlock_lr] = lr_saved;
6727 so->next_unlock_lr = (so->next_unlock_lr+1) % SO_LCKDBG_MAX;
6728
6729 if (refcount) {
6730 if (so->so_usecount <= 0) {
6731 panic("%s: bad refcount=%d so=%p (%d, %d, %d) "
6732 "lrh=%s", __func__, so->so_usecount, so,
6733 SOCK_DOM(so), so->so_type,
6734 SOCK_PROTO(so), solockhistory_nr(so));
6735 /* NOTREACHED */
6736 }
6737
6738 so->so_usecount--;
6739 if (so->so_usecount == 0)
6740 sofreelastref(so, 1);
6741 }
6742 lck_mtx_unlock(mutex_held);
6743 }
6744 }
6745
6746 /* Called with socket locked, will unlock socket */
6747 void
6748 sofree(struct socket *so)
6749 {
6750 lck_mtx_t *mutex_held;
6751
6752 if (so->so_proto->pr_getlock != NULL)
6753 mutex_held = (*so->so_proto->pr_getlock)(so, 0);
6754 else
6755 mutex_held = so->so_proto->pr_domain->dom_mtx;
6756 LCK_MTX_ASSERT(mutex_held, LCK_MTX_ASSERT_OWNED);
6757
6758 sofreelastref(so, 0);
6759 }
6760
6761 void
6762 soreference(struct socket *so)
6763 {
6764 socket_lock(so, 1); /* locks & take one reference on socket */
6765 socket_unlock(so, 0); /* unlock only */
6766 }
6767
6768 void
6769 sodereference(struct socket *so)
6770 {
6771 socket_lock(so, 0);
6772 socket_unlock(so, 1);
6773 }
6774
6775 /*
6776 * Set or clear SOF_MULTIPAGES on the socket to enable or disable the
6777 * possibility of using jumbo clusters. Caller must ensure to hold
6778 * the socket lock.
6779 */
6780 void
6781 somultipages(struct socket *so, boolean_t set)
6782 {
6783 if (set)
6784 so->so_flags |= SOF_MULTIPAGES;
6785 else
6786 so->so_flags &= ~SOF_MULTIPAGES;
6787 }
6788
6789 void
6790 soif2kcl(struct socket *so, boolean_t set)
6791 {
6792 if (set)
6793 so->so_flags1 |= SOF1_IF_2KCL;
6794 else
6795 so->so_flags1 &= ~SOF1_IF_2KCL;
6796 }
6797
6798 int
6799 so_isdstlocal(struct socket *so) {
6800
6801 struct inpcb *inp = (struct inpcb *)so->so_pcb;
6802
6803 if (SOCK_DOM(so) == PF_INET)
6804 return (inaddr_local(inp->inp_faddr));
6805 else if (SOCK_DOM(so) == PF_INET6)
6806 return (in6addr_local(&inp->in6p_faddr));
6807
6808 return (0);
6809 }
6810
6811 int
6812 sosetdefunct(struct proc *p, struct socket *so, int level, boolean_t noforce)
6813 {
6814 struct sockbuf *rcv, *snd;
6815 int err = 0, defunct;
6816
6817 rcv = &so->so_rcv;
6818 snd = &so->so_snd;
6819
6820 defunct = (so->so_flags & SOF_DEFUNCT);
6821 if (defunct) {
6822 if (!(snd->sb_flags & rcv->sb_flags & SB_DROP)) {
6823 panic("%s: SB_DROP not set", __func__);
6824 /* NOTREACHED */
6825 }
6826 goto done;
6827 }
6828
6829 if (so->so_flags & SOF_NODEFUNCT) {
6830 if (noforce) {
6831 err = EOPNOTSUPP;
6832 SODEFUNCTLOG("%s[%d, %s]: (target pid %d "
6833 "name %s level %d) so 0x%llx [%d,%d] "
6834 "is not eligible for defunct "
6835 "(%d)\n", __func__, proc_selfpid(),
6836 proc_best_name(current_proc()), proc_pid(p),
6837 proc_best_name(p), level,
6838 (uint64_t)DEBUG_KERNEL_ADDRPERM(so),
6839 SOCK_DOM(so), SOCK_TYPE(so), err);
6840 return (err);
6841 }
6842 so->so_flags &= ~SOF_NODEFUNCT;
6843 SODEFUNCTLOG("%s[%d, %s]: (target pid %d name %s level %d) "
6844 "so 0x%llx [%d,%d] defunct by force\n", __func__,
6845 proc_selfpid(), proc_best_name(current_proc()),
6846 proc_pid(p), proc_best_name(p), level,
6847 (uint64_t)DEBUG_KERNEL_ADDRPERM(so),
6848 SOCK_DOM(so), SOCK_TYPE(so));
6849 } else if (so->so_flags1 & SOF1_EXTEND_BK_IDLE_WANTED) {
6850 struct inpcb *inp = (struct inpcb *)so->so_pcb;
6851 struct ifnet *ifp = inp->inp_last_outifp;
6852
6853 if (ifp && IFNET_IS_CELLULAR(ifp)) {
6854 OSIncrementAtomic(&soextbkidlestat.so_xbkidle_nocell);
6855 } else if (so->so_flags & SOF_DELEGATED) {
6856 OSIncrementAtomic(&soextbkidlestat.so_xbkidle_nodlgtd);
6857 } else if (soextbkidlestat.so_xbkidle_time == 0) {
6858 OSIncrementAtomic(&soextbkidlestat.so_xbkidle_notime);
6859 } else if (noforce) {
6860 OSIncrementAtomic(&soextbkidlestat.so_xbkidle_active);
6861
6862 so->so_flags1 |= SOF1_EXTEND_BK_IDLE_INPROG;
6863 so->so_extended_bk_start = net_uptime();
6864 OSBitOrAtomic(P_LXBKIDLEINPROG, &p->p_ladvflag);
6865
6866 inpcb_timer_sched(inp->inp_pcbinfo, INPCB_TIMER_LAZY);
6867
6868 err = EOPNOTSUPP;
6869 SODEFUNCTLOG("%s[%d, %s]: (target pid %d name %s "
6870 "level %d) extend bk idle so 0x%llx rcv hw %d "
6871 "cc %d\n",
6872 __func__, proc_selfpid(),
6873 proc_best_name(current_proc()), proc_pid(p),
6874 proc_best_name(p), level,
6875 (uint64_t)DEBUG_KERNEL_ADDRPERM(so),
6876 so->so_rcv.sb_hiwat, so->so_rcv.sb_cc);
6877 return (err);
6878 } else {
6879 OSIncrementAtomic(&soextbkidlestat.so_xbkidle_forced);
6880 }
6881 }
6882
6883 so->so_flags |= SOF_DEFUNCT;
6884
6885 /* Prevent further data from being appended to the socket buffers */
6886 snd->sb_flags |= SB_DROP;
6887 rcv->sb_flags |= SB_DROP;
6888
6889 /* Flush any existing data in the socket buffers */
6890 if (rcv->sb_cc != 0) {
6891 rcv->sb_flags &= ~SB_SEL;
6892 selthreadclear(&rcv->sb_sel);
6893 sbrelease(rcv);
6894 }
6895 if (snd->sb_cc != 0) {
6896 snd->sb_flags &= ~SB_SEL;
6897 selthreadclear(&snd->sb_sel);
6898 sbrelease(snd);
6899 }
6900
6901 done:
6902 SODEFUNCTLOG("%s[%d, %s]: (target pid %d name %s level %d) "
6903 "so 0x%llx [%d,%d] %s defunct%s\n", __func__, proc_selfpid(),
6904 proc_best_name(current_proc()), proc_pid(p), proc_best_name(p),
6905 level, (uint64_t)DEBUG_KERNEL_ADDRPERM(so), SOCK_DOM(so),
6906 SOCK_TYPE(so), defunct ? "is already" : "marked as",
6907 (so->so_flags1 & SOF1_EXTEND_BK_IDLE_WANTED) ? " extbkidle" : "");
6908
6909 return (err);
6910 }
6911
6912 int
6913 sodefunct(struct proc *p, struct socket *so, int level)
6914 {
6915 struct sockbuf *rcv, *snd;
6916
6917 if (!(so->so_flags & SOF_DEFUNCT)) {
6918 panic("%s improperly called", __func__);
6919 /* NOTREACHED */
6920 }
6921 if (so->so_state & SS_DEFUNCT)
6922 goto done;
6923
6924 rcv = &so->so_rcv;
6925 snd = &so->so_snd;
6926
6927 if (SOCK_DOM(so) == PF_INET || SOCK_DOM(so) == PF_INET6) {
6928 char s[MAX_IPv6_STR_LEN];
6929 char d[MAX_IPv6_STR_LEN];
6930 struct inpcb *inp = sotoinpcb(so);
6931
6932 SODEFUNCTLOG("%s[%d, %s]: (target pid %d name %s level %d) "
6933 "so 0x%llx [%s %s:%d -> %s:%d] is now defunct "
6934 "[rcv_si 0x%x, snd_si 0x%x, rcv_fl 0x%x, snd_fl 0x%x]\n",
6935 __func__, proc_selfpid(), proc_best_name(current_proc()),
6936 proc_pid(p), proc_best_name(p), level,
6937 (uint64_t)DEBUG_KERNEL_ADDRPERM(so),
6938 (SOCK_TYPE(so) == SOCK_STREAM) ? "TCP" : "UDP",
6939 inet_ntop(SOCK_DOM(so), ((SOCK_DOM(so) == PF_INET) ?
6940 (void *)&inp->inp_laddr.s_addr : (void *)&inp->in6p_laddr),
6941 s, sizeof (s)), ntohs(inp->in6p_lport),
6942 inet_ntop(SOCK_DOM(so), (SOCK_DOM(so) == PF_INET) ?
6943 (void *)&inp->inp_faddr.s_addr : (void *)&inp->in6p_faddr,
6944 d, sizeof (d)), ntohs(inp->in6p_fport),
6945 (uint32_t)rcv->sb_sel.si_flags,
6946 (uint32_t)snd->sb_sel.si_flags,
6947 rcv->sb_flags, snd->sb_flags);
6948 } else {
6949 SODEFUNCTLOG("%s[%d, %s]: (target pid %d name %s level %d) "
6950 "so 0x%llx [%d,%d] is now defunct [rcv_si 0x%x, "
6951 "snd_si 0x%x, rcv_fl 0x%x, snd_fl 0x%x]\n", __func__,
6952 proc_selfpid(), proc_best_name(current_proc()),
6953 proc_pid(p), proc_best_name(p), level,
6954 (uint64_t)DEBUG_KERNEL_ADDRPERM(so),
6955 SOCK_DOM(so), SOCK_TYPE(so),
6956 (uint32_t)rcv->sb_sel.si_flags,
6957 (uint32_t)snd->sb_sel.si_flags, rcv->sb_flags,
6958 snd->sb_flags);
6959 }
6960
6961 /*
6962 * Unwedge threads blocked on sbwait() and sb_lock().
6963 */
6964 sbwakeup(rcv);
6965 sbwakeup(snd);
6966
6967 so->so_flags1 |= SOF1_DEFUNCTINPROG;
6968 if (rcv->sb_flags & SB_LOCK)
6969 sbunlock(rcv, TRUE); /* keep socket locked */
6970 if (snd->sb_flags & SB_LOCK)
6971 sbunlock(snd, TRUE); /* keep socket locked */
6972
6973 /*
6974 * Flush the buffers and disconnect. We explicitly call shutdown
6975 * on both data directions to ensure that SS_CANT{RCV,SEND}MORE
6976 * states are set for the socket. This would also flush out data
6977 * hanging off the receive list of this socket.
6978 */
6979 (void) soshutdownlock_final(so, SHUT_RD);
6980 (void) soshutdownlock_final(so, SHUT_WR);
6981 (void) sodisconnectlocked(so);
6982
6983 /*
6984 * Explicitly handle connectionless-protocol disconnection
6985 * and release any remaining data in the socket buffers.
6986 */
6987 if (!(so->so_state & SS_ISDISCONNECTED))
6988 (void) soisdisconnected(so);
6989
6990 if (so->so_error == 0)
6991 so->so_error = EBADF;
6992
6993 if (rcv->sb_cc != 0) {
6994 rcv->sb_flags &= ~SB_SEL;
6995 selthreadclear(&rcv->sb_sel);
6996 sbrelease(rcv);
6997 }
6998 if (snd->sb_cc != 0) {
6999 snd->sb_flags &= ~SB_SEL;
7000 selthreadclear(&snd->sb_sel);
7001 sbrelease(snd);
7002 }
7003 so->so_state |= SS_DEFUNCT;
7004 OSIncrementAtomicLong((volatile long *)&sodefunct_calls);
7005
7006 done:
7007 return (0);
7008 }
7009
7010 int
7011 soresume(struct proc *p, struct socket *so, int locked)
7012 {
7013 if (locked == 0)
7014 socket_lock(so, 1);
7015
7016 if (so->so_flags1 & SOF1_EXTEND_BK_IDLE_INPROG) {
7017 SODEFUNCTLOG("%s[%d, %s]: (target pid %d name %s) so 0x%llx "
7018 "[%d,%d] resumed from bk idle\n",
7019 __func__, proc_selfpid(), proc_best_name(current_proc()),
7020 proc_pid(p), proc_best_name(p),
7021 (uint64_t)DEBUG_KERNEL_ADDRPERM(so),
7022 SOCK_DOM(so), SOCK_TYPE(so));
7023
7024 so->so_flags1 &= ~SOF1_EXTEND_BK_IDLE_INPROG;
7025 so->so_extended_bk_start = 0;
7026 OSBitAndAtomic(~P_LXBKIDLEINPROG, &p->p_ladvflag);
7027
7028 OSIncrementAtomic(&soextbkidlestat.so_xbkidle_resumed);
7029 OSDecrementAtomic(&soextbkidlestat.so_xbkidle_active);
7030 VERIFY(soextbkidlestat.so_xbkidle_active >= 0);
7031 }
7032 if (locked == 0)
7033 socket_unlock(so, 1);
7034
7035 return (0);
7036 }
7037
7038 /*
7039 * Does not attempt to account for sockets that are delegated from
7040 * the current process
7041 */
7042 int
7043 so_set_extended_bk_idle(struct socket *so, int optval)
7044 {
7045 int error = 0;
7046
7047 if ((SOCK_DOM(so) != PF_INET && SOCK_DOM(so) != PF_INET6) ||
7048 SOCK_PROTO(so) != IPPROTO_TCP) {
7049 OSDecrementAtomic(&soextbkidlestat.so_xbkidle_notsupp);
7050 error = EOPNOTSUPP;
7051 } else if (optval == 0) {
7052 so->so_flags1 &= ~SOF1_EXTEND_BK_IDLE_WANTED;
7053
7054 soresume(current_proc(), so, 1);
7055 } else {
7056 struct proc *p = current_proc();
7057 int i;
7058 struct filedesc *fdp;
7059 int count = 0;
7060
7061 /*
7062 * Unlock socket to avoid lock ordering issue with
7063 * the proc fd table lock
7064 */
7065 socket_unlock(so, 0);
7066
7067 proc_fdlock(p);
7068
7069 fdp = p->p_fd;
7070 for (i = 0; i < fdp->fd_nfiles; i++) {
7071 struct fileproc *fp = fdp->fd_ofiles[i];
7072 struct socket *so2;
7073
7074 if (fp == NULL ||
7075 (fdp->fd_ofileflags[i] & UF_RESERVED) != 0 ||
7076 FILEGLOB_DTYPE(fp->f_fglob) != DTYPE_SOCKET)
7077 continue;
7078
7079 so2 = (struct socket *)fp->f_fglob->fg_data;
7080 if (so != so2 &&
7081 so2->so_flags1 & SOF1_EXTEND_BK_IDLE_WANTED)
7082 count++;
7083 if (count >= soextbkidlestat.so_xbkidle_maxperproc)
7084 break;
7085 }
7086 proc_fdunlock(p);
7087
7088 socket_lock(so, 0);
7089
7090 if (count >= soextbkidlestat.so_xbkidle_maxperproc) {
7091 OSIncrementAtomic(&soextbkidlestat.so_xbkidle_toomany);
7092 error = EBUSY;
7093 } else if (so->so_flags & SOF_DELEGATED) {
7094 OSIncrementAtomic(&soextbkidlestat.so_xbkidle_nodlgtd);
7095 error = EBUSY;
7096 } else {
7097 so->so_flags1 |= SOF1_EXTEND_BK_IDLE_WANTED;
7098 OSIncrementAtomic(&soextbkidlestat.so_xbkidle_wantok);
7099 }
7100 SODEFUNCTLOG("%s[%d, %s]: so 0x%llx [%d,%d] "
7101 "%s marked for extended bk idle\n",
7102 __func__, proc_selfpid(), proc_best_name(current_proc()),
7103 (uint64_t)DEBUG_KERNEL_ADDRPERM(so),
7104 SOCK_DOM(so), SOCK_TYPE(so),
7105 (so->so_flags1 & SOF1_EXTEND_BK_IDLE_WANTED) ?
7106 "is" : "not");
7107 }
7108
7109 return (error);
7110 }
7111
7112 static void
7113 so_stop_extended_bk_idle(struct socket *so)
7114 {
7115 so->so_flags1 &= ~SOF1_EXTEND_BK_IDLE_INPROG;
7116 so->so_extended_bk_start = 0;
7117
7118 OSDecrementAtomic(&soextbkidlestat.so_xbkidle_active);
7119 VERIFY(soextbkidlestat.so_xbkidle_active >= 0);
7120 /*
7121 * Force defunct
7122 */
7123 sosetdefunct(current_proc(), so,
7124 SHUTDOWN_SOCKET_LEVEL_DISCONNECT_INTERNAL, FALSE);
7125 if (so->so_flags & SOF_DEFUNCT) {
7126 sodefunct(current_proc(), so,
7127 SHUTDOWN_SOCKET_LEVEL_DISCONNECT_INTERNAL);
7128 }
7129 }
7130
7131 void
7132 so_drain_extended_bk_idle(struct socket *so)
7133 {
7134 if (so && (so->so_flags1 & SOF1_EXTEND_BK_IDLE_INPROG)) {
7135 /*
7136 * Only penalize sockets that have outstanding data
7137 */
7138 if (so->so_rcv.sb_cc || so->so_snd.sb_cc) {
7139 so_stop_extended_bk_idle(so);
7140
7141 OSIncrementAtomic(&soextbkidlestat.so_xbkidle_drained);
7142 }
7143 }
7144 }
7145
7146 /*
7147 * Return values tells if socket is still in extended background idle
7148 */
7149 int
7150 so_check_extended_bk_idle_time(struct socket *so)
7151 {
7152 int ret = 1;
7153
7154 if ((so->so_flags1 & SOF1_EXTEND_BK_IDLE_INPROG)) {
7155 SODEFUNCTLOG("%s[%d, %s]: so 0x%llx [%d,%d]\n",
7156 __func__, proc_selfpid(), proc_best_name(current_proc()),
7157 (uint64_t)DEBUG_KERNEL_ADDRPERM(so),
7158 SOCK_DOM(so), SOCK_TYPE(so));
7159 if (net_uptime() - so->so_extended_bk_start >
7160 soextbkidlestat.so_xbkidle_time) {
7161 so_stop_extended_bk_idle(so);
7162
7163 OSIncrementAtomic(&soextbkidlestat.so_xbkidle_expired);
7164
7165 ret = 0;
7166 } else {
7167 struct inpcb *inp = (struct inpcb *)so->so_pcb;
7168
7169 inpcb_timer_sched(inp->inp_pcbinfo, INPCB_TIMER_LAZY);
7170 OSIncrementAtomic(&soextbkidlestat.so_xbkidle_resched);
7171 }
7172 }
7173
7174 return (ret);
7175 }
7176
7177 void
7178 resume_proc_sockets(proc_t p)
7179 {
7180 if (p->p_ladvflag & P_LXBKIDLEINPROG) {
7181 struct filedesc *fdp;
7182 int i;
7183
7184 proc_fdlock(p);
7185 fdp = p->p_fd;
7186 for (i = 0; i < fdp->fd_nfiles; i++) {
7187 struct fileproc *fp;
7188 struct socket *so;
7189
7190 fp = fdp->fd_ofiles[i];
7191 if (fp == NULL ||
7192 (fdp->fd_ofileflags[i] & UF_RESERVED) != 0 ||
7193 FILEGLOB_DTYPE(fp->f_fglob) != DTYPE_SOCKET)
7194 continue;
7195
7196 so = (struct socket *)fp->f_fglob->fg_data;
7197 (void) soresume(p, so, 0);
7198 }
7199 proc_fdunlock(p);
7200
7201 OSBitAndAtomic(~P_LXBKIDLEINPROG, &p->p_ladvflag);
7202 }
7203 }
7204
7205 __private_extern__ int
7206 so_set_recv_anyif(struct socket *so, int optval)
7207 {
7208 int ret = 0;
7209
7210 #if INET6
7211 if (SOCK_DOM(so) == PF_INET || SOCK_DOM(so) == PF_INET6) {
7212 #else
7213 if (SOCK_DOM(so) == PF_INET) {
7214 #endif /* !INET6 */
7215 if (optval)
7216 sotoinpcb(so)->inp_flags |= INP_RECV_ANYIF;
7217 else
7218 sotoinpcb(so)->inp_flags &= ~INP_RECV_ANYIF;
7219 }
7220
7221
7222 return (ret);
7223 }
7224
7225 __private_extern__ int
7226 so_get_recv_anyif(struct socket *so)
7227 {
7228 int ret = 0;
7229
7230 #if INET6
7231 if (SOCK_DOM(so) == PF_INET || SOCK_DOM(so) == PF_INET6) {
7232 #else
7233 if (SOCK_DOM(so) == PF_INET) {
7234 #endif /* !INET6 */
7235 ret = (sotoinpcb(so)->inp_flags & INP_RECV_ANYIF) ? 1 : 0;
7236 }
7237
7238 return (ret);
7239 }
7240
7241 int
7242 so_set_restrictions(struct socket *so, uint32_t vals)
7243 {
7244 int nocell_old, nocell_new;
7245 int noexpensive_old, noexpensive_new;
7246
7247 /*
7248 * Deny-type restrictions are trapdoors; once set they cannot be
7249 * unset for the lifetime of the socket. This allows them to be
7250 * issued by a framework on behalf of the application without
7251 * having to worry that they can be undone.
7252 *
7253 * Note here that socket-level restrictions overrides any protocol
7254 * level restrictions. For instance, SO_RESTRICT_DENY_CELLULAR
7255 * socket restriction issued on the socket has a higher precendence
7256 * than INP_NO_IFT_CELLULAR. The latter is affected by the UUID
7257 * policy PROC_UUID_NO_CELLULAR for unrestricted sockets only,
7258 * i.e. when SO_RESTRICT_DENY_CELLULAR has not been issued.
7259 */
7260 nocell_old = (so->so_restrictions & SO_RESTRICT_DENY_CELLULAR);
7261 noexpensive_old = (so->so_restrictions & SO_RESTRICT_DENY_EXPENSIVE);
7262 so->so_restrictions |= (vals & (SO_RESTRICT_DENY_IN |
7263 SO_RESTRICT_DENY_OUT | SO_RESTRICT_DENY_CELLULAR |
7264 SO_RESTRICT_DENY_EXPENSIVE));
7265 nocell_new = (so->so_restrictions & SO_RESTRICT_DENY_CELLULAR);
7266 noexpensive_new = (so->so_restrictions & SO_RESTRICT_DENY_EXPENSIVE);
7267
7268 /* we can only set, not clear restrictions */
7269 if ((nocell_new - nocell_old) == 0 &&
7270 (noexpensive_new - noexpensive_old) == 0)
7271 return (0);
7272 #if INET6
7273 if (SOCK_DOM(so) == PF_INET || SOCK_DOM(so) == PF_INET6) {
7274 #else
7275 if (SOCK_DOM(so) == PF_INET) {
7276 #endif /* !INET6 */
7277 if (nocell_new - nocell_old != 0) {
7278 /*
7279 * if deny cellular is now set, do what's needed
7280 * for INPCB
7281 */
7282 inp_set_nocellular(sotoinpcb(so));
7283 }
7284 if (noexpensive_new - noexpensive_old != 0) {
7285 inp_set_noexpensive(sotoinpcb(so));
7286 }
7287 }
7288
7289 if (SOCK_DOM(so) == PF_MULTIPATH)
7290 mptcp_set_restrictions(so);
7291
7292 return (0);
7293 }
7294
7295 uint32_t
7296 so_get_restrictions(struct socket *so)
7297 {
7298 return (so->so_restrictions & (SO_RESTRICT_DENY_IN |
7299 SO_RESTRICT_DENY_OUT |
7300 SO_RESTRICT_DENY_CELLULAR | SO_RESTRICT_DENY_EXPENSIVE));
7301 }
7302
7303 int
7304 so_set_effective_pid(struct socket *so, int epid, struct proc *p)
7305 {
7306 struct proc *ep = PROC_NULL;
7307 int error = 0;
7308
7309 /* pid 0 is reserved for kernel */
7310 if (epid == 0) {
7311 error = EINVAL;
7312 goto done;
7313 }
7314
7315 /*
7316 * If this is an in-kernel socket, prevent its delegate
7317 * association from changing unless the socket option is
7318 * coming from within the kernel itself.
7319 */
7320 if (so->last_pid == 0 && p != kernproc) {
7321 error = EACCES;
7322 goto done;
7323 }
7324
7325 /*
7326 * If this is issued by a process that's recorded as the
7327 * real owner of the socket, or if the pid is the same as
7328 * the process's own pid, then proceed. Otherwise ensure
7329 * that the issuing process has the necessary privileges.
7330 */
7331 if (epid != so->last_pid || epid != proc_pid(p)) {
7332 if ((error = priv_check_cred(kauth_cred_get(),
7333 PRIV_NET_PRIVILEGED_SOCKET_DELEGATE, 0))) {
7334 error = EACCES;
7335 goto done;
7336 }
7337 }
7338
7339 /* Find the process that corresponds to the effective pid */
7340 if ((ep = proc_find(epid)) == PROC_NULL) {
7341 error = ESRCH;
7342 goto done;
7343 }
7344
7345 /*
7346 * If a process tries to delegate the socket to itself, then
7347 * there's really nothing to do; treat it as a way for the
7348 * delegate association to be cleared. Note that we check
7349 * the passed-in proc rather than calling proc_selfpid(),
7350 * as we need to check the process issuing the socket option
7351 * which could be kernproc. Given that we don't allow 0 for
7352 * effective pid, it means that a delegated in-kernel socket
7353 * stays delegated during its lifetime (which is probably OK.)
7354 */
7355 if (epid == proc_pid(p)) {
7356 so->so_flags &= ~SOF_DELEGATED;
7357 so->e_upid = 0;
7358 so->e_pid = 0;
7359 uuid_clear(so->e_uuid);
7360 } else {
7361 so->so_flags |= SOF_DELEGATED;
7362 so->e_upid = proc_uniqueid(ep);
7363 so->e_pid = proc_pid(ep);
7364 proc_getexecutableuuid(ep, so->e_uuid, sizeof (so->e_uuid));
7365 }
7366 done:
7367 if (error == 0 && net_io_policy_log) {
7368 uuid_string_t buf;
7369
7370 uuid_unparse(so->e_uuid, buf);
7371 log(LOG_DEBUG, "%s[%s,%d]: so 0x%llx [%d,%d] epid %d (%s) "
7372 "euuid %s%s\n", __func__, proc_name_address(p),
7373 proc_pid(p), (uint64_t)DEBUG_KERNEL_ADDRPERM(so),
7374 SOCK_DOM(so), SOCK_TYPE(so),
7375 so->e_pid, proc_name_address(ep), buf,
7376 ((so->so_flags & SOF_DELEGATED) ? " [delegated]" : ""));
7377 } else if (error != 0 && net_io_policy_log) {
7378 log(LOG_ERR, "%s[%s,%d]: so 0x%llx [%d,%d] epid %d (%s) "
7379 "ERROR (%d)\n", __func__, proc_name_address(p),
7380 proc_pid(p), (uint64_t)DEBUG_KERNEL_ADDRPERM(so),
7381 SOCK_DOM(so), SOCK_TYPE(so),
7382 epid, (ep == PROC_NULL) ? "PROC_NULL" :
7383 proc_name_address(ep), error);
7384 }
7385
7386 /* Update this socket's policy upon success */
7387 if (error == 0) {
7388 so->so_policy_gencnt *= -1;
7389 so_update_policy(so);
7390 #if NECP
7391 so_update_necp_policy(so, NULL, NULL);
7392 #endif /* NECP */
7393 }
7394
7395 if (ep != PROC_NULL)
7396 proc_rele(ep);
7397
7398 return (error);
7399 }
7400
7401 int
7402 so_set_effective_uuid(struct socket *so, uuid_t euuid, struct proc *p)
7403 {
7404 uuid_string_t buf;
7405 uuid_t uuid;
7406 int error = 0;
7407
7408 /* UUID must not be all-zeroes (reserved for kernel) */
7409 if (uuid_is_null(euuid)) {
7410 error = EINVAL;
7411 goto done;
7412 }
7413
7414 /*
7415 * If this is an in-kernel socket, prevent its delegate
7416 * association from changing unless the socket option is
7417 * coming from within the kernel itself.
7418 */
7419 if (so->last_pid == 0 && p != kernproc) {
7420 error = EACCES;
7421 goto done;
7422 }
7423
7424 /* Get the UUID of the issuing process */
7425 proc_getexecutableuuid(p, uuid, sizeof (uuid));
7426
7427 /*
7428 * If this is issued by a process that's recorded as the
7429 * real owner of the socket, or if the uuid is the same as
7430 * the process's own uuid, then proceed. Otherwise ensure
7431 * that the issuing process has the necessary privileges.
7432 */
7433 if (uuid_compare(euuid, so->last_uuid) != 0 ||
7434 uuid_compare(euuid, uuid) != 0) {
7435 if ((error = priv_check_cred(kauth_cred_get(),
7436 PRIV_NET_PRIVILEGED_SOCKET_DELEGATE, 0))) {
7437 error = EACCES;
7438 goto done;
7439 }
7440 }
7441
7442 /*
7443 * If a process tries to delegate the socket to itself, then
7444 * there's really nothing to do; treat it as a way for the
7445 * delegate association to be cleared. Note that we check
7446 * the uuid of the passed-in proc rather than that of the
7447 * current process, as we need to check the process issuing
7448 * the socket option which could be kernproc itself. Given
7449 * that we don't allow 0 for effective uuid, it means that
7450 * a delegated in-kernel socket stays delegated during its
7451 * lifetime (which is okay.)
7452 */
7453 if (uuid_compare(euuid, uuid) == 0) {
7454 so->so_flags &= ~SOF_DELEGATED;
7455 so->e_upid = 0;
7456 so->e_pid = 0;
7457 uuid_clear(so->e_uuid);
7458 } else {
7459 so->so_flags |= SOF_DELEGATED;
7460 /*
7461 * Unlike so_set_effective_pid(), we only have the UUID
7462 * here and the process ID is not known. Inherit the
7463 * real {pid,upid} of the socket.
7464 */
7465 so->e_upid = so->last_upid;
7466 so->e_pid = so->last_pid;
7467 uuid_copy(so->e_uuid, euuid);
7468 }
7469
7470 done:
7471 if (error == 0 && net_io_policy_log) {
7472 uuid_unparse(so->e_uuid, buf);
7473 log(LOG_DEBUG, "%s[%s,%d]: so 0x%llx [%d,%d] epid %d "
7474 "euuid %s%s\n", __func__, proc_name_address(p), proc_pid(p),
7475 (uint64_t)DEBUG_KERNEL_ADDRPERM(so), SOCK_DOM(so),
7476 SOCK_TYPE(so), so->e_pid, buf,
7477 ((so->so_flags & SOF_DELEGATED) ? " [delegated]" : ""));
7478 } else if (error != 0 && net_io_policy_log) {
7479 uuid_unparse(euuid, buf);
7480 log(LOG_DEBUG, "%s[%s,%d]: so 0x%llx [%d,%d] euuid %s "
7481 "ERROR (%d)\n", __func__, proc_name_address(p), proc_pid(p),
7482 (uint64_t)DEBUG_KERNEL_ADDRPERM(so), SOCK_DOM(so),
7483 SOCK_TYPE(so), buf, error);
7484 }
7485
7486 /* Update this socket's policy upon success */
7487 if (error == 0) {
7488 so->so_policy_gencnt *= -1;
7489 so_update_policy(so);
7490 #if NECP
7491 so_update_necp_policy(so, NULL, NULL);
7492 #endif /* NECP */
7493 }
7494
7495 return (error);
7496 }
7497
7498 void
7499 netpolicy_post_msg(uint32_t ev_code, struct netpolicy_event_data *ev_data,
7500 uint32_t ev_datalen)
7501 {
7502 struct kev_msg ev_msg;
7503
7504 /*
7505 * A netpolicy event always starts with a netpolicy_event_data
7506 * structure, but the caller can provide for a longer event
7507 * structure to post, depending on the event code.
7508 */
7509 VERIFY(ev_data != NULL && ev_datalen >= sizeof (*ev_data));
7510
7511 bzero(&ev_msg, sizeof (ev_msg));
7512 ev_msg.vendor_code = KEV_VENDOR_APPLE;
7513 ev_msg.kev_class = KEV_NETWORK_CLASS;
7514 ev_msg.kev_subclass = KEV_NETPOLICY_SUBCLASS;
7515 ev_msg.event_code = ev_code;
7516
7517 ev_msg.dv[0].data_ptr = ev_data;
7518 ev_msg.dv[0].data_length = ev_datalen;
7519
7520 kev_post_msg(&ev_msg);
7521 }
7522
7523 void
7524 socket_post_kev_msg(uint32_t ev_code,
7525 struct kev_socket_event_data *ev_data,
7526 uint32_t ev_datalen)
7527 {
7528 struct kev_msg ev_msg;
7529
7530 bzero(&ev_msg, sizeof(ev_msg));
7531 ev_msg.vendor_code = KEV_VENDOR_APPLE;
7532 ev_msg.kev_class = KEV_NETWORK_CLASS;
7533 ev_msg.kev_subclass = KEV_SOCKET_SUBCLASS;
7534 ev_msg.event_code = ev_code;
7535
7536 ev_msg.dv[0].data_ptr = ev_data;
7537 ev_msg.dv[0]. data_length = ev_datalen;
7538
7539 kev_post_msg(&ev_msg);
7540 }
7541
7542 void
7543 socket_post_kev_msg_closed(struct socket *so)
7544 {
7545 struct kev_socket_closed ev;
7546 struct sockaddr *socksa = NULL, *peersa = NULL;
7547 int err;
7548 bzero(&ev, sizeof(ev));
7549 err = (*so->so_proto->pr_usrreqs->pru_sockaddr)(so, &socksa);
7550 if (err == 0) {
7551 err = (*so->so_proto->pr_usrreqs->pru_peeraddr)(so,
7552 &peersa);
7553 if (err == 0) {
7554 memcpy(&ev.ev_data.kev_sockname, socksa,
7555 min(socksa->sa_len,
7556 sizeof (ev.ev_data.kev_sockname)));
7557 memcpy(&ev.ev_data.kev_peername, peersa,
7558 min(peersa->sa_len,
7559 sizeof (ev.ev_data.kev_peername)));
7560 socket_post_kev_msg(KEV_SOCKET_CLOSED,
7561 &ev.ev_data, sizeof (ev));
7562 }
7563 }
7564 if (socksa != NULL)
7565 FREE(socksa, M_SONAME);
7566 if (peersa != NULL)
7567 FREE(peersa, M_SONAME);
7568 }