]> git.saurik.com Git - apple/xnu.git/blob - bsd/kern/uipc_socket.c
c552c417506e20ce08114831f06e7446b9054bd8
[apple/xnu.git] / bsd / kern / uipc_socket.c
1 /*
2 * Copyright (c) 1998-2016 Apple Inc. All rights reserved.
3 *
4 * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
5 *
6 * This file contains Original Code and/or Modifications of Original Code
7 * as defined in and that are subject to the Apple Public Source License
8 * Version 2.0 (the 'License'). You may not use this file except in
9 * compliance with the License. The rights granted to you under the License
10 * may not be used to create, or enable the creation or redistribution of,
11 * unlawful or unlicensed copies of an Apple operating system, or to
12 * circumvent, violate, or enable the circumvention or violation of, any
13 * terms of an Apple operating system software license agreement.
14 *
15 * Please obtain a copy of the License at
16 * http://www.opensource.apple.com/apsl/ and read it before using this file.
17 *
18 * The Original Code and all software distributed under the License are
19 * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
20 * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
21 * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
22 * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
23 * Please see the License for the specific language governing rights and
24 * limitations under the License.
25 *
26 * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
27 */
28 /* Copyright (c) 1995 NeXT Computer, Inc. All Rights Reserved */
29 /*
30 * Copyright (c) 1982, 1986, 1988, 1990, 1993
31 * The Regents of the University of California. All rights reserved.
32 *
33 * Redistribution and use in source and binary forms, with or without
34 * modification, are permitted provided that the following conditions
35 * are met:
36 * 1. Redistributions of source code must retain the above copyright
37 * notice, this list of conditions and the following disclaimer.
38 * 2. Redistributions in binary form must reproduce the above copyright
39 * notice, this list of conditions and the following disclaimer in the
40 * documentation and/or other materials provided with the distribution.
41 * 3. All advertising materials mentioning features or use of this software
42 * must display the following acknowledgement:
43 * This product includes software developed by the University of
44 * California, Berkeley and its contributors.
45 * 4. Neither the name of the University nor the names of its contributors
46 * may be used to endorse or promote products derived from this software
47 * without specific prior written permission.
48 *
49 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
50 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
51 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
52 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
53 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
54 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
55 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
56 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
57 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
58 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
59 * SUCH DAMAGE.
60 *
61 * @(#)uipc_socket.c 8.3 (Berkeley) 4/15/94
62 */
63 /*
64 * NOTICE: This file was modified by SPARTA, Inc. in 2005 to introduce
65 * support for mandatory and extensible security protections. This notice
66 * is included in support of clause 2.2 (b) of the Apple Public License,
67 * Version 2.0.
68 */
69
70 #include <sys/param.h>
71 #include <sys/systm.h>
72 #include <sys/filedesc.h>
73 #include <sys/proc.h>
74 #include <sys/proc_internal.h>
75 #include <sys/kauth.h>
76 #include <sys/file_internal.h>
77 #include <sys/fcntl.h>
78 #include <sys/malloc.h>
79 #include <sys/mbuf.h>
80 #include <sys/domain.h>
81 #include <sys/kernel.h>
82 #include <sys/event.h>
83 #include <sys/poll.h>
84 #include <sys/protosw.h>
85 #include <sys/socket.h>
86 #include <sys/socketvar.h>
87 #include <sys/resourcevar.h>
88 #include <sys/signalvar.h>
89 #include <sys/sysctl.h>
90 #include <sys/syslog.h>
91 #include <sys/uio.h>
92 #include <sys/uio_internal.h>
93 #include <sys/ev.h>
94 #include <sys/kdebug.h>
95 #include <sys/un.h>
96 #include <sys/user.h>
97 #include <sys/priv.h>
98 #include <sys/kern_event.h>
99 #include <net/route.h>
100 #include <net/init.h>
101 #include <net/ntstat.h>
102 #include <net/content_filter.h>
103 #include <netinet/in.h>
104 #include <netinet/in_pcb.h>
105 #include <netinet/in_tclass.h>
106 #include <netinet/tcp_var.h>
107 #include <netinet/ip6.h>
108 #include <netinet6/ip6_var.h>
109 #include <netinet/flow_divert.h>
110 #include <kern/zalloc.h>
111 #include <kern/locks.h>
112 #include <machine/limits.h>
113 #include <libkern/OSAtomic.h>
114 #include <pexpert/pexpert.h>
115 #include <kern/assert.h>
116 #include <kern/task.h>
117 #include <kern/policy_internal.h>
118
119 #include <sys/kpi_mbuf.h>
120 #include <sys/mcache.h>
121 #include <sys/unpcb.h>
122
123 #if CONFIG_MACF
124 #include <security/mac.h>
125 #include <security/mac_framework.h>
126 #endif /* MAC */
127
128 #if MULTIPATH
129 #include <netinet/mp_pcb.h>
130 #include <netinet/mptcp_var.h>
131 #endif /* MULTIPATH */
132
133 #define ROUNDUP(a, b) (((a) + ((b) - 1)) & (~((b) - 1)))
134
135 #if DEBUG || DEVELOPMENT
136 #define DEBUG_KERNEL_ADDRPERM(_v) (_v)
137 #else
138 #define DEBUG_KERNEL_ADDRPERM(_v) VM_KERNEL_ADDRPERM(_v)
139 #endif
140
141 /* TODO: this should be in a header file somewhere */
142 extern char *proc_name_address(void *p);
143 extern char *proc_best_name(proc_t);
144
145 static u_int32_t so_cache_hw; /* High water mark for socache */
146 static u_int32_t so_cache_timeouts; /* number of timeouts */
147 static u_int32_t so_cache_max_freed; /* max freed per timeout */
148 static u_int32_t cached_sock_count = 0;
149 STAILQ_HEAD(, socket) so_cache_head;
150 int max_cached_sock_count = MAX_CACHED_SOCKETS;
151 static u_int32_t so_cache_time;
152 static int socketinit_done;
153 static struct zone *so_cache_zone;
154
155 static lck_grp_t *so_cache_mtx_grp;
156 static lck_attr_t *so_cache_mtx_attr;
157 static lck_grp_attr_t *so_cache_mtx_grp_attr;
158 static lck_mtx_t *so_cache_mtx;
159
160 #include <machine/limits.h>
161
162 static int filt_sorattach(struct knote *kn);
163 static void filt_sordetach(struct knote *kn);
164 static int filt_soread(struct knote *kn, long hint);
165 static int filt_sortouch(struct knote *kn, struct kevent_internal_s *kev);
166 static int filt_sorprocess(struct knote *kn, struct filt_process_s *data, struct kevent_internal_s *kev);
167
168 static int filt_sowattach(struct knote *kn);
169 static void filt_sowdetach(struct knote *kn);
170 static int filt_sowrite(struct knote *kn, long hint);
171 static int filt_sowtouch(struct knote *kn, struct kevent_internal_s *kev);
172 static int filt_sowprocess(struct knote *kn, struct filt_process_s *data, struct kevent_internal_s *kev);
173
174 static int filt_sockattach(struct knote *kn);
175 static void filt_sockdetach(struct knote *kn);
176 static int filt_sockev(struct knote *kn, long hint);
177 static int filt_socktouch(struct knote *kn, struct kevent_internal_s *kev);
178 static int filt_sockprocess(struct knote *kn, struct filt_process_s *data, struct kevent_internal_s *kev);
179
180 static int sooptcopyin_timeval(struct sockopt *, struct timeval *);
181 static int sooptcopyout_timeval(struct sockopt *, const struct timeval *);
182
183 struct filterops soread_filtops = {
184 .f_isfd = 1,
185 .f_attach = filt_sorattach,
186 .f_detach = filt_sordetach,
187 .f_event = filt_soread,
188 .f_touch = filt_sortouch,
189 .f_process = filt_sorprocess,
190 };
191
192 struct filterops sowrite_filtops = {
193 .f_isfd = 1,
194 .f_attach = filt_sowattach,
195 .f_detach = filt_sowdetach,
196 .f_event = filt_sowrite,
197 .f_touch = filt_sowtouch,
198 .f_process = filt_sowprocess,
199 };
200
201 struct filterops sock_filtops = {
202 .f_isfd = 1,
203 .f_attach = filt_sockattach,
204 .f_detach = filt_sockdetach,
205 .f_event = filt_sockev,
206 .f_touch = filt_socktouch,
207 .f_process = filt_sockprocess,
208 };
209
210 struct filterops soexcept_filtops = {
211 .f_isfd = 1,
212 .f_attach = filt_sorattach,
213 .f_detach = filt_sordetach,
214 .f_event = filt_soread,
215 .f_touch = filt_sortouch,
216 .f_process = filt_sorprocess,
217 };
218
219 SYSCTL_DECL(_kern_ipc);
220
221 #define EVEN_MORE_LOCKING_DEBUG 0
222
223 int socket_debug = 0;
224 SYSCTL_INT(_kern_ipc, OID_AUTO, socket_debug,
225 CTLFLAG_RW | CTLFLAG_LOCKED, &socket_debug, 0, "");
226
227 static unsigned long sodefunct_calls = 0;
228 SYSCTL_LONG(_kern_ipc, OID_AUTO, sodefunct_calls, CTLFLAG_LOCKED,
229 &sodefunct_calls, "");
230
231 static int socket_zone = M_SOCKET;
232 so_gen_t so_gencnt; /* generation count for sockets */
233
234 MALLOC_DEFINE(M_SONAME, "soname", "socket name");
235 MALLOC_DEFINE(M_PCB, "pcb", "protocol control block");
236
237 #define DBG_LAYER_IN_BEG NETDBG_CODE(DBG_NETSOCK, 0)
238 #define DBG_LAYER_IN_END NETDBG_CODE(DBG_NETSOCK, 2)
239 #define DBG_LAYER_OUT_BEG NETDBG_CODE(DBG_NETSOCK, 1)
240 #define DBG_LAYER_OUT_END NETDBG_CODE(DBG_NETSOCK, 3)
241 #define DBG_FNC_SOSEND NETDBG_CODE(DBG_NETSOCK, (4 << 8) | 1)
242 #define DBG_FNC_SOSEND_LIST NETDBG_CODE(DBG_NETSOCK, (4 << 8) | 3)
243 #define DBG_FNC_SORECEIVE NETDBG_CODE(DBG_NETSOCK, (8 << 8))
244 #define DBG_FNC_SORECEIVE_LIST NETDBG_CODE(DBG_NETSOCK, (8 << 8) | 3)
245 #define DBG_FNC_SOSHUTDOWN NETDBG_CODE(DBG_NETSOCK, (9 << 8))
246
247 #define MAX_SOOPTGETM_SIZE (128 * MCLBYTES)
248
249 int somaxconn = SOMAXCONN;
250 SYSCTL_INT(_kern_ipc, KIPC_SOMAXCONN, somaxconn,
251 CTLFLAG_RW | CTLFLAG_LOCKED, &somaxconn, 0, "");
252
253 /* Should we get a maximum also ??? */
254 static int sosendmaxchain = 65536;
255 static int sosendminchain = 16384;
256 static int sorecvmincopy = 16384;
257 SYSCTL_INT(_kern_ipc, OID_AUTO, sosendminchain,
258 CTLFLAG_RW | CTLFLAG_LOCKED, &sosendminchain, 0, "");
259 SYSCTL_INT(_kern_ipc, OID_AUTO, sorecvmincopy,
260 CTLFLAG_RW | CTLFLAG_LOCKED, &sorecvmincopy, 0, "");
261
262 /*
263 * Set to enable jumbo clusters (if available) for large writes when
264 * the socket is marked with SOF_MULTIPAGES; see below.
265 */
266 int sosendjcl = 1;
267 SYSCTL_INT(_kern_ipc, OID_AUTO, sosendjcl,
268 CTLFLAG_RW | CTLFLAG_LOCKED, &sosendjcl, 0, "");
269
270 /*
271 * Set this to ignore SOF_MULTIPAGES and use jumbo clusters for large
272 * writes on the socket for all protocols on any network interfaces,
273 * depending upon sosendjcl above. Be extra careful when setting this
274 * to 1, because sending down packets that cross physical pages down to
275 * broken drivers (those that falsely assume that the physical pages
276 * are contiguous) might lead to system panics or silent data corruption.
277 * When set to 0, the system will respect SOF_MULTIPAGES, which is set
278 * only for TCP sockets whose outgoing interface is IFNET_MULTIPAGES
279 * capable. Set this to 1 only for testing/debugging purposes.
280 */
281 int sosendjcl_ignore_capab = 0;
282 SYSCTL_INT(_kern_ipc, OID_AUTO, sosendjcl_ignore_capab,
283 CTLFLAG_RW | CTLFLAG_LOCKED, &sosendjcl_ignore_capab, 0, "");
284
285 /*
286 * Set this to ignore SOF1_IF_2KCL and use big clusters for large
287 * writes on the socket for all protocols on any network interfaces.
288 * Be extra careful when setting this to 1, because sending down packets with
289 * clusters larger that 2 KB might lead to system panics or data corruption.
290 * When set to 0, the system will respect SOF1_IF_2KCL, which is set
291 * on the outgoing interface
292 * Set this to 1 for testing/debugging purposes only.
293 */
294 int sosendbigcl_ignore_capab = 0;
295 SYSCTL_INT(_kern_ipc, OID_AUTO, sosendbigcl_ignore_capab,
296 CTLFLAG_RW | CTLFLAG_LOCKED, &sosendbigcl_ignore_capab, 0, "");
297
298 int sodefunctlog = 0;
299 SYSCTL_INT(_kern_ipc, OID_AUTO, sodefunctlog, CTLFLAG_RW | CTLFLAG_LOCKED,
300 &sodefunctlog, 0, "");
301
302 int sothrottlelog = 0;
303 SYSCTL_INT(_kern_ipc, OID_AUTO, sothrottlelog, CTLFLAG_RW | CTLFLAG_LOCKED,
304 &sothrottlelog, 0, "");
305
306 int sorestrictrecv = 1;
307 SYSCTL_INT(_kern_ipc, OID_AUTO, sorestrictrecv, CTLFLAG_RW | CTLFLAG_LOCKED,
308 &sorestrictrecv, 0, "Enable inbound interface restrictions");
309
310 int sorestrictsend = 1;
311 SYSCTL_INT(_kern_ipc, OID_AUTO, sorestrictsend, CTLFLAG_RW | CTLFLAG_LOCKED,
312 &sorestrictsend, 0, "Enable outbound interface restrictions");
313
314 int soreserveheadroom = 1;
315 SYSCTL_INT(_kern_ipc, OID_AUTO, soreserveheadroom, CTLFLAG_RW | CTLFLAG_LOCKED,
316 &soreserveheadroom, 0, "To allocate contiguous datagram buffers");
317
318 #if (DEBUG || DEVELOPMENT)
319 int so_notsent_lowat_check = 1;
320 SYSCTL_INT(_kern_ipc, OID_AUTO, notsent_lowat, CTLFLAG_RW|CTLFLAG_LOCKED,
321 &so_notsent_lowat_check, 0, "enable/disable notsnet lowat check");
322 #endif /* DEBUG || DEVELOPMENT */
323
324 extern struct inpcbinfo tcbinfo;
325
326 /* TODO: these should be in header file */
327 extern int get_inpcb_str_size(void);
328 extern int get_tcp_str_size(void);
329
330 static unsigned int sl_zone_size; /* size of sockaddr_list */
331 static struct zone *sl_zone; /* zone for sockaddr_list */
332
333 static unsigned int se_zone_size; /* size of sockaddr_entry */
334 static struct zone *se_zone; /* zone for sockaddr_entry */
335
336 vm_size_t so_cache_zone_element_size;
337
338 static int sodelayed_copy(struct socket *, struct uio *, struct mbuf **,
339 user_ssize_t *);
340 static void cached_sock_alloc(struct socket **, int);
341 static void cached_sock_free(struct socket *);
342
343 /*
344 * Maximum of extended background idle sockets per process
345 * Set to zero to disable further setting of the option
346 */
347
348 #define SO_IDLE_BK_IDLE_MAX_PER_PROC 1
349 #define SO_IDLE_BK_IDLE_TIME 600
350 #define SO_IDLE_BK_IDLE_RCV_HIWAT 131072
351
352 struct soextbkidlestat soextbkidlestat;
353
354 SYSCTL_UINT(_kern_ipc, OID_AUTO, maxextbkidleperproc,
355 CTLFLAG_RW | CTLFLAG_LOCKED, &soextbkidlestat.so_xbkidle_maxperproc, 0,
356 "Maximum of extended background idle sockets per process");
357
358 SYSCTL_UINT(_kern_ipc, OID_AUTO, extbkidletime, CTLFLAG_RW | CTLFLAG_LOCKED,
359 &soextbkidlestat.so_xbkidle_time, 0,
360 "Time in seconds to keep extended background idle sockets");
361
362 SYSCTL_UINT(_kern_ipc, OID_AUTO, extbkidlercvhiwat, CTLFLAG_RW | CTLFLAG_LOCKED,
363 &soextbkidlestat.so_xbkidle_rcvhiwat, 0,
364 "High water mark for extended background idle sockets");
365
366 SYSCTL_STRUCT(_kern_ipc, OID_AUTO, extbkidlestat, CTLFLAG_RD | CTLFLAG_LOCKED,
367 &soextbkidlestat, soextbkidlestat, "");
368
369 int so_set_extended_bk_idle(struct socket *, int);
370
371 /*
372 * SOTCDB_NO_DSCP is set by default, to prevent the networking stack from
373 * setting the DSCP code on the packet based on the service class; see
374 * <rdar://problem/11277343> for details.
375 */
376 __private_extern__ u_int32_t sotcdb = 0;
377 SYSCTL_INT(_kern_ipc, OID_AUTO, sotcdb, CTLFLAG_RW | CTLFLAG_LOCKED,
378 &sotcdb, 0, "");
379
380 void
381 socketinit(void)
382 {
383 _CASSERT(sizeof(so_gencnt) == sizeof(uint64_t));
384 VERIFY(IS_P2ALIGNED(&so_gencnt, sizeof(uint32_t)));
385
386 #ifdef __LP64__
387 _CASSERT(sizeof(struct sa_endpoints) == sizeof(struct user64_sa_endpoints));
388 _CASSERT(offsetof(struct sa_endpoints, sae_srcif) == offsetof(struct user64_sa_endpoints, sae_srcif));
389 _CASSERT(offsetof(struct sa_endpoints, sae_srcaddr) == offsetof(struct user64_sa_endpoints, sae_srcaddr));
390 _CASSERT(offsetof(struct sa_endpoints, sae_srcaddrlen) == offsetof(struct user64_sa_endpoints, sae_srcaddrlen));
391 _CASSERT(offsetof(struct sa_endpoints, sae_dstaddr) == offsetof(struct user64_sa_endpoints, sae_dstaddr));
392 _CASSERT(offsetof(struct sa_endpoints, sae_dstaddrlen) == offsetof(struct user64_sa_endpoints, sae_dstaddrlen));
393 #else
394 _CASSERT(sizeof(struct sa_endpoints) == sizeof(struct user32_sa_endpoints));
395 _CASSERT(offsetof(struct sa_endpoints, sae_srcif) == offsetof(struct user32_sa_endpoints, sae_srcif));
396 _CASSERT(offsetof(struct sa_endpoints, sae_srcaddr) == offsetof(struct user32_sa_endpoints, sae_srcaddr));
397 _CASSERT(offsetof(struct sa_endpoints, sae_srcaddrlen) == offsetof(struct user32_sa_endpoints, sae_srcaddrlen));
398 _CASSERT(offsetof(struct sa_endpoints, sae_dstaddr) == offsetof(struct user32_sa_endpoints, sae_dstaddr));
399 _CASSERT(offsetof(struct sa_endpoints, sae_dstaddrlen) == offsetof(struct user32_sa_endpoints, sae_dstaddrlen));
400 #endif
401
402 if (socketinit_done) {
403 printf("socketinit: already called...\n");
404 return;
405 }
406 socketinit_done = 1;
407
408 PE_parse_boot_argn("socket_debug", &socket_debug,
409 sizeof (socket_debug));
410
411 /*
412 * allocate lock group attribute and group for socket cache mutex
413 */
414 so_cache_mtx_grp_attr = lck_grp_attr_alloc_init();
415 so_cache_mtx_grp = lck_grp_alloc_init("so_cache",
416 so_cache_mtx_grp_attr);
417
418 /*
419 * allocate the lock attribute for socket cache mutex
420 */
421 so_cache_mtx_attr = lck_attr_alloc_init();
422
423 /* cached sockets mutex */
424 so_cache_mtx = lck_mtx_alloc_init(so_cache_mtx_grp, so_cache_mtx_attr);
425 if (so_cache_mtx == NULL) {
426 panic("%s: unable to allocate so_cache_mtx\n", __func__);
427 /* NOTREACHED */
428 }
429 STAILQ_INIT(&so_cache_head);
430
431 so_cache_zone_element_size = (vm_size_t)(sizeof (struct socket) + 4
432 + get_inpcb_str_size() + 4 + get_tcp_str_size());
433
434 so_cache_zone = zinit(so_cache_zone_element_size,
435 (120000 * so_cache_zone_element_size), 8192, "socache zone");
436 zone_change(so_cache_zone, Z_CALLERACCT, FALSE);
437 zone_change(so_cache_zone, Z_NOENCRYPT, TRUE);
438
439 sl_zone_size = sizeof (struct sockaddr_list);
440 if ((sl_zone = zinit(sl_zone_size, 1024 * sl_zone_size, 1024,
441 "sockaddr_list")) == NULL) {
442 panic("%s: unable to allocate sockaddr_list zone\n", __func__);
443 /* NOTREACHED */
444 }
445 zone_change(sl_zone, Z_CALLERACCT, FALSE);
446 zone_change(sl_zone, Z_EXPAND, TRUE);
447
448 se_zone_size = sizeof (struct sockaddr_entry);
449 if ((se_zone = zinit(se_zone_size, 1024 * se_zone_size, 1024,
450 "sockaddr_entry")) == NULL) {
451 panic("%s: unable to allocate sockaddr_entry zone\n", __func__);
452 /* NOTREACHED */
453 }
454 zone_change(se_zone, Z_CALLERACCT, FALSE);
455 zone_change(se_zone, Z_EXPAND, TRUE);
456
457 bzero(&soextbkidlestat, sizeof(struct soextbkidlestat));
458 soextbkidlestat.so_xbkidle_maxperproc = SO_IDLE_BK_IDLE_MAX_PER_PROC;
459 soextbkidlestat.so_xbkidle_time = SO_IDLE_BK_IDLE_TIME;
460 soextbkidlestat.so_xbkidle_rcvhiwat = SO_IDLE_BK_IDLE_RCV_HIWAT;
461
462 in_pcbinit();
463 sflt_init();
464 socket_tclass_init();
465 #if MULTIPATH
466 mp_pcbinit();
467 #endif /* MULTIPATH */
468 }
469
470 static void
471 cached_sock_alloc(struct socket **so, int waitok)
472 {
473 caddr_t temp;
474 uintptr_t offset;
475
476 lck_mtx_lock(so_cache_mtx);
477
478 if (!STAILQ_EMPTY(&so_cache_head)) {
479 VERIFY(cached_sock_count > 0);
480
481 *so = STAILQ_FIRST(&so_cache_head);
482 STAILQ_REMOVE_HEAD(&so_cache_head, so_cache_ent);
483 STAILQ_NEXT((*so), so_cache_ent) = NULL;
484
485 cached_sock_count--;
486 lck_mtx_unlock(so_cache_mtx);
487
488 temp = (*so)->so_saved_pcb;
489 bzero((caddr_t)*so, sizeof (struct socket));
490
491 (*so)->so_saved_pcb = temp;
492 } else {
493
494 lck_mtx_unlock(so_cache_mtx);
495
496 if (waitok)
497 *so = (struct socket *)zalloc(so_cache_zone);
498 else
499 *so = (struct socket *)zalloc_noblock(so_cache_zone);
500
501 if (*so == NULL)
502 return;
503
504 bzero((caddr_t)*so, sizeof (struct socket));
505
506 /*
507 * Define offsets for extra structures into our
508 * single block of memory. Align extra structures
509 * on longword boundaries.
510 */
511
512 offset = (uintptr_t)*so;
513 offset += sizeof (struct socket);
514
515 offset = ALIGN(offset);
516
517 (*so)->so_saved_pcb = (caddr_t)offset;
518 offset += get_inpcb_str_size();
519
520 offset = ALIGN(offset);
521
522 ((struct inpcb *)(void *)(*so)->so_saved_pcb)->inp_saved_ppcb =
523 (caddr_t)offset;
524 }
525
526 OSBitOrAtomic(SOF1_CACHED_IN_SOCK_LAYER, &(*so)->so_flags1);
527 }
528
529 static void
530 cached_sock_free(struct socket *so)
531 {
532
533 lck_mtx_lock(so_cache_mtx);
534
535 so_cache_time = net_uptime();
536 if (++cached_sock_count > max_cached_sock_count) {
537 --cached_sock_count;
538 lck_mtx_unlock(so_cache_mtx);
539 zfree(so_cache_zone, so);
540 } else {
541 if (so_cache_hw < cached_sock_count)
542 so_cache_hw = cached_sock_count;
543
544 STAILQ_INSERT_TAIL(&so_cache_head, so, so_cache_ent);
545
546 so->cache_timestamp = so_cache_time;
547 lck_mtx_unlock(so_cache_mtx);
548 }
549 }
550
551 void
552 so_update_last_owner_locked(struct socket *so, proc_t self)
553 {
554 if (so->last_pid != 0) {
555 /*
556 * last_pid and last_upid should remain zero for sockets
557 * created using sock_socket. The check above achieves that
558 */
559 if (self == PROC_NULL)
560 self = current_proc();
561
562 if (so->last_upid != proc_uniqueid(self) ||
563 so->last_pid != proc_pid(self)) {
564 so->last_upid = proc_uniqueid(self);
565 so->last_pid = proc_pid(self);
566 proc_getexecutableuuid(self, so->last_uuid,
567 sizeof (so->last_uuid));
568 }
569 proc_pidoriginatoruuid(so->so_vuuid, sizeof(so->so_vuuid));
570 }
571 }
572
573 void
574 so_update_policy(struct socket *so)
575 {
576 if (SOCK_DOM(so) == PF_INET || SOCK_DOM(so) == PF_INET6)
577 (void) inp_update_policy(sotoinpcb(so));
578 }
579
580 #if NECP
581 static void
582 so_update_necp_policy(struct socket *so, struct sockaddr *override_local_addr,
583 struct sockaddr *override_remote_addr)
584 {
585 if (SOCK_DOM(so) == PF_INET || SOCK_DOM(so) == PF_INET6)
586 inp_update_necp_policy(sotoinpcb(so), override_local_addr,
587 override_remote_addr, 0);
588 }
589 #endif /* NECP */
590
591 boolean_t
592 so_cache_timer(void)
593 {
594 struct socket *p;
595 int n_freed = 0;
596 boolean_t rc = FALSE;
597
598 lck_mtx_lock(so_cache_mtx);
599 so_cache_timeouts++;
600 so_cache_time = net_uptime();
601
602 while (!STAILQ_EMPTY(&so_cache_head)) {
603 VERIFY(cached_sock_count > 0);
604 p = STAILQ_FIRST(&so_cache_head);
605 if ((so_cache_time - p->cache_timestamp) <
606 SO_CACHE_TIME_LIMIT)
607 break;
608
609 STAILQ_REMOVE_HEAD(&so_cache_head, so_cache_ent);
610 --cached_sock_count;
611
612 zfree(so_cache_zone, p);
613
614 if (++n_freed >= SO_CACHE_MAX_FREE_BATCH) {
615 so_cache_max_freed++;
616 break;
617 }
618 }
619
620 /* Schedule again if there is more to cleanup */
621 if (!STAILQ_EMPTY(&so_cache_head))
622 rc = TRUE;
623
624 lck_mtx_unlock(so_cache_mtx);
625 return (rc);
626 }
627
628 /*
629 * Get a socket structure from our zone, and initialize it.
630 * We don't implement `waitok' yet (see comments in uipc_domain.c).
631 * Note that it would probably be better to allocate socket
632 * and PCB at the same time, but I'm not convinced that all
633 * the protocols can be easily modified to do this.
634 */
635 struct socket *
636 soalloc(int waitok, int dom, int type)
637 {
638 struct socket *so;
639
640 if ((dom == PF_INET) && (type == SOCK_STREAM)) {
641 cached_sock_alloc(&so, waitok);
642 } else {
643 MALLOC_ZONE(so, struct socket *, sizeof (*so), socket_zone,
644 M_WAITOK);
645 if (so != NULL)
646 bzero(so, sizeof (*so));
647 }
648 if (so != NULL) {
649 so->so_gencnt = OSIncrementAtomic64((SInt64 *)&so_gencnt);
650 so->so_zone = socket_zone;
651 #if CONFIG_MACF_SOCKET
652 /* Convert waitok to M_WAITOK/M_NOWAIT for MAC Framework. */
653 if (mac_socket_label_init(so, !waitok) != 0) {
654 sodealloc(so);
655 return (NULL);
656 }
657 #endif /* MAC_SOCKET */
658 }
659
660 return (so);
661 }
662
663 int
664 socreate_internal(int dom, struct socket **aso, int type, int proto,
665 struct proc *p, uint32_t flags, struct proc *ep)
666 {
667 struct protosw *prp;
668 struct socket *so;
669 int error = 0;
670
671 #if TCPDEBUG
672 extern int tcpconsdebug;
673 #endif
674
675 VERIFY(aso != NULL);
676 *aso = NULL;
677
678 if (proto != 0)
679 prp = pffindproto(dom, proto, type);
680 else
681 prp = pffindtype(dom, type);
682
683 if (prp == NULL || prp->pr_usrreqs->pru_attach == NULL) {
684 if (pffinddomain(dom) == NULL)
685 return (EAFNOSUPPORT);
686 if (proto != 0) {
687 if (pffindprotonotype(dom, proto) != NULL)
688 return (EPROTOTYPE);
689 }
690 return (EPROTONOSUPPORT);
691 }
692 if (prp->pr_type != type)
693 return (EPROTOTYPE);
694 so = soalloc(1, dom, type);
695 if (so == NULL)
696 return (ENOBUFS);
697
698 if (flags & SOCF_ASYNC)
699 so->so_state |= SS_NBIO;
700 #if MULTIPATH
701 if (flags & SOCF_MP_SUBFLOW) {
702 /*
703 * A multipath subflow socket is used internally in the kernel,
704 * therefore it does not have a file desciptor associated by
705 * default.
706 */
707 so->so_state |= SS_NOFDREF;
708 so->so_flags |= SOF_MP_SUBFLOW;
709 }
710 #endif /* MULTIPATH */
711
712 TAILQ_INIT(&so->so_incomp);
713 TAILQ_INIT(&so->so_comp);
714 so->so_type = type;
715 so->last_upid = proc_uniqueid(p);
716 so->last_pid = proc_pid(p);
717 proc_getexecutableuuid(p, so->last_uuid, sizeof (so->last_uuid));
718 proc_pidoriginatoruuid(so->so_vuuid, sizeof(so->so_vuuid));
719
720 if (ep != PROC_NULL && ep != p) {
721 so->e_upid = proc_uniqueid(ep);
722 so->e_pid = proc_pid(ep);
723 proc_getexecutableuuid(ep, so->e_uuid, sizeof (so->e_uuid));
724 so->so_flags |= SOF_DELEGATED;
725 }
726
727 so->so_cred = kauth_cred_proc_ref(p);
728 if (!suser(kauth_cred_get(), NULL))
729 so->so_state |= SS_PRIV;
730
731 so->so_proto = prp;
732 so->so_rcv.sb_flags |= SB_RECV;
733 so->so_rcv.sb_so = so->so_snd.sb_so = so;
734 so->next_lock_lr = 0;
735 so->next_unlock_lr = 0;
736
737 #if CONFIG_MACF_SOCKET
738 mac_socket_label_associate(kauth_cred_get(), so);
739 #endif /* MAC_SOCKET */
740
741 /*
742 * Attachment will create the per pcb lock if necessary and
743 * increase refcount for creation, make sure it's done before
744 * socket is inserted in lists.
745 */
746 so->so_usecount++;
747
748 error = (*prp->pr_usrreqs->pru_attach)(so, proto, p);
749 if (error != 0) {
750 /*
751 * Warning:
752 * If so_pcb is not zero, the socket will be leaked,
753 * so protocol attachment handler must be coded carefuly
754 */
755 so->so_state |= SS_NOFDREF;
756 VERIFY(so->so_usecount > 0);
757 so->so_usecount--;
758 sofreelastref(so, 1); /* will deallocate the socket */
759 return (error);
760 }
761
762 atomic_add_32(&prp->pr_domain->dom_refs, 1);
763 TAILQ_INIT(&so->so_evlist);
764
765 /* Attach socket filters for this protocol */
766 sflt_initsock(so);
767 #if TCPDEBUG
768 if (tcpconsdebug == 2)
769 so->so_options |= SO_DEBUG;
770 #endif
771 so_set_default_traffic_class(so);
772
773 /*
774 * If this thread or task is marked to create backgrounded sockets,
775 * mark the socket as background.
776 */
777 if (proc_get_effective_thread_policy(current_thread(),
778 TASK_POLICY_NEW_SOCKETS_BG)) {
779 socket_set_traffic_mgt_flags(so, TRAFFIC_MGT_SO_BACKGROUND);
780 so->so_background_thread = current_thread();
781 }
782
783 switch (dom) {
784 /*
785 * Don't mark Unix domain, system or multipath sockets as
786 * eligible for defunct by default.
787 */
788 case PF_LOCAL:
789 case PF_SYSTEM:
790 case PF_MULTIPATH:
791 so->so_flags |= SOF_NODEFUNCT;
792 break;
793 default:
794 break;
795 }
796
797 /*
798 * Entitlements can't be checked at socket creation time except if the
799 * application requested a feature guarded by a privilege (c.f., socket
800 * delegation).
801 * The priv(9) and the Sandboxing APIs are designed with the idea that
802 * a privilege check should only be triggered by a userland request.
803 * A privilege check at socket creation time is time consuming and
804 * could trigger many authorisation error messages from the security
805 * APIs.
806 */
807
808 *aso = so;
809
810 return (0);
811 }
812
813 /*
814 * Returns: 0 Success
815 * EAFNOSUPPORT
816 * EPROTOTYPE
817 * EPROTONOSUPPORT
818 * ENOBUFS
819 * <pru_attach>:ENOBUFS[AF_UNIX]
820 * <pru_attach>:ENOBUFS[TCP]
821 * <pru_attach>:ENOMEM[TCP]
822 * <pru_attach>:??? [other protocol families, IPSEC]
823 */
824 int
825 socreate(int dom, struct socket **aso, int type, int proto)
826 {
827 return (socreate_internal(dom, aso, type, proto, current_proc(), 0,
828 PROC_NULL));
829 }
830
831 int
832 socreate_delegate(int dom, struct socket **aso, int type, int proto, pid_t epid)
833 {
834 int error = 0;
835 struct proc *ep = PROC_NULL;
836
837 if ((proc_selfpid() != epid) && ((ep = proc_find(epid)) == PROC_NULL)) {
838 error = ESRCH;
839 goto done;
840 }
841
842 error = socreate_internal(dom, aso, type, proto, current_proc(), 0, ep);
843
844 /*
845 * It might not be wise to hold the proc reference when calling
846 * socreate_internal since it calls soalloc with M_WAITOK
847 */
848 done:
849 if (ep != PROC_NULL)
850 proc_rele(ep);
851
852 return (error);
853 }
854
855 /*
856 * Returns: 0 Success
857 * <pru_bind>:EINVAL Invalid argument [COMMON_START]
858 * <pru_bind>:EAFNOSUPPORT Address family not supported
859 * <pru_bind>:EADDRNOTAVAIL Address not available.
860 * <pru_bind>:EINVAL Invalid argument
861 * <pru_bind>:EAFNOSUPPORT Address family not supported [notdef]
862 * <pru_bind>:EACCES Permission denied
863 * <pru_bind>:EADDRINUSE Address in use
864 * <pru_bind>:EAGAIN Resource unavailable, try again
865 * <pru_bind>:EPERM Operation not permitted
866 * <pru_bind>:???
867 * <sf_bind>:???
868 *
869 * Notes: It's not possible to fully enumerate the return codes above,
870 * since socket filter authors and protocol family authors may
871 * not choose to limit their error returns to those listed, even
872 * though this may result in some software operating incorrectly.
873 *
874 * The error codes which are enumerated above are those known to
875 * be returned by the tcp_usr_bind function supplied.
876 */
877 int
878 sobindlock(struct socket *so, struct sockaddr *nam, int dolock)
879 {
880 struct proc *p = current_proc();
881 int error = 0;
882
883 if (dolock)
884 socket_lock(so, 1);
885 VERIFY(so->so_usecount > 1);
886
887 so_update_last_owner_locked(so, p);
888 so_update_policy(so);
889
890 #if NECP
891 so_update_necp_policy(so, nam, NULL);
892 #endif /* NECP */
893
894 /*
895 * If this is a bind request on a socket that has been marked
896 * as inactive, reject it now before we go any further.
897 */
898 if (so->so_flags & SOF_DEFUNCT) {
899 error = EINVAL;
900 SODEFUNCTLOG("%s[%d, %s]: defunct so 0x%llx [%d,%d] (%d)\n",
901 __func__, proc_pid(p), proc_best_name(p),
902 (uint64_t)DEBUG_KERNEL_ADDRPERM(so),
903 SOCK_DOM(so), SOCK_TYPE(so), error);
904 goto out;
905 }
906
907 /* Socket filter */
908 error = sflt_bind(so, nam);
909
910 if (error == 0)
911 error = (*so->so_proto->pr_usrreqs->pru_bind)(so, nam, p);
912 out:
913 if (dolock)
914 socket_unlock(so, 1);
915
916 if (error == EJUSTRETURN)
917 error = 0;
918
919 return (error);
920 }
921
922 void
923 sodealloc(struct socket *so)
924 {
925 kauth_cred_unref(&so->so_cred);
926
927 /* Remove any filters */
928 sflt_termsock(so);
929
930 #if CONTENT_FILTER
931 cfil_sock_detach(so);
932 #endif /* CONTENT_FILTER */
933
934 /* Delete the state allocated for msg queues on a socket */
935 if (so->so_flags & SOF_ENABLE_MSGS) {
936 FREE(so->so_msg_state, M_TEMP);
937 so->so_msg_state = NULL;
938 }
939 VERIFY(so->so_msg_state == NULL);
940
941 so->so_gencnt = OSIncrementAtomic64((SInt64 *)&so_gencnt);
942
943 #if CONFIG_MACF_SOCKET
944 mac_socket_label_destroy(so);
945 #endif /* MAC_SOCKET */
946
947 if (so->so_flags1 & SOF1_CACHED_IN_SOCK_LAYER) {
948 cached_sock_free(so);
949 } else {
950 FREE_ZONE(so, sizeof (*so), so->so_zone);
951 }
952 }
953
954 /*
955 * Returns: 0 Success
956 * EINVAL
957 * EOPNOTSUPP
958 * <pru_listen>:EINVAL[AF_UNIX]
959 * <pru_listen>:EINVAL[TCP]
960 * <pru_listen>:EADDRNOTAVAIL[TCP] Address not available.
961 * <pru_listen>:EINVAL[TCP] Invalid argument
962 * <pru_listen>:EAFNOSUPPORT[TCP] Address family not supported [notdef]
963 * <pru_listen>:EACCES[TCP] Permission denied
964 * <pru_listen>:EADDRINUSE[TCP] Address in use
965 * <pru_listen>:EAGAIN[TCP] Resource unavailable, try again
966 * <pru_listen>:EPERM[TCP] Operation not permitted
967 * <sf_listen>:???
968 *
969 * Notes: Other <pru_listen> returns depend on the protocol family; all
970 * <sf_listen> returns depend on what the filter author causes
971 * their filter to return.
972 */
973 int
974 solisten(struct socket *so, int backlog)
975 {
976 struct proc *p = current_proc();
977 int error = 0;
978
979 socket_lock(so, 1);
980
981 so_update_last_owner_locked(so, p);
982 so_update_policy(so);
983
984 #if NECP
985 so_update_necp_policy(so, NULL, NULL);
986 #endif /* NECP */
987
988 if (so->so_proto == NULL) {
989 error = EINVAL;
990 goto out;
991 }
992 if ((so->so_proto->pr_flags & PR_CONNREQUIRED) == 0) {
993 error = EOPNOTSUPP;
994 goto out;
995 }
996
997 /*
998 * If the listen request is made on a socket that is not fully
999 * disconnected, or on a socket that has been marked as inactive,
1000 * reject the request now.
1001 */
1002 if ((so->so_state &
1003 (SS_ISCONNECTED|SS_ISCONNECTING|SS_ISDISCONNECTING)) ||
1004 (so->so_flags & SOF_DEFUNCT)) {
1005 error = EINVAL;
1006 if (so->so_flags & SOF_DEFUNCT) {
1007 SODEFUNCTLOG("%s[%d, %s]: defunct so 0x%llx [%d,%d] "
1008 "(%d)\n", __func__, proc_pid(p),
1009 proc_best_name(p),
1010 (uint64_t)DEBUG_KERNEL_ADDRPERM(so),
1011 SOCK_DOM(so), SOCK_TYPE(so), error);
1012 }
1013 goto out;
1014 }
1015
1016 if ((so->so_restrictions & SO_RESTRICT_DENY_IN) != 0) {
1017 error = EPERM;
1018 goto out;
1019 }
1020
1021 error = sflt_listen(so);
1022 if (error == 0)
1023 error = (*so->so_proto->pr_usrreqs->pru_listen)(so, p);
1024
1025 if (error) {
1026 if (error == EJUSTRETURN)
1027 error = 0;
1028 goto out;
1029 }
1030
1031 if (TAILQ_EMPTY(&so->so_comp))
1032 so->so_options |= SO_ACCEPTCONN;
1033 /*
1034 * POSIX: The implementation may have an upper limit on the length of
1035 * the listen queue-either global or per accepting socket. If backlog
1036 * exceeds this limit, the length of the listen queue is set to the
1037 * limit.
1038 *
1039 * If listen() is called with a backlog argument value that is less
1040 * than 0, the function behaves as if it had been called with a backlog
1041 * argument value of 0.
1042 *
1043 * A backlog argument of 0 may allow the socket to accept connections,
1044 * in which case the length of the listen queue may be set to an
1045 * implementation-defined minimum value.
1046 */
1047 if (backlog <= 0 || backlog > somaxconn)
1048 backlog = somaxconn;
1049
1050 so->so_qlimit = backlog;
1051 out:
1052 socket_unlock(so, 1);
1053 return (error);
1054 }
1055
1056 void
1057 sofreelastref(struct socket *so, int dealloc)
1058 {
1059 struct socket *head = so->so_head;
1060
1061 /* Assume socket is locked */
1062
1063 if (!(so->so_flags & SOF_PCBCLEARING) || !(so->so_state & SS_NOFDREF)) {
1064 selthreadclear(&so->so_snd.sb_sel);
1065 selthreadclear(&so->so_rcv.sb_sel);
1066 so->so_rcv.sb_flags &= ~(SB_SEL|SB_UPCALL);
1067 so->so_snd.sb_flags &= ~(SB_SEL|SB_UPCALL);
1068 so->so_event = sonullevent;
1069 return;
1070 }
1071 if (head != NULL) {
1072 /*
1073 * Need to lock the listener when the protocol has
1074 * per socket locks
1075 */
1076 if (head->so_proto->pr_getlock != NULL)
1077 socket_lock(head, 1);
1078
1079 if (so->so_state & SS_INCOMP) {
1080 so->so_state &= ~SS_INCOMP;
1081 TAILQ_REMOVE(&head->so_incomp, so, so_list);
1082 head->so_incqlen--;
1083 head->so_qlen--;
1084 so->so_head = NULL;
1085 } else if (so->so_state & SS_COMP) {
1086 /*
1087 * We must not decommission a socket that's
1088 * on the accept(2) queue. If we do, then
1089 * accept(2) may hang after select(2) indicated
1090 * that the listening socket was ready.
1091 */
1092 selthreadclear(&so->so_snd.sb_sel);
1093 selthreadclear(&so->so_rcv.sb_sel);
1094 so->so_rcv.sb_flags &= ~(SB_SEL|SB_UPCALL);
1095 so->so_snd.sb_flags &= ~(SB_SEL|SB_UPCALL);
1096 so->so_event = sonullevent;
1097 if (head->so_proto->pr_getlock != NULL)
1098 socket_unlock(head, 1);
1099 return;
1100 } else {
1101 panic("sofree: not queued");
1102 }
1103 if (head->so_proto->pr_getlock != NULL)
1104 socket_unlock(head, 1);
1105 }
1106 sowflush(so);
1107 sorflush(so);
1108
1109 #if FLOW_DIVERT
1110 if (so->so_flags & SOF_FLOW_DIVERT) {
1111 flow_divert_detach(so);
1112 }
1113 #endif /* FLOW_DIVERT */
1114
1115 /* 3932268: disable upcall */
1116 so->so_rcv.sb_flags &= ~SB_UPCALL;
1117 so->so_snd.sb_flags &= ~(SB_UPCALL|SB_SNDBYTE_CNT);
1118 so->so_event = sonullevent;
1119
1120 if (dealloc)
1121 sodealloc(so);
1122 }
1123
1124 void
1125 soclose_wait_locked(struct socket *so)
1126 {
1127 lck_mtx_t *mutex_held;
1128
1129 if (so->so_proto->pr_getlock != NULL)
1130 mutex_held = (*so->so_proto->pr_getlock)(so, 0);
1131 else
1132 mutex_held = so->so_proto->pr_domain->dom_mtx;
1133 lck_mtx_assert(mutex_held, LCK_MTX_ASSERT_OWNED);
1134
1135 /*
1136 * Double check here and return if there's no outstanding upcall;
1137 * otherwise proceed further only if SOF_UPCALLCLOSEWAIT is set.
1138 */
1139 if (!so->so_upcallusecount || !(so->so_flags & SOF_UPCALLCLOSEWAIT))
1140 return;
1141 so->so_rcv.sb_flags &= ~SB_UPCALL;
1142 so->so_snd.sb_flags &= ~SB_UPCALL;
1143 so->so_flags |= SOF_CLOSEWAIT;
1144 (void) msleep((caddr_t)&so->so_upcallusecount, mutex_held, (PZERO - 1),
1145 "soclose_wait_locked", NULL);
1146 lck_mtx_assert(mutex_held, LCK_MTX_ASSERT_OWNED);
1147 so->so_flags &= ~SOF_CLOSEWAIT;
1148 }
1149
1150 /*
1151 * Close a socket on last file table reference removal.
1152 * Initiate disconnect if connected.
1153 * Free socket when disconnect complete.
1154 */
1155 int
1156 soclose_locked(struct socket *so)
1157 {
1158 int error = 0;
1159 lck_mtx_t *mutex_held;
1160 struct timespec ts;
1161
1162 if (so->so_usecount == 0) {
1163 panic("soclose: so=%p refcount=0\n", so);
1164 /* NOTREACHED */
1165 }
1166
1167 sflt_notify(so, sock_evt_closing, NULL);
1168
1169 if (so->so_upcallusecount)
1170 soclose_wait_locked(so);
1171
1172 #if CONTENT_FILTER
1173 /*
1174 * We have to wait until the content filters are done
1175 */
1176 if ((so->so_flags & SOF_CONTENT_FILTER) != 0) {
1177 cfil_sock_close_wait(so);
1178 cfil_sock_is_closed(so);
1179 cfil_sock_detach(so);
1180 }
1181 #endif /* CONTENT_FILTER */
1182
1183 if (so->so_flags1 & SOF1_EXTEND_BK_IDLE_INPROG) {
1184 soresume(current_proc(), so, 1);
1185 so->so_flags1 &= ~SOF1_EXTEND_BK_IDLE_WANTED;
1186 }
1187
1188 if ((so->so_options & SO_ACCEPTCONN)) {
1189 struct socket *sp;
1190
1191 /*
1192 * We do not want new connection to be added
1193 * to the connection queues
1194 */
1195 so->so_options &= ~SO_ACCEPTCONN;
1196
1197 while ((sp = TAILQ_FIRST(&so->so_incomp)) != NULL) {
1198 int socklock = 0;
1199
1200 /*
1201 * Radar 5350314
1202 * skip sockets thrown away by tcpdropdropblreq
1203 * they will get cleanup by the garbage collection.
1204 * otherwise, remove the incomp socket from the queue
1205 * and let soabort trigger the appropriate cleanup.
1206 */
1207 if (sp->so_flags & SOF_OVERFLOW)
1208 continue;
1209
1210 if (so->so_proto->pr_getlock != NULL) {
1211 /*
1212 * Lock ordering for consistency with the
1213 * rest of the stack, we lock the socket
1214 * first and then grab the head.
1215 */
1216 socket_unlock(so, 0);
1217 socket_lock(sp, 1);
1218 socket_lock(so, 0);
1219 socklock = 1;
1220 }
1221
1222 /*
1223 * Radar 27945981
1224 * The extra reference for the list insure the
1225 * validity of the socket pointer when we perform the
1226 * unlock of the head above
1227 */
1228 if (sp->so_state & SS_INCOMP) {
1229 sp->so_state &= ~SS_INCOMP;
1230 sp->so_head = NULL;
1231 TAILQ_REMOVE(&so->so_incomp, sp, so_list);
1232 so->so_incqlen--;
1233 so->so_qlen--;
1234
1235 (void) soabort(sp);
1236 }
1237
1238 if (socklock != 0)
1239 socket_unlock(sp, 1);
1240 }
1241
1242 while ((sp = TAILQ_FIRST(&so->so_comp)) != NULL) {
1243 int socklock = 0;
1244
1245 /* Dequeue from so_comp since sofree() won't do it */
1246 if (so->so_proto->pr_getlock != NULL) {
1247 /*
1248 * Lock ordering for consistency with the
1249 * rest of the stack, we lock the socket
1250 * first and then grab the head.
1251 */
1252 socket_unlock(so, 0);
1253 socket_lock(sp, 1);
1254 socket_lock(so, 0);
1255 socklock = 1;
1256 }
1257
1258 if (sp->so_state & SS_COMP) {
1259 sp->so_state &= ~SS_COMP;
1260 sp->so_head = NULL;
1261 TAILQ_REMOVE(&so->so_comp, sp, so_list);
1262 so->so_qlen--;
1263
1264 (void) soabort(sp);
1265 }
1266
1267 if (socklock)
1268 socket_unlock(sp, 1);
1269 }
1270 }
1271 if (so->so_pcb == NULL) {
1272 /* 3915887: mark the socket as ready for dealloc */
1273 so->so_flags |= SOF_PCBCLEARING;
1274 goto discard;
1275 }
1276 if (so->so_state & SS_ISCONNECTED) {
1277 if ((so->so_state & SS_ISDISCONNECTING) == 0) {
1278 error = sodisconnectlocked(so);
1279 if (error)
1280 goto drop;
1281 }
1282 if (so->so_options & SO_LINGER) {
1283 if ((so->so_state & SS_ISDISCONNECTING) &&
1284 (so->so_state & SS_NBIO))
1285 goto drop;
1286 if (so->so_proto->pr_getlock != NULL)
1287 mutex_held = (*so->so_proto->pr_getlock)(so, 0);
1288 else
1289 mutex_held = so->so_proto->pr_domain->dom_mtx;
1290 while (so->so_state & SS_ISCONNECTED) {
1291 ts.tv_sec = (so->so_linger/100);
1292 ts.tv_nsec = (so->so_linger % 100) *
1293 NSEC_PER_USEC * 1000 * 10;
1294 error = msleep((caddr_t)&so->so_timeo,
1295 mutex_held, PSOCK | PCATCH, "soclose", &ts);
1296 if (error) {
1297 /*
1298 * It's OK when the time fires,
1299 * don't report an error
1300 */
1301 if (error == EWOULDBLOCK)
1302 error = 0;
1303 break;
1304 }
1305 }
1306 }
1307 }
1308 drop:
1309 if (so->so_usecount == 0) {
1310 panic("soclose: usecount is zero so=%p\n", so);
1311 /* NOTREACHED */
1312 }
1313 if (so->so_pcb != NULL && !(so->so_flags & SOF_PCBCLEARING)) {
1314 int error2 = (*so->so_proto->pr_usrreqs->pru_detach)(so);
1315 if (error == 0)
1316 error = error2;
1317 }
1318 if (so->so_usecount <= 0) {
1319 panic("soclose: usecount is zero so=%p\n", so);
1320 /* NOTREACHED */
1321 }
1322 discard:
1323 if (so->so_pcb != NULL && !(so->so_flags & SOF_MP_SUBFLOW) &&
1324 (so->so_state & SS_NOFDREF)) {
1325 panic("soclose: NOFDREF");
1326 /* NOTREACHED */
1327 }
1328 so->so_state |= SS_NOFDREF;
1329
1330 if (so->so_flags & SOF_MP_SUBFLOW)
1331 so->so_flags &= ~SOF_MP_SUBFLOW;
1332
1333 if ((so->so_flags & SOF_KNOTE) != 0)
1334 KNOTE(&so->so_klist, SO_FILT_HINT_LOCKED);
1335
1336 atomic_add_32(&so->so_proto->pr_domain->dom_refs, -1);
1337 evsofree(so);
1338
1339 VERIFY(so->so_usecount > 0);
1340 so->so_usecount--;
1341 sofree(so);
1342 return (error);
1343 }
1344
1345 int
1346 soclose(struct socket *so)
1347 {
1348 int error = 0;
1349 socket_lock(so, 1);
1350
1351 if (so->so_retaincnt == 0) {
1352 error = soclose_locked(so);
1353 } else {
1354 /*
1355 * if the FD is going away, but socket is
1356 * retained in kernel remove its reference
1357 */
1358 so->so_usecount--;
1359 if (so->so_usecount < 2)
1360 panic("soclose: retaincnt non null and so=%p "
1361 "usecount=%d\n", so, so->so_usecount);
1362 }
1363 socket_unlock(so, 1);
1364 return (error);
1365 }
1366
1367 /*
1368 * Must be called at splnet...
1369 */
1370 /* Should already be locked */
1371 int
1372 soabort(struct socket *so)
1373 {
1374 int error;
1375
1376 #ifdef MORE_LOCKING_DEBUG
1377 lck_mtx_t *mutex_held;
1378
1379 if (so->so_proto->pr_getlock != NULL)
1380 mutex_held = (*so->so_proto->pr_getlock)(so, 0);
1381 else
1382 mutex_held = so->so_proto->pr_domain->dom_mtx;
1383 lck_mtx_assert(mutex_held, LCK_MTX_ASSERT_OWNED);
1384 #endif
1385
1386 if ((so->so_flags & SOF_ABORTED) == 0) {
1387 so->so_flags |= SOF_ABORTED;
1388 error = (*so->so_proto->pr_usrreqs->pru_abort)(so);
1389 if (error) {
1390 sofree(so);
1391 return (error);
1392 }
1393 }
1394 return (0);
1395 }
1396
1397 int
1398 soacceptlock(struct socket *so, struct sockaddr **nam, int dolock)
1399 {
1400 int error;
1401
1402 if (dolock)
1403 socket_lock(so, 1);
1404
1405 so_update_last_owner_locked(so, PROC_NULL);
1406 so_update_policy(so);
1407 #if NECP
1408 so_update_necp_policy(so, NULL, NULL);
1409 #endif /* NECP */
1410
1411 if ((so->so_state & SS_NOFDREF) == 0)
1412 panic("soaccept: !NOFDREF");
1413 so->so_state &= ~SS_NOFDREF;
1414 error = (*so->so_proto->pr_usrreqs->pru_accept)(so, nam);
1415
1416 if (dolock)
1417 socket_unlock(so, 1);
1418 return (error);
1419 }
1420
1421 int
1422 soaccept(struct socket *so, struct sockaddr **nam)
1423 {
1424 return (soacceptlock(so, nam, 1));
1425 }
1426
1427 int
1428 soacceptfilter(struct socket *so, struct socket *head)
1429 {
1430 struct sockaddr *local = NULL, *remote = NULL;
1431 int error = 0;
1432
1433 /*
1434 * Hold the lock even if this socket has not been made visible
1435 * to the filter(s). For sockets with global locks, this protects
1436 * against the head or peer going away
1437 */
1438 socket_lock(so, 1);
1439 if (sogetaddr_locked(so, &remote, 1) != 0 ||
1440 sogetaddr_locked(so, &local, 0) != 0) {
1441 so->so_state &= ~SS_NOFDREF;
1442 socket_unlock(so, 1);
1443 soclose(so);
1444 /* Out of resources; try it again next time */
1445 error = ECONNABORTED;
1446 goto done;
1447 }
1448
1449 error = sflt_accept(head, so, local, remote);
1450
1451 /*
1452 * If we get EJUSTRETURN from one of the filters, mark this socket
1453 * as inactive and return it anyway. This newly accepted socket
1454 * will be disconnected later before we hand it off to the caller.
1455 */
1456 if (error == EJUSTRETURN) {
1457 error = 0;
1458 (void) sosetdefunct(current_proc(), so,
1459 SHUTDOWN_SOCKET_LEVEL_DISCONNECT_INTERNAL, FALSE);
1460 }
1461
1462 if (error != 0) {
1463 /*
1464 * This may seem like a duplication to the above error
1465 * handling part when we return ECONNABORTED, except
1466 * the following is done while holding the lock since
1467 * the socket has been exposed to the filter(s) earlier.
1468 */
1469 so->so_state &= ~SS_COMP;
1470 socket_unlock(so, 1);
1471 soclose(so);
1472 /* Propagate socket filter's error code to the caller */
1473 } else {
1474 socket_unlock(so, 1);
1475 }
1476 done:
1477 /* Callee checks for NULL pointer */
1478 sock_freeaddr(remote);
1479 sock_freeaddr(local);
1480 return (error);
1481 }
1482
1483 /*
1484 * Returns: 0 Success
1485 * EOPNOTSUPP Operation not supported on socket
1486 * EISCONN Socket is connected
1487 * <pru_connect>:EADDRNOTAVAIL Address not available.
1488 * <pru_connect>:EINVAL Invalid argument
1489 * <pru_connect>:EAFNOSUPPORT Address family not supported [notdef]
1490 * <pru_connect>:EACCES Permission denied
1491 * <pru_connect>:EADDRINUSE Address in use
1492 * <pru_connect>:EAGAIN Resource unavailable, try again
1493 * <pru_connect>:EPERM Operation not permitted
1494 * <sf_connect_out>:??? [anything a filter writer might set]
1495 */
1496 int
1497 soconnectlock(struct socket *so, struct sockaddr *nam, int dolock)
1498 {
1499 int error;
1500 struct proc *p = current_proc();
1501
1502 if (dolock)
1503 socket_lock(so, 1);
1504
1505 so_update_last_owner_locked(so, p);
1506 so_update_policy(so);
1507
1508 #if NECP
1509 so_update_necp_policy(so, NULL, nam);
1510 #endif /* NECP */
1511
1512 /*
1513 * If this is a listening socket or if this is a previously-accepted
1514 * socket that has been marked as inactive, reject the connect request.
1515 */
1516 if ((so->so_options & SO_ACCEPTCONN) || (so->so_flags & SOF_DEFUNCT)) {
1517 error = EOPNOTSUPP;
1518 if (so->so_flags & SOF_DEFUNCT) {
1519 SODEFUNCTLOG("%s[%d, %s]: defunct so 0x%llx [%d,%d] "
1520 "(%d)\n", __func__, proc_pid(p),
1521 proc_best_name(p),
1522 (uint64_t)DEBUG_KERNEL_ADDRPERM(so),
1523 SOCK_DOM(so), SOCK_TYPE(so), error);
1524 }
1525 if (dolock)
1526 socket_unlock(so, 1);
1527 return (error);
1528 }
1529
1530 if ((so->so_restrictions & SO_RESTRICT_DENY_OUT) != 0) {
1531 if (dolock)
1532 socket_unlock(so, 1);
1533 return (EPERM);
1534 }
1535
1536 /*
1537 * If protocol is connection-based, can only connect once.
1538 * Otherwise, if connected, try to disconnect first.
1539 * This allows user to disconnect by connecting to, e.g.,
1540 * a null address.
1541 */
1542 if (so->so_state & (SS_ISCONNECTED|SS_ISCONNECTING) &&
1543 ((so->so_proto->pr_flags & PR_CONNREQUIRED) ||
1544 (error = sodisconnectlocked(so)))) {
1545 error = EISCONN;
1546 } else {
1547 /*
1548 * Run connect filter before calling protocol:
1549 * - non-blocking connect returns before completion;
1550 */
1551 error = sflt_connectout(so, nam);
1552 if (error != 0) {
1553 if (error == EJUSTRETURN)
1554 error = 0;
1555 } else {
1556 error = (*so->so_proto->pr_usrreqs->pru_connect)
1557 (so, nam, p);
1558 }
1559 }
1560 if (dolock)
1561 socket_unlock(so, 1);
1562 return (error);
1563 }
1564
1565 int
1566 soconnect(struct socket *so, struct sockaddr *nam)
1567 {
1568 return (soconnectlock(so, nam, 1));
1569 }
1570
1571 /*
1572 * Returns: 0 Success
1573 * <pru_connect2>:EINVAL[AF_UNIX]
1574 * <pru_connect2>:EPROTOTYPE[AF_UNIX]
1575 * <pru_connect2>:??? [other protocol families]
1576 *
1577 * Notes: <pru_connect2> is not supported by [TCP].
1578 */
1579 int
1580 soconnect2(struct socket *so1, struct socket *so2)
1581 {
1582 int error;
1583
1584 socket_lock(so1, 1);
1585 if (so2->so_proto->pr_lock)
1586 socket_lock(so2, 1);
1587
1588 error = (*so1->so_proto->pr_usrreqs->pru_connect2)(so1, so2);
1589
1590 socket_unlock(so1, 1);
1591 if (so2->so_proto->pr_lock)
1592 socket_unlock(so2, 1);
1593 return (error);
1594 }
1595
1596 int
1597 soconnectxlocked(struct socket *so, struct sockaddr_list **src_sl,
1598 struct sockaddr_list **dst_sl, struct proc *p, uint32_t ifscope,
1599 sae_associd_t aid, sae_connid_t *pcid, uint32_t flags, void *arg,
1600 uint32_t arglen, uio_t auio, user_ssize_t *bytes_written)
1601 {
1602 int error;
1603
1604 so_update_last_owner_locked(so, p);
1605 so_update_policy(so);
1606
1607 /*
1608 * If this is a listening socket or if this is a previously-accepted
1609 * socket that has been marked as inactive, reject the connect request.
1610 */
1611 if ((so->so_options & SO_ACCEPTCONN) || (so->so_flags & SOF_DEFUNCT)) {
1612 error = EOPNOTSUPP;
1613 if (so->so_flags & SOF_DEFUNCT) {
1614 SODEFUNCTLOG("%s[%d, %s]: defunct so 0x%llx [%d,%d] "
1615 "(%d)\n", __func__, proc_pid(p),
1616 proc_best_name(p),
1617 (uint64_t)DEBUG_KERNEL_ADDRPERM(so),
1618 SOCK_DOM(so), SOCK_TYPE(so), error);
1619 }
1620 return (error);
1621 }
1622
1623 if ((so->so_restrictions & SO_RESTRICT_DENY_OUT) != 0)
1624 return (EPERM);
1625
1626 /*
1627 * If protocol is connection-based, can only connect once
1628 * unless PR_MULTICONN is set. Otherwise, if connected,
1629 * try to disconnect first. This allows user to disconnect
1630 * by connecting to, e.g., a null address.
1631 */
1632 if ((so->so_state & (SS_ISCONNECTED|SS_ISCONNECTING)) &&
1633 !(so->so_proto->pr_flags & PR_MULTICONN) &&
1634 ((so->so_proto->pr_flags & PR_CONNREQUIRED) ||
1635 (error = sodisconnectlocked(so)) != 0)) {
1636 error = EISCONN;
1637 } else {
1638 /*
1639 * Run connect filter before calling protocol:
1640 * - non-blocking connect returns before completion;
1641 */
1642 error = sflt_connectxout(so, dst_sl);
1643 if (error != 0) {
1644 /* Disable PRECONNECT_DATA, as we don't need to send a SYN anymore. */
1645 so->so_flags1 &= ~SOF1_PRECONNECT_DATA;
1646 if (error == EJUSTRETURN)
1647 error = 0;
1648 } else {
1649 error = (*so->so_proto->pr_usrreqs->pru_connectx)
1650 (so, src_sl, dst_sl, p, ifscope, aid, pcid,
1651 flags, arg, arglen, auio, bytes_written);
1652 }
1653 }
1654
1655 return (error);
1656 }
1657
1658 int
1659 sodisconnectlocked(struct socket *so)
1660 {
1661 int error;
1662
1663 if ((so->so_state & SS_ISCONNECTED) == 0) {
1664 error = ENOTCONN;
1665 goto bad;
1666 }
1667 if (so->so_state & SS_ISDISCONNECTING) {
1668 error = EALREADY;
1669 goto bad;
1670 }
1671
1672 error = (*so->so_proto->pr_usrreqs->pru_disconnect)(so);
1673 if (error == 0)
1674 sflt_notify(so, sock_evt_disconnected, NULL);
1675
1676 bad:
1677 return (error);
1678 }
1679
1680 /* Locking version */
1681 int
1682 sodisconnect(struct socket *so)
1683 {
1684 int error;
1685
1686 socket_lock(so, 1);
1687 error = sodisconnectlocked(so);
1688 socket_unlock(so, 1);
1689 return (error);
1690 }
1691
1692 int
1693 sodisconnectxlocked(struct socket *so, sae_associd_t aid, sae_connid_t cid)
1694 {
1695 int error;
1696
1697 /*
1698 * Call the protocol disconnectx handler; let it handle all
1699 * matters related to the connection state of this session.
1700 */
1701 error = (*so->so_proto->pr_usrreqs->pru_disconnectx)(so, aid, cid);
1702 if (error == 0) {
1703 /*
1704 * The event applies only for the session, not for
1705 * the disconnection of individual subflows.
1706 */
1707 if (so->so_state & (SS_ISDISCONNECTING|SS_ISDISCONNECTED))
1708 sflt_notify(so, sock_evt_disconnected, NULL);
1709 }
1710 return (error);
1711 }
1712
1713 int
1714 sodisconnectx(struct socket *so, sae_associd_t aid, sae_connid_t cid)
1715 {
1716 int error;
1717
1718 socket_lock(so, 1);
1719 error = sodisconnectxlocked(so, aid, cid);
1720 socket_unlock(so, 1);
1721 return (error);
1722 }
1723
1724 int
1725 sopeelofflocked(struct socket *so, sae_associd_t aid, struct socket **psop)
1726 {
1727 return ((*so->so_proto->pr_usrreqs->pru_peeloff)(so, aid, psop));
1728 }
1729
1730 #define SBLOCKWAIT(f) (((f) & MSG_DONTWAIT) ? 0 : SBL_WAIT)
1731
1732 /*
1733 * sosendcheck will lock the socket buffer if it isn't locked and
1734 * verify that there is space for the data being inserted.
1735 *
1736 * Returns: 0 Success
1737 * EPIPE
1738 * sblock:EWOULDBLOCK
1739 * sblock:EINTR
1740 * sbwait:EBADF
1741 * sbwait:EINTR
1742 * [so_error]:???
1743 */
1744 int
1745 sosendcheck(struct socket *so, struct sockaddr *addr, user_ssize_t resid,
1746 int32_t clen, int32_t atomic, int flags, int *sblocked,
1747 struct mbuf *control)
1748 {
1749 int error = 0;
1750 int32_t space;
1751 int assumelock = 0;
1752
1753 restart:
1754 if (*sblocked == 0) {
1755 if ((so->so_snd.sb_flags & SB_LOCK) != 0 &&
1756 so->so_send_filt_thread != 0 &&
1757 so->so_send_filt_thread == current_thread()) {
1758 /*
1759 * We're being called recursively from a filter,
1760 * allow this to continue. Radar 4150520.
1761 * Don't set sblocked because we don't want
1762 * to perform an unlock later.
1763 */
1764 assumelock = 1;
1765 } else {
1766 error = sblock(&so->so_snd, SBLOCKWAIT(flags));
1767 if (error) {
1768 if (so->so_flags & SOF_DEFUNCT)
1769 goto defunct;
1770 return (error);
1771 }
1772 *sblocked = 1;
1773 }
1774 }
1775
1776 /*
1777 * If a send attempt is made on a socket that has been marked
1778 * as inactive (disconnected), reject the request.
1779 */
1780 if (so->so_flags & SOF_DEFUNCT) {
1781 defunct:
1782 error = EPIPE;
1783 SODEFUNCTLOG("%s[%d, %s]: defunct so 0x%llx [%d,%d] (%d)\n",
1784 __func__, proc_selfpid(), proc_best_name(current_proc()),
1785 (uint64_t)DEBUG_KERNEL_ADDRPERM(so),
1786 SOCK_DOM(so), SOCK_TYPE(so), error);
1787 return (error);
1788 }
1789
1790 if (so->so_state & SS_CANTSENDMORE) {
1791 #if CONTENT_FILTER
1792 /*
1793 * Can re-inject data of half closed connections
1794 */
1795 if ((so->so_state & SS_ISDISCONNECTED) == 0 &&
1796 so->so_snd.sb_cfil_thread == current_thread() &&
1797 cfil_sock_data_pending(&so->so_snd) != 0)
1798 CFIL_LOG(LOG_INFO,
1799 "so %llx ignore SS_CANTSENDMORE",
1800 (uint64_t)DEBUG_KERNEL_ADDRPERM(so));
1801 else
1802 #endif /* CONTENT_FILTER */
1803 return (EPIPE);
1804 }
1805 if (so->so_error) {
1806 error = so->so_error;
1807 so->so_error = 0;
1808 return (error);
1809 }
1810
1811 if ((so->so_state & SS_ISCONNECTED) == 0) {
1812 if ((so->so_proto->pr_flags & PR_CONNREQUIRED) != 0) {
1813 if (((so->so_state & SS_ISCONFIRMING) == 0) &&
1814 (resid != 0 || clen == 0) &&
1815 !(so->so_flags1 & SOF1_PRECONNECT_DATA)) {
1816 #if MPTCP
1817 /*
1818 * MPTCP Fast Join sends data before the
1819 * socket is truly connected.
1820 */
1821 if ((so->so_flags & (SOF_MP_SUBFLOW |
1822 SOF_MPTCP_FASTJOIN)) !=
1823 (SOF_MP_SUBFLOW | SOF_MPTCP_FASTJOIN))
1824 #endif /* MPTCP */
1825 return (ENOTCONN);
1826 }
1827 } else if (addr == 0 && !(flags&MSG_HOLD)) {
1828 return ((so->so_proto->pr_flags & PR_CONNREQUIRED) ?
1829 ENOTCONN : EDESTADDRREQ);
1830 }
1831 }
1832
1833 if (so->so_flags & SOF_ENABLE_MSGS)
1834 space = msgq_sbspace(so, control);
1835 else
1836 space = sbspace(&so->so_snd);
1837
1838 if (flags & MSG_OOB)
1839 space += 1024;
1840 if ((atomic && resid > so->so_snd.sb_hiwat) ||
1841 clen > so->so_snd.sb_hiwat)
1842 return (EMSGSIZE);
1843
1844 if ((space < resid + clen &&
1845 (atomic || (space < (int32_t)so->so_snd.sb_lowat) ||
1846 space < clen)) ||
1847 (so->so_type == SOCK_STREAM && so_wait_for_if_feedback(so))) {
1848 /*
1849 * don't block the connectx call when there's more data
1850 * than can be copied.
1851 */
1852 if (so->so_flags1 & SOF1_PRECONNECT_DATA) {
1853 if (space == 0) {
1854 return (EWOULDBLOCK);
1855 }
1856 if (space < (int32_t)so->so_snd.sb_lowat) {
1857 return (0);
1858 }
1859 }
1860 if ((so->so_state & SS_NBIO) || (flags & MSG_NBIO) ||
1861 assumelock) {
1862 return (EWOULDBLOCK);
1863 }
1864 sbunlock(&so->so_snd, TRUE); /* keep socket locked */
1865 *sblocked = 0;
1866 error = sbwait(&so->so_snd);
1867 if (error) {
1868 if (so->so_flags & SOF_DEFUNCT)
1869 goto defunct;
1870 return (error);
1871 }
1872 goto restart;
1873 }
1874 return (0);
1875 }
1876
1877 /*
1878 * Send on a socket.
1879 * If send must go all at once and message is larger than
1880 * send buffering, then hard error.
1881 * Lock against other senders.
1882 * If must go all at once and not enough room now, then
1883 * inform user that this would block and do nothing.
1884 * Otherwise, if nonblocking, send as much as possible.
1885 * The data to be sent is described by "uio" if nonzero,
1886 * otherwise by the mbuf chain "top" (which must be null
1887 * if uio is not). Data provided in mbuf chain must be small
1888 * enough to send all at once.
1889 *
1890 * Returns nonzero on error, timeout or signal; callers
1891 * must check for short counts if EINTR/ERESTART are returned.
1892 * Data and control buffers are freed on return.
1893 * Experiment:
1894 * MSG_HOLD: go thru most of sosend(), but just enqueue the mbuf
1895 * MSG_SEND: go thru as for MSG_HOLD on current fragment, then
1896 * point at the mbuf chain being constructed and go from there.
1897 *
1898 * Returns: 0 Success
1899 * EOPNOTSUPP
1900 * EINVAL
1901 * ENOBUFS
1902 * uiomove:EFAULT
1903 * sosendcheck:EPIPE
1904 * sosendcheck:EWOULDBLOCK
1905 * sosendcheck:EINTR
1906 * sosendcheck:EBADF
1907 * sosendcheck:EINTR
1908 * sosendcheck:??? [value from so_error]
1909 * <pru_send>:ECONNRESET[TCP]
1910 * <pru_send>:EINVAL[TCP]
1911 * <pru_send>:ENOBUFS[TCP]
1912 * <pru_send>:EADDRINUSE[TCP]
1913 * <pru_send>:EADDRNOTAVAIL[TCP]
1914 * <pru_send>:EAFNOSUPPORT[TCP]
1915 * <pru_send>:EACCES[TCP]
1916 * <pru_send>:EAGAIN[TCP]
1917 * <pru_send>:EPERM[TCP]
1918 * <pru_send>:EMSGSIZE[TCP]
1919 * <pru_send>:EHOSTUNREACH[TCP]
1920 * <pru_send>:ENETUNREACH[TCP]
1921 * <pru_send>:ENETDOWN[TCP]
1922 * <pru_send>:ENOMEM[TCP]
1923 * <pru_send>:ENOBUFS[TCP]
1924 * <pru_send>:???[TCP] [ignorable: mostly IPSEC/firewall/DLIL]
1925 * <pru_send>:EINVAL[AF_UNIX]
1926 * <pru_send>:EOPNOTSUPP[AF_UNIX]
1927 * <pru_send>:EPIPE[AF_UNIX]
1928 * <pru_send>:ENOTCONN[AF_UNIX]
1929 * <pru_send>:EISCONN[AF_UNIX]
1930 * <pru_send>:???[AF_UNIX] [whatever a filter author chooses]
1931 * <sf_data_out>:??? [whatever a filter author chooses]
1932 *
1933 * Notes: Other <pru_send> returns depend on the protocol family; all
1934 * <sf_data_out> returns depend on what the filter author causes
1935 * their filter to return.
1936 */
1937 int
1938 sosend(struct socket *so, struct sockaddr *addr, struct uio *uio,
1939 struct mbuf *top, struct mbuf *control, int flags)
1940 {
1941 struct mbuf **mp;
1942 struct mbuf *m, *freelist = NULL;
1943 user_ssize_t space, len, resid, orig_resid;
1944 int clen = 0, error, dontroute, mlen, sendflags;
1945 int atomic = sosendallatonce(so) || top;
1946 int sblocked = 0;
1947 struct proc *p = current_proc();
1948 struct mbuf *control_copy = NULL;
1949 uint16_t headroom = 0;
1950 boolean_t en_tracing = FALSE;
1951
1952 if (uio != NULL)
1953 resid = uio_resid(uio);
1954 else
1955 resid = top->m_pkthdr.len;
1956
1957 KERNEL_DEBUG((DBG_FNC_SOSEND | DBG_FUNC_START), so, resid,
1958 so->so_snd.sb_cc, so->so_snd.sb_lowat, so->so_snd.sb_hiwat);
1959
1960 socket_lock(so, 1);
1961
1962 /*
1963 * trace if tracing & network (vs. unix) sockets & and
1964 * non-loopback
1965 */
1966 if (ENTR_SHOULDTRACE &&
1967 (SOCK_CHECK_DOM(so, AF_INET) || SOCK_CHECK_DOM(so, AF_INET6))) {
1968 struct inpcb *inp = sotoinpcb(so);
1969 if (inp->inp_last_outifp != NULL &&
1970 !(inp->inp_last_outifp->if_flags & IFF_LOOPBACK)) {
1971 en_tracing = TRUE;
1972 KERNEL_ENERGYTRACE(kEnTrActKernSockWrite, DBG_FUNC_START,
1973 VM_KERNEL_ADDRPERM(so),
1974 ((so->so_state & SS_NBIO) ? kEnTrFlagNonBlocking : 0),
1975 (int64_t)resid);
1976 orig_resid = resid;
1977 }
1978 }
1979
1980 /*
1981 * Re-injection should not affect process accounting
1982 */
1983 if ((flags & MSG_SKIPCFIL) == 0) {
1984 so_update_last_owner_locked(so, p);
1985 so_update_policy(so);
1986
1987 #if NECP
1988 so_update_necp_policy(so, NULL, addr);
1989 #endif /* NECP */
1990 }
1991
1992 if (so->so_type != SOCK_STREAM && (flags & MSG_OOB) != 0) {
1993 error = EOPNOTSUPP;
1994 socket_unlock(so, 1);
1995 goto out;
1996 }
1997
1998 /*
1999 * In theory resid should be unsigned.
2000 * However, space must be signed, as it might be less than 0
2001 * if we over-committed, and we must use a signed comparison
2002 * of space and resid. On the other hand, a negative resid
2003 * causes us to loop sending 0-length segments to the protocol.
2004 *
2005 * Usually, MSG_EOR isn't used on SOCK_STREAM type sockets.
2006 * But it will be used by sockets doing message delivery.
2007 *
2008 * Note: We limit resid to be a positive int value as we use
2009 * imin() to set bytes_to_copy -- radr://14558484
2010 */
2011 if (resid < 0 || resid > INT_MAX || (so->so_type == SOCK_STREAM &&
2012 !(so->so_flags & SOF_ENABLE_MSGS) && (flags & MSG_EOR))) {
2013 error = EINVAL;
2014 socket_unlock(so, 1);
2015 goto out;
2016 }
2017
2018 dontroute = (flags & MSG_DONTROUTE) &&
2019 (so->so_options & SO_DONTROUTE) == 0 &&
2020 (so->so_proto->pr_flags & PR_ATOMIC);
2021 OSIncrementAtomicLong(&p->p_stats->p_ru.ru_msgsnd);
2022
2023 if (control != NULL)
2024 clen = control->m_len;
2025
2026 if (soreserveheadroom != 0)
2027 headroom = so->so_pktheadroom;
2028
2029 do {
2030 error = sosendcheck(so, addr, resid, clen, atomic, flags,
2031 &sblocked, control);
2032 if (error)
2033 goto release;
2034
2035 mp = &top;
2036 if (so->so_flags & SOF_ENABLE_MSGS)
2037 space = msgq_sbspace(so, control);
2038 else
2039 space = sbspace(&so->so_snd) - clen;
2040 space += ((flags & MSG_OOB) ? 1024 : 0);
2041
2042 do {
2043 if (uio == NULL) {
2044 /*
2045 * Data is prepackaged in "top".
2046 */
2047 resid = 0;
2048 if (flags & MSG_EOR)
2049 top->m_flags |= M_EOR;
2050 } else {
2051 int chainlength;
2052 int bytes_to_copy;
2053 boolean_t jumbocl;
2054 boolean_t bigcl;
2055 int bytes_to_alloc;
2056
2057 bytes_to_copy = imin(resid, space);
2058
2059 bytes_to_alloc = bytes_to_copy;
2060 if (top == NULL)
2061 bytes_to_alloc += headroom;
2062
2063 if (sosendminchain > 0)
2064 chainlength = 0;
2065 else
2066 chainlength = sosendmaxchain;
2067
2068 /*
2069 * Use big 4 KB cluster when the outgoing interface
2070 * does not prefer 2 KB clusters
2071 */
2072 bigcl = !(so->so_flags1 & SOF1_IF_2KCL) ||
2073 sosendbigcl_ignore_capab;
2074
2075 /*
2076 * Attempt to use larger than system page-size
2077 * clusters for large writes only if there is
2078 * a jumbo cluster pool and if the socket is
2079 * marked accordingly.
2080 */
2081 jumbocl = sosendjcl && njcl > 0 &&
2082 ((so->so_flags & SOF_MULTIPAGES) ||
2083 sosendjcl_ignore_capab) &&
2084 bigcl;
2085
2086 socket_unlock(so, 0);
2087
2088 do {
2089 int num_needed;
2090 int hdrs_needed = (top == NULL) ? 1 : 0;
2091
2092 /*
2093 * try to maintain a local cache of mbuf
2094 * clusters needed to complete this
2095 * write the list is further limited to
2096 * the number that are currently needed
2097 * to fill the socket this mechanism
2098 * allows a large number of mbufs/
2099 * clusters to be grabbed under a single
2100 * mbuf lock... if we can't get any
2101 * clusters, than fall back to trying
2102 * for mbufs if we fail early (or
2103 * miscalcluate the number needed) make
2104 * sure to release any clusters we
2105 * haven't yet consumed.
2106 */
2107 if (freelist == NULL &&
2108 bytes_to_alloc > MBIGCLBYTES &&
2109 jumbocl) {
2110 num_needed =
2111 bytes_to_alloc / M16KCLBYTES;
2112
2113 if ((bytes_to_alloc -
2114 (num_needed * M16KCLBYTES))
2115 >= MINCLSIZE)
2116 num_needed++;
2117
2118 freelist =
2119 m_getpackets_internal(
2120 (unsigned int *)&num_needed,
2121 hdrs_needed, M_WAIT, 0,
2122 M16KCLBYTES);
2123 /*
2124 * Fall back to 4K cluster size
2125 * if allocation failed
2126 */
2127 }
2128
2129 if (freelist == NULL &&
2130 bytes_to_alloc > MCLBYTES &&
2131 bigcl) {
2132 num_needed =
2133 bytes_to_alloc / MBIGCLBYTES;
2134
2135 if ((bytes_to_alloc -
2136 (num_needed * MBIGCLBYTES)) >=
2137 MINCLSIZE)
2138 num_needed++;
2139
2140 freelist =
2141 m_getpackets_internal(
2142 (unsigned int *)&num_needed,
2143 hdrs_needed, M_WAIT, 0,
2144 MBIGCLBYTES);
2145 /*
2146 * Fall back to cluster size
2147 * if allocation failed
2148 */
2149 }
2150
2151 /*
2152 * Allocate a cluster as we want to
2153 * avoid to split the data in more
2154 * that one segment and using MINCLSIZE
2155 * would lead us to allocate two mbufs
2156 */
2157 if (soreserveheadroom != 0 &&
2158 freelist == NULL &&
2159 ((top == NULL &&
2160 bytes_to_alloc > _MHLEN) ||
2161 bytes_to_alloc > _MLEN)) {
2162 num_needed = ROUNDUP(bytes_to_alloc, MCLBYTES) /
2163 MCLBYTES;
2164 freelist =
2165 m_getpackets_internal(
2166 (unsigned int *)&num_needed,
2167 hdrs_needed, M_WAIT, 0,
2168 MCLBYTES);
2169 /*
2170 * Fall back to a single mbuf
2171 * if allocation failed
2172 */
2173 } else if (freelist == NULL &&
2174 bytes_to_alloc > MINCLSIZE) {
2175 num_needed =
2176 bytes_to_alloc / MCLBYTES;
2177
2178 if ((bytes_to_alloc -
2179 (num_needed * MCLBYTES)) >=
2180 MINCLSIZE)
2181 num_needed++;
2182
2183 freelist =
2184 m_getpackets_internal(
2185 (unsigned int *)&num_needed,
2186 hdrs_needed, M_WAIT, 0,
2187 MCLBYTES);
2188 /*
2189 * Fall back to a single mbuf
2190 * if allocation failed
2191 */
2192 }
2193 /*
2194 * For datagram protocols, leave
2195 * headroom for protocol headers
2196 * in the first cluster of the chain
2197 */
2198 if (freelist != NULL && atomic &&
2199 top == NULL && headroom > 0) {
2200 freelist->m_data += headroom;
2201 }
2202
2203 /*
2204 * Fall back to regular mbufs without
2205 * reserving the socket headroom
2206 */
2207 if (freelist == NULL) {
2208 if (top == NULL)
2209 MGETHDR(freelist,
2210 M_WAIT, MT_DATA);
2211 else
2212 MGET(freelist,
2213 M_WAIT, MT_DATA);
2214
2215 if (freelist == NULL) {
2216 error = ENOBUFS;
2217 socket_lock(so, 0);
2218 goto release;
2219 }
2220 /*
2221 * For datagram protocols,
2222 * leave room for protocol
2223 * headers in first mbuf.
2224 */
2225 if (atomic && top == NULL &&
2226 bytes_to_copy < MHLEN) {
2227 MH_ALIGN(freelist,
2228 bytes_to_copy);
2229 }
2230 }
2231 m = freelist;
2232 freelist = m->m_next;
2233 m->m_next = NULL;
2234
2235 if ((m->m_flags & M_EXT))
2236 mlen = m->m_ext.ext_size -
2237 m_leadingspace(m);
2238 else if ((m->m_flags & M_PKTHDR))
2239 mlen =
2240 MHLEN - m_leadingspace(m);
2241 else
2242 mlen = MLEN - m_leadingspace(m);
2243 len = imin(mlen, bytes_to_copy);
2244
2245 chainlength += len;
2246
2247 space -= len;
2248
2249 error = uiomove(mtod(m, caddr_t),
2250 len, uio);
2251
2252 resid = uio_resid(uio);
2253
2254 m->m_len = len;
2255 *mp = m;
2256 top->m_pkthdr.len += len;
2257 if (error)
2258 break;
2259 mp = &m->m_next;
2260 if (resid <= 0) {
2261 if (flags & MSG_EOR)
2262 top->m_flags |= M_EOR;
2263 break;
2264 }
2265 bytes_to_copy = min(resid, space);
2266
2267 } while (space > 0 &&
2268 (chainlength < sosendmaxchain || atomic ||
2269 resid < MINCLSIZE));
2270
2271 socket_lock(so, 0);
2272
2273 if (error)
2274 goto release;
2275 }
2276
2277 if (flags & (MSG_HOLD|MSG_SEND)) {
2278 /* Enqueue for later, go away if HOLD */
2279 struct mbuf *mb1;
2280 if (so->so_temp && (flags & MSG_FLUSH)) {
2281 m_freem(so->so_temp);
2282 so->so_temp = NULL;
2283 }
2284 if (so->so_temp)
2285 so->so_tail->m_next = top;
2286 else
2287 so->so_temp = top;
2288 mb1 = top;
2289 while (mb1->m_next)
2290 mb1 = mb1->m_next;
2291 so->so_tail = mb1;
2292 if (flags & MSG_HOLD) {
2293 top = NULL;
2294 goto release;
2295 }
2296 top = so->so_temp;
2297 }
2298 if (dontroute)
2299 so->so_options |= SO_DONTROUTE;
2300
2301 /*
2302 * Compute flags here, for pru_send and NKEs
2303 *
2304 * If the user set MSG_EOF, the protocol
2305 * understands this flag and nothing left to
2306 * send then use PRU_SEND_EOF instead of PRU_SEND.
2307 */
2308 sendflags = (flags & MSG_OOB) ? PRUS_OOB :
2309 ((flags & MSG_EOF) &&
2310 (so->so_proto->pr_flags & PR_IMPLOPCL) &&
2311 (resid <= 0)) ? PRUS_EOF :
2312 /* If there is more to send set PRUS_MORETOCOME */
2313 (resid > 0 && space > 0) ? PRUS_MORETOCOME : 0;
2314
2315 if ((flags & MSG_SKIPCFIL) == 0) {
2316 /*
2317 * Socket filter processing
2318 */
2319 error = sflt_data_out(so, addr, &top,
2320 &control, (sendflags & MSG_OOB) ?
2321 sock_data_filt_flag_oob : 0);
2322 if (error) {
2323 if (error == EJUSTRETURN) {
2324 error = 0;
2325 clen = 0;
2326 control = NULL;
2327 top = NULL;
2328 }
2329 goto release;
2330 }
2331 #if CONTENT_FILTER
2332 /*
2333 * Content filter processing
2334 */
2335 error = cfil_sock_data_out(so, addr, top,
2336 control, (sendflags & MSG_OOB) ?
2337 sock_data_filt_flag_oob : 0);
2338 if (error) {
2339 if (error == EJUSTRETURN) {
2340 error = 0;
2341 clen = 0;
2342 control = NULL;
2343 top = NULL;
2344 }
2345 goto release;
2346 }
2347 #endif /* CONTENT_FILTER */
2348 }
2349 if (so->so_flags & SOF_ENABLE_MSGS) {
2350 /*
2351 * Make a copy of control mbuf,
2352 * so that msg priority can be
2353 * passed to subsequent mbufs.
2354 */
2355 control_copy = m_dup(control, M_NOWAIT);
2356 }
2357 error = (*so->so_proto->pr_usrreqs->pru_send)
2358 (so, sendflags, top, addr, control, p);
2359
2360 if (flags & MSG_SEND)
2361 so->so_temp = NULL;
2362
2363 if (dontroute)
2364 so->so_options &= ~SO_DONTROUTE;
2365
2366 clen = 0;
2367 control = control_copy;
2368 control_copy = NULL;
2369 top = NULL;
2370 mp = &top;
2371 if (error)
2372 goto release;
2373 } while (resid && space > 0);
2374 } while (resid);
2375
2376 release:
2377 if (sblocked)
2378 sbunlock(&so->so_snd, FALSE); /* will unlock socket */
2379 else
2380 socket_unlock(so, 1);
2381 out:
2382 if (top != NULL)
2383 m_freem(top);
2384 if (control != NULL)
2385 m_freem(control);
2386 if (freelist != NULL)
2387 m_freem_list(freelist);
2388 if (control_copy != NULL)
2389 m_freem(control_copy);
2390
2391 /*
2392 * One write has been done. This was enough. Get back to "normal"
2393 * behavior.
2394 */
2395 if (so->so_flags1 & SOF1_PRECONNECT_DATA)
2396 so->so_flags1 &= ~SOF1_PRECONNECT_DATA;
2397
2398 if (en_tracing) {
2399 /* resid passed here is the bytes left in uio */
2400 KERNEL_ENERGYTRACE(kEnTrActKernSockWrite, DBG_FUNC_END,
2401 VM_KERNEL_ADDRPERM(so),
2402 ((error == EWOULDBLOCK) ? kEnTrFlagNoWork : 0),
2403 (int64_t)(orig_resid - resid));
2404 }
2405 KERNEL_DEBUG(DBG_FNC_SOSEND | DBG_FUNC_END, so, resid,
2406 so->so_snd.sb_cc, space, error);
2407
2408 return (error);
2409 }
2410
2411 /*
2412 * Supported only connected sockets (no address) without ancillary data
2413 * (control mbuf) for atomic protocols
2414 */
2415 int
2416 sosend_list(struct socket *so, struct uio **uioarray, u_int uiocnt, int flags)
2417 {
2418 struct mbuf *m, *freelist = NULL;
2419 user_ssize_t len, resid;
2420 int error, dontroute, mlen;
2421 int atomic = sosendallatonce(so);
2422 int sblocked = 0;
2423 struct proc *p = current_proc();
2424 u_int uiofirst = 0;
2425 u_int uiolast = 0;
2426 struct mbuf *top = NULL;
2427 uint16_t headroom = 0;
2428 boolean_t bigcl;
2429
2430 KERNEL_DEBUG((DBG_FNC_SOSEND_LIST | DBG_FUNC_START), so, uiocnt,
2431 so->so_snd.sb_cc, so->so_snd.sb_lowat, so->so_snd.sb_hiwat);
2432
2433 if (so->so_type != SOCK_DGRAM) {
2434 error = EINVAL;
2435 goto out;
2436 }
2437 if (atomic == 0) {
2438 error = EINVAL;
2439 goto out;
2440 }
2441 if (so->so_proto->pr_usrreqs->pru_send_list == NULL) {
2442 error = EPROTONOSUPPORT;
2443 goto out;
2444 }
2445 if (flags & ~(MSG_DONTWAIT | MSG_NBIO)) {
2446 error = EINVAL;
2447 goto out;
2448 }
2449 resid = uio_array_resid(uioarray, uiocnt);
2450
2451 /*
2452 * In theory resid should be unsigned.
2453 * However, space must be signed, as it might be less than 0
2454 * if we over-committed, and we must use a signed comparison
2455 * of space and resid. On the other hand, a negative resid
2456 * causes us to loop sending 0-length segments to the protocol.
2457 *
2458 * Note: We limit resid to be a positive int value as we use
2459 * imin() to set bytes_to_copy -- radr://14558484
2460 */
2461 if (resid < 0 || resid > INT_MAX) {
2462 error = EINVAL;
2463 goto out;
2464 }
2465
2466 socket_lock(so, 1);
2467 so_update_last_owner_locked(so, p);
2468 so_update_policy(so);
2469
2470 #if NECP
2471 so_update_necp_policy(so, NULL, NULL);
2472 #endif /* NECP */
2473
2474 dontroute = (flags & MSG_DONTROUTE) &&
2475 (so->so_options & SO_DONTROUTE) == 0 &&
2476 (so->so_proto->pr_flags & PR_ATOMIC);
2477 OSIncrementAtomicLong(&p->p_stats->p_ru.ru_msgsnd);
2478
2479 error = sosendcheck(so, NULL, resid, 0, atomic, flags,
2480 &sblocked, NULL);
2481 if (error)
2482 goto release;
2483
2484 /*
2485 * Use big 4 KB clusters when the outgoing interface does not prefer
2486 * 2 KB clusters
2487 */
2488 bigcl = !(so->so_flags1 & SOF1_IF_2KCL) || sosendbigcl_ignore_capab;
2489
2490 if (soreserveheadroom != 0)
2491 headroom = so->so_pktheadroom;
2492
2493 do {
2494 int i;
2495 int num_needed = 0;
2496 int chainlength;
2497 size_t maxpktlen = 0;
2498 int bytes_to_alloc;
2499
2500 if (sosendminchain > 0)
2501 chainlength = 0;
2502 else
2503 chainlength = sosendmaxchain;
2504
2505 socket_unlock(so, 0);
2506
2507 /*
2508 * Find a set of uio that fit in a reasonable number
2509 * of mbuf packets
2510 */
2511 for (i = uiofirst; i < uiocnt; i++) {
2512 struct uio *auio = uioarray[i];
2513
2514 len = uio_resid(auio);
2515
2516 /* Do nothing for empty messages */
2517 if (len == 0)
2518 continue;
2519
2520 num_needed += 1;
2521 uiolast += 1;
2522
2523 if (len > maxpktlen)
2524 maxpktlen = len;
2525
2526 chainlength += len;
2527 if (chainlength > sosendmaxchain)
2528 break;
2529 }
2530 /*
2531 * Nothing left to send
2532 */
2533 if (num_needed == 0) {
2534 socket_lock(so, 0);
2535 break;
2536 }
2537 /*
2538 * Allocate buffer large enough to include headroom space for
2539 * network and link header
2540 *
2541 */
2542 bytes_to_alloc = maxpktlen + headroom;
2543
2544 /*
2545 * Allocate a single contiguous buffer of the smallest available
2546 * size when possible
2547 */
2548 if (bytes_to_alloc > MCLBYTES &&
2549 bytes_to_alloc <= MBIGCLBYTES && bigcl) {
2550 freelist = m_getpackets_internal(
2551 (unsigned int *)&num_needed,
2552 num_needed, M_WAIT, 1,
2553 MBIGCLBYTES);
2554 } else if (bytes_to_alloc > _MHLEN &&
2555 bytes_to_alloc <= MCLBYTES) {
2556 freelist = m_getpackets_internal(
2557 (unsigned int *)&num_needed,
2558 num_needed, M_WAIT, 1,
2559 MCLBYTES);
2560 } else {
2561 freelist = m_allocpacket_internal(
2562 (unsigned int *)&num_needed,
2563 bytes_to_alloc, NULL, M_WAIT, 1, 0);
2564 }
2565
2566 if (freelist == NULL) {
2567 socket_lock(so, 0);
2568 error = ENOMEM;
2569 goto release;
2570 }
2571 /*
2572 * Copy each uio of the set into its own mbuf packet
2573 */
2574 for (i = uiofirst, m = freelist;
2575 i < uiolast && m != NULL;
2576 i++) {
2577 int bytes_to_copy;
2578 struct mbuf *n;
2579 struct uio *auio = uioarray[i];
2580
2581 bytes_to_copy = uio_resid(auio);
2582
2583 /* Do nothing for empty messages */
2584 if (bytes_to_copy == 0)
2585 continue;
2586 /*
2587 * Leave headroom for protocol headers
2588 * in the first mbuf of the chain
2589 */
2590 m->m_data += headroom;
2591
2592 for (n = m; n != NULL; n = n->m_next) {
2593 if ((m->m_flags & M_EXT))
2594 mlen = m->m_ext.ext_size -
2595 m_leadingspace(m);
2596 else if ((m->m_flags & M_PKTHDR))
2597 mlen =
2598 MHLEN - m_leadingspace(m);
2599 else
2600 mlen = MLEN - m_leadingspace(m);
2601 len = imin(mlen, bytes_to_copy);
2602
2603 /*
2604 * Note: uiomove() decrements the iovec
2605 * length
2606 */
2607 error = uiomove(mtod(n, caddr_t),
2608 len, auio);
2609 if (error != 0)
2610 break;
2611 n->m_len = len;
2612 m->m_pkthdr.len += len;
2613
2614 VERIFY(m->m_pkthdr.len <= maxpktlen);
2615
2616 bytes_to_copy -= len;
2617 resid -= len;
2618 }
2619 if (m->m_pkthdr.len == 0) {
2620 printf(
2621 "%s:%d so %llx pkt %llx type %u len null\n",
2622 __func__, __LINE__,
2623 (uint64_t)DEBUG_KERNEL_ADDRPERM(so),
2624 (uint64_t)DEBUG_KERNEL_ADDRPERM(m),
2625 m->m_type);
2626 }
2627 if (error != 0)
2628 break;
2629 m = m->m_nextpkt;
2630 }
2631
2632 socket_lock(so, 0);
2633
2634 if (error)
2635 goto release;
2636 top = freelist;
2637 freelist = NULL;
2638
2639 if (dontroute)
2640 so->so_options |= SO_DONTROUTE;
2641
2642 if ((flags & MSG_SKIPCFIL) == 0) {
2643 struct mbuf **prevnextp = NULL;
2644
2645 for (i = uiofirst, m = top;
2646 i < uiolast && m != NULL;
2647 i++) {
2648 struct mbuf *nextpkt = m->m_nextpkt;
2649
2650 /*
2651 * Socket filter processing
2652 */
2653 error = sflt_data_out(so, NULL, &m,
2654 NULL, 0);
2655 if (error != 0 && error != EJUSTRETURN)
2656 goto release;
2657
2658 #if CONTENT_FILTER
2659 if (error == 0) {
2660 /*
2661 * Content filter processing
2662 */
2663 error = cfil_sock_data_out(so, NULL, m,
2664 NULL, 0);
2665 if (error != 0 && error != EJUSTRETURN)
2666 goto release;
2667 }
2668 #endif /* CONTENT_FILTER */
2669 /*
2670 * Remove packet from the list when
2671 * swallowed by a filter
2672 */
2673 if (error == EJUSTRETURN) {
2674 error = 0;
2675 if (prevnextp != NULL)
2676 *prevnextp = nextpkt;
2677 else
2678 top = nextpkt;
2679 }
2680
2681 m = nextpkt;
2682 if (m != NULL)
2683 prevnextp = &m->m_nextpkt;
2684 }
2685 }
2686 if (top != NULL)
2687 error = (*so->so_proto->pr_usrreqs->pru_send_list)
2688 (so, 0, top, NULL, NULL, p);
2689
2690 if (dontroute)
2691 so->so_options &= ~SO_DONTROUTE;
2692
2693 top = NULL;
2694 uiofirst = uiolast;
2695 } while (resid > 0 && error == 0);
2696 release:
2697 if (sblocked)
2698 sbunlock(&so->so_snd, FALSE); /* will unlock socket */
2699 else
2700 socket_unlock(so, 1);
2701 out:
2702 if (top != NULL)
2703 m_freem(top);
2704 if (freelist != NULL)
2705 m_freem_list(freelist);
2706
2707 KERNEL_DEBUG(DBG_FNC_SOSEND_LIST | DBG_FUNC_END, so, resid,
2708 so->so_snd.sb_cc, 0, error);
2709
2710 return (error);
2711 }
2712
2713 /*
2714 * May return ERESTART when packet is dropped by MAC policy check
2715 */
2716 static int
2717 soreceive_addr(struct proc *p, struct socket *so, struct sockaddr **psa,
2718 int flags, struct mbuf **mp, struct mbuf **nextrecordp, int canwait)
2719 {
2720 int error = 0;
2721 struct mbuf *m = *mp;
2722 struct mbuf *nextrecord = *nextrecordp;
2723
2724 KASSERT(m->m_type == MT_SONAME, ("receive 1a"));
2725 #if CONFIG_MACF_SOCKET_SUBSET
2726 /*
2727 * Call the MAC framework for policy checking if we're in
2728 * the user process context and the socket isn't connected.
2729 */
2730 if (p != kernproc && !(so->so_state & SS_ISCONNECTED)) {
2731 struct mbuf *m0 = m;
2732 /*
2733 * Dequeue this record (temporarily) from the receive
2734 * list since we're about to drop the socket's lock
2735 * where a new record may arrive and be appended to
2736 * the list. Upon MAC policy failure, the record
2737 * will be freed. Otherwise, we'll add it back to
2738 * the head of the list. We cannot rely on SB_LOCK
2739 * because append operation uses the socket's lock.
2740 */
2741 do {
2742 m->m_nextpkt = NULL;
2743 sbfree(&so->so_rcv, m);
2744 m = m->m_next;
2745 } while (m != NULL);
2746 m = m0;
2747 so->so_rcv.sb_mb = nextrecord;
2748 SB_EMPTY_FIXUP(&so->so_rcv);
2749 SBLASTRECORDCHK(&so->so_rcv, "soreceive 1a");
2750 SBLASTMBUFCHK(&so->so_rcv, "soreceive 1a");
2751 socket_unlock(so, 0);
2752
2753 if (mac_socket_check_received(proc_ucred(p), so,
2754 mtod(m, struct sockaddr *)) != 0) {
2755 /*
2756 * MAC policy failure; free this record and
2757 * process the next record (or block until
2758 * one is available). We have adjusted sb_cc
2759 * and sb_mbcnt above so there is no need to
2760 * call sbfree() again.
2761 */
2762 m_freem(m);
2763 /*
2764 * Clear SB_LOCK but don't unlock the socket.
2765 * Process the next record or wait for one.
2766 */
2767 socket_lock(so, 0);
2768 sbunlock(&so->so_rcv, TRUE); /* stay locked */
2769 error = ERESTART;
2770 goto done;
2771 }
2772 socket_lock(so, 0);
2773 /*
2774 * If the socket has been defunct'd, drop it.
2775 */
2776 if (so->so_flags & SOF_DEFUNCT) {
2777 m_freem(m);
2778 error = ENOTCONN;
2779 goto done;
2780 }
2781 /*
2782 * Re-adjust the socket receive list and re-enqueue
2783 * the record in front of any packets which may have
2784 * been appended while we dropped the lock.
2785 */
2786 for (m = m0; m->m_next != NULL; m = m->m_next)
2787 sballoc(&so->so_rcv, m);
2788 sballoc(&so->so_rcv, m);
2789 if (so->so_rcv.sb_mb == NULL) {
2790 so->so_rcv.sb_lastrecord = m0;
2791 so->so_rcv.sb_mbtail = m;
2792 }
2793 m = m0;
2794 nextrecord = m->m_nextpkt = so->so_rcv.sb_mb;
2795 so->so_rcv.sb_mb = m;
2796 SBLASTRECORDCHK(&so->so_rcv, "soreceive 1b");
2797 SBLASTMBUFCHK(&so->so_rcv, "soreceive 1b");
2798 }
2799 #endif /* CONFIG_MACF_SOCKET_SUBSET */
2800 if (psa != NULL) {
2801 *psa = dup_sockaddr(mtod(m, struct sockaddr *), canwait);
2802 if ((*psa == NULL) && (flags & MSG_NEEDSA)) {
2803 error = EWOULDBLOCK;
2804 goto done;
2805 }
2806 }
2807 if (flags & MSG_PEEK) {
2808 m = m->m_next;
2809 } else {
2810 sbfree(&so->so_rcv, m);
2811 if (m->m_next == NULL && so->so_rcv.sb_cc != 0) {
2812 panic("%s: about to create invalid socketbuf",
2813 __func__);
2814 /* NOTREACHED */
2815 }
2816 MFREE(m, so->so_rcv.sb_mb);
2817 m = so->so_rcv.sb_mb;
2818 if (m != NULL) {
2819 m->m_nextpkt = nextrecord;
2820 } else {
2821 so->so_rcv.sb_mb = nextrecord;
2822 SB_EMPTY_FIXUP(&so->so_rcv);
2823 }
2824 }
2825 done:
2826 *mp = m;
2827 *nextrecordp = nextrecord;
2828
2829 return (error);
2830 }
2831
2832 /*
2833 * Process one or more MT_CONTROL mbufs present before any data mbufs
2834 * in the first mbuf chain on the socket buffer. If MSG_PEEK, we
2835 * just copy the data; if !MSG_PEEK, we call into the protocol to
2836 * perform externalization.
2837 */
2838 static int
2839 soreceive_ctl(struct socket *so, struct mbuf **controlp, int flags,
2840 struct mbuf **mp, struct mbuf **nextrecordp)
2841 {
2842 int error = 0;
2843 struct mbuf *cm = NULL, *cmn;
2844 struct mbuf **cme = &cm;
2845 struct sockbuf *sb_rcv = &so->so_rcv;
2846 struct mbuf **msgpcm = NULL;
2847 struct mbuf *m = *mp;
2848 struct mbuf *nextrecord = *nextrecordp;
2849 struct protosw *pr = so->so_proto;
2850
2851 /*
2852 * Externalizing the control messages would require us to
2853 * drop the socket's lock below. Once we re-acquire the
2854 * lock, the mbuf chain might change. In order to preserve
2855 * consistency, we unlink all control messages from the
2856 * first mbuf chain in one shot and link them separately
2857 * onto a different chain.
2858 */
2859 do {
2860 if (flags & MSG_PEEK) {
2861 if (controlp != NULL) {
2862 if (*controlp == NULL) {
2863 msgpcm = controlp;
2864 }
2865 *controlp = m_copy(m, 0, m->m_len);
2866
2867 /*
2868 * If we failed to allocate an mbuf,
2869 * release any previously allocated
2870 * mbufs for control data. Return
2871 * an error. Keep the mbufs in the
2872 * socket as this is using
2873 * MSG_PEEK flag.
2874 */
2875 if (*controlp == NULL) {
2876 m_freem(*msgpcm);
2877 error = ENOBUFS;
2878 goto done;
2879 }
2880 controlp = &(*controlp)->m_next;
2881 }
2882 m = m->m_next;
2883 } else {
2884 m->m_nextpkt = NULL;
2885 sbfree(sb_rcv, m);
2886 sb_rcv->sb_mb = m->m_next;
2887 m->m_next = NULL;
2888 *cme = m;
2889 cme = &(*cme)->m_next;
2890 m = sb_rcv->sb_mb;
2891 }
2892 } while (m != NULL && m->m_type == MT_CONTROL);
2893
2894 if (!(flags & MSG_PEEK)) {
2895 if (sb_rcv->sb_mb != NULL) {
2896 sb_rcv->sb_mb->m_nextpkt = nextrecord;
2897 } else {
2898 sb_rcv->sb_mb = nextrecord;
2899 SB_EMPTY_FIXUP(sb_rcv);
2900 }
2901 if (nextrecord == NULL)
2902 sb_rcv->sb_lastrecord = m;
2903 }
2904
2905 SBLASTRECORDCHK(&so->so_rcv, "soreceive ctl");
2906 SBLASTMBUFCHK(&so->so_rcv, "soreceive ctl");
2907
2908 while (cm != NULL) {
2909 int cmsg_type;
2910
2911 cmn = cm->m_next;
2912 cm->m_next = NULL;
2913 cmsg_type = mtod(cm, struct cmsghdr *)->cmsg_type;
2914
2915 /*
2916 * Call the protocol to externalize SCM_RIGHTS message
2917 * and return the modified message to the caller upon
2918 * success. Otherwise, all other control messages are
2919 * returned unmodified to the caller. Note that we
2920 * only get into this loop if MSG_PEEK is not set.
2921 */
2922 if (pr->pr_domain->dom_externalize != NULL &&
2923 cmsg_type == SCM_RIGHTS) {
2924 /*
2925 * Release socket lock: see 3903171. This
2926 * would also allow more records to be appended
2927 * to the socket buffer. We still have SB_LOCK
2928 * set on it, so we can be sure that the head
2929 * of the mbuf chain won't change.
2930 */
2931 socket_unlock(so, 0);
2932 error = (*pr->pr_domain->dom_externalize)(cm);
2933 socket_lock(so, 0);
2934 } else {
2935 error = 0;
2936 }
2937
2938 if (controlp != NULL && error == 0) {
2939 *controlp = cm;
2940 controlp = &(*controlp)->m_next;
2941 } else {
2942 (void) m_free(cm);
2943 }
2944 cm = cmn;
2945 }
2946 /*
2947 * Update the value of nextrecord in case we received new
2948 * records when the socket was unlocked above for
2949 * externalizing SCM_RIGHTS.
2950 */
2951 if (m != NULL)
2952 nextrecord = sb_rcv->sb_mb->m_nextpkt;
2953 else
2954 nextrecord = sb_rcv->sb_mb;
2955
2956 done:
2957 *mp = m;
2958 *nextrecordp = nextrecord;
2959
2960 return (error);
2961 }
2962
2963 /*
2964 * Implement receive operations on a socket.
2965 * We depend on the way that records are added to the sockbuf
2966 * by sbappend*. In particular, each record (mbufs linked through m_next)
2967 * must begin with an address if the protocol so specifies,
2968 * followed by an optional mbuf or mbufs containing ancillary data,
2969 * and then zero or more mbufs of data.
2970 * In order to avoid blocking network interrupts for the entire time here,
2971 * we splx() while doing the actual copy to user space.
2972 * Although the sockbuf is locked, new data may still be appended,
2973 * and thus we must maintain consistency of the sockbuf during that time.
2974 *
2975 * The caller may receive the data as a single mbuf chain by supplying
2976 * an mbuf **mp0 for use in returning the chain. The uio is then used
2977 * only for the count in uio_resid.
2978 *
2979 * Returns: 0 Success
2980 * ENOBUFS
2981 * ENOTCONN
2982 * EWOULDBLOCK
2983 * uiomove:EFAULT
2984 * sblock:EWOULDBLOCK
2985 * sblock:EINTR
2986 * sbwait:EBADF
2987 * sbwait:EINTR
2988 * sodelayed_copy:EFAULT
2989 * <pru_rcvoob>:EINVAL[TCP]
2990 * <pru_rcvoob>:EWOULDBLOCK[TCP]
2991 * <pru_rcvoob>:???
2992 * <pr_domain->dom_externalize>:EMSGSIZE[AF_UNIX]
2993 * <pr_domain->dom_externalize>:ENOBUFS[AF_UNIX]
2994 * <pr_domain->dom_externalize>:???
2995 *
2996 * Notes: Additional return values from calls through <pru_rcvoob> and
2997 * <pr_domain->dom_externalize> depend on protocols other than
2998 * TCP or AF_UNIX, which are documented above.
2999 */
3000 int
3001 soreceive(struct socket *so, struct sockaddr **psa, struct uio *uio,
3002 struct mbuf **mp0, struct mbuf **controlp, int *flagsp)
3003 {
3004 struct mbuf *m, **mp, *ml = NULL;
3005 struct mbuf *nextrecord, *free_list;
3006 int flags, error, offset;
3007 user_ssize_t len;
3008 struct protosw *pr = so->so_proto;
3009 int moff, type = 0;
3010 user_ssize_t orig_resid = uio_resid(uio);
3011 user_ssize_t delayed_copy_len;
3012 int can_delay;
3013 int need_event;
3014 struct proc *p = current_proc();
3015 boolean_t en_tracing = FALSE;
3016
3017 /*
3018 * Sanity check on the length passed by caller as we are making 'int'
3019 * comparisons
3020 */
3021 if (orig_resid < 0 || orig_resid > INT_MAX)
3022 return (EINVAL);
3023
3024 KERNEL_DEBUG(DBG_FNC_SORECEIVE | DBG_FUNC_START, so,
3025 uio_resid(uio), so->so_rcv.sb_cc, so->so_rcv.sb_lowat,
3026 so->so_rcv.sb_hiwat);
3027
3028 socket_lock(so, 1);
3029 so_update_last_owner_locked(so, p);
3030 so_update_policy(so);
3031
3032 #ifdef MORE_LOCKING_DEBUG
3033 if (so->so_usecount == 1) {
3034 panic("%s: so=%x no other reference on socket\n", __func__, so);
3035 /* NOTREACHED */
3036 }
3037 #endif
3038 mp = mp0;
3039 if (psa != NULL)
3040 *psa = NULL;
3041 if (controlp != NULL)
3042 *controlp = NULL;
3043 if (flagsp != NULL)
3044 flags = *flagsp &~ MSG_EOR;
3045 else
3046 flags = 0;
3047
3048 /*
3049 * If a recv attempt is made on a previously-accepted socket
3050 * that has been marked as inactive (disconnected), reject
3051 * the request.
3052 */
3053 if (so->so_flags & SOF_DEFUNCT) {
3054 struct sockbuf *sb = &so->so_rcv;
3055
3056 error = ENOTCONN;
3057 SODEFUNCTLOG("%s[%d, %s]: defunct so 0x%llx [%d,%d] (%d)\n",
3058 __func__, proc_pid(p), proc_best_name(p),
3059 (uint64_t)DEBUG_KERNEL_ADDRPERM(so),
3060 SOCK_DOM(so), SOCK_TYPE(so), error);
3061 /*
3062 * This socket should have been disconnected and flushed
3063 * prior to being returned from sodefunct(); there should
3064 * be no data on its receive list, so panic otherwise.
3065 */
3066 if (so->so_state & SS_DEFUNCT)
3067 sb_empty_assert(sb, __func__);
3068 socket_unlock(so, 1);
3069 return (error);
3070 }
3071
3072 if ((so->so_flags1 & SOF1_PRECONNECT_DATA) &&
3073 pr->pr_usrreqs->pru_preconnect) {
3074 /*
3075 * A user may set the CONNECT_RESUME_ON_READ_WRITE-flag but not
3076 * calling write() right after this. *If* the app calls a read
3077 * we do not want to block this read indefinetely. Thus,
3078 * we trigger a connect so that the session gets initiated.
3079 */
3080 error = (*pr->pr_usrreqs->pru_preconnect)(so);
3081
3082 if (error) {
3083 socket_unlock(so, 1);
3084 return (error);
3085 }
3086 }
3087
3088 if (ENTR_SHOULDTRACE &&
3089 (SOCK_CHECK_DOM(so, AF_INET) || SOCK_CHECK_DOM(so, AF_INET6))) {
3090 /*
3091 * enable energy tracing for inet sockets that go over
3092 * non-loopback interfaces only.
3093 */
3094 struct inpcb *inp = sotoinpcb(so);
3095 if (inp->inp_last_outifp != NULL &&
3096 !(inp->inp_last_outifp->if_flags & IFF_LOOPBACK)) {
3097 en_tracing = TRUE;
3098 KERNEL_ENERGYTRACE(kEnTrActKernSockRead, DBG_FUNC_START,
3099 VM_KERNEL_ADDRPERM(so),
3100 ((so->so_state & SS_NBIO) ?
3101 kEnTrFlagNonBlocking : 0),
3102 (int64_t)orig_resid);
3103 }
3104 }
3105
3106 /*
3107 * When SO_WANTOOBFLAG is set we try to get out-of-band data
3108 * regardless of the flags argument. Here is the case were
3109 * out-of-band data is not inline.
3110 */
3111 if ((flags & MSG_OOB) ||
3112 ((so->so_options & SO_WANTOOBFLAG) != 0 &&
3113 (so->so_options & SO_OOBINLINE) == 0 &&
3114 (so->so_oobmark || (so->so_state & SS_RCVATMARK)))) {
3115 m = m_get(M_WAIT, MT_DATA);
3116 if (m == NULL) {
3117 socket_unlock(so, 1);
3118 KERNEL_DEBUG(DBG_FNC_SORECEIVE | DBG_FUNC_END,
3119 ENOBUFS, 0, 0, 0, 0);
3120 return (ENOBUFS);
3121 }
3122 error = (*pr->pr_usrreqs->pru_rcvoob)(so, m, flags & MSG_PEEK);
3123 if (error)
3124 goto bad;
3125 socket_unlock(so, 0);
3126 do {
3127 error = uiomove(mtod(m, caddr_t),
3128 imin(uio_resid(uio), m->m_len), uio);
3129 m = m_free(m);
3130 } while (uio_resid(uio) && error == 0 && m != NULL);
3131 socket_lock(so, 0);
3132 bad:
3133 if (m != NULL)
3134 m_freem(m);
3135
3136 if ((so->so_options & SO_WANTOOBFLAG) != 0) {
3137 if (error == EWOULDBLOCK || error == EINVAL) {
3138 /*
3139 * Let's try to get normal data:
3140 * EWOULDBLOCK: out-of-band data not
3141 * receive yet. EINVAL: out-of-band data
3142 * already read.
3143 */
3144 error = 0;
3145 goto nooob;
3146 } else if (error == 0 && flagsp != NULL) {
3147 *flagsp |= MSG_OOB;
3148 }
3149 }
3150 socket_unlock(so, 1);
3151 if (en_tracing) {
3152 KERNEL_ENERGYTRACE(kEnTrActKernSockRead, DBG_FUNC_END,
3153 VM_KERNEL_ADDRPERM(so), 0,
3154 (int64_t)(orig_resid - uio_resid(uio)));
3155 }
3156 KERNEL_DEBUG(DBG_FNC_SORECEIVE | DBG_FUNC_END, error,
3157 0, 0, 0, 0);
3158
3159 return (error);
3160 }
3161 nooob:
3162 if (mp != NULL)
3163 *mp = NULL;
3164
3165 if (so->so_state & SS_ISCONFIRMING && uio_resid(uio)) {
3166 (*pr->pr_usrreqs->pru_rcvd)(so, 0);
3167 }
3168
3169 free_list = NULL;
3170 delayed_copy_len = 0;
3171 restart:
3172 #ifdef MORE_LOCKING_DEBUG
3173 if (so->so_usecount <= 1)
3174 printf("soreceive: sblock so=0x%llx ref=%d on socket\n",
3175 (uint64_t)DEBUG_KERNEL_ADDRPERM(so), so->so_usecount);
3176 #endif
3177 /*
3178 * See if the socket has been closed (SS_NOFDREF|SS_CANTRCVMORE)
3179 * and if so just return to the caller. This could happen when
3180 * soreceive() is called by a socket upcall function during the
3181 * time the socket is freed. The socket buffer would have been
3182 * locked across the upcall, therefore we cannot put this thread
3183 * to sleep (else we will deadlock) or return EWOULDBLOCK (else
3184 * we may livelock), because the lock on the socket buffer will
3185 * only be released when the upcall routine returns to its caller.
3186 * Because the socket has been officially closed, there can be
3187 * no further read on it.
3188 *
3189 * A multipath subflow socket would have its SS_NOFDREF set by
3190 * default, so check for SOF_MP_SUBFLOW socket flag; when the
3191 * socket is closed for real, SOF_MP_SUBFLOW would be cleared.
3192 */
3193 if ((so->so_state & (SS_NOFDREF | SS_CANTRCVMORE)) ==
3194 (SS_NOFDREF | SS_CANTRCVMORE) && !(so->so_flags & SOF_MP_SUBFLOW)) {
3195 socket_unlock(so, 1);
3196 return (0);
3197 }
3198
3199 error = sblock(&so->so_rcv, SBLOCKWAIT(flags));
3200 if (error) {
3201 socket_unlock(so, 1);
3202 KERNEL_DEBUG(DBG_FNC_SORECEIVE | DBG_FUNC_END, error,
3203 0, 0, 0, 0);
3204 if (en_tracing) {
3205 KERNEL_ENERGYTRACE(kEnTrActKernSockRead, DBG_FUNC_END,
3206 VM_KERNEL_ADDRPERM(so), 0,
3207 (int64_t)(orig_resid - uio_resid(uio)));
3208 }
3209 return (error);
3210 }
3211
3212 m = so->so_rcv.sb_mb;
3213 /*
3214 * If we have less data than requested, block awaiting more
3215 * (subject to any timeout) if:
3216 * 1. the current count is less than the low water mark, or
3217 * 2. MSG_WAITALL is set, and it is possible to do the entire
3218 * receive operation at once if we block (resid <= hiwat).
3219 * 3. MSG_DONTWAIT is not set
3220 * If MSG_WAITALL is set but resid is larger than the receive buffer,
3221 * we have to do the receive in sections, and thus risk returning
3222 * a short count if a timeout or signal occurs after we start.
3223 */
3224 if (m == NULL || (((flags & MSG_DONTWAIT) == 0 &&
3225 so->so_rcv.sb_cc < uio_resid(uio)) &&
3226 (so->so_rcv.sb_cc < so->so_rcv.sb_lowat ||
3227 ((flags & MSG_WAITALL) && uio_resid(uio) <= so->so_rcv.sb_hiwat)) &&
3228 m->m_nextpkt == NULL && (pr->pr_flags & PR_ATOMIC) == 0)) {
3229 /*
3230 * Panic if we notice inconsistencies in the socket's
3231 * receive list; both sb_mb and sb_cc should correctly
3232 * reflect the contents of the list, otherwise we may
3233 * end up with false positives during select() or poll()
3234 * which could put the application in a bad state.
3235 */
3236 SB_MB_CHECK(&so->so_rcv);
3237
3238 if (so->so_error) {
3239 if (m != NULL)
3240 goto dontblock;
3241 error = so->so_error;
3242 if ((flags & MSG_PEEK) == 0)
3243 so->so_error = 0;
3244 goto release;
3245 }
3246 if (so->so_state & SS_CANTRCVMORE) {
3247 #if CONTENT_FILTER
3248 /*
3249 * Deal with half closed connections
3250 */
3251 if ((so->so_state & SS_ISDISCONNECTED) == 0 &&
3252 cfil_sock_data_pending(&so->so_rcv) != 0)
3253 CFIL_LOG(LOG_INFO,
3254 "so %llx ignore SS_CANTRCVMORE",
3255 (uint64_t)DEBUG_KERNEL_ADDRPERM(so));
3256 else
3257 #endif /* CONTENT_FILTER */
3258 if (m != NULL)
3259 goto dontblock;
3260 else
3261 goto release;
3262 }
3263 for (; m != NULL; m = m->m_next)
3264 if (m->m_type == MT_OOBDATA || (m->m_flags & M_EOR)) {
3265 m = so->so_rcv.sb_mb;
3266 goto dontblock;
3267 }
3268 if ((so->so_state & (SS_ISCONNECTED|SS_ISCONNECTING)) == 0 &&
3269 (so->so_proto->pr_flags & PR_CONNREQUIRED)) {
3270 error = ENOTCONN;
3271 goto release;
3272 }
3273 if (uio_resid(uio) == 0)
3274 goto release;
3275
3276 if ((so->so_state & SS_NBIO) ||
3277 (flags & (MSG_DONTWAIT|MSG_NBIO))) {
3278 error = EWOULDBLOCK;
3279 goto release;
3280 }
3281 SBLASTRECORDCHK(&so->so_rcv, "soreceive sbwait 1");
3282 SBLASTMBUFCHK(&so->so_rcv, "soreceive sbwait 1");
3283 sbunlock(&so->so_rcv, TRUE); /* keep socket locked */
3284 #if EVEN_MORE_LOCKING_DEBUG
3285 if (socket_debug)
3286 printf("Waiting for socket data\n");
3287 #endif
3288
3289 error = sbwait(&so->so_rcv);
3290 #if EVEN_MORE_LOCKING_DEBUG
3291 if (socket_debug)
3292 printf("SORECEIVE - sbwait returned %d\n", error);
3293 #endif
3294 if (so->so_usecount < 1) {
3295 panic("%s: after 2nd sblock so=%p ref=%d on socket\n",
3296 __func__, so, so->so_usecount);
3297 /* NOTREACHED */
3298 }
3299 if (error) {
3300 socket_unlock(so, 1);
3301 KERNEL_DEBUG(DBG_FNC_SORECEIVE | DBG_FUNC_END, error,
3302 0, 0, 0, 0);
3303 if (en_tracing) {
3304 KERNEL_ENERGYTRACE(kEnTrActKernSockRead, DBG_FUNC_END,
3305 VM_KERNEL_ADDRPERM(so), 0,
3306 (int64_t)(orig_resid - uio_resid(uio)));
3307 }
3308 return (error);
3309 }
3310 goto restart;
3311 }
3312 dontblock:
3313 OSIncrementAtomicLong(&p->p_stats->p_ru.ru_msgrcv);
3314 SBLASTRECORDCHK(&so->so_rcv, "soreceive 1");
3315 SBLASTMBUFCHK(&so->so_rcv, "soreceive 1");
3316 nextrecord = m->m_nextpkt;
3317
3318 if ((pr->pr_flags & PR_ADDR) && m->m_type == MT_SONAME) {
3319 error = soreceive_addr(p, so, psa, flags, &m, &nextrecord,
3320 mp0 == NULL);
3321 if (error == ERESTART)
3322 goto restart;
3323 else if (error != 0)
3324 goto release;
3325 orig_resid = 0;
3326 }
3327
3328 /*
3329 * Process one or more MT_CONTROL mbufs present before any data mbufs
3330 * in the first mbuf chain on the socket buffer. If MSG_PEEK, we
3331 * just copy the data; if !MSG_PEEK, we call into the protocol to
3332 * perform externalization.
3333 */
3334 if (m != NULL && m->m_type == MT_CONTROL) {
3335 error = soreceive_ctl(so, controlp, flags, &m, &nextrecord);
3336 if (error != 0)
3337 goto release;
3338 orig_resid = 0;
3339 }
3340
3341 /*
3342 * If the socket is a TCP socket with message delivery
3343 * enabled, then create a control msg to deliver the
3344 * relative TCP sequence number for this data. Waiting
3345 * until this point will protect against failures to
3346 * allocate an mbuf for control msgs.
3347 */
3348 if (so->so_type == SOCK_STREAM && SOCK_PROTO(so) == IPPROTO_TCP &&
3349 (so->so_flags & SOF_ENABLE_MSGS) && controlp != NULL) {
3350 struct mbuf *seq_cm;
3351
3352 seq_cm = sbcreatecontrol((caddr_t)&m->m_pkthdr.msg_seq,
3353 sizeof (uint32_t), SCM_SEQNUM, SOL_SOCKET);
3354 if (seq_cm == NULL) {
3355 /* unable to allocate a control mbuf */
3356 error = ENOBUFS;
3357 goto release;
3358 }
3359 *controlp = seq_cm;
3360 controlp = &seq_cm->m_next;
3361 }
3362
3363 if (m != NULL) {
3364 if (!(flags & MSG_PEEK)) {
3365 /*
3366 * We get here because m points to an mbuf following
3367 * any MT_SONAME or MT_CONTROL mbufs which have been
3368 * processed above. In any case, m should be pointing
3369 * to the head of the mbuf chain, and the nextrecord
3370 * should be either NULL or equal to m->m_nextpkt.
3371 * See comments above about SB_LOCK.
3372 */
3373 if (m != so->so_rcv.sb_mb ||
3374 m->m_nextpkt != nextrecord) {
3375 panic("%s: post-control !sync so=%p m=%p "
3376 "nextrecord=%p\n", __func__, so, m,
3377 nextrecord);
3378 /* NOTREACHED */
3379 }
3380 if (nextrecord == NULL)
3381 so->so_rcv.sb_lastrecord = m;
3382 }
3383 type = m->m_type;
3384 if (type == MT_OOBDATA)
3385 flags |= MSG_OOB;
3386 } else {
3387 if (!(flags & MSG_PEEK)) {
3388 SB_EMPTY_FIXUP(&so->so_rcv);
3389 }
3390 }
3391 SBLASTRECORDCHK(&so->so_rcv, "soreceive 2");
3392 SBLASTMBUFCHK(&so->so_rcv, "soreceive 2");
3393
3394 moff = 0;
3395 offset = 0;
3396
3397 if (!(flags & MSG_PEEK) && uio_resid(uio) > sorecvmincopy)
3398 can_delay = 1;
3399 else
3400 can_delay = 0;
3401
3402 need_event = 0;
3403
3404 while (m != NULL &&
3405 (uio_resid(uio) - delayed_copy_len) > 0 && error == 0) {
3406 if (m->m_type == MT_OOBDATA) {
3407 if (type != MT_OOBDATA)
3408 break;
3409 } else if (type == MT_OOBDATA) {
3410 break;
3411 }
3412 /*
3413 * Make sure to allways set MSG_OOB event when getting
3414 * out of band data inline.
3415 */
3416 if ((so->so_options & SO_WANTOOBFLAG) != 0 &&
3417 (so->so_options & SO_OOBINLINE) != 0 &&
3418 (so->so_state & SS_RCVATMARK) != 0) {
3419 flags |= MSG_OOB;
3420 }
3421 so->so_state &= ~SS_RCVATMARK;
3422 len = uio_resid(uio) - delayed_copy_len;
3423 if (so->so_oobmark && len > so->so_oobmark - offset)
3424 len = so->so_oobmark - offset;
3425 if (len > m->m_len - moff)
3426 len = m->m_len - moff;
3427 /*
3428 * If mp is set, just pass back the mbufs.
3429 * Otherwise copy them out via the uio, then free.
3430 * Sockbuf must be consistent here (points to current mbuf,
3431 * it points to next record) when we drop priority;
3432 * we must note any additions to the sockbuf when we
3433 * block interrupts again.
3434 */
3435 if (mp == NULL) {
3436 SBLASTRECORDCHK(&so->so_rcv, "soreceive uiomove");
3437 SBLASTMBUFCHK(&so->so_rcv, "soreceive uiomove");
3438 if (can_delay && len == m->m_len) {
3439 /*
3440 * only delay the copy if we're consuming the
3441 * mbuf and we're NOT in MSG_PEEK mode
3442 * and we have enough data to make it worthwile
3443 * to drop and retake the lock... can_delay
3444 * reflects the state of the 2 latter
3445 * constraints moff should always be zero
3446 * in these cases
3447 */
3448 delayed_copy_len += len;
3449 } else {
3450 if (delayed_copy_len) {
3451 error = sodelayed_copy(so, uio,
3452 &free_list, &delayed_copy_len);
3453
3454 if (error) {
3455 goto release;
3456 }
3457 /*
3458 * can only get here if MSG_PEEK is not
3459 * set therefore, m should point at the
3460 * head of the rcv queue; if it doesn't,
3461 * it means something drastically
3462 * changed while we were out from behind
3463 * the lock in sodelayed_copy. perhaps
3464 * a RST on the stream. in any event,
3465 * the stream has been interrupted. it's
3466 * probably best just to return whatever
3467 * data we've moved and let the caller
3468 * sort it out...
3469 */
3470 if (m != so->so_rcv.sb_mb) {
3471 break;
3472 }
3473 }
3474 socket_unlock(so, 0);
3475 error = uiomove(mtod(m, caddr_t) + moff,
3476 (int)len, uio);
3477 socket_lock(so, 0);
3478
3479 if (error)
3480 goto release;
3481 }
3482 } else {
3483 uio_setresid(uio, (uio_resid(uio) - len));
3484 }
3485 if (len == m->m_len - moff) {
3486 if (m->m_flags & M_EOR)
3487 flags |= MSG_EOR;
3488 if (flags & MSG_PEEK) {
3489 m = m->m_next;
3490 moff = 0;
3491 } else {
3492 nextrecord = m->m_nextpkt;
3493 sbfree(&so->so_rcv, m);
3494 m->m_nextpkt = NULL;
3495
3496 /*
3497 * If this packet is an unordered packet
3498 * (indicated by M_UNORDERED_DATA flag), remove
3499 * the additional bytes added to the
3500 * receive socket buffer size.
3501 */
3502 if ((so->so_flags & SOF_ENABLE_MSGS) &&
3503 m->m_len &&
3504 (m->m_flags & M_UNORDERED_DATA) &&
3505 sbreserve(&so->so_rcv,
3506 so->so_rcv.sb_hiwat - m->m_len)) {
3507 if (so->so_msg_state->msg_uno_bytes >
3508 m->m_len) {
3509 so->so_msg_state->
3510 msg_uno_bytes -= m->m_len;
3511 } else {
3512 so->so_msg_state->
3513 msg_uno_bytes = 0;
3514 }
3515 m->m_flags &= ~M_UNORDERED_DATA;
3516 }
3517
3518 if (mp != NULL) {
3519 *mp = m;
3520 mp = &m->m_next;
3521 so->so_rcv.sb_mb = m = m->m_next;
3522 *mp = NULL;
3523 } else {
3524 if (free_list == NULL)
3525 free_list = m;
3526 else
3527 ml->m_next = m;
3528 ml = m;
3529 so->so_rcv.sb_mb = m = m->m_next;
3530 ml->m_next = NULL;
3531 }
3532 if (m != NULL) {
3533 m->m_nextpkt = nextrecord;
3534 if (nextrecord == NULL)
3535 so->so_rcv.sb_lastrecord = m;
3536 } else {
3537 so->so_rcv.sb_mb = nextrecord;
3538 SB_EMPTY_FIXUP(&so->so_rcv);
3539 }
3540 SBLASTRECORDCHK(&so->so_rcv, "soreceive 3");
3541 SBLASTMBUFCHK(&so->so_rcv, "soreceive 3");
3542 }
3543 } else {
3544 if (flags & MSG_PEEK) {
3545 moff += len;
3546 } else {
3547 if (mp != NULL) {
3548 int copy_flag;
3549
3550 if (flags & MSG_DONTWAIT)
3551 copy_flag = M_DONTWAIT;
3552 else
3553 copy_flag = M_WAIT;
3554 *mp = m_copym(m, 0, len, copy_flag);
3555 /*
3556 * Failed to allocate an mbuf?
3557 * Adjust uio_resid back, it was
3558 * adjusted down by len bytes which
3559 * we didn't copy over.
3560 */
3561 if (*mp == NULL) {
3562 uio_setresid(uio,
3563 (uio_resid(uio) + len));
3564 break;
3565 }
3566 }
3567 m->m_data += len;
3568 m->m_len -= len;
3569 so->so_rcv.sb_cc -= len;
3570 }
3571 }
3572 if (so->so_oobmark) {
3573 if ((flags & MSG_PEEK) == 0) {
3574 so->so_oobmark -= len;
3575 if (so->so_oobmark == 0) {
3576 so->so_state |= SS_RCVATMARK;
3577 /*
3578 * delay posting the actual event until
3579 * after any delayed copy processing
3580 * has finished
3581 */
3582 need_event = 1;
3583 break;
3584 }
3585 } else {
3586 offset += len;
3587 if (offset == so->so_oobmark)
3588 break;
3589 }
3590 }
3591 if (flags & MSG_EOR)
3592 break;
3593 /*
3594 * If the MSG_WAITALL or MSG_WAITSTREAM flag is set
3595 * (for non-atomic socket), we must not quit until
3596 * "uio->uio_resid == 0" or an error termination.
3597 * If a signal/timeout occurs, return with a short
3598 * count but without error. Keep sockbuf locked
3599 * against other readers.
3600 */
3601 while (flags & (MSG_WAITALL|MSG_WAITSTREAM) && m == NULL &&
3602 (uio_resid(uio) - delayed_copy_len) > 0 &&
3603 !sosendallatonce(so) && !nextrecord) {
3604 if (so->so_error || ((so->so_state & SS_CANTRCVMORE)
3605 #if CONTENT_FILTER
3606 && cfil_sock_data_pending(&so->so_rcv) == 0
3607 #endif /* CONTENT_FILTER */
3608 ))
3609 goto release;
3610
3611 /*
3612 * Depending on the protocol (e.g. TCP), the following
3613 * might cause the socket lock to be dropped and later
3614 * be reacquired, and more data could have arrived and
3615 * have been appended to the receive socket buffer by
3616 * the time it returns. Therefore, we only sleep in
3617 * sbwait() below if and only if the socket buffer is
3618 * empty, in order to avoid a false sleep.
3619 */
3620 if (pr->pr_flags & PR_WANTRCVD && so->so_pcb &&
3621 (((struct inpcb *)so->so_pcb)->inp_state !=
3622 INPCB_STATE_DEAD))
3623 (*pr->pr_usrreqs->pru_rcvd)(so, flags);
3624
3625 SBLASTRECORDCHK(&so->so_rcv, "soreceive sbwait 2");
3626 SBLASTMBUFCHK(&so->so_rcv, "soreceive sbwait 2");
3627
3628 if (so->so_rcv.sb_mb == NULL && sbwait(&so->so_rcv)) {
3629 error = 0;
3630 goto release;
3631 }
3632 /*
3633 * have to wait until after we get back from the sbwait
3634 * to do the copy because we will drop the lock if we
3635 * have enough data that has been delayed... by dropping
3636 * the lock we open up a window allowing the netisr
3637 * thread to process the incoming packets and to change
3638 * the state of this socket... we're issuing the sbwait
3639 * because the socket is empty and we're expecting the
3640 * netisr thread to wake us up when more packets arrive;
3641 * if we allow that processing to happen and then sbwait
3642 * we could stall forever with packets sitting in the
3643 * socket if no further packets arrive from the remote
3644 * side.
3645 *
3646 * we want to copy before we've collected all the data
3647 * to satisfy this request to allow the copy to overlap
3648 * the incoming packet processing on an MP system
3649 */
3650 if (delayed_copy_len > sorecvmincopy &&
3651 (delayed_copy_len > (so->so_rcv.sb_hiwat / 2))) {
3652 error = sodelayed_copy(so, uio,
3653 &free_list, &delayed_copy_len);
3654
3655 if (error)
3656 goto release;
3657 }
3658 m = so->so_rcv.sb_mb;
3659 if (m != NULL) {
3660 nextrecord = m->m_nextpkt;
3661 }
3662 SB_MB_CHECK(&so->so_rcv);
3663 }
3664 }
3665 #ifdef MORE_LOCKING_DEBUG
3666 if (so->so_usecount <= 1) {
3667 panic("%s: after big while so=%p ref=%d on socket\n",
3668 __func__, so, so->so_usecount);
3669 /* NOTREACHED */
3670 }
3671 #endif
3672
3673 if (m != NULL && pr->pr_flags & PR_ATOMIC) {
3674 if (so->so_options & SO_DONTTRUNC) {
3675 flags |= MSG_RCVMORE;
3676 } else {
3677 flags |= MSG_TRUNC;
3678 if ((flags & MSG_PEEK) == 0)
3679 (void) sbdroprecord(&so->so_rcv);
3680 }
3681 }
3682
3683 /*
3684 * pru_rcvd below (for TCP) may cause more data to be received
3685 * if the socket lock is dropped prior to sending the ACK; some
3686 * legacy OpenTransport applications don't handle this well
3687 * (if it receives less data than requested while MSG_HAVEMORE
3688 * is set), and so we set the flag now based on what we know
3689 * prior to calling pru_rcvd.
3690 */
3691 if ((so->so_options & SO_WANTMORE) && so->so_rcv.sb_cc > 0)
3692 flags |= MSG_HAVEMORE;
3693
3694 if ((flags & MSG_PEEK) == 0) {
3695 if (m == NULL) {
3696 so->so_rcv.sb_mb = nextrecord;
3697 /*
3698 * First part is an inline SB_EMPTY_FIXUP(). Second
3699 * part makes sure sb_lastrecord is up-to-date if
3700 * there is still data in the socket buffer.
3701 */
3702 if (so->so_rcv.sb_mb == NULL) {
3703 so->so_rcv.sb_mbtail = NULL;
3704 so->so_rcv.sb_lastrecord = NULL;
3705 } else if (nextrecord->m_nextpkt == NULL) {
3706 so->so_rcv.sb_lastrecord = nextrecord;
3707 }
3708 SB_MB_CHECK(&so->so_rcv);
3709 }
3710 SBLASTRECORDCHK(&so->so_rcv, "soreceive 4");
3711 SBLASTMBUFCHK(&so->so_rcv, "soreceive 4");
3712 if (pr->pr_flags & PR_WANTRCVD && so->so_pcb)
3713 (*pr->pr_usrreqs->pru_rcvd)(so, flags);
3714 }
3715
3716 if (delayed_copy_len) {
3717 error = sodelayed_copy(so, uio, &free_list, &delayed_copy_len);
3718 if (error)
3719 goto release;
3720 }
3721 if (free_list != NULL) {
3722 m_freem_list(free_list);
3723 free_list = NULL;
3724 }
3725 if (need_event)
3726 postevent(so, 0, EV_OOB);
3727
3728 if (orig_resid == uio_resid(uio) && orig_resid &&
3729 (flags & MSG_EOR) == 0 && (so->so_state & SS_CANTRCVMORE) == 0) {
3730 sbunlock(&so->so_rcv, TRUE); /* keep socket locked */
3731 goto restart;
3732 }
3733
3734 if (flagsp != NULL)
3735 *flagsp |= flags;
3736 release:
3737 #ifdef MORE_LOCKING_DEBUG
3738 if (so->so_usecount <= 1) {
3739 panic("%s: release so=%p ref=%d on socket\n", __func__,
3740 so, so->so_usecount);
3741 /* NOTREACHED */
3742 }
3743 #endif
3744 if (delayed_copy_len)
3745 error = sodelayed_copy(so, uio, &free_list, &delayed_copy_len);
3746
3747 if (free_list != NULL)
3748 m_freem_list(free_list);
3749
3750 sbunlock(&so->so_rcv, FALSE); /* will unlock socket */
3751
3752 if (en_tracing) {
3753 KERNEL_ENERGYTRACE(kEnTrActKernSockRead, DBG_FUNC_END,
3754 VM_KERNEL_ADDRPERM(so),
3755 ((error == EWOULDBLOCK) ? kEnTrFlagNoWork : 0),
3756 (int64_t)(orig_resid - uio_resid(uio)));
3757 }
3758 KERNEL_DEBUG(DBG_FNC_SORECEIVE | DBG_FUNC_END, so, uio_resid(uio),
3759 so->so_rcv.sb_cc, 0, error);
3760
3761 return (error);
3762 }
3763
3764 /*
3765 * Returns: 0 Success
3766 * uiomove:EFAULT
3767 */
3768 static int
3769 sodelayed_copy(struct socket *so, struct uio *uio, struct mbuf **free_list,
3770 user_ssize_t *resid)
3771 {
3772 int error = 0;
3773 struct mbuf *m;
3774
3775 m = *free_list;
3776
3777 socket_unlock(so, 0);
3778
3779 while (m != NULL && error == 0) {
3780 error = uiomove(mtod(m, caddr_t), (int)m->m_len, uio);
3781 m = m->m_next;
3782 }
3783 m_freem_list(*free_list);
3784
3785 *free_list = NULL;
3786 *resid = 0;
3787
3788 socket_lock(so, 0);
3789
3790 return (error);
3791 }
3792
3793 static int
3794 sodelayed_copy_list(struct socket *so, struct recv_msg_elem *msgarray,
3795 u_int uiocnt, struct mbuf **free_list, user_ssize_t *resid)
3796 {
3797 #pragma unused(so)
3798 int error = 0;
3799 struct mbuf *ml, *m;
3800 int i = 0;
3801 struct uio *auio;
3802
3803 for (ml = *free_list, i = 0; ml != NULL && i < uiocnt;
3804 ml = ml->m_nextpkt, i++) {
3805 auio = msgarray[i].uio;
3806 for (m = ml; m != NULL; m = m->m_next) {
3807 error = uiomove(mtod(m, caddr_t), m->m_len, auio);
3808 if (error != 0)
3809 goto out;
3810 }
3811 }
3812 out:
3813 m_freem_list(*free_list);
3814
3815 *free_list = NULL;
3816 *resid = 0;
3817
3818 return (error);
3819 }
3820
3821 int
3822 soreceive_list(struct socket *so, struct recv_msg_elem *msgarray, u_int uiocnt,
3823 int *flagsp)
3824 {
3825 struct mbuf *m;
3826 struct mbuf *nextrecord;
3827 struct mbuf *ml = NULL, *free_list = NULL, *free_tail = NULL;
3828 int error;
3829 user_ssize_t len, pktlen, delayed_copy_len = 0;
3830 struct protosw *pr = so->so_proto;
3831 user_ssize_t resid;
3832 struct proc *p = current_proc();
3833 struct uio *auio = NULL;
3834 int npkts = 0;
3835 int sblocked = 0;
3836 struct sockaddr **psa = NULL;
3837 struct mbuf **controlp = NULL;
3838 int can_delay;
3839 int flags;
3840 struct mbuf *free_others = NULL;
3841
3842 KERNEL_DEBUG(DBG_FNC_SORECEIVE_LIST | DBG_FUNC_START,
3843 so, uiocnt,
3844 so->so_rcv.sb_cc, so->so_rcv.sb_lowat, so->so_rcv.sb_hiwat);
3845
3846 /*
3847 * Sanity checks:
3848 * - Only supports don't wait flags
3849 * - Only support datagram sockets (could be extended to raw)
3850 * - Must be atomic
3851 * - Protocol must support packet chains
3852 * - The uio array is NULL (should we panic?)
3853 */
3854 if (flagsp != NULL)
3855 flags = *flagsp;
3856 else
3857 flags = 0;
3858 if (flags & ~(MSG_PEEK | MSG_WAITALL | MSG_DONTWAIT | MSG_NEEDSA |
3859 MSG_NBIO)) {
3860 printf("%s invalid flags 0x%x\n", __func__, flags);
3861 error = EINVAL;
3862 goto out;
3863 }
3864 if (so->so_type != SOCK_DGRAM) {
3865 error = EINVAL;
3866 goto out;
3867 }
3868 if (sosendallatonce(so) == 0) {
3869 error = EINVAL;
3870 goto out;
3871 }
3872 if (so->so_proto->pr_usrreqs->pru_send_list == NULL) {
3873 error = EPROTONOSUPPORT;
3874 goto out;
3875 }
3876 if (msgarray == NULL) {
3877 printf("%s uioarray is NULL\n", __func__);
3878 error = EINVAL;
3879 goto out;
3880 }
3881 if (uiocnt == 0) {
3882 printf("%s uiocnt is 0\n", __func__);
3883 error = EINVAL;
3884 goto out;
3885 }
3886 /*
3887 * Sanity check on the length passed by caller as we are making 'int'
3888 * comparisons
3889 */
3890 resid = recv_msg_array_resid(msgarray, uiocnt);
3891 if (resid < 0 || resid > INT_MAX) {
3892 error = EINVAL;
3893 goto out;
3894 }
3895
3896 if (!(flags & MSG_PEEK) && sorecvmincopy > 0)
3897 can_delay = 1;
3898 else
3899 can_delay = 0;
3900
3901 socket_lock(so, 1);
3902 so_update_last_owner_locked(so, p);
3903 so_update_policy(so);
3904
3905 #if NECP
3906 so_update_necp_policy(so, NULL, NULL);
3907 #endif /* NECP */
3908
3909 /*
3910 * If a recv attempt is made on a previously-accepted socket
3911 * that has been marked as inactive (disconnected), reject
3912 * the request.
3913 */
3914 if (so->so_flags & SOF_DEFUNCT) {
3915 struct sockbuf *sb = &so->so_rcv;
3916
3917 error = ENOTCONN;
3918 SODEFUNCTLOG("%s[%d, %s]: defunct so 0x%llx [%d,%d] (%d)\n",
3919 __func__, proc_pid(p), proc_best_name(p),
3920 (uint64_t)DEBUG_KERNEL_ADDRPERM(so),
3921 SOCK_DOM(so), SOCK_TYPE(so), error);
3922 /*
3923 * This socket should have been disconnected and flushed
3924 * prior to being returned from sodefunct(); there should
3925 * be no data on its receive list, so panic otherwise.
3926 */
3927 if (so->so_state & SS_DEFUNCT)
3928 sb_empty_assert(sb, __func__);
3929 goto release;
3930 }
3931
3932 next:
3933 /*
3934 * The uio may be empty
3935 */
3936 if (npkts >= uiocnt) {
3937 error = 0;
3938 goto release;
3939 }
3940 restart:
3941 /*
3942 * See if the socket has been closed (SS_NOFDREF|SS_CANTRCVMORE)
3943 * and if so just return to the caller. This could happen when
3944 * soreceive() is called by a socket upcall function during the
3945 * time the socket is freed. The socket buffer would have been
3946 * locked across the upcall, therefore we cannot put this thread
3947 * to sleep (else we will deadlock) or return EWOULDBLOCK (else
3948 * we may livelock), because the lock on the socket buffer will
3949 * only be released when the upcall routine returns to its caller.
3950 * Because the socket has been officially closed, there can be
3951 * no further read on it.
3952 */
3953 if ((so->so_state & (SS_NOFDREF | SS_CANTRCVMORE)) ==
3954 (SS_NOFDREF | SS_CANTRCVMORE)) {
3955 error = 0;
3956 goto release;
3957 }
3958
3959 error = sblock(&so->so_rcv, SBLOCKWAIT(flags));
3960 if (error) {
3961 goto release;
3962 }
3963 sblocked = 1;
3964
3965 m = so->so_rcv.sb_mb;
3966 /*
3967 * Block awaiting more datagram if needed
3968 */
3969 if (m == NULL || (((flags & MSG_DONTWAIT) == 0 &&
3970 (so->so_rcv.sb_cc < so->so_rcv.sb_lowat ||
3971 ((flags & MSG_WAITALL) && npkts < uiocnt))))) {
3972 /*
3973 * Panic if we notice inconsistencies in the socket's
3974 * receive list; both sb_mb and sb_cc should correctly
3975 * reflect the contents of the list, otherwise we may
3976 * end up with false positives during select() or poll()
3977 * which could put the application in a bad state.
3978 */
3979 SB_MB_CHECK(&so->so_rcv);
3980
3981 if (so->so_error) {
3982 error = so->so_error;
3983 if ((flags & MSG_PEEK) == 0)
3984 so->so_error = 0;
3985 goto release;
3986 }
3987 if (so->so_state & SS_CANTRCVMORE) {
3988 goto release;
3989 }
3990 if ((so->so_state & (SS_ISCONNECTED|SS_ISCONNECTING)) == 0 &&
3991 (so->so_proto->pr_flags & PR_CONNREQUIRED)) {
3992 error = ENOTCONN;
3993 goto release;
3994 }
3995 if ((so->so_state & SS_NBIO) ||
3996 (flags & (MSG_DONTWAIT|MSG_NBIO))) {
3997 error = EWOULDBLOCK;
3998 goto release;
3999 }
4000 /*
4001 * Do not block if we got some data
4002 */
4003 if (free_list != NULL) {
4004 error = 0;
4005 goto release;
4006 }
4007
4008 SBLASTRECORDCHK(&so->so_rcv, "soreceive sbwait 1");
4009 SBLASTMBUFCHK(&so->so_rcv, "soreceive sbwait 1");
4010
4011 sbunlock(&so->so_rcv, TRUE); /* keep socket locked */
4012 sblocked = 0;
4013
4014 error = sbwait(&so->so_rcv);
4015 if (error) {
4016 goto release;
4017 }
4018 goto restart;
4019 }
4020
4021 OSIncrementAtomicLong(&p->p_stats->p_ru.ru_msgrcv);
4022 SBLASTRECORDCHK(&so->so_rcv, "soreceive 1");
4023 SBLASTMBUFCHK(&so->so_rcv, "soreceive 1");
4024
4025 /*
4026 * Consume the current uio index as we have a datagram
4027 */
4028 auio = msgarray[npkts].uio;
4029 resid = uio_resid(auio);
4030 msgarray[npkts].which |= SOCK_MSG_DATA;
4031 psa = (msgarray[npkts].which & SOCK_MSG_SA) ?
4032 &msgarray[npkts].psa : NULL;
4033 controlp = (msgarray[npkts].which & SOCK_MSG_CONTROL) ?
4034 &msgarray[npkts].controlp : NULL;
4035 npkts += 1;
4036 nextrecord = m->m_nextpkt;
4037
4038 if ((pr->pr_flags & PR_ADDR) && m->m_type == MT_SONAME) {
4039 error = soreceive_addr(p, so, psa, flags, &m, &nextrecord, 1);
4040 if (error == ERESTART)
4041 goto restart;
4042 else if (error != 0)
4043 goto release;
4044 }
4045
4046 if (m != NULL && m->m_type == MT_CONTROL) {
4047 error = soreceive_ctl(so, controlp, flags, &m, &nextrecord);
4048 if (error != 0)
4049 goto release;
4050 }
4051
4052 if (m->m_pkthdr.len == 0) {
4053 printf("%s:%d so %llx pkt %llx type %u pktlen null\n",
4054 __func__, __LINE__,
4055 (uint64_t)DEBUG_KERNEL_ADDRPERM(so),
4056 (uint64_t)DEBUG_KERNEL_ADDRPERM(m),
4057 m->m_type);
4058 }
4059
4060 /*
4061 * Loop to copy the mbufs of the current record
4062 * Support zero length packets
4063 */
4064 ml = NULL;
4065 pktlen = 0;
4066 while (m != NULL && (len = resid - pktlen) >= 0 && error == 0) {
4067 if (m->m_len == 0)
4068 panic("%p m_len zero", m);
4069 if (m->m_type == 0)
4070 panic("%p m_type zero", m);
4071 /*
4072 * Clip to the residual length
4073 */
4074 if (len > m->m_len)
4075 len = m->m_len;
4076 pktlen += len;
4077 /*
4078 * Copy the mbufs via the uio or delay the copy
4079 * Sockbuf must be consistent here (points to current mbuf,
4080 * it points to next record) when we drop priority;
4081 * we must note any additions to the sockbuf when we
4082 * block interrupts again.
4083 */
4084 if (len > 0 && can_delay == 0) {
4085 socket_unlock(so, 0);
4086 error = uiomove(mtod(m, caddr_t), (int)len, auio);
4087 socket_lock(so, 0);
4088 if (error)
4089 goto release;
4090 } else {
4091 delayed_copy_len += len;
4092 }
4093
4094 if (len == m->m_len) {
4095 /*
4096 * m was entirely copied
4097 */
4098 sbfree(&so->so_rcv, m);
4099 nextrecord = m->m_nextpkt;
4100 m->m_nextpkt = NULL;
4101
4102 /*
4103 * Set the first packet to the head of the free list
4104 */
4105 if (free_list == NULL)
4106 free_list = m;
4107 /*
4108 * Link current packet to tail of free list
4109 */
4110 if (ml == NULL) {
4111 if (free_tail != NULL)
4112 free_tail->m_nextpkt = m;
4113 free_tail = m;
4114 }
4115 /*
4116 * Link current mbuf to last mbuf of current packet
4117 */
4118 if (ml != NULL)
4119 ml->m_next = m;
4120 ml = m;
4121
4122 /*
4123 * Move next buf to head of socket buffer
4124 */
4125 so->so_rcv.sb_mb = m = ml->m_next;
4126 ml->m_next = NULL;
4127
4128 if (m != NULL) {
4129 m->m_nextpkt = nextrecord;
4130 if (nextrecord == NULL)
4131 so->so_rcv.sb_lastrecord = m;
4132 } else {
4133 so->so_rcv.sb_mb = nextrecord;
4134 SB_EMPTY_FIXUP(&so->so_rcv);
4135 }
4136 SBLASTRECORDCHK(&so->so_rcv, "soreceive 3");
4137 SBLASTMBUFCHK(&so->so_rcv, "soreceive 3");
4138 } else {
4139 /*
4140 * Stop the loop on partial copy
4141 */
4142 break;
4143 }
4144 }
4145 #ifdef MORE_LOCKING_DEBUG
4146 if (so->so_usecount <= 1) {
4147 panic("%s: after big while so=%llx ref=%d on socket\n",
4148 __func__,
4149 (uint64_t)DEBUG_KERNEL_ADDRPERM(so), so->so_usecount);
4150 /* NOTREACHED */
4151 }
4152 #endif
4153 /*
4154 * Tell the caller we made a partial copy
4155 */
4156 if (m != NULL) {
4157 if (so->so_options & SO_DONTTRUNC) {
4158 /*
4159 * Copyout first the freelist then the partial mbuf
4160 */
4161 socket_unlock(so, 0);
4162 if (delayed_copy_len)
4163 error = sodelayed_copy_list(so, msgarray,
4164 uiocnt, &free_list, &delayed_copy_len);
4165
4166 if (error == 0) {
4167 error = uiomove(mtod(m, caddr_t), (int)len,
4168 auio);
4169 }
4170 socket_lock(so, 0);
4171 if (error)
4172 goto release;
4173
4174 m->m_data += len;
4175 m->m_len -= len;
4176 so->so_rcv.sb_cc -= len;
4177 flags |= MSG_RCVMORE;
4178 } else {
4179 (void) sbdroprecord(&so->so_rcv);
4180 nextrecord = so->so_rcv.sb_mb;
4181 m = NULL;
4182 flags |= MSG_TRUNC;
4183 }
4184 }
4185
4186 if (m == NULL) {
4187 so->so_rcv.sb_mb = nextrecord;
4188 /*
4189 * First part is an inline SB_EMPTY_FIXUP(). Second
4190 * part makes sure sb_lastrecord is up-to-date if
4191 * there is still data in the socket buffer.
4192 */
4193 if (so->so_rcv.sb_mb == NULL) {
4194 so->so_rcv.sb_mbtail = NULL;
4195 so->so_rcv.sb_lastrecord = NULL;
4196 } else if (nextrecord->m_nextpkt == NULL) {
4197 so->so_rcv.sb_lastrecord = nextrecord;
4198 }
4199 SB_MB_CHECK(&so->so_rcv);
4200 }
4201 SBLASTRECORDCHK(&so->so_rcv, "soreceive 4");
4202 SBLASTMBUFCHK(&so->so_rcv, "soreceive 4");
4203
4204 /*
4205 * We can continue to the next packet as long as:
4206 * - We haven't exhausted the uio array
4207 * - There was no error
4208 * - A packet was not truncated
4209 * - We can still receive more data
4210 */
4211 if (npkts < uiocnt && error == 0 &&
4212 (flags & (MSG_RCVMORE | MSG_TRUNC)) == 0 &&
4213 (so->so_state & SS_CANTRCVMORE) == 0) {
4214 sbunlock(&so->so_rcv, TRUE); /* keep socket locked */
4215 sblocked = 0;
4216
4217 goto next;
4218 }
4219 if (flagsp != NULL)
4220 *flagsp |= flags;
4221
4222 release:
4223 /*
4224 * pru_rcvd may cause more data to be received if the socket lock
4225 * is dropped so we set MSG_HAVEMORE now based on what we know.
4226 * That way the caller won't be surprised if it receives less data
4227 * than requested.
4228 */
4229 if ((so->so_options & SO_WANTMORE) && so->so_rcv.sb_cc > 0)
4230 flags |= MSG_HAVEMORE;
4231
4232 if (pr->pr_flags & PR_WANTRCVD && so->so_pcb)
4233 (*pr->pr_usrreqs->pru_rcvd)(so, flags);
4234
4235 if (sblocked)
4236 sbunlock(&so->so_rcv, FALSE); /* will unlock socket */
4237 else
4238 socket_unlock(so, 1);
4239
4240 if (delayed_copy_len)
4241 error = sodelayed_copy_list(so, msgarray, uiocnt,
4242 &free_list, &delayed_copy_len);
4243 out:
4244 /*
4245 * Amortize the cost of freeing the mbufs
4246 */
4247 if (free_list != NULL)
4248 m_freem_list(free_list);
4249 if (free_others != NULL)
4250 m_freem_list(free_others);
4251
4252 KERNEL_DEBUG(DBG_FNC_SORECEIVE_LIST | DBG_FUNC_END, error,
4253 0, 0, 0, 0);
4254 return (error);
4255 }
4256
4257 /*
4258 * Returns: 0 Success
4259 * EINVAL
4260 * ENOTCONN
4261 * <pru_shutdown>:EINVAL
4262 * <pru_shutdown>:EADDRNOTAVAIL[TCP]
4263 * <pru_shutdown>:ENOBUFS[TCP]
4264 * <pru_shutdown>:EMSGSIZE[TCP]
4265 * <pru_shutdown>:EHOSTUNREACH[TCP]
4266 * <pru_shutdown>:ENETUNREACH[TCP]
4267 * <pru_shutdown>:ENETDOWN[TCP]
4268 * <pru_shutdown>:ENOMEM[TCP]
4269 * <pru_shutdown>:EACCES[TCP]
4270 * <pru_shutdown>:EMSGSIZE[TCP]
4271 * <pru_shutdown>:ENOBUFS[TCP]
4272 * <pru_shutdown>:???[TCP] [ignorable: mostly IPSEC/firewall/DLIL]
4273 * <pru_shutdown>:??? [other protocol families]
4274 */
4275 int
4276 soshutdown(struct socket *so, int how)
4277 {
4278 int error;
4279
4280 KERNEL_DEBUG(DBG_FNC_SOSHUTDOWN | DBG_FUNC_START, how, 0, 0, 0, 0);
4281
4282 switch (how) {
4283 case SHUT_RD:
4284 case SHUT_WR:
4285 case SHUT_RDWR:
4286 socket_lock(so, 1);
4287 if ((so->so_state &
4288 (SS_ISCONNECTED|SS_ISCONNECTING|SS_ISDISCONNECTING)) == 0) {
4289 error = ENOTCONN;
4290 } else {
4291 error = soshutdownlock(so, how);
4292 }
4293 socket_unlock(so, 1);
4294 break;
4295 default:
4296 error = EINVAL;
4297 break;
4298 }
4299
4300 KERNEL_DEBUG(DBG_FNC_SOSHUTDOWN | DBG_FUNC_END, how, error, 0, 0, 0);
4301
4302 return (error);
4303 }
4304
4305 int
4306 soshutdownlock_final(struct socket *so, int how)
4307 {
4308 struct protosw *pr = so->so_proto;
4309 int error = 0;
4310
4311 sflt_notify(so, sock_evt_shutdown, &how);
4312
4313 if (how != SHUT_WR) {
4314 if ((so->so_state & SS_CANTRCVMORE) != 0) {
4315 /* read already shut down */
4316 error = ENOTCONN;
4317 goto done;
4318 }
4319 sorflush(so);
4320 postevent(so, 0, EV_RCLOSED);
4321 }
4322 if (how != SHUT_RD) {
4323 if ((so->so_state & SS_CANTSENDMORE) != 0) {
4324 /* write already shut down */
4325 error = ENOTCONN;
4326 goto done;
4327 }
4328 error = (*pr->pr_usrreqs->pru_shutdown)(so);
4329 postevent(so, 0, EV_WCLOSED);
4330 }
4331 done:
4332 KERNEL_DEBUG(DBG_FNC_SOSHUTDOWN, how, 1, 0, 0, 0);
4333 return (error);
4334 }
4335
4336 int
4337 soshutdownlock(struct socket *so, int how)
4338 {
4339 int error = 0;
4340
4341 #if CONTENT_FILTER
4342 /*
4343 * A content filter may delay the actual shutdown until it
4344 * has processed the pending data
4345 */
4346 if (so->so_flags & SOF_CONTENT_FILTER) {
4347 error = cfil_sock_shutdown(so, &how);
4348 if (error == EJUSTRETURN) {
4349 error = 0;
4350 goto done;
4351 } else if (error != 0) {
4352 goto done;
4353 }
4354 }
4355 #endif /* CONTENT_FILTER */
4356
4357 error = soshutdownlock_final(so, how);
4358
4359 done:
4360 return (error);
4361 }
4362
4363 void
4364 sowflush(struct socket *so)
4365 {
4366 struct sockbuf *sb = &so->so_snd;
4367
4368 /*
4369 * Obtain lock on the socket buffer (SB_LOCK). This is required
4370 * to prevent the socket buffer from being unexpectedly altered
4371 * while it is used by another thread in socket send/receive.
4372 *
4373 * sblock() must not fail here, hence the assertion.
4374 */
4375 (void) sblock(sb, SBL_WAIT | SBL_NOINTR | SBL_IGNDEFUNCT);
4376 VERIFY(sb->sb_flags & SB_LOCK);
4377
4378 sb->sb_flags &= ~(SB_SEL|SB_UPCALL);
4379 sb->sb_flags |= SB_DROP;
4380 sb->sb_upcall = NULL;
4381 sb->sb_upcallarg = NULL;
4382
4383 sbunlock(sb, TRUE); /* keep socket locked */
4384
4385 selthreadclear(&sb->sb_sel);
4386 sbrelease(sb);
4387 }
4388
4389 void
4390 sorflush(struct socket *so)
4391 {
4392 struct sockbuf *sb = &so->so_rcv;
4393 struct protosw *pr = so->so_proto;
4394 struct sockbuf asb;
4395 #ifdef notyet
4396 lck_mtx_t *mutex_held;
4397 /*
4398 * XXX: This code is currently commented out, because we may get here
4399 * as part of sofreelastref(), and at that time, pr_getlock() may no
4400 * longer be able to return us the lock; this will be fixed in future.
4401 */
4402 if (so->so_proto->pr_getlock != NULL)
4403 mutex_held = (*so->so_proto->pr_getlock)(so, 0);
4404 else
4405 mutex_held = so->so_proto->pr_domain->dom_mtx;
4406
4407 lck_mtx_assert(mutex_held, LCK_MTX_ASSERT_OWNED);
4408 #endif /* notyet */
4409
4410 sflt_notify(so, sock_evt_flush_read, NULL);
4411
4412 socantrcvmore(so);
4413
4414 /*
4415 * Obtain lock on the socket buffer (SB_LOCK). This is required
4416 * to prevent the socket buffer from being unexpectedly altered
4417 * while it is used by another thread in socket send/receive.
4418 *
4419 * sblock() must not fail here, hence the assertion.
4420 */
4421 (void) sblock(sb, SBL_WAIT | SBL_NOINTR | SBL_IGNDEFUNCT);
4422 VERIFY(sb->sb_flags & SB_LOCK);
4423
4424 /*
4425 * Copy only the relevant fields from "sb" to "asb" which we
4426 * need for sbrelease() to function. In particular, skip
4427 * sb_sel as it contains the wait queue linkage, which would
4428 * wreak havoc if we were to issue selthreadclear() on "asb".
4429 * Make sure to not carry over SB_LOCK in "asb", as we need
4430 * to acquire it later as part of sbrelease().
4431 */
4432 bzero(&asb, sizeof (asb));
4433 asb.sb_cc = sb->sb_cc;
4434 asb.sb_hiwat = sb->sb_hiwat;
4435 asb.sb_mbcnt = sb->sb_mbcnt;
4436 asb.sb_mbmax = sb->sb_mbmax;
4437 asb.sb_ctl = sb->sb_ctl;
4438 asb.sb_lowat = sb->sb_lowat;
4439 asb.sb_mb = sb->sb_mb;
4440 asb.sb_mbtail = sb->sb_mbtail;
4441 asb.sb_lastrecord = sb->sb_lastrecord;
4442 asb.sb_so = sb->sb_so;
4443 asb.sb_flags = sb->sb_flags;
4444 asb.sb_flags &= ~(SB_LOCK|SB_SEL|SB_KNOTE|SB_UPCALL);
4445 asb.sb_flags |= SB_DROP;
4446
4447 /*
4448 * Ideally we'd bzero() these and preserve the ones we need;
4449 * but to do that we'd need to shuffle things around in the
4450 * sockbuf, and we can't do it now because there are KEXTS
4451 * that are directly referring to the socket structure.
4452 *
4453 * Setting SB_DROP acts as a barrier to prevent further appends.
4454 * Clearing SB_SEL is done for selthreadclear() below.
4455 */
4456 sb->sb_cc = 0;
4457 sb->sb_hiwat = 0;
4458 sb->sb_mbcnt = 0;
4459 sb->sb_mbmax = 0;
4460 sb->sb_ctl = 0;
4461 sb->sb_lowat = 0;
4462 sb->sb_mb = NULL;
4463 sb->sb_mbtail = NULL;
4464 sb->sb_lastrecord = NULL;
4465 sb->sb_timeo.tv_sec = 0;
4466 sb->sb_timeo.tv_usec = 0;
4467 sb->sb_upcall = NULL;
4468 sb->sb_upcallarg = NULL;
4469 sb->sb_flags &= ~(SB_SEL|SB_UPCALL);
4470 sb->sb_flags |= SB_DROP;
4471
4472 sbunlock(sb, TRUE); /* keep socket locked */
4473
4474 /*
4475 * Note that selthreadclear() is called on the original "sb" and
4476 * not the local "asb" because of the way wait queue linkage is
4477 * implemented. Given that selwakeup() may be triggered, SB_SEL
4478 * should no longer be set (cleared above.)
4479 */
4480 selthreadclear(&sb->sb_sel);
4481
4482 if ((pr->pr_flags & PR_RIGHTS) && pr->pr_domain->dom_dispose)
4483 (*pr->pr_domain->dom_dispose)(asb.sb_mb);
4484
4485 sbrelease(&asb);
4486 }
4487
4488 /*
4489 * Perhaps this routine, and sooptcopyout(), below, ought to come in
4490 * an additional variant to handle the case where the option value needs
4491 * to be some kind of integer, but not a specific size.
4492 * In addition to their use here, these functions are also called by the
4493 * protocol-level pr_ctloutput() routines.
4494 *
4495 * Returns: 0 Success
4496 * EINVAL
4497 * copyin:EFAULT
4498 */
4499 int
4500 sooptcopyin(struct sockopt *sopt, void *buf, size_t len, size_t minlen)
4501 {
4502 size_t valsize;
4503
4504 /*
4505 * If the user gives us more than we wanted, we ignore it,
4506 * but if we don't get the minimum length the caller
4507 * wants, we return EINVAL. On success, sopt->sopt_valsize
4508 * is set to however much we actually retrieved.
4509 */
4510 if ((valsize = sopt->sopt_valsize) < minlen)
4511 return (EINVAL);
4512 if (valsize > len)
4513 sopt->sopt_valsize = valsize = len;
4514
4515 if (sopt->sopt_p != kernproc)
4516 return (copyin(sopt->sopt_val, buf, valsize));
4517
4518 bcopy(CAST_DOWN(caddr_t, sopt->sopt_val), buf, valsize);
4519 return (0);
4520 }
4521
4522 /*
4523 * sooptcopyin_timeval
4524 * Copy in a timeval value into tv_p, and take into account whether the
4525 * the calling process is 64-bit or 32-bit. Moved the sanity checking
4526 * code here so that we can verify the 64-bit tv_sec value before we lose
4527 * the top 32-bits assigning tv64.tv_sec to tv_p->tv_sec.
4528 */
4529 static int
4530 sooptcopyin_timeval(struct sockopt *sopt, struct timeval *tv_p)
4531 {
4532 int error;
4533
4534 if (proc_is64bit(sopt->sopt_p)) {
4535 struct user64_timeval tv64;
4536
4537 if (sopt->sopt_valsize < sizeof (tv64))
4538 return (EINVAL);
4539
4540 sopt->sopt_valsize = sizeof (tv64);
4541 if (sopt->sopt_p != kernproc) {
4542 error = copyin(sopt->sopt_val, &tv64, sizeof (tv64));
4543 if (error != 0)
4544 return (error);
4545 } else {
4546 bcopy(CAST_DOWN(caddr_t, sopt->sopt_val), &tv64,
4547 sizeof (tv64));
4548 }
4549 if (tv64.tv_sec < 0 || tv64.tv_sec > LONG_MAX ||
4550 tv64.tv_usec < 0 || tv64.tv_usec >= 1000000)
4551 return (EDOM);
4552
4553 tv_p->tv_sec = tv64.tv_sec;
4554 tv_p->tv_usec = tv64.tv_usec;
4555 } else {
4556 struct user32_timeval tv32;
4557
4558 if (sopt->sopt_valsize < sizeof (tv32))
4559 return (EINVAL);
4560
4561 sopt->sopt_valsize = sizeof (tv32);
4562 if (sopt->sopt_p != kernproc) {
4563 error = copyin(sopt->sopt_val, &tv32, sizeof (tv32));
4564 if (error != 0) {
4565 return (error);
4566 }
4567 } else {
4568 bcopy(CAST_DOWN(caddr_t, sopt->sopt_val), &tv32,
4569 sizeof (tv32));
4570 }
4571 #ifndef __LP64__
4572 /*
4573 * K64todo "comparison is always false due to
4574 * limited range of data type"
4575 */
4576 if (tv32.tv_sec < 0 || tv32.tv_sec > LONG_MAX ||
4577 tv32.tv_usec < 0 || tv32.tv_usec >= 1000000)
4578 return (EDOM);
4579 #endif
4580 tv_p->tv_sec = tv32.tv_sec;
4581 tv_p->tv_usec = tv32.tv_usec;
4582 }
4583 return (0);
4584 }
4585
4586 static int
4587 soopt_cred_check(struct socket *so, int priv)
4588 {
4589 kauth_cred_t cred = NULL;
4590 proc_t ep = PROC_NULL;
4591 int error;
4592
4593 if (so->so_flags & SOF_DELEGATED) {
4594 ep = proc_find(so->e_pid);
4595 if (ep)
4596 cred = kauth_cred_proc_ref(ep);
4597 }
4598 error = priv_check_cred(cred ? cred : so->so_cred, priv, 0);
4599 if (cred)
4600 kauth_cred_unref(&cred);
4601 if (ep != PROC_NULL)
4602 proc_rele(ep);
4603
4604 return (error);
4605 }
4606
4607 /*
4608 * Returns: 0 Success
4609 * EINVAL
4610 * ENOPROTOOPT
4611 * ENOBUFS
4612 * EDOM
4613 * sooptcopyin:EINVAL
4614 * sooptcopyin:EFAULT
4615 * sooptcopyin_timeval:EINVAL
4616 * sooptcopyin_timeval:EFAULT
4617 * sooptcopyin_timeval:EDOM
4618 * <pr_ctloutput>:EOPNOTSUPP[AF_UNIX]
4619 * <pr_ctloutput>:???w
4620 * sflt_attach_private:??? [whatever a filter author chooses]
4621 * <sf_setoption>:??? [whatever a filter author chooses]
4622 *
4623 * Notes: Other <pru_listen> returns depend on the protocol family; all
4624 * <sf_listen> returns depend on what the filter author causes
4625 * their filter to return.
4626 */
4627 int
4628 sosetoptlock(struct socket *so, struct sockopt *sopt, int dolock)
4629 {
4630 int error, optval;
4631 struct linger l;
4632 struct timeval tv;
4633 #if CONFIG_MACF_SOCKET
4634 struct mac extmac;
4635 #endif /* MAC_SOCKET */
4636
4637 if (sopt->sopt_dir != SOPT_SET)
4638 sopt->sopt_dir = SOPT_SET;
4639
4640 if (dolock)
4641 socket_lock(so, 1);
4642
4643 if ((so->so_state & (SS_CANTRCVMORE | SS_CANTSENDMORE)) ==
4644 (SS_CANTRCVMORE | SS_CANTSENDMORE) &&
4645 (so->so_flags & SOF_NPX_SETOPTSHUT) == 0) {
4646 /* the socket has been shutdown, no more sockopt's */
4647 error = EINVAL;
4648 goto out;
4649 }
4650
4651 error = sflt_setsockopt(so, sopt);
4652 if (error != 0) {
4653 if (error == EJUSTRETURN)
4654 error = 0;
4655 goto out;
4656 }
4657
4658 if (sopt->sopt_level != SOL_SOCKET) {
4659 if (so->so_proto != NULL &&
4660 so->so_proto->pr_ctloutput != NULL) {
4661 error = (*so->so_proto->pr_ctloutput)(so, sopt);
4662 goto out;
4663 }
4664 error = ENOPROTOOPT;
4665 } else {
4666 /*
4667 * Allow socket-level (SOL_SOCKET) options to be filtered by
4668 * the protocol layer, if needed. A zero value returned from
4669 * the handler means use default socket-level processing as
4670 * done by the rest of this routine. Otherwise, any other
4671 * return value indicates that the option is unsupported.
4672 */
4673 if (so->so_proto != NULL && (error = so->so_proto->pr_usrreqs->
4674 pru_socheckopt(so, sopt)) != 0)
4675 goto out;
4676
4677 error = 0;
4678 switch (sopt->sopt_name) {
4679 case SO_LINGER:
4680 case SO_LINGER_SEC:
4681 error = sooptcopyin(sopt, &l, sizeof (l), sizeof (l));
4682 if (error != 0)
4683 goto out;
4684
4685 so->so_linger = (sopt->sopt_name == SO_LINGER) ?
4686 l.l_linger : l.l_linger * hz;
4687 if (l.l_onoff != 0)
4688 so->so_options |= SO_LINGER;
4689 else
4690 so->so_options &= ~SO_LINGER;
4691 break;
4692
4693 case SO_DEBUG:
4694 case SO_KEEPALIVE:
4695 case SO_DONTROUTE:
4696 case SO_USELOOPBACK:
4697 case SO_BROADCAST:
4698 case SO_REUSEADDR:
4699 case SO_REUSEPORT:
4700 case SO_OOBINLINE:
4701 case SO_TIMESTAMP:
4702 case SO_TIMESTAMP_MONOTONIC:
4703 case SO_DONTTRUNC:
4704 case SO_WANTMORE:
4705 case SO_WANTOOBFLAG:
4706 case SO_NOWAKEFROMSLEEP:
4707 case SO_NOAPNFALLBK:
4708 error = sooptcopyin(sopt, &optval, sizeof (optval),
4709 sizeof (optval));
4710 if (error != 0)
4711 goto out;
4712 if (optval)
4713 so->so_options |= sopt->sopt_name;
4714 else
4715 so->so_options &= ~sopt->sopt_name;
4716 break;
4717
4718 case SO_SNDBUF:
4719 case SO_RCVBUF:
4720 case SO_SNDLOWAT:
4721 case SO_RCVLOWAT:
4722 error = sooptcopyin(sopt, &optval, sizeof (optval),
4723 sizeof (optval));
4724 if (error != 0)
4725 goto out;
4726
4727 /*
4728 * Values < 1 make no sense for any of these
4729 * options, so disallow them.
4730 */
4731 if (optval < 1) {
4732 error = EINVAL;
4733 goto out;
4734 }
4735
4736 switch (sopt->sopt_name) {
4737 case SO_SNDBUF:
4738 case SO_RCVBUF: {
4739 struct sockbuf *sb =
4740 (sopt->sopt_name == SO_SNDBUF) ?
4741 &so->so_snd : &so->so_rcv;
4742 if (sbreserve(sb, (u_int32_t)optval) == 0) {
4743 error = ENOBUFS;
4744 goto out;
4745 }
4746 sb->sb_flags |= SB_USRSIZE;
4747 sb->sb_flags &= ~SB_AUTOSIZE;
4748 sb->sb_idealsize = (u_int32_t)optval;
4749 break;
4750 }
4751 /*
4752 * Make sure the low-water is never greater than
4753 * the high-water.
4754 */
4755 case SO_SNDLOWAT: {
4756 int space = sbspace(&so->so_snd);
4757 u_int32_t hiwat = so->so_snd.sb_hiwat;
4758
4759 if (so->so_snd.sb_flags & SB_UNIX) {
4760 struct unpcb *unp =
4761 (struct unpcb *)(so->so_pcb);
4762 if (unp != NULL &&
4763 unp->unp_conn != NULL) {
4764 hiwat += unp->unp_conn->unp_cc;
4765 }
4766 }
4767
4768 so->so_snd.sb_lowat =
4769 (optval > hiwat) ?
4770 hiwat : optval;
4771
4772 if (space >= so->so_snd.sb_lowat) {
4773 sowwakeup(so);
4774 }
4775 break;
4776 }
4777 case SO_RCVLOWAT: {
4778 int64_t data_len;
4779 so->so_rcv.sb_lowat =
4780 (optval > so->so_rcv.sb_hiwat) ?
4781 so->so_rcv.sb_hiwat : optval;
4782 data_len = so->so_rcv.sb_cc
4783 - so->so_rcv.sb_ctl;
4784 if (data_len >= so->so_rcv.sb_lowat)
4785 sorwakeup(so);
4786 break;
4787 }
4788 }
4789 break;
4790
4791 case SO_SNDTIMEO:
4792 case SO_RCVTIMEO:
4793 error = sooptcopyin_timeval(sopt, &tv);
4794 if (error != 0)
4795 goto out;
4796
4797 switch (sopt->sopt_name) {
4798 case SO_SNDTIMEO:
4799 so->so_snd.sb_timeo = tv;
4800 break;
4801 case SO_RCVTIMEO:
4802 so->so_rcv.sb_timeo = tv;
4803 break;
4804 }
4805 break;
4806
4807 case SO_NKE: {
4808 struct so_nke nke;
4809
4810 error = sooptcopyin(sopt, &nke, sizeof (nke),
4811 sizeof (nke));
4812 if (error != 0)
4813 goto out;
4814
4815 error = sflt_attach_internal(so, nke.nke_handle);
4816 break;
4817 }
4818
4819 case SO_NOSIGPIPE:
4820 error = sooptcopyin(sopt, &optval, sizeof (optval),
4821 sizeof (optval));
4822 if (error != 0)
4823 goto out;
4824 if (optval != 0)
4825 so->so_flags |= SOF_NOSIGPIPE;
4826 else
4827 so->so_flags &= ~SOF_NOSIGPIPE;
4828 break;
4829
4830 case SO_NOADDRERR:
4831 error = sooptcopyin(sopt, &optval, sizeof (optval),
4832 sizeof (optval));
4833 if (error != 0)
4834 goto out;
4835 if (optval != 0)
4836 so->so_flags |= SOF_NOADDRAVAIL;
4837 else
4838 so->so_flags &= ~SOF_NOADDRAVAIL;
4839 break;
4840
4841 case SO_REUSESHAREUID:
4842 error = sooptcopyin(sopt, &optval, sizeof (optval),
4843 sizeof (optval));
4844 if (error != 0)
4845 goto out;
4846 if (optval != 0)
4847 so->so_flags |= SOF_REUSESHAREUID;
4848 else
4849 so->so_flags &= ~SOF_REUSESHAREUID;
4850 break;
4851
4852 case SO_NOTIFYCONFLICT:
4853 if (kauth_cred_issuser(kauth_cred_get()) == 0) {
4854 error = EPERM;
4855 goto out;
4856 }
4857 error = sooptcopyin(sopt, &optval, sizeof (optval),
4858 sizeof (optval));
4859 if (error != 0)
4860 goto out;
4861 if (optval != 0)
4862 so->so_flags |= SOF_NOTIFYCONFLICT;
4863 else
4864 so->so_flags &= ~SOF_NOTIFYCONFLICT;
4865 break;
4866
4867 case SO_RESTRICTIONS:
4868 error = sooptcopyin(sopt, &optval, sizeof (optval),
4869 sizeof (optval));
4870 if (error != 0)
4871 goto out;
4872
4873 error = so_set_restrictions(so, optval);
4874 break;
4875
4876 case SO_AWDL_UNRESTRICTED:
4877 if (SOCK_DOM(so) != PF_INET &&
4878 SOCK_DOM(so) != PF_INET6) {
4879 error = EOPNOTSUPP;
4880 goto out;
4881 }
4882 error = sooptcopyin(sopt, &optval, sizeof(optval),
4883 sizeof(optval));
4884 if (error != 0)
4885 goto out;
4886 if (optval != 0) {
4887 error = soopt_cred_check(so,
4888 PRIV_NET_RESTRICTED_AWDL);
4889 if (error == 0)
4890 inp_set_awdl_unrestricted(
4891 sotoinpcb(so));
4892 } else
4893 inp_clear_awdl_unrestricted(sotoinpcb(so));
4894 break;
4895 case SO_INTCOPROC_ALLOW:
4896 if (SOCK_DOM(so) != PF_INET6) {
4897 error = EOPNOTSUPP;
4898 goto out;
4899 }
4900 error = sooptcopyin(sopt, &optval, sizeof(optval),
4901 sizeof(optval));
4902 if (error != 0)
4903 goto out;
4904 if (optval != 0 &&
4905 inp_get_intcoproc_allowed(sotoinpcb(so)) == FALSE) {
4906 error = soopt_cred_check(so,
4907 PRIV_NET_RESTRICTED_INTCOPROC);
4908 if (error == 0)
4909 inp_set_intcoproc_allowed(
4910 sotoinpcb(so));
4911 } else if (optval == 0)
4912 inp_clear_intcoproc_allowed(sotoinpcb(so));
4913 break;
4914
4915 case SO_LABEL:
4916 #if CONFIG_MACF_SOCKET
4917 if ((error = sooptcopyin(sopt, &extmac, sizeof (extmac),
4918 sizeof (extmac))) != 0)
4919 goto out;
4920
4921 error = mac_setsockopt_label(proc_ucred(sopt->sopt_p),
4922 so, &extmac);
4923 #else
4924 error = EOPNOTSUPP;
4925 #endif /* MAC_SOCKET */
4926 break;
4927
4928 case SO_UPCALLCLOSEWAIT:
4929 error = sooptcopyin(sopt, &optval, sizeof (optval),
4930 sizeof (optval));
4931 if (error != 0)
4932 goto out;
4933 if (optval != 0)
4934 so->so_flags |= SOF_UPCALLCLOSEWAIT;
4935 else
4936 so->so_flags &= ~SOF_UPCALLCLOSEWAIT;
4937 break;
4938
4939 case SO_RANDOMPORT:
4940 error = sooptcopyin(sopt, &optval, sizeof (optval),
4941 sizeof (optval));
4942 if (error != 0)
4943 goto out;
4944 if (optval != 0)
4945 so->so_flags |= SOF_BINDRANDOMPORT;
4946 else
4947 so->so_flags &= ~SOF_BINDRANDOMPORT;
4948 break;
4949
4950 case SO_NP_EXTENSIONS: {
4951 struct so_np_extensions sonpx;
4952
4953 error = sooptcopyin(sopt, &sonpx, sizeof (sonpx),
4954 sizeof (sonpx));
4955 if (error != 0)
4956 goto out;
4957 if (sonpx.npx_mask & ~SONPX_MASK_VALID) {
4958 error = EINVAL;
4959 goto out;
4960 }
4961 /*
4962 * Only one bit defined for now
4963 */
4964 if ((sonpx.npx_mask & SONPX_SETOPTSHUT)) {
4965 if ((sonpx.npx_flags & SONPX_SETOPTSHUT))
4966 so->so_flags |= SOF_NPX_SETOPTSHUT;
4967 else
4968 so->so_flags &= ~SOF_NPX_SETOPTSHUT;
4969 }
4970 break;
4971 }
4972
4973 case SO_TRAFFIC_CLASS: {
4974 error = sooptcopyin(sopt, &optval, sizeof (optval),
4975 sizeof (optval));
4976 if (error != 0)
4977 goto out;
4978 if (optval >= SO_TC_NET_SERVICE_OFFSET) {
4979 int netsvc = optval - SO_TC_NET_SERVICE_OFFSET;
4980 error = so_set_net_service_type(so, netsvc);
4981 goto out;
4982 }
4983 error = so_set_traffic_class(so, optval);
4984 if (error != 0)
4985 goto out;
4986 so->so_flags1 &= ~SOF1_TC_NET_SERV_TYPE;
4987 so->so_netsvctype = _NET_SERVICE_TYPE_UNSPEC;
4988 break;
4989 }
4990
4991 case SO_RECV_TRAFFIC_CLASS: {
4992 error = sooptcopyin(sopt, &optval, sizeof (optval),
4993 sizeof (optval));
4994 if (error != 0)
4995 goto out;
4996 if (optval == 0)
4997 so->so_flags &= ~SOF_RECV_TRAFFIC_CLASS;
4998 else
4999 so->so_flags |= SOF_RECV_TRAFFIC_CLASS;
5000 break;
5001 }
5002
5003 #if (DEVELOPMENT || DEBUG)
5004 case SO_TRAFFIC_CLASS_DBG: {
5005 struct so_tcdbg so_tcdbg;
5006
5007 error = sooptcopyin(sopt, &so_tcdbg,
5008 sizeof (struct so_tcdbg), sizeof (struct so_tcdbg));
5009 if (error != 0)
5010 goto out;
5011 error = so_set_tcdbg(so, &so_tcdbg);
5012 if (error != 0)
5013 goto out;
5014 break;
5015 }
5016 #endif /* (DEVELOPMENT || DEBUG) */
5017
5018 case SO_PRIVILEGED_TRAFFIC_CLASS:
5019 error = priv_check_cred(kauth_cred_get(),
5020 PRIV_NET_PRIVILEGED_TRAFFIC_CLASS, 0);
5021 if (error != 0)
5022 goto out;
5023 error = sooptcopyin(sopt, &optval, sizeof (optval),
5024 sizeof (optval));
5025 if (error != 0)
5026 goto out;
5027 if (optval == 0)
5028 so->so_flags &= ~SOF_PRIVILEGED_TRAFFIC_CLASS;
5029 else
5030 so->so_flags |= SOF_PRIVILEGED_TRAFFIC_CLASS;
5031 break;
5032
5033 case SO_DEFUNCTOK:
5034 error = sooptcopyin(sopt, &optval, sizeof (optval),
5035 sizeof (optval));
5036 if (error != 0 || (so->so_flags & SOF_DEFUNCT)) {
5037 if (error == 0)
5038 error = EBADF;
5039 goto out;
5040 }
5041 /*
5042 * Any process can set SO_DEFUNCTOK (clear
5043 * SOF_NODEFUNCT), but only root can clear
5044 * SO_DEFUNCTOK (set SOF_NODEFUNCT).
5045 */
5046 if (optval == 0 &&
5047 kauth_cred_issuser(kauth_cred_get()) == 0) {
5048 error = EPERM;
5049 goto out;
5050 }
5051 if (optval)
5052 so->so_flags &= ~SOF_NODEFUNCT;
5053 else
5054 so->so_flags |= SOF_NODEFUNCT;
5055
5056 if (SOCK_DOM(so) == PF_INET ||
5057 SOCK_DOM(so) == PF_INET6) {
5058 char s[MAX_IPv6_STR_LEN];
5059 char d[MAX_IPv6_STR_LEN];
5060 struct inpcb *inp = sotoinpcb(so);
5061
5062 SODEFUNCTLOG("%s[%d, %s]: so 0x%llx "
5063 "[%s %s:%d -> %s:%d] is now marked "
5064 "as %seligible for "
5065 "defunct\n", __func__, proc_selfpid(),
5066 proc_best_name(current_proc()),
5067 (uint64_t)DEBUG_KERNEL_ADDRPERM(so),
5068 (SOCK_TYPE(so) == SOCK_STREAM) ?
5069 "TCP" : "UDP", inet_ntop(SOCK_DOM(so),
5070 ((SOCK_DOM(so) == PF_INET) ?
5071 (void *)&inp->inp_laddr.s_addr :
5072 (void *)&inp->in6p_laddr), s, sizeof (s)),
5073 ntohs(inp->in6p_lport),
5074 inet_ntop(SOCK_DOM(so),
5075 (SOCK_DOM(so) == PF_INET) ?
5076 (void *)&inp->inp_faddr.s_addr :
5077 (void *)&inp->in6p_faddr, d, sizeof (d)),
5078 ntohs(inp->in6p_fport),
5079 (so->so_flags & SOF_NODEFUNCT) ?
5080 "not " : "");
5081 } else {
5082 SODEFUNCTLOG("%s[%d, %s]: so 0x%llx [%d,%d] "
5083 "is now marked as %seligible for "
5084 "defunct\n",
5085 __func__, proc_selfpid(),
5086 proc_best_name(current_proc()),
5087 (uint64_t)DEBUG_KERNEL_ADDRPERM(so),
5088 SOCK_DOM(so), SOCK_TYPE(so),
5089 (so->so_flags & SOF_NODEFUNCT) ?
5090 "not " : "");
5091 }
5092 break;
5093
5094 case SO_ISDEFUNCT:
5095 /* This option is not settable */
5096 error = EINVAL;
5097 break;
5098
5099 case SO_OPPORTUNISTIC:
5100 error = sooptcopyin(sopt, &optval, sizeof (optval),
5101 sizeof (optval));
5102 if (error == 0)
5103 error = so_set_opportunistic(so, optval);
5104 break;
5105
5106 case SO_FLUSH:
5107 /* This option is handled by lower layer(s) */
5108 error = 0;
5109 break;
5110
5111 case SO_RECV_ANYIF:
5112 error = sooptcopyin(sopt, &optval, sizeof (optval),
5113 sizeof (optval));
5114 if (error == 0)
5115 error = so_set_recv_anyif(so, optval);
5116 break;
5117
5118 case SO_TRAFFIC_MGT_BACKGROUND: {
5119 /* This option is handled by lower layer(s) */
5120 error = 0;
5121 break;
5122 }
5123
5124 #if FLOW_DIVERT
5125 case SO_FLOW_DIVERT_TOKEN:
5126 error = flow_divert_token_set(so, sopt);
5127 break;
5128 #endif /* FLOW_DIVERT */
5129
5130
5131 case SO_DELEGATED:
5132 if ((error = sooptcopyin(sopt, &optval, sizeof (optval),
5133 sizeof (optval))) != 0)
5134 break;
5135
5136 error = so_set_effective_pid(so, optval, sopt->sopt_p);
5137 break;
5138
5139 case SO_DELEGATED_UUID: {
5140 uuid_t euuid;
5141
5142 if ((error = sooptcopyin(sopt, &euuid, sizeof (euuid),
5143 sizeof (euuid))) != 0)
5144 break;
5145
5146 error = so_set_effective_uuid(so, euuid, sopt->sopt_p);
5147 break;
5148 }
5149
5150 #if NECP
5151 case SO_NECP_ATTRIBUTES:
5152 error = necp_set_socket_attributes(so, sopt);
5153 break;
5154 #endif /* NECP */
5155
5156 #if MPTCP
5157 case SO_MPTCP_FASTJOIN:
5158 if (!((so->so_flags & SOF_MP_SUBFLOW) ||
5159 ((SOCK_CHECK_DOM(so, PF_MULTIPATH)) &&
5160 (SOCK_CHECK_PROTO(so, IPPROTO_TCP))))) {
5161 error = ENOPROTOOPT;
5162 break;
5163 }
5164
5165 error = sooptcopyin(sopt, &optval, sizeof (optval),
5166 sizeof (optval));
5167 if (error != 0)
5168 goto out;
5169 if (optval == 0)
5170 so->so_flags &= ~SOF_MPTCP_FASTJOIN;
5171 else
5172 so->so_flags |= SOF_MPTCP_FASTJOIN;
5173 break;
5174 #endif /* MPTCP */
5175
5176 case SO_EXTENDED_BK_IDLE:
5177 error = sooptcopyin(sopt, &optval, sizeof (optval),
5178 sizeof (optval));
5179 if (error == 0)
5180 error = so_set_extended_bk_idle(so, optval);
5181 break;
5182
5183 case SO_MARK_CELLFALLBACK:
5184 error = sooptcopyin(sopt, &optval, sizeof(optval),
5185 sizeof(optval));
5186 if (error != 0)
5187 goto out;
5188 if (optval < 0) {
5189 error = EINVAL;
5190 goto out;
5191 }
5192 if (optval == 0)
5193 so->so_flags1 &= ~SOF1_CELLFALLBACK;
5194 else
5195 so->so_flags1 |= SOF1_CELLFALLBACK;
5196 break;
5197
5198 case SO_NET_SERVICE_TYPE: {
5199 error = sooptcopyin(sopt, &optval, sizeof(optval),
5200 sizeof(optval));
5201 if (error != 0)
5202 goto out;
5203 error = so_set_net_service_type(so, optval);
5204 break;
5205 }
5206
5207 case SO_QOSMARKING_POLICY_OVERRIDE:
5208 error = priv_check_cred(kauth_cred_get(),
5209 PRIV_NET_QOSMARKING_POLICY_OVERRIDE, 0);
5210 if (error != 0)
5211 goto out;
5212 error = sooptcopyin(sopt, &optval, sizeof(optval),
5213 sizeof(optval));
5214 if (error != 0)
5215 goto out;
5216 if (optval == 0)
5217 so->so_flags1 &= ~SOF1_QOSMARKING_POLICY_OVERRIDE;
5218 else
5219 so->so_flags1 |= SOF1_QOSMARKING_POLICY_OVERRIDE;
5220 break;
5221
5222 default:
5223 error = ENOPROTOOPT;
5224 break;
5225 }
5226 if (error == 0 && so->so_proto != NULL &&
5227 so->so_proto->pr_ctloutput != NULL) {
5228 (void) so->so_proto->pr_ctloutput(so, sopt);
5229 }
5230 }
5231 out:
5232 if (dolock)
5233 socket_unlock(so, 1);
5234 return (error);
5235 }
5236
5237 /* Helper routines for getsockopt */
5238 int
5239 sooptcopyout(struct sockopt *sopt, void *buf, size_t len)
5240 {
5241 int error;
5242 size_t valsize;
5243
5244 error = 0;
5245
5246 /*
5247 * Documented get behavior is that we always return a value,
5248 * possibly truncated to fit in the user's buffer.
5249 * Traditional behavior is that we always tell the user
5250 * precisely how much we copied, rather than something useful
5251 * like the total amount we had available for her.
5252 * Note that this interface is not idempotent; the entire answer must
5253 * generated ahead of time.
5254 */
5255 valsize = min(len, sopt->sopt_valsize);
5256 sopt->sopt_valsize = valsize;
5257 if (sopt->sopt_val != USER_ADDR_NULL) {
5258 if (sopt->sopt_p != kernproc)
5259 error = copyout(buf, sopt->sopt_val, valsize);
5260 else
5261 bcopy(buf, CAST_DOWN(caddr_t, sopt->sopt_val), valsize);
5262 }
5263 return (error);
5264 }
5265
5266 static int
5267 sooptcopyout_timeval(struct sockopt *sopt, const struct timeval *tv_p)
5268 {
5269 int error;
5270 size_t len;
5271 struct user64_timeval tv64;
5272 struct user32_timeval tv32;
5273 const void * val;
5274 size_t valsize;
5275
5276 error = 0;
5277 if (proc_is64bit(sopt->sopt_p)) {
5278 len = sizeof (tv64);
5279 tv64.tv_sec = tv_p->tv_sec;
5280 tv64.tv_usec = tv_p->tv_usec;
5281 val = &tv64;
5282 } else {
5283 len = sizeof (tv32);
5284 tv32.tv_sec = tv_p->tv_sec;
5285 tv32.tv_usec = tv_p->tv_usec;
5286 val = &tv32;
5287 }
5288 valsize = min(len, sopt->sopt_valsize);
5289 sopt->sopt_valsize = valsize;
5290 if (sopt->sopt_val != USER_ADDR_NULL) {
5291 if (sopt->sopt_p != kernproc)
5292 error = copyout(val, sopt->sopt_val, valsize);
5293 else
5294 bcopy(val, CAST_DOWN(caddr_t, sopt->sopt_val), valsize);
5295 }
5296 return (error);
5297 }
5298
5299 /*
5300 * Return: 0 Success
5301 * ENOPROTOOPT
5302 * <pr_ctloutput>:EOPNOTSUPP[AF_UNIX]
5303 * <pr_ctloutput>:???
5304 * <sf_getoption>:???
5305 */
5306 int
5307 sogetoptlock(struct socket *so, struct sockopt *sopt, int dolock)
5308 {
5309 int error, optval;
5310 struct linger l;
5311 struct timeval tv;
5312 #if CONFIG_MACF_SOCKET
5313 struct mac extmac;
5314 #endif /* MAC_SOCKET */
5315
5316 if (sopt->sopt_dir != SOPT_GET)
5317 sopt->sopt_dir = SOPT_GET;
5318
5319 if (dolock)
5320 socket_lock(so, 1);
5321
5322 error = sflt_getsockopt(so, sopt);
5323 if (error != 0) {
5324 if (error == EJUSTRETURN)
5325 error = 0;
5326 goto out;
5327 }
5328
5329 if (sopt->sopt_level != SOL_SOCKET) {
5330 if (so->so_proto != NULL &&
5331 so->so_proto->pr_ctloutput != NULL) {
5332 error = (*so->so_proto->pr_ctloutput)(so, sopt);
5333 goto out;
5334 }
5335 error = ENOPROTOOPT;
5336 } else {
5337 /*
5338 * Allow socket-level (SOL_SOCKET) options to be filtered by
5339 * the protocol layer, if needed. A zero value returned from
5340 * the handler means use default socket-level processing as
5341 * done by the rest of this routine. Otherwise, any other
5342 * return value indicates that the option is unsupported.
5343 */
5344 if (so->so_proto != NULL && (error = so->so_proto->pr_usrreqs->
5345 pru_socheckopt(so, sopt)) != 0)
5346 goto out;
5347
5348 error = 0;
5349 switch (sopt->sopt_name) {
5350 case SO_LINGER:
5351 case SO_LINGER_SEC:
5352 l.l_onoff = ((so->so_options & SO_LINGER) ? 1 : 0);
5353 l.l_linger = (sopt->sopt_name == SO_LINGER) ?
5354 so->so_linger : so->so_linger / hz;
5355 error = sooptcopyout(sopt, &l, sizeof (l));
5356 break;
5357
5358 case SO_USELOOPBACK:
5359 case SO_DONTROUTE:
5360 case SO_DEBUG:
5361 case SO_KEEPALIVE:
5362 case SO_REUSEADDR:
5363 case SO_REUSEPORT:
5364 case SO_BROADCAST:
5365 case SO_OOBINLINE:
5366 case SO_TIMESTAMP:
5367 case SO_TIMESTAMP_MONOTONIC:
5368 case SO_DONTTRUNC:
5369 case SO_WANTMORE:
5370 case SO_WANTOOBFLAG:
5371 case SO_NOWAKEFROMSLEEP:
5372 case SO_NOAPNFALLBK:
5373 optval = so->so_options & sopt->sopt_name;
5374 integer:
5375 error = sooptcopyout(sopt, &optval, sizeof (optval));
5376 break;
5377
5378 case SO_TYPE:
5379 optval = so->so_type;
5380 goto integer;
5381
5382 case SO_NREAD:
5383 if (so->so_proto->pr_flags & PR_ATOMIC) {
5384 int pkt_total;
5385 struct mbuf *m1;
5386
5387 pkt_total = 0;
5388 m1 = so->so_rcv.sb_mb;
5389 while (m1 != NULL) {
5390 if (m1->m_type == MT_DATA ||
5391 m1->m_type == MT_HEADER ||
5392 m1->m_type == MT_OOBDATA)
5393 pkt_total += m1->m_len;
5394 m1 = m1->m_next;
5395 }
5396 optval = pkt_total;
5397 } else {
5398 optval = so->so_rcv.sb_cc - so->so_rcv.sb_ctl;
5399 }
5400 goto integer;
5401
5402 case SO_NUMRCVPKT:
5403 if (so->so_proto->pr_flags & PR_ATOMIC) {
5404 int cnt = 0;
5405 struct mbuf *m1;
5406
5407 m1 = so->so_rcv.sb_mb;
5408 while (m1 != NULL) {
5409 if (m1->m_type == MT_DATA ||
5410 m1->m_type == MT_HEADER ||
5411 m1->m_type == MT_OOBDATA)
5412 cnt += 1;
5413 m1 = m1->m_nextpkt;
5414 }
5415 optval = cnt;
5416 goto integer;
5417 } else {
5418 error = EINVAL;
5419 break;
5420 }
5421
5422 case SO_NWRITE:
5423 optval = so->so_snd.sb_cc;
5424 goto integer;
5425
5426 case SO_ERROR:
5427 optval = so->so_error;
5428 so->so_error = 0;
5429 goto integer;
5430
5431 case SO_SNDBUF: {
5432 u_int32_t hiwat = so->so_snd.sb_hiwat;
5433
5434 if (so->so_snd.sb_flags & SB_UNIX) {
5435 struct unpcb *unp =
5436 (struct unpcb *)(so->so_pcb);
5437 if (unp != NULL && unp->unp_conn != NULL) {
5438 hiwat += unp->unp_conn->unp_cc;
5439 }
5440 }
5441
5442 optval = hiwat;
5443 goto integer;
5444 }
5445 case SO_RCVBUF:
5446 optval = so->so_rcv.sb_hiwat;
5447 goto integer;
5448
5449 case SO_SNDLOWAT:
5450 optval = so->so_snd.sb_lowat;
5451 goto integer;
5452
5453 case SO_RCVLOWAT:
5454 optval = so->so_rcv.sb_lowat;
5455 goto integer;
5456
5457 case SO_SNDTIMEO:
5458 case SO_RCVTIMEO:
5459 tv = (sopt->sopt_name == SO_SNDTIMEO ?
5460 so->so_snd.sb_timeo : so->so_rcv.sb_timeo);
5461
5462 error = sooptcopyout_timeval(sopt, &tv);
5463 break;
5464
5465 case SO_NOSIGPIPE:
5466 optval = (so->so_flags & SOF_NOSIGPIPE);
5467 goto integer;
5468
5469 case SO_NOADDRERR:
5470 optval = (so->so_flags & SOF_NOADDRAVAIL);
5471 goto integer;
5472
5473 case SO_REUSESHAREUID:
5474 optval = (so->so_flags & SOF_REUSESHAREUID);
5475 goto integer;
5476
5477
5478 case SO_NOTIFYCONFLICT:
5479 optval = (so->so_flags & SOF_NOTIFYCONFLICT);
5480 goto integer;
5481
5482 case SO_RESTRICTIONS:
5483 optval = so_get_restrictions(so);
5484 goto integer;
5485
5486 case SO_AWDL_UNRESTRICTED:
5487 if (SOCK_DOM(so) == PF_INET ||
5488 SOCK_DOM(so) == PF_INET6) {
5489 optval = inp_get_awdl_unrestricted(
5490 sotoinpcb(so));
5491 goto integer;
5492 } else
5493 error = EOPNOTSUPP;
5494 break;
5495
5496 case SO_INTCOPROC_ALLOW:
5497 if (SOCK_DOM(so) == PF_INET6) {
5498 optval = inp_get_intcoproc_allowed(
5499 sotoinpcb(so));
5500 goto integer;
5501 } else
5502 error = EOPNOTSUPP;
5503 break;
5504
5505 case SO_LABEL:
5506 #if CONFIG_MACF_SOCKET
5507 if ((error = sooptcopyin(sopt, &extmac, sizeof (extmac),
5508 sizeof (extmac))) != 0 ||
5509 (error = mac_socket_label_get(proc_ucred(
5510 sopt->sopt_p), so, &extmac)) != 0)
5511 break;
5512
5513 error = sooptcopyout(sopt, &extmac, sizeof (extmac));
5514 #else
5515 error = EOPNOTSUPP;
5516 #endif /* MAC_SOCKET */
5517 break;
5518
5519 case SO_PEERLABEL:
5520 #if CONFIG_MACF_SOCKET
5521 if ((error = sooptcopyin(sopt, &extmac, sizeof (extmac),
5522 sizeof (extmac))) != 0 ||
5523 (error = mac_socketpeer_label_get(proc_ucred(
5524 sopt->sopt_p), so, &extmac)) != 0)
5525 break;
5526
5527 error = sooptcopyout(sopt, &extmac, sizeof (extmac));
5528 #else
5529 error = EOPNOTSUPP;
5530 #endif /* MAC_SOCKET */
5531 break;
5532
5533 #ifdef __APPLE_API_PRIVATE
5534 case SO_UPCALLCLOSEWAIT:
5535 optval = (so->so_flags & SOF_UPCALLCLOSEWAIT);
5536 goto integer;
5537 #endif
5538 case SO_RANDOMPORT:
5539 optval = (so->so_flags & SOF_BINDRANDOMPORT);
5540 goto integer;
5541
5542 case SO_NP_EXTENSIONS: {
5543 struct so_np_extensions sonpx;
5544
5545 sonpx.npx_flags = (so->so_flags & SOF_NPX_SETOPTSHUT) ?
5546 SONPX_SETOPTSHUT : 0;
5547 sonpx.npx_mask = SONPX_MASK_VALID;
5548
5549 error = sooptcopyout(sopt, &sonpx,
5550 sizeof (struct so_np_extensions));
5551 break;
5552 }
5553
5554 case SO_TRAFFIC_CLASS:
5555 optval = so->so_traffic_class;
5556 goto integer;
5557
5558 case SO_RECV_TRAFFIC_CLASS:
5559 optval = (so->so_flags & SOF_RECV_TRAFFIC_CLASS);
5560 goto integer;
5561
5562 case SO_TRAFFIC_CLASS_STATS:
5563 error = sooptcopyout(sopt, &so->so_tc_stats,
5564 sizeof (so->so_tc_stats));
5565 break;
5566
5567 #if (DEVELOPMENT || DEBUG)
5568 case SO_TRAFFIC_CLASS_DBG:
5569 error = sogetopt_tcdbg(so, sopt);
5570 break;
5571 #endif /* (DEVELOPMENT || DEBUG) */
5572
5573 case SO_PRIVILEGED_TRAFFIC_CLASS:
5574 optval = (so->so_flags & SOF_PRIVILEGED_TRAFFIC_CLASS);
5575 goto integer;
5576
5577 case SO_DEFUNCTOK:
5578 optval = !(so->so_flags & SOF_NODEFUNCT);
5579 goto integer;
5580
5581 case SO_ISDEFUNCT:
5582 optval = (so->so_flags & SOF_DEFUNCT);
5583 goto integer;
5584
5585 case SO_OPPORTUNISTIC:
5586 optval = so_get_opportunistic(so);
5587 goto integer;
5588
5589 case SO_FLUSH:
5590 /* This option is not gettable */
5591 error = EINVAL;
5592 break;
5593
5594 case SO_RECV_ANYIF:
5595 optval = so_get_recv_anyif(so);
5596 goto integer;
5597
5598 case SO_TRAFFIC_MGT_BACKGROUND:
5599 /* This option is handled by lower layer(s) */
5600 if (so->so_proto != NULL &&
5601 so->so_proto->pr_ctloutput != NULL) {
5602 (void) so->so_proto->pr_ctloutput(so, sopt);
5603 }
5604 break;
5605
5606 #if FLOW_DIVERT
5607 case SO_FLOW_DIVERT_TOKEN:
5608 error = flow_divert_token_get(so, sopt);
5609 break;
5610 #endif /* FLOW_DIVERT */
5611
5612 #if NECP
5613 case SO_NECP_ATTRIBUTES:
5614 error = necp_get_socket_attributes(so, sopt);
5615 break;
5616 #endif /* NECP */
5617
5618 #if CONTENT_FILTER
5619 case SO_CFIL_SOCK_ID: {
5620 cfil_sock_id_t sock_id;
5621
5622 sock_id = cfil_sock_id_from_socket(so);
5623
5624 error = sooptcopyout(sopt, &sock_id,
5625 sizeof(cfil_sock_id_t));
5626 break;
5627 }
5628 #endif /* CONTENT_FILTER */
5629
5630 #if MPTCP
5631 case SO_MPTCP_FASTJOIN:
5632 if (!((so->so_flags & SOF_MP_SUBFLOW) ||
5633 ((SOCK_CHECK_DOM(so, PF_MULTIPATH)) &&
5634 (SOCK_CHECK_PROTO(so, IPPROTO_TCP))))) {
5635 error = ENOPROTOOPT;
5636 break;
5637 }
5638 optval = (so->so_flags & SOF_MPTCP_FASTJOIN);
5639 /* Fixed along with rdar://19391339 */
5640 goto integer;
5641 #endif /* MPTCP */
5642
5643 case SO_EXTENDED_BK_IDLE:
5644 optval = (so->so_flags1 & SOF1_EXTEND_BK_IDLE_WANTED);
5645 goto integer;
5646 case SO_MARK_CELLFALLBACK:
5647 optval = ((so->so_flags1 & SOF1_CELLFALLBACK) > 0)
5648 ? 1 : 0;
5649 goto integer;
5650 case SO_NET_SERVICE_TYPE: {
5651 if ((so->so_flags1 & SOF1_TC_NET_SERV_TYPE))
5652 optval = so->so_netsvctype;
5653 else
5654 optval = NET_SERVICE_TYPE_BE;
5655 goto integer;
5656 }
5657 case SO_NETSVC_MARKING_LEVEL:
5658 optval = so_get_netsvc_marking_level(so);
5659 goto integer;
5660
5661 default:
5662 error = ENOPROTOOPT;
5663 break;
5664 }
5665 }
5666 out:
5667 if (dolock)
5668 socket_unlock(so, 1);
5669 return (error);
5670 }
5671
5672 /*
5673 * The size limits on our soopt_getm is different from that on FreeBSD.
5674 * We limit the size of options to MCLBYTES. This will have to change
5675 * if we need to define options that need more space than MCLBYTES.
5676 */
5677 int
5678 soopt_getm(struct sockopt *sopt, struct mbuf **mp)
5679 {
5680 struct mbuf *m, *m_prev;
5681 int sopt_size = sopt->sopt_valsize;
5682 int how;
5683
5684 if (sopt_size <= 0 || sopt_size > MCLBYTES)
5685 return (EMSGSIZE);
5686
5687 how = sopt->sopt_p != kernproc ? M_WAIT : M_DONTWAIT;
5688 MGET(m, how, MT_DATA);
5689 if (m == NULL)
5690 return (ENOBUFS);
5691 if (sopt_size > MLEN) {
5692 MCLGET(m, how);
5693 if ((m->m_flags & M_EXT) == 0) {
5694 m_free(m);
5695 return (ENOBUFS);
5696 }
5697 m->m_len = min(MCLBYTES, sopt_size);
5698 } else {
5699 m->m_len = min(MLEN, sopt_size);
5700 }
5701 sopt_size -= m->m_len;
5702 *mp = m;
5703 m_prev = m;
5704
5705 while (sopt_size > 0) {
5706 MGET(m, how, MT_DATA);
5707 if (m == NULL) {
5708 m_freem(*mp);
5709 return (ENOBUFS);
5710 }
5711 if (sopt_size > MLEN) {
5712 MCLGET(m, how);
5713 if ((m->m_flags & M_EXT) == 0) {
5714 m_freem(*mp);
5715 m_freem(m);
5716 return (ENOBUFS);
5717 }
5718 m->m_len = min(MCLBYTES, sopt_size);
5719 } else {
5720 m->m_len = min(MLEN, sopt_size);
5721 }
5722 sopt_size -= m->m_len;
5723 m_prev->m_next = m;
5724 m_prev = m;
5725 }
5726 return (0);
5727 }
5728
5729 /* copyin sopt data into mbuf chain */
5730 int
5731 soopt_mcopyin(struct sockopt *sopt, struct mbuf *m)
5732 {
5733 struct mbuf *m0 = m;
5734
5735 if (sopt->sopt_val == USER_ADDR_NULL)
5736 return (0);
5737 while (m != NULL && sopt->sopt_valsize >= m->m_len) {
5738 if (sopt->sopt_p != kernproc) {
5739 int error;
5740
5741 error = copyin(sopt->sopt_val, mtod(m, char *),
5742 m->m_len);
5743 if (error != 0) {
5744 m_freem(m0);
5745 return (error);
5746 }
5747 } else {
5748 bcopy(CAST_DOWN(caddr_t, sopt->sopt_val),
5749 mtod(m, char *), m->m_len);
5750 }
5751 sopt->sopt_valsize -= m->m_len;
5752 sopt->sopt_val += m->m_len;
5753 m = m->m_next;
5754 }
5755 /* should be allocated enoughly at ip6_sooptmcopyin() */
5756 if (m != NULL) {
5757 panic("soopt_mcopyin");
5758 /* NOTREACHED */
5759 }
5760 return (0);
5761 }
5762
5763 /* copyout mbuf chain data into soopt */
5764 int
5765 soopt_mcopyout(struct sockopt *sopt, struct mbuf *m)
5766 {
5767 struct mbuf *m0 = m;
5768 size_t valsize = 0;
5769
5770 if (sopt->sopt_val == USER_ADDR_NULL)
5771 return (0);
5772 while (m != NULL && sopt->sopt_valsize >= m->m_len) {
5773 if (sopt->sopt_p != kernproc) {
5774 int error;
5775
5776 error = copyout(mtod(m, char *), sopt->sopt_val,
5777 m->m_len);
5778 if (error != 0) {
5779 m_freem(m0);
5780 return (error);
5781 }
5782 } else {
5783 bcopy(mtod(m, char *),
5784 CAST_DOWN(caddr_t, sopt->sopt_val), m->m_len);
5785 }
5786 sopt->sopt_valsize -= m->m_len;
5787 sopt->sopt_val += m->m_len;
5788 valsize += m->m_len;
5789 m = m->m_next;
5790 }
5791 if (m != NULL) {
5792 /* enough soopt buffer should be given from user-land */
5793 m_freem(m0);
5794 return (EINVAL);
5795 }
5796 sopt->sopt_valsize = valsize;
5797 return (0);
5798 }
5799
5800 void
5801 sohasoutofband(struct socket *so)
5802 {
5803 if (so->so_pgid < 0)
5804 gsignal(-so->so_pgid, SIGURG);
5805 else if (so->so_pgid > 0)
5806 proc_signal(so->so_pgid, SIGURG);
5807 selwakeup(&so->so_rcv.sb_sel);
5808 if (so->so_rcv.sb_flags & SB_KNOTE) {
5809 KNOTE(&so->so_rcv.sb_sel.si_note,
5810 (NOTE_OOB | SO_FILT_HINT_LOCKED));
5811 }
5812 }
5813
5814 int
5815 sopoll(struct socket *so, int events, kauth_cred_t cred, void * wql)
5816 {
5817 #pragma unused(cred)
5818 struct proc *p = current_proc();
5819 int revents = 0;
5820
5821 socket_lock(so, 1);
5822 so_update_last_owner_locked(so, PROC_NULL);
5823 so_update_policy(so);
5824
5825 if (events & (POLLIN | POLLRDNORM))
5826 if (soreadable(so))
5827 revents |= events & (POLLIN | POLLRDNORM);
5828
5829 if (events & (POLLOUT | POLLWRNORM))
5830 if (sowriteable(so))
5831 revents |= events & (POLLOUT | POLLWRNORM);
5832
5833 if (events & (POLLPRI | POLLRDBAND))
5834 if (so->so_oobmark || (so->so_state & SS_RCVATMARK))
5835 revents |= events & (POLLPRI | POLLRDBAND);
5836
5837 if (revents == 0) {
5838 if (events & (POLLIN | POLLPRI | POLLRDNORM | POLLRDBAND)) {
5839 /*
5840 * Darwin sets the flag first,
5841 * BSD calls selrecord first
5842 */
5843 so->so_rcv.sb_flags |= SB_SEL;
5844 selrecord(p, &so->so_rcv.sb_sel, wql);
5845 }
5846
5847 if (events & (POLLOUT | POLLWRNORM)) {
5848 /*
5849 * Darwin sets the flag first,
5850 * BSD calls selrecord first
5851 */
5852 so->so_snd.sb_flags |= SB_SEL;
5853 selrecord(p, &so->so_snd.sb_sel, wql);
5854 }
5855 }
5856
5857 socket_unlock(so, 1);
5858 return (revents);
5859 }
5860
5861 int
5862 soo_kqfilter(struct fileproc *fp, struct knote *kn, vfs_context_t ctx)
5863 {
5864 #pragma unused(fp)
5865 #if !CONFIG_MACF_SOCKET
5866 #pragma unused(ctx)
5867 #endif /* MAC_SOCKET */
5868 struct socket *so = (struct socket *)kn->kn_fp->f_fglob->fg_data;
5869 int result;
5870
5871 socket_lock(so, 1);
5872 so_update_last_owner_locked(so, PROC_NULL);
5873 so_update_policy(so);
5874
5875 #if CONFIG_MACF_SOCKET
5876 if (mac_socket_check_kqfilter(proc_ucred(vfs_context_proc(ctx)),
5877 kn, so) != 0) {
5878 socket_unlock(so, 1);
5879 kn->kn_flags = EV_ERROR;
5880 kn->kn_data = EPERM;
5881 return 0;
5882 }
5883 #endif /* MAC_SOCKET */
5884
5885 switch (kn->kn_filter) {
5886 case EVFILT_READ:
5887 kn->kn_filtid = EVFILTID_SOREAD;
5888 break;
5889 case EVFILT_WRITE:
5890 kn->kn_filtid = EVFILTID_SOWRITE;
5891 break;
5892 case EVFILT_SOCK:
5893 kn->kn_filtid = EVFILTID_SCK;
5894 break;
5895 case EVFILT_EXCEPT:
5896 kn->kn_filtid = EVFILTID_SOEXCEPT;
5897 break;
5898 default:
5899 socket_unlock(so, 1);
5900 kn->kn_flags = EV_ERROR;
5901 kn->kn_data = EINVAL;
5902 return 0;
5903 }
5904
5905 /*
5906 * call the appropriate sub-filter attach
5907 * with the socket still locked
5908 */
5909 result = knote_fops(kn)->f_attach(kn);
5910
5911 socket_unlock(so, 1);
5912
5913 return result;
5914 }
5915
5916 static int
5917 filt_soread_common(struct knote *kn, struct socket *so)
5918 {
5919 if (so->so_options & SO_ACCEPTCONN) {
5920 int is_not_empty;
5921
5922 /*
5923 * Radar 6615193 handle the listen case dynamically
5924 * for kqueue read filter. This allows to call listen()
5925 * after registering the kqueue EVFILT_READ.
5926 */
5927
5928 kn->kn_data = so->so_qlen;
5929 is_not_empty = ! TAILQ_EMPTY(&so->so_comp);
5930
5931 return (is_not_empty);
5932 }
5933
5934 /* socket isn't a listener */
5935 /*
5936 * NOTE_LOWAT specifies new low water mark in data, i.e.
5937 * the bytes of protocol data. We therefore exclude any
5938 * control bytes.
5939 */
5940 kn->kn_data = so->so_rcv.sb_cc - so->so_rcv.sb_ctl;
5941
5942 if (kn->kn_sfflags & NOTE_OOB) {
5943 if (so->so_oobmark || (so->so_state & SS_RCVATMARK)) {
5944 kn->kn_fflags |= NOTE_OOB;
5945 kn->kn_data -= so->so_oobmark;
5946 return (1);
5947 }
5948 }
5949
5950 if ((so->so_state & SS_CANTRCVMORE)
5951 #if CONTENT_FILTER
5952 && cfil_sock_data_pending(&so->so_rcv) == 0
5953 #endif /* CONTENT_FILTER */
5954 ) {
5955 kn->kn_flags |= EV_EOF;
5956 kn->kn_fflags = so->so_error;
5957 return (1);
5958 }
5959
5960 if (so->so_error) { /* temporary udp error */
5961 return (1);
5962 }
5963
5964 int64_t lowwat = so->so_rcv.sb_lowat;
5965 /*
5966 * Ensure that when NOTE_LOWAT is used, the derived
5967 * low water mark is bounded by socket's rcv buf's
5968 * high and low water mark values.
5969 */
5970 if (kn->kn_sfflags & NOTE_LOWAT) {
5971 if (kn->kn_sdata > so->so_rcv.sb_hiwat)
5972 lowwat = so->so_rcv.sb_hiwat;
5973 else if (kn->kn_sdata > lowwat)
5974 lowwat = kn->kn_sdata;
5975 }
5976
5977 /*
5978 * The order below is important. Since NOTE_LOWAT
5979 * overrides sb_lowat, check for NOTE_LOWAT case
5980 * first.
5981 */
5982 if (kn->kn_sfflags & NOTE_LOWAT)
5983 return (kn->kn_data >= lowwat);
5984
5985 return (so->so_rcv.sb_cc >= lowwat);
5986 }
5987
5988 static int
5989 filt_sorattach(struct knote *kn)
5990 {
5991 struct socket *so = (struct socket *)kn->kn_fp->f_fglob->fg_data;
5992
5993 /* socket locked */
5994
5995 /*
5996 * If the caller explicitly asked for OOB results (e.g. poll())
5997 * from EVFILT_READ, then save that off in the hookid field
5998 * and reserve the kn_flags EV_OOBAND bit for output only.
5999 */
6000 if (kn->kn_filter == EVFILT_READ &&
6001 kn->kn_flags & EV_OOBAND) {
6002 kn->kn_flags &= ~EV_OOBAND;
6003 kn->kn_hookid = EV_OOBAND;
6004 } else {
6005 kn->kn_hookid = 0;
6006 }
6007 if (KNOTE_ATTACH(&so->so_rcv.sb_sel.si_note, kn))
6008 so->so_rcv.sb_flags |= SB_KNOTE;
6009
6010 /* indicate if event is already fired */
6011 return filt_soread_common(kn, so);
6012 }
6013
6014 static void
6015 filt_sordetach(struct knote *kn)
6016 {
6017 struct socket *so = (struct socket *)kn->kn_fp->f_fglob->fg_data;
6018
6019 socket_lock(so, 1);
6020 if (so->so_rcv.sb_flags & SB_KNOTE)
6021 if (KNOTE_DETACH(&so->so_rcv.sb_sel.si_note, kn))
6022 so->so_rcv.sb_flags &= ~SB_KNOTE;
6023 socket_unlock(so, 1);
6024 }
6025
6026 /*ARGSUSED*/
6027 static int
6028 filt_soread(struct knote *kn, long hint)
6029 {
6030 struct socket *so = (struct socket *)kn->kn_fp->f_fglob->fg_data;
6031 int retval;
6032
6033 if ((hint & SO_FILT_HINT_LOCKED) == 0)
6034 socket_lock(so, 1);
6035
6036 retval = filt_soread_common(kn, so);
6037
6038 if ((hint & SO_FILT_HINT_LOCKED) == 0)
6039 socket_unlock(so, 1);
6040
6041 return retval;
6042 }
6043
6044 static int
6045 filt_sortouch(struct knote *kn, struct kevent_internal_s *kev)
6046 {
6047 struct socket *so = (struct socket *)kn->kn_fp->f_fglob->fg_data;
6048 int retval;
6049
6050 socket_lock(so, 1);
6051
6052 /* save off the new input fflags and data */
6053 kn->kn_sfflags = kev->fflags;
6054 kn->kn_sdata = kev->data;
6055 if ((kn->kn_status & KN_UDATA_SPECIFIC) == 0)
6056 kn->kn_udata = kev->udata;
6057
6058 /* determine if changes result in fired events */
6059 retval = filt_soread_common(kn, so);
6060
6061 socket_unlock(so, 1);
6062
6063 return retval;
6064 }
6065
6066 static int
6067 filt_sorprocess(struct knote *kn, struct filt_process_s *data, struct kevent_internal_s *kev)
6068 {
6069 #pragma unused(data)
6070 struct socket *so = (struct socket *)kn->kn_fp->f_fglob->fg_data;
6071 int retval;
6072
6073 socket_lock(so, 1);
6074 retval = filt_soread_common(kn, so);
6075 if (retval) {
6076 *kev = kn->kn_kevent;
6077 if (kn->kn_flags & EV_CLEAR) {
6078 kn->kn_fflags = 0;
6079 kn->kn_data = 0;
6080 }
6081 }
6082 socket_unlock(so, 1);
6083
6084 return retval;
6085 }
6086
6087 int
6088 so_wait_for_if_feedback(struct socket *so)
6089 {
6090 if ((SOCK_DOM(so) == PF_INET || SOCK_DOM(so) == PF_INET6) &&
6091 (so->so_state & SS_ISCONNECTED)) {
6092 struct inpcb *inp = sotoinpcb(so);
6093 if (INP_WAIT_FOR_IF_FEEDBACK(inp))
6094 return (1);
6095 }
6096 return (0);
6097 }
6098
6099 static int
6100 filt_sowrite_common(struct knote *kn, struct socket *so)
6101 {
6102 int ret = 0;
6103
6104 kn->kn_data = sbspace(&so->so_snd);
6105 if (so->so_state & SS_CANTSENDMORE) {
6106 kn->kn_flags |= EV_EOF;
6107 kn->kn_fflags = so->so_error;
6108 return 1;
6109 }
6110 if (so->so_error) { /* temporary udp error */
6111 return 1;
6112 }
6113 if (!socanwrite(so)) {
6114 return 0;
6115 }
6116 if (so->so_flags1 & SOF1_PRECONNECT_DATA) {
6117 return 1;
6118 }
6119 int64_t lowwat = so->so_snd.sb_lowat;
6120 if (kn->kn_sfflags & NOTE_LOWAT) {
6121 if (kn->kn_sdata > so->so_snd.sb_hiwat)
6122 lowwat = so->so_snd.sb_hiwat;
6123 else if (kn->kn_sdata > lowwat)
6124 lowwat = kn->kn_sdata;
6125 }
6126 if (kn->kn_data >= lowwat) {
6127 if ((so->so_flags & SOF_NOTSENT_LOWAT)
6128 #if (DEBUG || DEVELOPMENT)
6129 && so_notsent_lowat_check == 1
6130 #endif /* DEBUG || DEVELOPMENT */
6131 ) {
6132 if ((SOCK_DOM(so) == PF_INET ||
6133 SOCK_DOM(so) == PF_INET6) &&
6134 so->so_type == SOCK_STREAM) {
6135 ret = tcp_notsent_lowat_check(so);
6136 }
6137 #if MPTCP
6138 else if ((SOCK_DOM(so) == PF_MULTIPATH) &&
6139 (SOCK_PROTO(so) == IPPROTO_TCP)) {
6140 ret = mptcp_notsent_lowat_check(so);
6141 }
6142 #endif
6143 else {
6144 return 1;
6145 }
6146 } else {
6147 ret = 1;
6148 }
6149 }
6150 if (so_wait_for_if_feedback(so))
6151 ret = 0;
6152 return (ret);
6153 }
6154
6155 static int
6156 filt_sowattach(struct knote *kn)
6157 {
6158 struct socket *so = (struct socket *)kn->kn_fp->f_fglob->fg_data;
6159
6160 /* socket locked */
6161 if (KNOTE_ATTACH(&so->so_snd.sb_sel.si_note, kn))
6162 so->so_snd.sb_flags |= SB_KNOTE;
6163
6164 /* determine if its already fired */
6165 return filt_sowrite_common(kn, so);
6166 }
6167
6168 static void
6169 filt_sowdetach(struct knote *kn)
6170 {
6171 struct socket *so = (struct socket *)kn->kn_fp->f_fglob->fg_data;
6172 socket_lock(so, 1);
6173
6174 if (so->so_snd.sb_flags & SB_KNOTE)
6175 if (KNOTE_DETACH(&so->so_snd.sb_sel.si_note, kn))
6176 so->so_snd.sb_flags &= ~SB_KNOTE;
6177 socket_unlock(so, 1);
6178 }
6179
6180 /*ARGSUSED*/
6181 static int
6182 filt_sowrite(struct knote *kn, long hint)
6183 {
6184 struct socket *so = (struct socket *)kn->kn_fp->f_fglob->fg_data;
6185 int ret;
6186
6187 if ((hint & SO_FILT_HINT_LOCKED) == 0)
6188 socket_lock(so, 1);
6189
6190 ret = filt_sowrite_common(kn, so);
6191
6192 if ((hint & SO_FILT_HINT_LOCKED) == 0)
6193 socket_unlock(so, 1);
6194
6195 return ret;
6196 }
6197
6198 static int
6199 filt_sowtouch(struct knote *kn, struct kevent_internal_s *kev)
6200 {
6201 struct socket *so = (struct socket *)kn->kn_fp->f_fglob->fg_data;
6202 int ret;
6203
6204 socket_lock(so, 1);
6205
6206 /*save off the new input fflags and data */
6207 kn->kn_sfflags = kev->fflags;
6208 kn->kn_sdata = kev->data;
6209 if ((kn->kn_status & KN_UDATA_SPECIFIC) == 0)
6210 kn->kn_udata = kev->udata;
6211
6212 /* determine if these changes result in a triggered event */
6213 ret = filt_sowrite_common(kn, so);
6214
6215 socket_unlock(so, 1);
6216
6217 return ret;
6218 }
6219
6220 static int
6221 filt_sowprocess(struct knote *kn, struct filt_process_s *data, struct kevent_internal_s *kev)
6222 {
6223 #pragma unused(data)
6224 struct socket *so = (struct socket *)kn->kn_fp->f_fglob->fg_data;
6225 int ret;
6226
6227 socket_lock(so, 1);
6228 ret = filt_sowrite_common(kn, so);
6229 if (ret) {
6230 *kev = kn->kn_kevent;
6231 if (kn->kn_flags & EV_CLEAR) {
6232 kn->kn_fflags = 0;
6233 kn->kn_data = 0;
6234 }
6235 }
6236 socket_unlock(so, 1);
6237 return ret;
6238 }
6239
6240 static int
6241 filt_sockev_common(struct knote *kn, struct socket *so, long ev_hint)
6242 {
6243 int ret = 0;
6244 uint32_t level_trigger = 0;
6245
6246 if (ev_hint & SO_FILT_HINT_CONNRESET) {
6247 kn->kn_fflags |= NOTE_CONNRESET;
6248 }
6249 if (ev_hint & SO_FILT_HINT_TIMEOUT) {
6250 kn->kn_fflags |= NOTE_TIMEOUT;
6251 }
6252 if (ev_hint & SO_FILT_HINT_NOSRCADDR) {
6253 kn->kn_fflags |= NOTE_NOSRCADDR;
6254 }
6255 if (ev_hint & SO_FILT_HINT_IFDENIED) {
6256 kn->kn_fflags |= NOTE_IFDENIED;
6257 }
6258 if (ev_hint & SO_FILT_HINT_KEEPALIVE) {
6259 kn->kn_fflags |= NOTE_KEEPALIVE;
6260 }
6261 if (ev_hint & SO_FILT_HINT_ADAPTIVE_WTIMO) {
6262 kn->kn_fflags |= NOTE_ADAPTIVE_WTIMO;
6263 }
6264 if (ev_hint & SO_FILT_HINT_ADAPTIVE_RTIMO) {
6265 kn->kn_fflags |= NOTE_ADAPTIVE_RTIMO;
6266 }
6267 if ((ev_hint & SO_FILT_HINT_CONNECTED) ||
6268 (so->so_state & SS_ISCONNECTED)) {
6269 kn->kn_fflags |= NOTE_CONNECTED;
6270 level_trigger |= NOTE_CONNECTED;
6271 }
6272 if ((ev_hint & SO_FILT_HINT_DISCONNECTED) ||
6273 (so->so_state & SS_ISDISCONNECTED)) {
6274 kn->kn_fflags |= NOTE_DISCONNECTED;
6275 level_trigger |= NOTE_DISCONNECTED;
6276 }
6277 if (ev_hint & SO_FILT_HINT_CONNINFO_UPDATED) {
6278 if (so->so_proto != NULL &&
6279 (so->so_proto->pr_flags & PR_EVCONNINFO))
6280 kn->kn_fflags |= NOTE_CONNINFO_UPDATED;
6281 }
6282
6283 if ((ev_hint & SO_FILT_HINT_NOTIFY_ACK) ||
6284 tcp_notify_ack_active(so)) {
6285 kn->kn_fflags |= NOTE_NOTIFY_ACK;
6286 }
6287
6288 if ((so->so_state & SS_CANTRCVMORE)
6289 #if CONTENT_FILTER
6290 && cfil_sock_data_pending(&so->so_rcv) == 0
6291 #endif /* CONTENT_FILTER */
6292 ) {
6293 kn->kn_fflags |= NOTE_READCLOSED;
6294 level_trigger |= NOTE_READCLOSED;
6295 }
6296
6297 if (so->so_state & SS_CANTSENDMORE) {
6298 kn->kn_fflags |= NOTE_WRITECLOSED;
6299 level_trigger |= NOTE_WRITECLOSED;
6300 }
6301
6302 if ((ev_hint & SO_FILT_HINT_SUSPEND) ||
6303 (so->so_flags & SOF_SUSPENDED)) {
6304 kn->kn_fflags &= ~(NOTE_SUSPEND | NOTE_RESUME);
6305
6306 /* If resume event was delivered before, reset it */
6307 kn->kn_hookid &= ~NOTE_RESUME;
6308
6309 kn->kn_fflags |= NOTE_SUSPEND;
6310 level_trigger |= NOTE_SUSPEND;
6311 }
6312
6313 if ((ev_hint & SO_FILT_HINT_RESUME) ||
6314 (so->so_flags & SOF_SUSPENDED) == 0) {
6315 kn->kn_fflags &= ~(NOTE_SUSPEND | NOTE_RESUME);
6316
6317 /* If suspend event was delivered before, reset it */
6318 kn->kn_hookid &= ~NOTE_SUSPEND;
6319
6320 kn->kn_fflags |= NOTE_RESUME;
6321 level_trigger |= NOTE_RESUME;
6322 }
6323
6324 if (so->so_error != 0) {
6325 ret = 1;
6326 kn->kn_data = so->so_error;
6327 kn->kn_flags |= EV_EOF;
6328 } else {
6329 get_sockev_state(so, (u_int32_t *)&(kn->kn_data));
6330 }
6331
6332 /* Reset any events that are not requested on this knote */
6333 kn->kn_fflags &= (kn->kn_sfflags & EVFILT_SOCK_ALL_MASK);
6334 level_trigger &= (kn->kn_sfflags & EVFILT_SOCK_ALL_MASK);
6335
6336 /* Find the level triggerred events that are already delivered */
6337 level_trigger &= kn->kn_hookid;
6338 level_trigger &= EVFILT_SOCK_LEVEL_TRIGGER_MASK;
6339
6340 /* Do not deliver level triggerred events more than once */
6341 if ((kn->kn_fflags & ~level_trigger) != 0)
6342 ret = 1;
6343
6344 return (ret);
6345 }
6346
6347 static int
6348 filt_sockattach(struct knote *kn)
6349 {
6350 struct socket *so = (struct socket *)kn->kn_fp->f_fglob->fg_data;
6351
6352 /* socket locked */
6353 kn->kn_hookid = 0;
6354 if (KNOTE_ATTACH(&so->so_klist, kn))
6355 so->so_flags |= SOF_KNOTE;
6356
6357 /* determine if event already fired */
6358 return filt_sockev_common(kn, so, 0);
6359 }
6360
6361 static void
6362 filt_sockdetach(struct knote *kn)
6363 {
6364 struct socket *so = (struct socket *)kn->kn_fp->f_fglob->fg_data;
6365 socket_lock(so, 1);
6366
6367 if ((so->so_flags & SOF_KNOTE) != 0)
6368 if (KNOTE_DETACH(&so->so_klist, kn))
6369 so->so_flags &= ~SOF_KNOTE;
6370 socket_unlock(so, 1);
6371 }
6372
6373 static int
6374 filt_sockev(struct knote *kn, long hint)
6375 {
6376 int ret = 0, locked = 0;
6377 struct socket *so = (struct socket *)kn->kn_fp->f_fglob->fg_data;
6378 long ev_hint = (hint & SO_FILT_HINT_EV);
6379
6380 if ((hint & SO_FILT_HINT_LOCKED) == 0) {
6381 socket_lock(so, 1);
6382 locked = 1;
6383 }
6384
6385 ret = filt_sockev_common(kn, so, ev_hint);
6386
6387 if (locked)
6388 socket_unlock(so, 1);
6389
6390 return ret;
6391 }
6392
6393
6394
6395 /*
6396 * filt_socktouch - update event state
6397 */
6398 static int
6399 filt_socktouch(
6400 struct knote *kn,
6401 struct kevent_internal_s *kev)
6402 {
6403 struct socket *so = (struct socket *)kn->kn_fp->f_fglob->fg_data;
6404 uint32_t changed_flags;
6405 int ret;
6406
6407 socket_lock(so, 1);
6408
6409 /* save off the [result] data and fflags */
6410 changed_flags = (kn->kn_sfflags ^ kn->kn_hookid);
6411
6412 /* save off the new input fflags and data */
6413 kn->kn_sfflags = kev->fflags;
6414 kn->kn_sdata = kev->data;
6415 if ((kn->kn_status & KN_UDATA_SPECIFIC) == 0)
6416 kn->kn_udata = kev->udata;
6417
6418 /* restrict the current results to the (smaller?) set of new interest */
6419 /*
6420 * For compatibility with previous implementations, we leave kn_fflags
6421 * as they were before.
6422 */
6423 //kn->kn_fflags &= kev->fflags;
6424
6425 /*
6426 * Since we keep track of events that are already
6427 * delivered, if any of those events are not requested
6428 * anymore the state related to them can be reset
6429 */
6430 kn->kn_hookid &=
6431 ~(changed_flags & EVFILT_SOCK_LEVEL_TRIGGER_MASK);
6432
6433 /* determine if we have events to deliver */
6434 ret = filt_sockev_common(kn, so, 0);
6435
6436 socket_unlock(so, 1);
6437
6438 return ret;
6439 }
6440
6441 /*
6442 * filt_sockprocess - query event fired state and return data
6443 */
6444 static int
6445 filt_sockprocess(
6446 struct knote *kn,
6447 struct filt_process_s *data,
6448 struct kevent_internal_s *kev)
6449 {
6450 #pragma unused(data)
6451
6452 struct socket *so = (struct socket *)kn->kn_fp->f_fglob->fg_data;
6453 int ret = 0;
6454
6455 socket_lock(so, 1);
6456
6457 ret = filt_sockev_common(kn, so, 0);
6458 if (ret) {
6459 *kev = kn->kn_kevent;
6460
6461 /*
6462 * Store the state of the events being delivered. This
6463 * state can be used to deliver level triggered events
6464 * ateast once and still avoid waking up the application
6465 * multiple times as long as the event is active.
6466 */
6467 if (kn->kn_fflags != 0)
6468 kn->kn_hookid |= (kn->kn_fflags &
6469 EVFILT_SOCK_LEVEL_TRIGGER_MASK);
6470
6471 /*
6472 * NOTE_RESUME and NOTE_SUSPEND are an exception, deliver
6473 * only one of them and remember the last one that was
6474 * delivered last
6475 */
6476 if (kn->kn_fflags & NOTE_SUSPEND)
6477 kn->kn_hookid &= ~NOTE_RESUME;
6478 if (kn->kn_fflags & NOTE_RESUME)
6479 kn->kn_hookid &= ~NOTE_SUSPEND;
6480
6481 if (kn->kn_flags & EV_CLEAR) {
6482 kn->kn_data = 0;
6483 kn->kn_fflags = 0;
6484 }
6485 }
6486
6487 socket_unlock(so, 1);
6488
6489 return ret;
6490 }
6491
6492 void
6493 get_sockev_state(struct socket *so, u_int32_t *statep)
6494 {
6495 u_int32_t state = *(statep);
6496
6497 /*
6498 * If the state variable is already used by a previous event,
6499 * reset it.
6500 */
6501 if (state != 0)
6502 return;
6503
6504 if (so->so_state & SS_ISCONNECTED)
6505 state |= SOCKEV_CONNECTED;
6506 else
6507 state &= ~(SOCKEV_CONNECTED);
6508 state |= ((so->so_state & SS_ISDISCONNECTED) ? SOCKEV_DISCONNECTED : 0);
6509 *(statep) = state;
6510 }
6511
6512 #define SO_LOCK_HISTORY_STR_LEN \
6513 (2 * SO_LCKDBG_MAX * (2 + (2 * sizeof (void *)) + 1) + 1)
6514
6515 __private_extern__ const char *
6516 solockhistory_nr(struct socket *so)
6517 {
6518 size_t n = 0;
6519 int i;
6520 static char lock_history_str[SO_LOCK_HISTORY_STR_LEN];
6521
6522 bzero(lock_history_str, sizeof (lock_history_str));
6523 for (i = SO_LCKDBG_MAX - 1; i >= 0; i--) {
6524 n += snprintf(lock_history_str + n,
6525 SO_LOCK_HISTORY_STR_LEN - n, "%p:%p ",
6526 so->lock_lr[(so->next_lock_lr + i) % SO_LCKDBG_MAX],
6527 so->unlock_lr[(so->next_unlock_lr + i) % SO_LCKDBG_MAX]);
6528 }
6529 return (lock_history_str);
6530 }
6531
6532 int
6533 socket_lock(struct socket *so, int refcount)
6534 {
6535 int error = 0;
6536 void *lr_saved;
6537
6538 lr_saved = __builtin_return_address(0);
6539
6540 if (so->so_proto->pr_lock) {
6541 error = (*so->so_proto->pr_lock)(so, refcount, lr_saved);
6542 } else {
6543 #ifdef MORE_LOCKING_DEBUG
6544 lck_mtx_assert(so->so_proto->pr_domain->dom_mtx,
6545 LCK_MTX_ASSERT_NOTOWNED);
6546 #endif
6547 lck_mtx_lock(so->so_proto->pr_domain->dom_mtx);
6548 if (refcount)
6549 so->so_usecount++;
6550 so->lock_lr[so->next_lock_lr] = lr_saved;
6551 so->next_lock_lr = (so->next_lock_lr+1) % SO_LCKDBG_MAX;
6552 }
6553
6554 return (error);
6555 }
6556
6557 int
6558 socket_unlock(struct socket *so, int refcount)
6559 {
6560 int error = 0;
6561 void *lr_saved;
6562 lck_mtx_t *mutex_held;
6563
6564 lr_saved = __builtin_return_address(0);
6565
6566 if (so->so_proto == NULL) {
6567 panic("%s: null so_proto so=%p\n", __func__, so);
6568 /* NOTREACHED */
6569 }
6570
6571 if (so && so->so_proto->pr_unlock) {
6572 error = (*so->so_proto->pr_unlock)(so, refcount, lr_saved);
6573 } else {
6574 mutex_held = so->so_proto->pr_domain->dom_mtx;
6575 #ifdef MORE_LOCKING_DEBUG
6576 lck_mtx_assert(mutex_held, LCK_MTX_ASSERT_OWNED);
6577 #endif
6578 so->unlock_lr[so->next_unlock_lr] = lr_saved;
6579 so->next_unlock_lr = (so->next_unlock_lr+1) % SO_LCKDBG_MAX;
6580
6581 if (refcount) {
6582 if (so->so_usecount <= 0) {
6583 panic("%s: bad refcount=%d so=%p (%d, %d, %d) "
6584 "lrh=%s", __func__, so->so_usecount, so,
6585 SOCK_DOM(so), so->so_type,
6586 SOCK_PROTO(so), solockhistory_nr(so));
6587 /* NOTREACHED */
6588 }
6589
6590 so->so_usecount--;
6591 if (so->so_usecount == 0)
6592 sofreelastref(so, 1);
6593 }
6594 lck_mtx_unlock(mutex_held);
6595 }
6596
6597 return (error);
6598 }
6599
6600 /* Called with socket locked, will unlock socket */
6601 void
6602 sofree(struct socket *so)
6603 {
6604 lck_mtx_t *mutex_held;
6605
6606 if (so->so_proto->pr_getlock != NULL)
6607 mutex_held = (*so->so_proto->pr_getlock)(so, 0);
6608 else
6609 mutex_held = so->so_proto->pr_domain->dom_mtx;
6610 lck_mtx_assert(mutex_held, LCK_MTX_ASSERT_OWNED);
6611
6612 sofreelastref(so, 0);
6613 }
6614
6615 void
6616 soreference(struct socket *so)
6617 {
6618 socket_lock(so, 1); /* locks & take one reference on socket */
6619 socket_unlock(so, 0); /* unlock only */
6620 }
6621
6622 void
6623 sodereference(struct socket *so)
6624 {
6625 socket_lock(so, 0);
6626 socket_unlock(so, 1);
6627 }
6628
6629 /*
6630 * Set or clear SOF_MULTIPAGES on the socket to enable or disable the
6631 * possibility of using jumbo clusters. Caller must ensure to hold
6632 * the socket lock.
6633 */
6634 void
6635 somultipages(struct socket *so, boolean_t set)
6636 {
6637 if (set)
6638 so->so_flags |= SOF_MULTIPAGES;
6639 else
6640 so->so_flags &= ~SOF_MULTIPAGES;
6641 }
6642
6643 void
6644 soif2kcl(struct socket *so, boolean_t set)
6645 {
6646 if (set)
6647 so->so_flags1 |= SOF1_IF_2KCL;
6648 else
6649 so->so_flags1 &= ~SOF1_IF_2KCL;
6650 }
6651
6652 int
6653 so_isdstlocal(struct socket *so) {
6654
6655 struct inpcb *inp = (struct inpcb *)so->so_pcb;
6656
6657 if (SOCK_DOM(so) == PF_INET)
6658 return (inaddr_local(inp->inp_faddr));
6659 else if (SOCK_DOM(so) == PF_INET6)
6660 return (in6addr_local(&inp->in6p_faddr));
6661
6662 return (0);
6663 }
6664
6665 int
6666 sosetdefunct(struct proc *p, struct socket *so, int level, boolean_t noforce)
6667 {
6668 struct sockbuf *rcv, *snd;
6669 int err = 0, defunct;
6670
6671 rcv = &so->so_rcv;
6672 snd = &so->so_snd;
6673
6674 defunct = (so->so_flags & SOF_DEFUNCT);
6675 if (defunct) {
6676 if (!(snd->sb_flags & rcv->sb_flags & SB_DROP)) {
6677 panic("%s: SB_DROP not set", __func__);
6678 /* NOTREACHED */
6679 }
6680 goto done;
6681 }
6682
6683 if (so->so_flags & SOF_NODEFUNCT) {
6684 if (noforce) {
6685 err = EOPNOTSUPP;
6686 SODEFUNCTLOG("%s[%d, %s]: (target pid %d "
6687 "name %s level %d) so 0x%llx [%d,%d] "
6688 "is not eligible for defunct "
6689 "(%d)\n", __func__, proc_selfpid(),
6690 proc_best_name(current_proc()), proc_pid(p),
6691 proc_best_name(p), level,
6692 (uint64_t)DEBUG_KERNEL_ADDRPERM(so),
6693 SOCK_DOM(so), SOCK_TYPE(so), err);
6694 return (err);
6695 }
6696 so->so_flags &= ~SOF_NODEFUNCT;
6697 SODEFUNCTLOG("%s[%d, %s]: (target pid %d name %s level %d) "
6698 "so 0x%llx [%d,%d] defunct by force\n", __func__,
6699 proc_selfpid(), proc_best_name(current_proc()),
6700 proc_pid(p), proc_best_name(p), level,
6701 (uint64_t)DEBUG_KERNEL_ADDRPERM(so),
6702 SOCK_DOM(so), SOCK_TYPE(so));
6703 } else if (so->so_flags1 & SOF1_EXTEND_BK_IDLE_WANTED) {
6704 struct inpcb *inp = (struct inpcb *)so->so_pcb;
6705 struct ifnet *ifp = inp->inp_last_outifp;
6706
6707 if (ifp && IFNET_IS_CELLULAR(ifp)) {
6708 OSIncrementAtomic(&soextbkidlestat.so_xbkidle_nocell);
6709 } else if (so->so_flags & SOF_DELEGATED) {
6710 OSIncrementAtomic(&soextbkidlestat.so_xbkidle_nodlgtd);
6711 } else if (soextbkidlestat.so_xbkidle_time == 0) {
6712 OSIncrementAtomic(&soextbkidlestat.so_xbkidle_notime);
6713 } else if (noforce) {
6714 OSIncrementAtomic(&soextbkidlestat.so_xbkidle_active);
6715
6716 so->so_flags1 |= SOF1_EXTEND_BK_IDLE_INPROG;
6717 so->so_extended_bk_start = net_uptime();
6718 OSBitOrAtomic(P_LXBKIDLEINPROG, &p->p_ladvflag);
6719
6720 inpcb_timer_sched(inp->inp_pcbinfo, INPCB_TIMER_LAZY);
6721
6722 err = EOPNOTSUPP;
6723 SODEFUNCTLOG("%s[%d, %s]: (target pid %d name %s "
6724 "level %d) extend bk idle so 0x%llx rcv hw %d "
6725 "cc %d\n",
6726 __func__, proc_selfpid(),
6727 proc_best_name(current_proc()), proc_pid(p),
6728 proc_best_name(p), level,
6729 (uint64_t)DEBUG_KERNEL_ADDRPERM(so),
6730 so->so_rcv.sb_hiwat, so->so_rcv.sb_cc);
6731 return (err);
6732 } else {
6733 OSIncrementAtomic(&soextbkidlestat.so_xbkidle_forced);
6734 }
6735 }
6736
6737 so->so_flags |= SOF_DEFUNCT;
6738
6739 /* Prevent further data from being appended to the socket buffers */
6740 snd->sb_flags |= SB_DROP;
6741 rcv->sb_flags |= SB_DROP;
6742
6743 /* Flush any existing data in the socket buffers */
6744 if (rcv->sb_cc != 0) {
6745 rcv->sb_flags &= ~SB_SEL;
6746 selthreadclear(&rcv->sb_sel);
6747 sbrelease(rcv);
6748 }
6749 if (snd->sb_cc != 0) {
6750 snd->sb_flags &= ~SB_SEL;
6751 selthreadclear(&snd->sb_sel);
6752 sbrelease(snd);
6753 }
6754
6755 done:
6756 SODEFUNCTLOG("%s[%d, %s]: (target pid %d name %s level %d) "
6757 "so 0x%llx [%d,%d] %s defunct%s\n", __func__, proc_selfpid(),
6758 proc_best_name(current_proc()), proc_pid(p), proc_best_name(p),
6759 level, (uint64_t)DEBUG_KERNEL_ADDRPERM(so), SOCK_DOM(so),
6760 SOCK_TYPE(so), defunct ? "is already" : "marked as",
6761 (so->so_flags1 & SOF1_EXTEND_BK_IDLE_WANTED) ? " extbkidle" : "");
6762
6763 return (err);
6764 }
6765
6766 int
6767 sodefunct(struct proc *p, struct socket *so, int level)
6768 {
6769 struct sockbuf *rcv, *snd;
6770
6771 if (!(so->so_flags & SOF_DEFUNCT)) {
6772 panic("%s improperly called", __func__);
6773 /* NOTREACHED */
6774 }
6775 if (so->so_state & SS_DEFUNCT)
6776 goto done;
6777
6778 rcv = &so->so_rcv;
6779 snd = &so->so_snd;
6780
6781 if (SOCK_DOM(so) == PF_INET || SOCK_DOM(so) == PF_INET6) {
6782 char s[MAX_IPv6_STR_LEN];
6783 char d[MAX_IPv6_STR_LEN];
6784 struct inpcb *inp = sotoinpcb(so);
6785
6786 SODEFUNCTLOG("%s[%d, %s]: (target pid %d name %s level %d) "
6787 "so 0x%llx [%s %s:%d -> %s:%d] is now defunct "
6788 "[rcv_si 0x%x, snd_si 0x%x, rcv_fl 0x%x, snd_fl 0x%x]\n",
6789 __func__, proc_selfpid(), proc_best_name(current_proc()),
6790 proc_pid(p), proc_best_name(p), level,
6791 (uint64_t)DEBUG_KERNEL_ADDRPERM(so),
6792 (SOCK_TYPE(so) == SOCK_STREAM) ? "TCP" : "UDP",
6793 inet_ntop(SOCK_DOM(so), ((SOCK_DOM(so) == PF_INET) ?
6794 (void *)&inp->inp_laddr.s_addr : (void *)&inp->in6p_laddr),
6795 s, sizeof (s)), ntohs(inp->in6p_lport),
6796 inet_ntop(SOCK_DOM(so), (SOCK_DOM(so) == PF_INET) ?
6797 (void *)&inp->inp_faddr.s_addr : (void *)&inp->in6p_faddr,
6798 d, sizeof (d)), ntohs(inp->in6p_fport),
6799 (uint32_t)rcv->sb_sel.si_flags,
6800 (uint32_t)snd->sb_sel.si_flags,
6801 rcv->sb_flags, snd->sb_flags);
6802 } else {
6803 SODEFUNCTLOG("%s[%d, %s]: (target pid %d name %s level %d) "
6804 "so 0x%llx [%d,%d] is now defunct [rcv_si 0x%x, "
6805 "snd_si 0x%x, rcv_fl 0x%x, snd_fl 0x%x]\n", __func__,
6806 proc_selfpid(), proc_best_name(current_proc()),
6807 proc_pid(p), proc_best_name(p), level,
6808 (uint64_t)DEBUG_KERNEL_ADDRPERM(so),
6809 SOCK_DOM(so), SOCK_TYPE(so),
6810 (uint32_t)rcv->sb_sel.si_flags,
6811 (uint32_t)snd->sb_sel.si_flags, rcv->sb_flags,
6812 snd->sb_flags);
6813 }
6814
6815 /*
6816 * Unwedge threads blocked on sbwait() and sb_lock().
6817 */
6818 sbwakeup(rcv);
6819 sbwakeup(snd);
6820
6821 so->so_flags1 |= SOF1_DEFUNCTINPROG;
6822 if (rcv->sb_flags & SB_LOCK)
6823 sbunlock(rcv, TRUE); /* keep socket locked */
6824 if (snd->sb_flags & SB_LOCK)
6825 sbunlock(snd, TRUE); /* keep socket locked */
6826
6827 /*
6828 * Flush the buffers and disconnect. We explicitly call shutdown
6829 * on both data directions to ensure that SS_CANT{RCV,SEND}MORE
6830 * states are set for the socket. This would also flush out data
6831 * hanging off the receive list of this socket.
6832 */
6833 (void) soshutdownlock_final(so, SHUT_RD);
6834 (void) soshutdownlock_final(so, SHUT_WR);
6835 (void) sodisconnectlocked(so);
6836
6837 /*
6838 * Explicitly handle connectionless-protocol disconnection
6839 * and release any remaining data in the socket buffers.
6840 */
6841 if (!(so->so_flags & SS_ISDISCONNECTED))
6842 (void) soisdisconnected(so);
6843
6844 if (so->so_error == 0)
6845 so->so_error = EBADF;
6846
6847 if (rcv->sb_cc != 0) {
6848 rcv->sb_flags &= ~SB_SEL;
6849 selthreadclear(&rcv->sb_sel);
6850 sbrelease(rcv);
6851 }
6852 if (snd->sb_cc != 0) {
6853 snd->sb_flags &= ~SB_SEL;
6854 selthreadclear(&snd->sb_sel);
6855 sbrelease(snd);
6856 }
6857 so->so_state |= SS_DEFUNCT;
6858 OSIncrementAtomicLong((volatile long *)&sodefunct_calls);
6859
6860 done:
6861 return (0);
6862 }
6863
6864 int
6865 soresume(struct proc *p, struct socket *so, int locked)
6866 {
6867 if (locked == 0)
6868 socket_lock(so, 1);
6869
6870 if (so->so_flags1 & SOF1_EXTEND_BK_IDLE_INPROG) {
6871 SODEFUNCTLOG("%s[%d, %s]: (target pid %d name %s) so 0x%llx "
6872 "[%d,%d] resumed from bk idle\n",
6873 __func__, proc_selfpid(), proc_best_name(current_proc()),
6874 proc_pid(p), proc_best_name(p),
6875 (uint64_t)DEBUG_KERNEL_ADDRPERM(so),
6876 SOCK_DOM(so), SOCK_TYPE(so));
6877
6878 so->so_flags1 &= ~SOF1_EXTEND_BK_IDLE_INPROG;
6879 so->so_extended_bk_start = 0;
6880 OSBitAndAtomic(~P_LXBKIDLEINPROG, &p->p_ladvflag);
6881
6882 OSIncrementAtomic(&soextbkidlestat.so_xbkidle_resumed);
6883 OSDecrementAtomic(&soextbkidlestat.so_xbkidle_active);
6884 VERIFY(soextbkidlestat.so_xbkidle_active >= 0);
6885 }
6886 if (locked == 0)
6887 socket_unlock(so, 1);
6888
6889 return (0);
6890 }
6891
6892 /*
6893 * Does not attempt to account for sockets that are delegated from
6894 * the current process
6895 */
6896 int
6897 so_set_extended_bk_idle(struct socket *so, int optval)
6898 {
6899 int error = 0;
6900
6901 if ((SOCK_DOM(so) != PF_INET && SOCK_DOM(so) != PF_INET6) ||
6902 SOCK_PROTO(so) != IPPROTO_TCP) {
6903 OSDecrementAtomic(&soextbkidlestat.so_xbkidle_notsupp);
6904 error = EOPNOTSUPP;
6905 } else if (optval == 0) {
6906 so->so_flags1 &= ~SOF1_EXTEND_BK_IDLE_WANTED;
6907
6908 soresume(current_proc(), so, 1);
6909 } else {
6910 struct proc *p = current_proc();
6911 int i;
6912 struct filedesc *fdp;
6913 int count = 0;
6914
6915 proc_fdlock(p);
6916
6917 fdp = p->p_fd;
6918 for (i = 0; i < fdp->fd_nfiles; i++) {
6919 struct fileproc *fp = fdp->fd_ofiles[i];
6920 struct socket *so2;
6921
6922 if (fp == NULL ||
6923 (fdp->fd_ofileflags[i] & UF_RESERVED) != 0 ||
6924 FILEGLOB_DTYPE(fp->f_fglob) != DTYPE_SOCKET)
6925 continue;
6926
6927 so2 = (struct socket *)fp->f_fglob->fg_data;
6928 if (so != so2 &&
6929 so2->so_flags1 & SOF1_EXTEND_BK_IDLE_WANTED)
6930 count++;
6931 if (count >= soextbkidlestat.so_xbkidle_maxperproc)
6932 break;
6933 }
6934 if (count >= soextbkidlestat.so_xbkidle_maxperproc) {
6935 OSIncrementAtomic(&soextbkidlestat.so_xbkidle_toomany);
6936 error = EBUSY;
6937 } else if (so->so_flags & SOF_DELEGATED) {
6938 OSIncrementAtomic(&soextbkidlestat.so_xbkidle_nodlgtd);
6939 error = EBUSY;
6940 } else {
6941 so->so_flags1 |= SOF1_EXTEND_BK_IDLE_WANTED;
6942 OSIncrementAtomic(&soextbkidlestat.so_xbkidle_wantok);
6943 }
6944 SODEFUNCTLOG("%s[%d, %s]: so 0x%llx [%d,%d] "
6945 "%s marked for extended bk idle\n",
6946 __func__, proc_selfpid(), proc_best_name(current_proc()),
6947 (uint64_t)DEBUG_KERNEL_ADDRPERM(so),
6948 SOCK_DOM(so), SOCK_TYPE(so),
6949 (so->so_flags1 & SOF1_EXTEND_BK_IDLE_WANTED) ?
6950 "is" : "not");
6951
6952 proc_fdunlock(p);
6953 }
6954
6955 return (error);
6956 }
6957
6958 static void
6959 so_stop_extended_bk_idle(struct socket *so)
6960 {
6961 so->so_flags1 &= ~SOF1_EXTEND_BK_IDLE_INPROG;
6962 so->so_extended_bk_start = 0;
6963
6964 OSDecrementAtomic(&soextbkidlestat.so_xbkidle_active);
6965 VERIFY(soextbkidlestat.so_xbkidle_active >= 0);
6966 /*
6967 * Force defunct
6968 */
6969 sosetdefunct(current_proc(), so,
6970 SHUTDOWN_SOCKET_LEVEL_DISCONNECT_INTERNAL, FALSE);
6971 if (so->so_flags & SOF_DEFUNCT) {
6972 sodefunct(current_proc(), so,
6973 SHUTDOWN_SOCKET_LEVEL_DISCONNECT_INTERNAL);
6974 }
6975 }
6976
6977 void
6978 so_drain_extended_bk_idle(struct socket *so)
6979 {
6980 if (so && (so->so_flags1 & SOF1_EXTEND_BK_IDLE_INPROG)) {
6981 /*
6982 * Only penalize sockets that have outstanding data
6983 */
6984 if (so->so_rcv.sb_cc || so->so_snd.sb_cc) {
6985 so_stop_extended_bk_idle(so);
6986
6987 OSIncrementAtomic(&soextbkidlestat.so_xbkidle_drained);
6988 }
6989 }
6990 }
6991
6992 /*
6993 * Return values tells if socket is still in extended background idle
6994 */
6995 int
6996 so_check_extended_bk_idle_time(struct socket *so)
6997 {
6998 int ret = 1;
6999
7000 if ((so->so_flags1 & SOF1_EXTEND_BK_IDLE_INPROG)) {
7001 SODEFUNCTLOG("%s[%d, %s]: so 0x%llx [%d,%d]\n",
7002 __func__, proc_selfpid(), proc_best_name(current_proc()),
7003 (uint64_t)DEBUG_KERNEL_ADDRPERM(so),
7004 SOCK_DOM(so), SOCK_TYPE(so));
7005 if (net_uptime() - so->so_extended_bk_start >
7006 soextbkidlestat.so_xbkidle_time) {
7007 so_stop_extended_bk_idle(so);
7008
7009 OSIncrementAtomic(&soextbkidlestat.so_xbkidle_expired);
7010
7011 ret = 0;
7012 } else {
7013 struct inpcb *inp = (struct inpcb *)so->so_pcb;
7014
7015 inpcb_timer_sched(inp->inp_pcbinfo, INPCB_TIMER_LAZY);
7016 OSIncrementAtomic(&soextbkidlestat.so_xbkidle_resched);
7017 }
7018 }
7019
7020 return (ret);
7021 }
7022
7023 void
7024 resume_proc_sockets(proc_t p)
7025 {
7026 if (p->p_ladvflag & P_LXBKIDLEINPROG) {
7027 struct filedesc *fdp;
7028 int i;
7029
7030 proc_fdlock(p);
7031 fdp = p->p_fd;
7032 for (i = 0; i < fdp->fd_nfiles; i++) {
7033 struct fileproc *fp;
7034 struct socket *so;
7035
7036 fp = fdp->fd_ofiles[i];
7037 if (fp == NULL ||
7038 (fdp->fd_ofileflags[i] & UF_RESERVED) != 0 ||
7039 FILEGLOB_DTYPE(fp->f_fglob) != DTYPE_SOCKET)
7040 continue;
7041
7042 so = (struct socket *)fp->f_fglob->fg_data;
7043 (void) soresume(p, so, 0);
7044 }
7045 proc_fdunlock(p);
7046
7047 OSBitAndAtomic(~P_LXBKIDLEINPROG, &p->p_ladvflag);
7048 }
7049 }
7050
7051 __private_extern__ int
7052 so_set_recv_anyif(struct socket *so, int optval)
7053 {
7054 int ret = 0;
7055
7056 #if INET6
7057 if (SOCK_DOM(so) == PF_INET || SOCK_DOM(so) == PF_INET6) {
7058 #else
7059 if (SOCK_DOM(so) == PF_INET) {
7060 #endif /* !INET6 */
7061 if (optval)
7062 sotoinpcb(so)->inp_flags |= INP_RECV_ANYIF;
7063 else
7064 sotoinpcb(so)->inp_flags &= ~INP_RECV_ANYIF;
7065 }
7066
7067 return (ret);
7068 }
7069
7070 __private_extern__ int
7071 so_get_recv_anyif(struct socket *so)
7072 {
7073 int ret = 0;
7074
7075 #if INET6
7076 if (SOCK_DOM(so) == PF_INET || SOCK_DOM(so) == PF_INET6) {
7077 #else
7078 if (SOCK_DOM(so) == PF_INET) {
7079 #endif /* !INET6 */
7080 ret = (sotoinpcb(so)->inp_flags & INP_RECV_ANYIF) ? 1 : 0;
7081 }
7082
7083 return (ret);
7084 }
7085
7086 int
7087 so_set_restrictions(struct socket *so, uint32_t vals)
7088 {
7089 int nocell_old, nocell_new;
7090 int noexpensive_old, noexpensive_new;
7091
7092 /*
7093 * Deny-type restrictions are trapdoors; once set they cannot be
7094 * unset for the lifetime of the socket. This allows them to be
7095 * issued by a framework on behalf of the application without
7096 * having to worry that they can be undone.
7097 *
7098 * Note here that socket-level restrictions overrides any protocol
7099 * level restrictions. For instance, SO_RESTRICT_DENY_CELLULAR
7100 * socket restriction issued on the socket has a higher precendence
7101 * than INP_NO_IFT_CELLULAR. The latter is affected by the UUID
7102 * policy PROC_UUID_NO_CELLULAR for unrestricted sockets only,
7103 * i.e. when SO_RESTRICT_DENY_CELLULAR has not been issued.
7104 */
7105 nocell_old = (so->so_restrictions & SO_RESTRICT_DENY_CELLULAR);
7106 noexpensive_old = (so->so_restrictions & SO_RESTRICT_DENY_EXPENSIVE);
7107 so->so_restrictions |= (vals & (SO_RESTRICT_DENY_IN |
7108 SO_RESTRICT_DENY_OUT | SO_RESTRICT_DENY_CELLULAR |
7109 SO_RESTRICT_DENY_EXPENSIVE));
7110 nocell_new = (so->so_restrictions & SO_RESTRICT_DENY_CELLULAR);
7111 noexpensive_new = (so->so_restrictions & SO_RESTRICT_DENY_EXPENSIVE);
7112
7113 /* we can only set, not clear restrictions */
7114 if ((nocell_new - nocell_old) == 0 &&
7115 (noexpensive_new - noexpensive_old) == 0)
7116 return (0);
7117 #if INET6
7118 if (SOCK_DOM(so) == PF_INET || SOCK_DOM(so) == PF_INET6) {
7119 #else
7120 if (SOCK_DOM(so) == PF_INET) {
7121 #endif /* !INET6 */
7122 if (nocell_new - nocell_old != 0) {
7123 /*
7124 * if deny cellular is now set, do what's needed
7125 * for INPCB
7126 */
7127 inp_set_nocellular(sotoinpcb(so));
7128 }
7129 if (noexpensive_new - noexpensive_old != 0) {
7130 inp_set_noexpensive(sotoinpcb(so));
7131 }
7132 }
7133
7134 return (0);
7135 }
7136
7137 uint32_t
7138 so_get_restrictions(struct socket *so)
7139 {
7140 return (so->so_restrictions & (SO_RESTRICT_DENY_IN |
7141 SO_RESTRICT_DENY_OUT |
7142 SO_RESTRICT_DENY_CELLULAR | SO_RESTRICT_DENY_EXPENSIVE));
7143 }
7144
7145 struct sockaddr_entry *
7146 sockaddrentry_alloc(int how)
7147 {
7148 struct sockaddr_entry *se;
7149
7150 se = (how == M_WAITOK) ? zalloc(se_zone) : zalloc_noblock(se_zone);
7151 if (se != NULL)
7152 bzero(se, se_zone_size);
7153
7154 return (se);
7155 }
7156
7157 void
7158 sockaddrentry_free(struct sockaddr_entry *se)
7159 {
7160 if (se->se_addr != NULL) {
7161 FREE(se->se_addr, M_SONAME);
7162 se->se_addr = NULL;
7163 }
7164 zfree(se_zone, se);
7165 }
7166
7167 struct sockaddr_entry *
7168 sockaddrentry_dup(const struct sockaddr_entry *src_se, int how)
7169 {
7170 struct sockaddr_entry *dst_se;
7171
7172 dst_se = sockaddrentry_alloc(how);
7173 if (dst_se != NULL) {
7174 int len = src_se->se_addr->sa_len;
7175
7176 MALLOC(dst_se->se_addr, struct sockaddr *,
7177 len, M_SONAME, how | M_ZERO);
7178 if (dst_se->se_addr != NULL) {
7179 bcopy(src_se->se_addr, dst_se->se_addr, len);
7180 } else {
7181 sockaddrentry_free(dst_se);
7182 dst_se = NULL;
7183 }
7184 }
7185
7186 return (dst_se);
7187 }
7188
7189 struct sockaddr_list *
7190 sockaddrlist_alloc(int how)
7191 {
7192 struct sockaddr_list *sl;
7193
7194 sl = (how == M_WAITOK) ? zalloc(sl_zone) : zalloc_noblock(sl_zone);
7195 if (sl != NULL) {
7196 bzero(sl, sl_zone_size);
7197 TAILQ_INIT(&sl->sl_head);
7198 }
7199 return (sl);
7200 }
7201
7202 void
7203 sockaddrlist_free(struct sockaddr_list *sl)
7204 {
7205 struct sockaddr_entry *se, *tse;
7206
7207 TAILQ_FOREACH_SAFE(se, &sl->sl_head, se_link, tse) {
7208 sockaddrlist_remove(sl, se);
7209 sockaddrentry_free(se);
7210 }
7211 VERIFY(sl->sl_cnt == 0 && TAILQ_EMPTY(&sl->sl_head));
7212 zfree(sl_zone, sl);
7213 }
7214
7215 void
7216 sockaddrlist_insert(struct sockaddr_list *sl, struct sockaddr_entry *se)
7217 {
7218 VERIFY(!(se->se_flags & SEF_ATTACHED));
7219 se->se_flags |= SEF_ATTACHED;
7220 TAILQ_INSERT_TAIL(&sl->sl_head, se, se_link);
7221 sl->sl_cnt++;
7222 VERIFY(sl->sl_cnt != 0);
7223 }
7224
7225 void
7226 sockaddrlist_remove(struct sockaddr_list *sl, struct sockaddr_entry *se)
7227 {
7228 VERIFY(se->se_flags & SEF_ATTACHED);
7229 se->se_flags &= ~SEF_ATTACHED;
7230 VERIFY(sl->sl_cnt != 0);
7231 sl->sl_cnt--;
7232 TAILQ_REMOVE(&sl->sl_head, se, se_link);
7233 }
7234
7235 struct sockaddr_list *
7236 sockaddrlist_dup(const struct sockaddr_list *src_sl, int how)
7237 {
7238 struct sockaddr_entry *src_se, *tse;
7239 struct sockaddr_list *dst_sl;
7240
7241 dst_sl = sockaddrlist_alloc(how);
7242 if (dst_sl == NULL)
7243 return (NULL);
7244
7245 TAILQ_FOREACH_SAFE(src_se, &src_sl->sl_head, se_link, tse) {
7246 struct sockaddr_entry *dst_se;
7247
7248 if (src_se->se_addr == NULL)
7249 continue;
7250
7251 dst_se = sockaddrentry_dup(src_se, how);
7252 if (dst_se == NULL) {
7253 sockaddrlist_free(dst_sl);
7254 return (NULL);
7255 }
7256
7257 sockaddrlist_insert(dst_sl, dst_se);
7258 }
7259 VERIFY(src_sl->sl_cnt == dst_sl->sl_cnt);
7260
7261 return (dst_sl);
7262 }
7263
7264 int
7265 so_set_effective_pid(struct socket *so, int epid, struct proc *p)
7266 {
7267 struct proc *ep = PROC_NULL;
7268 int error = 0;
7269
7270 /* pid 0 is reserved for kernel */
7271 if (epid == 0) {
7272 error = EINVAL;
7273 goto done;
7274 }
7275
7276 /*
7277 * If this is an in-kernel socket, prevent its delegate
7278 * association from changing unless the socket option is
7279 * coming from within the kernel itself.
7280 */
7281 if (so->last_pid == 0 && p != kernproc) {
7282 error = EACCES;
7283 goto done;
7284 }
7285
7286 /*
7287 * If this is issued by a process that's recorded as the
7288 * real owner of the socket, or if the pid is the same as
7289 * the process's own pid, then proceed. Otherwise ensure
7290 * that the issuing process has the necessary privileges.
7291 */
7292 if (epid != so->last_pid || epid != proc_pid(p)) {
7293 if ((error = priv_check_cred(kauth_cred_get(),
7294 PRIV_NET_PRIVILEGED_SOCKET_DELEGATE, 0))) {
7295 error = EACCES;
7296 goto done;
7297 }
7298 }
7299
7300 /* Find the process that corresponds to the effective pid */
7301 if ((ep = proc_find(epid)) == PROC_NULL) {
7302 error = ESRCH;
7303 goto done;
7304 }
7305
7306 /*
7307 * If a process tries to delegate the socket to itself, then
7308 * there's really nothing to do; treat it as a way for the
7309 * delegate association to be cleared. Note that we check
7310 * the passed-in proc rather than calling proc_selfpid(),
7311 * as we need to check the process issuing the socket option
7312 * which could be kernproc. Given that we don't allow 0 for
7313 * effective pid, it means that a delegated in-kernel socket
7314 * stays delegated during its lifetime (which is probably OK.)
7315 */
7316 if (epid == proc_pid(p)) {
7317 so->so_flags &= ~SOF_DELEGATED;
7318 so->e_upid = 0;
7319 so->e_pid = 0;
7320 uuid_clear(so->e_uuid);
7321 } else {
7322 so->so_flags |= SOF_DELEGATED;
7323 so->e_upid = proc_uniqueid(ep);
7324 so->e_pid = proc_pid(ep);
7325 proc_getexecutableuuid(ep, so->e_uuid, sizeof (so->e_uuid));
7326 }
7327 done:
7328 if (error == 0 && net_io_policy_log) {
7329 uuid_string_t buf;
7330
7331 uuid_unparse(so->e_uuid, buf);
7332 log(LOG_DEBUG, "%s[%s,%d]: so 0x%llx [%d,%d] epid %d (%s) "
7333 "euuid %s%s\n", __func__, proc_name_address(p),
7334 proc_pid(p), (uint64_t)DEBUG_KERNEL_ADDRPERM(so),
7335 SOCK_DOM(so), SOCK_TYPE(so),
7336 so->e_pid, proc_name_address(ep), buf,
7337 ((so->so_flags & SOF_DELEGATED) ? " [delegated]" : ""));
7338 } else if (error != 0 && net_io_policy_log) {
7339 log(LOG_ERR, "%s[%s,%d]: so 0x%llx [%d,%d] epid %d (%s) "
7340 "ERROR (%d)\n", __func__, proc_name_address(p),
7341 proc_pid(p), (uint64_t)DEBUG_KERNEL_ADDRPERM(so),
7342 SOCK_DOM(so), SOCK_TYPE(so),
7343 epid, (ep == PROC_NULL) ? "PROC_NULL" :
7344 proc_name_address(ep), error);
7345 }
7346
7347 /* Update this socket's policy upon success */
7348 if (error == 0) {
7349 so->so_policy_gencnt *= -1;
7350 so_update_policy(so);
7351 #if NECP
7352 so_update_necp_policy(so, NULL, NULL);
7353 #endif /* NECP */
7354 }
7355
7356 if (ep != PROC_NULL)
7357 proc_rele(ep);
7358
7359 return (error);
7360 }
7361
7362 int
7363 so_set_effective_uuid(struct socket *so, uuid_t euuid, struct proc *p)
7364 {
7365 uuid_string_t buf;
7366 uuid_t uuid;
7367 int error = 0;
7368
7369 /* UUID must not be all-zeroes (reserved for kernel) */
7370 if (uuid_is_null(euuid)) {
7371 error = EINVAL;
7372 goto done;
7373 }
7374
7375 /*
7376 * If this is an in-kernel socket, prevent its delegate
7377 * association from changing unless the socket option is
7378 * coming from within the kernel itself.
7379 */
7380 if (so->last_pid == 0 && p != kernproc) {
7381 error = EACCES;
7382 goto done;
7383 }
7384
7385 /* Get the UUID of the issuing process */
7386 proc_getexecutableuuid(p, uuid, sizeof (uuid));
7387
7388 /*
7389 * If this is issued by a process that's recorded as the
7390 * real owner of the socket, or if the uuid is the same as
7391 * the process's own uuid, then proceed. Otherwise ensure
7392 * that the issuing process has the necessary privileges.
7393 */
7394 if (uuid_compare(euuid, so->last_uuid) != 0 ||
7395 uuid_compare(euuid, uuid) != 0) {
7396 if ((error = priv_check_cred(kauth_cred_get(),
7397 PRIV_NET_PRIVILEGED_SOCKET_DELEGATE, 0))) {
7398 error = EACCES;
7399 goto done;
7400 }
7401 }
7402
7403 /*
7404 * If a process tries to delegate the socket to itself, then
7405 * there's really nothing to do; treat it as a way for the
7406 * delegate association to be cleared. Note that we check
7407 * the uuid of the passed-in proc rather than that of the
7408 * current process, as we need to check the process issuing
7409 * the socket option which could be kernproc itself. Given
7410 * that we don't allow 0 for effective uuid, it means that
7411 * a delegated in-kernel socket stays delegated during its
7412 * lifetime (which is okay.)
7413 */
7414 if (uuid_compare(euuid, uuid) == 0) {
7415 so->so_flags &= ~SOF_DELEGATED;
7416 so->e_upid = 0;
7417 so->e_pid = 0;
7418 uuid_clear(so->e_uuid);
7419 } else {
7420 so->so_flags |= SOF_DELEGATED;
7421 /*
7422 * Unlike so_set_effective_pid(), we only have the UUID
7423 * here and the process ID is not known. Inherit the
7424 * real {pid,upid} of the socket.
7425 */
7426 so->e_upid = so->last_upid;
7427 so->e_pid = so->last_pid;
7428 uuid_copy(so->e_uuid, euuid);
7429 }
7430
7431 done:
7432 if (error == 0 && net_io_policy_log) {
7433 uuid_unparse(so->e_uuid, buf);
7434 log(LOG_DEBUG, "%s[%s,%d]: so 0x%llx [%d,%d] epid %d "
7435 "euuid %s%s\n", __func__, proc_name_address(p), proc_pid(p),
7436 (uint64_t)DEBUG_KERNEL_ADDRPERM(so), SOCK_DOM(so),
7437 SOCK_TYPE(so), so->e_pid, buf,
7438 ((so->so_flags & SOF_DELEGATED) ? " [delegated]" : ""));
7439 } else if (error != 0 && net_io_policy_log) {
7440 uuid_unparse(euuid, buf);
7441 log(LOG_DEBUG, "%s[%s,%d]: so 0x%llx [%d,%d] euuid %s "
7442 "ERROR (%d)\n", __func__, proc_name_address(p), proc_pid(p),
7443 (uint64_t)DEBUG_KERNEL_ADDRPERM(so), SOCK_DOM(so),
7444 SOCK_TYPE(so), buf, error);
7445 }
7446
7447 /* Update this socket's policy upon success */
7448 if (error == 0) {
7449 so->so_policy_gencnt *= -1;
7450 so_update_policy(so);
7451 #if NECP
7452 so_update_necp_policy(so, NULL, NULL);
7453 #endif /* NECP */
7454 }
7455
7456 return (error);
7457 }
7458
7459 void
7460 netpolicy_post_msg(uint32_t ev_code, struct netpolicy_event_data *ev_data,
7461 uint32_t ev_datalen)
7462 {
7463 struct kev_msg ev_msg;
7464
7465 /*
7466 * A netpolicy event always starts with a netpolicy_event_data
7467 * structure, but the caller can provide for a longer event
7468 * structure to post, depending on the event code.
7469 */
7470 VERIFY(ev_data != NULL && ev_datalen >= sizeof (*ev_data));
7471
7472 bzero(&ev_msg, sizeof (ev_msg));
7473 ev_msg.vendor_code = KEV_VENDOR_APPLE;
7474 ev_msg.kev_class = KEV_NETWORK_CLASS;
7475 ev_msg.kev_subclass = KEV_NETPOLICY_SUBCLASS;
7476 ev_msg.event_code = ev_code;
7477
7478 ev_msg.dv[0].data_ptr = ev_data;
7479 ev_msg.dv[0].data_length = ev_datalen;
7480
7481 kev_post_msg(&ev_msg);
7482 }
7483
7484 void
7485 socket_post_kev_msg(uint32_t ev_code,
7486 struct kev_socket_event_data *ev_data,
7487 uint32_t ev_datalen)
7488 {
7489 struct kev_msg ev_msg;
7490
7491 bzero(&ev_msg, sizeof(ev_msg));
7492 ev_msg.vendor_code = KEV_VENDOR_APPLE;
7493 ev_msg.kev_class = KEV_NETWORK_CLASS;
7494 ev_msg.kev_subclass = KEV_SOCKET_SUBCLASS;
7495 ev_msg.event_code = ev_code;
7496
7497 ev_msg.dv[0].data_ptr = ev_data;
7498 ev_msg.dv[0]. data_length = ev_datalen;
7499
7500 kev_post_msg(&ev_msg);
7501 }
7502
7503 void
7504 socket_post_kev_msg_closed(struct socket *so)
7505 {
7506 struct kev_socket_closed ev;
7507 struct sockaddr *socksa = NULL, *peersa = NULL;
7508 int err;
7509 bzero(&ev, sizeof(ev));
7510 err = (*so->so_proto->pr_usrreqs->pru_sockaddr)(so, &socksa);
7511 if (err == 0) {
7512 err = (*so->so_proto->pr_usrreqs->pru_peeraddr)(so,
7513 &peersa);
7514 if (err == 0) {
7515 memcpy(&ev.ev_data.kev_sockname, socksa,
7516 min(socksa->sa_len,
7517 sizeof (ev.ev_data.kev_sockname)));
7518 memcpy(&ev.ev_data.kev_peername, peersa,
7519 min(peersa->sa_len,
7520 sizeof (ev.ev_data.kev_peername)));
7521 socket_post_kev_msg(KEV_SOCKET_CLOSED,
7522 &ev.ev_data, sizeof (ev));
7523 }
7524 }
7525 if (socksa != NULL)
7526 FREE(socksa, M_SONAME);
7527 if (peersa != NULL)
7528 FREE(peersa, M_SONAME);
7529 }