]> git.saurik.com Git - apple/xnu.git/blob - bsd/kern/uipc_socket.c
xnu-3789.70.16.tar.gz
[apple/xnu.git] / bsd / kern / uipc_socket.c
1 /*
2 * Copyright (c) 1998-2016 Apple Inc. All rights reserved.
3 *
4 * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
5 *
6 * This file contains Original Code and/or Modifications of Original Code
7 * as defined in and that are subject to the Apple Public Source License
8 * Version 2.0 (the 'License'). You may not use this file except in
9 * compliance with the License. The rights granted to you under the License
10 * may not be used to create, or enable the creation or redistribution of,
11 * unlawful or unlicensed copies of an Apple operating system, or to
12 * circumvent, violate, or enable the circumvention or violation of, any
13 * terms of an Apple operating system software license agreement.
14 *
15 * Please obtain a copy of the License at
16 * http://www.opensource.apple.com/apsl/ and read it before using this file.
17 *
18 * The Original Code and all software distributed under the License are
19 * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
20 * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
21 * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
22 * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
23 * Please see the License for the specific language governing rights and
24 * limitations under the License.
25 *
26 * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
27 */
28 /* Copyright (c) 1995 NeXT Computer, Inc. All Rights Reserved */
29 /*
30 * Copyright (c) 1982, 1986, 1988, 1990, 1993
31 * The Regents of the University of California. All rights reserved.
32 *
33 * Redistribution and use in source and binary forms, with or without
34 * modification, are permitted provided that the following conditions
35 * are met:
36 * 1. Redistributions of source code must retain the above copyright
37 * notice, this list of conditions and the following disclaimer.
38 * 2. Redistributions in binary form must reproduce the above copyright
39 * notice, this list of conditions and the following disclaimer in the
40 * documentation and/or other materials provided with the distribution.
41 * 3. All advertising materials mentioning features or use of this software
42 * must display the following acknowledgement:
43 * This product includes software developed by the University of
44 * California, Berkeley and its contributors.
45 * 4. Neither the name of the University nor the names of its contributors
46 * may be used to endorse or promote products derived from this software
47 * without specific prior written permission.
48 *
49 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
50 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
51 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
52 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
53 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
54 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
55 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
56 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
57 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
58 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
59 * SUCH DAMAGE.
60 *
61 * @(#)uipc_socket.c 8.3 (Berkeley) 4/15/94
62 */
63 /*
64 * NOTICE: This file was modified by SPARTA, Inc. in 2005 to introduce
65 * support for mandatory and extensible security protections. This notice
66 * is included in support of clause 2.2 (b) of the Apple Public License,
67 * Version 2.0.
68 */
69
70 #include <sys/param.h>
71 #include <sys/systm.h>
72 #include <sys/filedesc.h>
73 #include <sys/proc.h>
74 #include <sys/proc_internal.h>
75 #include <sys/kauth.h>
76 #include <sys/file_internal.h>
77 #include <sys/fcntl.h>
78 #include <sys/malloc.h>
79 #include <sys/mbuf.h>
80 #include <sys/domain.h>
81 #include <sys/kernel.h>
82 #include <sys/event.h>
83 #include <sys/poll.h>
84 #include <sys/protosw.h>
85 #include <sys/socket.h>
86 #include <sys/socketvar.h>
87 #include <sys/resourcevar.h>
88 #include <sys/signalvar.h>
89 #include <sys/sysctl.h>
90 #include <sys/syslog.h>
91 #include <sys/uio.h>
92 #include <sys/uio_internal.h>
93 #include <sys/ev.h>
94 #include <sys/kdebug.h>
95 #include <sys/un.h>
96 #include <sys/user.h>
97 #include <sys/priv.h>
98 #include <sys/kern_event.h>
99 #include <net/route.h>
100 #include <net/init.h>
101 #include <net/ntstat.h>
102 #include <net/content_filter.h>
103 #include <netinet/in.h>
104 #include <netinet/in_pcb.h>
105 #include <netinet/in_tclass.h>
106 #include <netinet/tcp_var.h>
107 #include <netinet/ip6.h>
108 #include <netinet6/ip6_var.h>
109 #include <netinet/flow_divert.h>
110 #include <kern/zalloc.h>
111 #include <kern/locks.h>
112 #include <machine/limits.h>
113 #include <libkern/OSAtomic.h>
114 #include <pexpert/pexpert.h>
115 #include <kern/assert.h>
116 #include <kern/task.h>
117 #include <kern/policy_internal.h>
118
119 #include <sys/kpi_mbuf.h>
120 #include <sys/mcache.h>
121 #include <sys/unpcb.h>
122
123 #if CONFIG_MACF
124 #include <security/mac.h>
125 #include <security/mac_framework.h>
126 #endif /* MAC */
127
128 #if MULTIPATH
129 #include <netinet/mp_pcb.h>
130 #include <netinet/mptcp_var.h>
131 #endif /* MULTIPATH */
132
133 #define ROUNDUP(a, b) (((a) + ((b) - 1)) & (~((b) - 1)))
134
135 #if DEBUG || DEVELOPMENT
136 #define DEBUG_KERNEL_ADDRPERM(_v) (_v)
137 #else
138 #define DEBUG_KERNEL_ADDRPERM(_v) VM_KERNEL_ADDRPERM(_v)
139 #endif
140
141 /* TODO: this should be in a header file somewhere */
142 extern char *proc_name_address(void *p);
143 extern char *proc_best_name(proc_t);
144
145 static u_int32_t so_cache_hw; /* High water mark for socache */
146 static u_int32_t so_cache_timeouts; /* number of timeouts */
147 static u_int32_t so_cache_max_freed; /* max freed per timeout */
148 static u_int32_t cached_sock_count = 0;
149 STAILQ_HEAD(, socket) so_cache_head;
150 int max_cached_sock_count = MAX_CACHED_SOCKETS;
151 static u_int32_t so_cache_time;
152 static int socketinit_done;
153 static struct zone *so_cache_zone;
154
155 static lck_grp_t *so_cache_mtx_grp;
156 static lck_attr_t *so_cache_mtx_attr;
157 static lck_grp_attr_t *so_cache_mtx_grp_attr;
158 static lck_mtx_t *so_cache_mtx;
159
160 #include <machine/limits.h>
161
162 static int filt_sorattach(struct knote *kn);
163 static void filt_sordetach(struct knote *kn);
164 static int filt_soread(struct knote *kn, long hint);
165 static int filt_sortouch(struct knote *kn, struct kevent_internal_s *kev);
166 static int filt_sorprocess(struct knote *kn, struct filt_process_s *data, struct kevent_internal_s *kev);
167
168 static int filt_sowattach(struct knote *kn);
169 static void filt_sowdetach(struct knote *kn);
170 static int filt_sowrite(struct knote *kn, long hint);
171 static int filt_sowtouch(struct knote *kn, struct kevent_internal_s *kev);
172 static int filt_sowprocess(struct knote *kn, struct filt_process_s *data, struct kevent_internal_s *kev);
173
174 static int filt_sockattach(struct knote *kn);
175 static void filt_sockdetach(struct knote *kn);
176 static int filt_sockev(struct knote *kn, long hint);
177 static int filt_socktouch(struct knote *kn, struct kevent_internal_s *kev);
178 static int filt_sockprocess(struct knote *kn, struct filt_process_s *data, struct kevent_internal_s *kev);
179
180 static int sooptcopyin_timeval(struct sockopt *, struct timeval *);
181 static int sooptcopyout_timeval(struct sockopt *, const struct timeval *);
182
183 struct filterops soread_filtops = {
184 .f_isfd = 1,
185 .f_attach = filt_sorattach,
186 .f_detach = filt_sordetach,
187 .f_event = filt_soread,
188 .f_touch = filt_sortouch,
189 .f_process = filt_sorprocess,
190 };
191
192 struct filterops sowrite_filtops = {
193 .f_isfd = 1,
194 .f_attach = filt_sowattach,
195 .f_detach = filt_sowdetach,
196 .f_event = filt_sowrite,
197 .f_touch = filt_sowtouch,
198 .f_process = filt_sowprocess,
199 };
200
201 struct filterops sock_filtops = {
202 .f_isfd = 1,
203 .f_attach = filt_sockattach,
204 .f_detach = filt_sockdetach,
205 .f_event = filt_sockev,
206 .f_touch = filt_socktouch,
207 .f_process = filt_sockprocess,
208 };
209
210 struct filterops soexcept_filtops = {
211 .f_isfd = 1,
212 .f_attach = filt_sorattach,
213 .f_detach = filt_sordetach,
214 .f_event = filt_soread,
215 .f_touch = filt_sortouch,
216 .f_process = filt_sorprocess,
217 };
218
219 SYSCTL_DECL(_kern_ipc);
220
221 #define EVEN_MORE_LOCKING_DEBUG 0
222
223 int socket_debug = 0;
224 SYSCTL_INT(_kern_ipc, OID_AUTO, socket_debug,
225 CTLFLAG_RW | CTLFLAG_LOCKED, &socket_debug, 0, "");
226
227 static unsigned long sodefunct_calls = 0;
228 SYSCTL_LONG(_kern_ipc, OID_AUTO, sodefunct_calls, CTLFLAG_LOCKED,
229 &sodefunct_calls, "");
230
231 static int socket_zone = M_SOCKET;
232 so_gen_t so_gencnt; /* generation count for sockets */
233
234 MALLOC_DEFINE(M_SONAME, "soname", "socket name");
235 MALLOC_DEFINE(M_PCB, "pcb", "protocol control block");
236
237 #define DBG_LAYER_IN_BEG NETDBG_CODE(DBG_NETSOCK, 0)
238 #define DBG_LAYER_IN_END NETDBG_CODE(DBG_NETSOCK, 2)
239 #define DBG_LAYER_OUT_BEG NETDBG_CODE(DBG_NETSOCK, 1)
240 #define DBG_LAYER_OUT_END NETDBG_CODE(DBG_NETSOCK, 3)
241 #define DBG_FNC_SOSEND NETDBG_CODE(DBG_NETSOCK, (4 << 8) | 1)
242 #define DBG_FNC_SOSEND_LIST NETDBG_CODE(DBG_NETSOCK, (4 << 8) | 3)
243 #define DBG_FNC_SORECEIVE NETDBG_CODE(DBG_NETSOCK, (8 << 8))
244 #define DBG_FNC_SORECEIVE_LIST NETDBG_CODE(DBG_NETSOCK, (8 << 8) | 3)
245 #define DBG_FNC_SOSHUTDOWN NETDBG_CODE(DBG_NETSOCK, (9 << 8))
246
247 #define MAX_SOOPTGETM_SIZE (128 * MCLBYTES)
248
249 int somaxconn = SOMAXCONN;
250 SYSCTL_INT(_kern_ipc, KIPC_SOMAXCONN, somaxconn,
251 CTLFLAG_RW | CTLFLAG_LOCKED, &somaxconn, 0, "");
252
253 /* Should we get a maximum also ??? */
254 static int sosendmaxchain = 65536;
255 static int sosendminchain = 16384;
256 static int sorecvmincopy = 16384;
257 SYSCTL_INT(_kern_ipc, OID_AUTO, sosendminchain,
258 CTLFLAG_RW | CTLFLAG_LOCKED, &sosendminchain, 0, "");
259 SYSCTL_INT(_kern_ipc, OID_AUTO, sorecvmincopy,
260 CTLFLAG_RW | CTLFLAG_LOCKED, &sorecvmincopy, 0, "");
261
262 /*
263 * Set to enable jumbo clusters (if available) for large writes when
264 * the socket is marked with SOF_MULTIPAGES; see below.
265 */
266 int sosendjcl = 1;
267 SYSCTL_INT(_kern_ipc, OID_AUTO, sosendjcl,
268 CTLFLAG_RW | CTLFLAG_LOCKED, &sosendjcl, 0, "");
269
270 /*
271 * Set this to ignore SOF_MULTIPAGES and use jumbo clusters for large
272 * writes on the socket for all protocols on any network interfaces,
273 * depending upon sosendjcl above. Be extra careful when setting this
274 * to 1, because sending down packets that cross physical pages down to
275 * broken drivers (those that falsely assume that the physical pages
276 * are contiguous) might lead to system panics or silent data corruption.
277 * When set to 0, the system will respect SOF_MULTIPAGES, which is set
278 * only for TCP sockets whose outgoing interface is IFNET_MULTIPAGES
279 * capable. Set this to 1 only for testing/debugging purposes.
280 */
281 int sosendjcl_ignore_capab = 0;
282 SYSCTL_INT(_kern_ipc, OID_AUTO, sosendjcl_ignore_capab,
283 CTLFLAG_RW | CTLFLAG_LOCKED, &sosendjcl_ignore_capab, 0, "");
284
285 /*
286 * Set this to ignore SOF1_IF_2KCL and use big clusters for large
287 * writes on the socket for all protocols on any network interfaces.
288 * Be extra careful when setting this to 1, because sending down packets with
289 * clusters larger that 2 KB might lead to system panics or data corruption.
290 * When set to 0, the system will respect SOF1_IF_2KCL, which is set
291 * on the outgoing interface
292 * Set this to 1 for testing/debugging purposes only.
293 */
294 int sosendbigcl_ignore_capab = 0;
295 SYSCTL_INT(_kern_ipc, OID_AUTO, sosendbigcl_ignore_capab,
296 CTLFLAG_RW | CTLFLAG_LOCKED, &sosendbigcl_ignore_capab, 0, "");
297
298 int sodefunctlog = 0;
299 SYSCTL_INT(_kern_ipc, OID_AUTO, sodefunctlog, CTLFLAG_RW | CTLFLAG_LOCKED,
300 &sodefunctlog, 0, "");
301
302 int sothrottlelog = 0;
303 SYSCTL_INT(_kern_ipc, OID_AUTO, sothrottlelog, CTLFLAG_RW | CTLFLAG_LOCKED,
304 &sothrottlelog, 0, "");
305
306 int sorestrictrecv = 1;
307 SYSCTL_INT(_kern_ipc, OID_AUTO, sorestrictrecv, CTLFLAG_RW | CTLFLAG_LOCKED,
308 &sorestrictrecv, 0, "Enable inbound interface restrictions");
309
310 int sorestrictsend = 1;
311 SYSCTL_INT(_kern_ipc, OID_AUTO, sorestrictsend, CTLFLAG_RW | CTLFLAG_LOCKED,
312 &sorestrictsend, 0, "Enable outbound interface restrictions");
313
314 int soreserveheadroom = 1;
315 SYSCTL_INT(_kern_ipc, OID_AUTO, soreserveheadroom, CTLFLAG_RW | CTLFLAG_LOCKED,
316 &soreserveheadroom, 0, "To allocate contiguous datagram buffers");
317
318 #if (DEBUG || DEVELOPMENT)
319 int so_notsent_lowat_check = 1;
320 SYSCTL_INT(_kern_ipc, OID_AUTO, notsent_lowat, CTLFLAG_RW|CTLFLAG_LOCKED,
321 &so_notsent_lowat_check, 0, "enable/disable notsnet lowat check");
322 #endif /* DEBUG || DEVELOPMENT */
323
324 int so_accept_list_waits = 0;
325 #if (DEBUG || DEVELOPMENT)
326 SYSCTL_INT(_kern_ipc, OID_AUTO, accept_list_waits, CTLFLAG_RW|CTLFLAG_LOCKED,
327 &so_accept_list_waits, 0, "number of waits for listener incomp list");
328 #endif /* DEBUG || DEVELOPMENT */
329
330 extern struct inpcbinfo tcbinfo;
331
332 /* TODO: these should be in header file */
333 extern int get_inpcb_str_size(void);
334 extern int get_tcp_str_size(void);
335
336 vm_size_t so_cache_zone_element_size;
337
338 static int sodelayed_copy(struct socket *, struct uio *, struct mbuf **,
339 user_ssize_t *);
340 static void cached_sock_alloc(struct socket **, int);
341 static void cached_sock_free(struct socket *);
342
343 /*
344 * Maximum of extended background idle sockets per process
345 * Set to zero to disable further setting of the option
346 */
347
348 #define SO_IDLE_BK_IDLE_MAX_PER_PROC 1
349 #define SO_IDLE_BK_IDLE_TIME 600
350 #define SO_IDLE_BK_IDLE_RCV_HIWAT 131072
351
352 struct soextbkidlestat soextbkidlestat;
353
354 SYSCTL_UINT(_kern_ipc, OID_AUTO, maxextbkidleperproc,
355 CTLFLAG_RW | CTLFLAG_LOCKED, &soextbkidlestat.so_xbkidle_maxperproc, 0,
356 "Maximum of extended background idle sockets per process");
357
358 SYSCTL_UINT(_kern_ipc, OID_AUTO, extbkidletime, CTLFLAG_RW | CTLFLAG_LOCKED,
359 &soextbkidlestat.so_xbkidle_time, 0,
360 "Time in seconds to keep extended background idle sockets");
361
362 SYSCTL_UINT(_kern_ipc, OID_AUTO, extbkidlercvhiwat, CTLFLAG_RW | CTLFLAG_LOCKED,
363 &soextbkidlestat.so_xbkidle_rcvhiwat, 0,
364 "High water mark for extended background idle sockets");
365
366 SYSCTL_STRUCT(_kern_ipc, OID_AUTO, extbkidlestat, CTLFLAG_RD | CTLFLAG_LOCKED,
367 &soextbkidlestat, soextbkidlestat, "");
368
369 int so_set_extended_bk_idle(struct socket *, int);
370
371 /*
372 * SOTCDB_NO_DSCP is set by default, to prevent the networking stack from
373 * setting the DSCP code on the packet based on the service class; see
374 * <rdar://problem/11277343> for details.
375 */
376 __private_extern__ u_int32_t sotcdb = 0;
377 SYSCTL_INT(_kern_ipc, OID_AUTO, sotcdb, CTLFLAG_RW | CTLFLAG_LOCKED,
378 &sotcdb, 0, "");
379
380 void
381 socketinit(void)
382 {
383 _CASSERT(sizeof(so_gencnt) == sizeof(uint64_t));
384 VERIFY(IS_P2ALIGNED(&so_gencnt, sizeof(uint32_t)));
385
386 #ifdef __LP64__
387 _CASSERT(sizeof(struct sa_endpoints) == sizeof(struct user64_sa_endpoints));
388 _CASSERT(offsetof(struct sa_endpoints, sae_srcif) == offsetof(struct user64_sa_endpoints, sae_srcif));
389 _CASSERT(offsetof(struct sa_endpoints, sae_srcaddr) == offsetof(struct user64_sa_endpoints, sae_srcaddr));
390 _CASSERT(offsetof(struct sa_endpoints, sae_srcaddrlen) == offsetof(struct user64_sa_endpoints, sae_srcaddrlen));
391 _CASSERT(offsetof(struct sa_endpoints, sae_dstaddr) == offsetof(struct user64_sa_endpoints, sae_dstaddr));
392 _CASSERT(offsetof(struct sa_endpoints, sae_dstaddrlen) == offsetof(struct user64_sa_endpoints, sae_dstaddrlen));
393 #else
394 _CASSERT(sizeof(struct sa_endpoints) == sizeof(struct user32_sa_endpoints));
395 _CASSERT(offsetof(struct sa_endpoints, sae_srcif) == offsetof(struct user32_sa_endpoints, sae_srcif));
396 _CASSERT(offsetof(struct sa_endpoints, sae_srcaddr) == offsetof(struct user32_sa_endpoints, sae_srcaddr));
397 _CASSERT(offsetof(struct sa_endpoints, sae_srcaddrlen) == offsetof(struct user32_sa_endpoints, sae_srcaddrlen));
398 _CASSERT(offsetof(struct sa_endpoints, sae_dstaddr) == offsetof(struct user32_sa_endpoints, sae_dstaddr));
399 _CASSERT(offsetof(struct sa_endpoints, sae_dstaddrlen) == offsetof(struct user32_sa_endpoints, sae_dstaddrlen));
400 #endif
401
402 if (socketinit_done) {
403 printf("socketinit: already called...\n");
404 return;
405 }
406 socketinit_done = 1;
407
408 PE_parse_boot_argn("socket_debug", &socket_debug,
409 sizeof (socket_debug));
410
411 /*
412 * allocate lock group attribute and group for socket cache mutex
413 */
414 so_cache_mtx_grp_attr = lck_grp_attr_alloc_init();
415 so_cache_mtx_grp = lck_grp_alloc_init("so_cache",
416 so_cache_mtx_grp_attr);
417
418 /*
419 * allocate the lock attribute for socket cache mutex
420 */
421 so_cache_mtx_attr = lck_attr_alloc_init();
422
423 /* cached sockets mutex */
424 so_cache_mtx = lck_mtx_alloc_init(so_cache_mtx_grp, so_cache_mtx_attr);
425 if (so_cache_mtx == NULL) {
426 panic("%s: unable to allocate so_cache_mtx\n", __func__);
427 /* NOTREACHED */
428 }
429 STAILQ_INIT(&so_cache_head);
430
431 so_cache_zone_element_size = (vm_size_t)(sizeof (struct socket) + 4
432 + get_inpcb_str_size() + 4 + get_tcp_str_size());
433
434 so_cache_zone = zinit(so_cache_zone_element_size,
435 (120000 * so_cache_zone_element_size), 8192, "socache zone");
436 zone_change(so_cache_zone, Z_CALLERACCT, FALSE);
437 zone_change(so_cache_zone, Z_NOENCRYPT, TRUE);
438
439 bzero(&soextbkidlestat, sizeof(struct soextbkidlestat));
440 soextbkidlestat.so_xbkidle_maxperproc = SO_IDLE_BK_IDLE_MAX_PER_PROC;
441 soextbkidlestat.so_xbkidle_time = SO_IDLE_BK_IDLE_TIME;
442 soextbkidlestat.so_xbkidle_rcvhiwat = SO_IDLE_BK_IDLE_RCV_HIWAT;
443
444 in_pcbinit();
445 sflt_init();
446 socket_tclass_init();
447 #if MULTIPATH
448 mp_pcbinit();
449 #endif /* MULTIPATH */
450 }
451
452 static void
453 cached_sock_alloc(struct socket **so, int waitok)
454 {
455 caddr_t temp;
456 uintptr_t offset;
457
458 lck_mtx_lock(so_cache_mtx);
459
460 if (!STAILQ_EMPTY(&so_cache_head)) {
461 VERIFY(cached_sock_count > 0);
462
463 *so = STAILQ_FIRST(&so_cache_head);
464 STAILQ_REMOVE_HEAD(&so_cache_head, so_cache_ent);
465 STAILQ_NEXT((*so), so_cache_ent) = NULL;
466
467 cached_sock_count--;
468 lck_mtx_unlock(so_cache_mtx);
469
470 temp = (*so)->so_saved_pcb;
471 bzero((caddr_t)*so, sizeof (struct socket));
472
473 (*so)->so_saved_pcb = temp;
474 } else {
475
476 lck_mtx_unlock(so_cache_mtx);
477
478 if (waitok)
479 *so = (struct socket *)zalloc(so_cache_zone);
480 else
481 *so = (struct socket *)zalloc_noblock(so_cache_zone);
482
483 if (*so == NULL)
484 return;
485
486 bzero((caddr_t)*so, sizeof (struct socket));
487
488 /*
489 * Define offsets for extra structures into our
490 * single block of memory. Align extra structures
491 * on longword boundaries.
492 */
493
494 offset = (uintptr_t)*so;
495 offset += sizeof (struct socket);
496
497 offset = ALIGN(offset);
498
499 (*so)->so_saved_pcb = (caddr_t)offset;
500 offset += get_inpcb_str_size();
501
502 offset = ALIGN(offset);
503
504 ((struct inpcb *)(void *)(*so)->so_saved_pcb)->inp_saved_ppcb =
505 (caddr_t)offset;
506 }
507
508 OSBitOrAtomic(SOF1_CACHED_IN_SOCK_LAYER, &(*so)->so_flags1);
509 }
510
511 static void
512 cached_sock_free(struct socket *so)
513 {
514
515 lck_mtx_lock(so_cache_mtx);
516
517 so_cache_time = net_uptime();
518 if (++cached_sock_count > max_cached_sock_count) {
519 --cached_sock_count;
520 lck_mtx_unlock(so_cache_mtx);
521 zfree(so_cache_zone, so);
522 } else {
523 if (so_cache_hw < cached_sock_count)
524 so_cache_hw = cached_sock_count;
525
526 STAILQ_INSERT_TAIL(&so_cache_head, so, so_cache_ent);
527
528 so->cache_timestamp = so_cache_time;
529 lck_mtx_unlock(so_cache_mtx);
530 }
531 }
532
533 void
534 so_update_last_owner_locked(struct socket *so, proc_t self)
535 {
536 if (so->last_pid != 0) {
537 /*
538 * last_pid and last_upid should remain zero for sockets
539 * created using sock_socket. The check above achieves that
540 */
541 if (self == PROC_NULL)
542 self = current_proc();
543
544 if (so->last_upid != proc_uniqueid(self) ||
545 so->last_pid != proc_pid(self)) {
546 so->last_upid = proc_uniqueid(self);
547 so->last_pid = proc_pid(self);
548 proc_getexecutableuuid(self, so->last_uuid,
549 sizeof (so->last_uuid));
550 }
551 proc_pidoriginatoruuid(so->so_vuuid, sizeof(so->so_vuuid));
552 }
553 }
554
555 void
556 so_update_policy(struct socket *so)
557 {
558 if (SOCK_DOM(so) == PF_INET || SOCK_DOM(so) == PF_INET6)
559 (void) inp_update_policy(sotoinpcb(so));
560 }
561
562 #if NECP
563 static void
564 so_update_necp_policy(struct socket *so, struct sockaddr *override_local_addr,
565 struct sockaddr *override_remote_addr)
566 {
567 if (SOCK_DOM(so) == PF_INET || SOCK_DOM(so) == PF_INET6)
568 inp_update_necp_policy(sotoinpcb(so), override_local_addr,
569 override_remote_addr, 0);
570 }
571 #endif /* NECP */
572
573 boolean_t
574 so_cache_timer(void)
575 {
576 struct socket *p;
577 int n_freed = 0;
578 boolean_t rc = FALSE;
579
580 lck_mtx_lock(so_cache_mtx);
581 so_cache_timeouts++;
582 so_cache_time = net_uptime();
583
584 while (!STAILQ_EMPTY(&so_cache_head)) {
585 VERIFY(cached_sock_count > 0);
586 p = STAILQ_FIRST(&so_cache_head);
587 if ((so_cache_time - p->cache_timestamp) <
588 SO_CACHE_TIME_LIMIT)
589 break;
590
591 STAILQ_REMOVE_HEAD(&so_cache_head, so_cache_ent);
592 --cached_sock_count;
593
594 zfree(so_cache_zone, p);
595
596 if (++n_freed >= SO_CACHE_MAX_FREE_BATCH) {
597 so_cache_max_freed++;
598 break;
599 }
600 }
601
602 /* Schedule again if there is more to cleanup */
603 if (!STAILQ_EMPTY(&so_cache_head))
604 rc = TRUE;
605
606 lck_mtx_unlock(so_cache_mtx);
607 return (rc);
608 }
609
610 /*
611 * Get a socket structure from our zone, and initialize it.
612 * We don't implement `waitok' yet (see comments in uipc_domain.c).
613 * Note that it would probably be better to allocate socket
614 * and PCB at the same time, but I'm not convinced that all
615 * the protocols can be easily modified to do this.
616 */
617 struct socket *
618 soalloc(int waitok, int dom, int type)
619 {
620 struct socket *so;
621
622 if ((dom == PF_INET) && (type == SOCK_STREAM)) {
623 cached_sock_alloc(&so, waitok);
624 } else {
625 MALLOC_ZONE(so, struct socket *, sizeof (*so), socket_zone,
626 M_WAITOK);
627 if (so != NULL)
628 bzero(so, sizeof (*so));
629 }
630 if (so != NULL) {
631 so->so_gencnt = OSIncrementAtomic64((SInt64 *)&so_gencnt);
632 so->so_zone = socket_zone;
633 #if CONFIG_MACF_SOCKET
634 /* Convert waitok to M_WAITOK/M_NOWAIT for MAC Framework. */
635 if (mac_socket_label_init(so, !waitok) != 0) {
636 sodealloc(so);
637 return (NULL);
638 }
639 #endif /* MAC_SOCKET */
640 }
641
642 return (so);
643 }
644
645 int
646 socreate_internal(int dom, struct socket **aso, int type, int proto,
647 struct proc *p, uint32_t flags, struct proc *ep)
648 {
649 struct protosw *prp;
650 struct socket *so;
651 int error = 0;
652
653 #if TCPDEBUG
654 extern int tcpconsdebug;
655 #endif
656
657 VERIFY(aso != NULL);
658 *aso = NULL;
659
660 if (proto != 0)
661 prp = pffindproto(dom, proto, type);
662 else
663 prp = pffindtype(dom, type);
664
665 if (prp == NULL || prp->pr_usrreqs->pru_attach == NULL) {
666 if (pffinddomain(dom) == NULL)
667 return (EAFNOSUPPORT);
668 if (proto != 0) {
669 if (pffindprotonotype(dom, proto) != NULL)
670 return (EPROTOTYPE);
671 }
672 return (EPROTONOSUPPORT);
673 }
674 if (prp->pr_type != type)
675 return (EPROTOTYPE);
676 so = soalloc(1, dom, type);
677 if (so == NULL)
678 return (ENOBUFS);
679
680 if (flags & SOCF_ASYNC)
681 so->so_state |= SS_NBIO;
682 #if MULTIPATH
683 if (flags & SOCF_MP_SUBFLOW) {
684 /*
685 * A multipath subflow socket is used internally in the kernel,
686 * therefore it does not have a file desciptor associated by
687 * default.
688 */
689 so->so_state |= SS_NOFDREF;
690 so->so_flags |= SOF_MP_SUBFLOW;
691 }
692 #endif /* MULTIPATH */
693
694 TAILQ_INIT(&so->so_incomp);
695 TAILQ_INIT(&so->so_comp);
696 so->so_type = type;
697 so->last_upid = proc_uniqueid(p);
698 so->last_pid = proc_pid(p);
699 proc_getexecutableuuid(p, so->last_uuid, sizeof (so->last_uuid));
700 proc_pidoriginatoruuid(so->so_vuuid, sizeof(so->so_vuuid));
701
702 if (ep != PROC_NULL && ep != p) {
703 so->e_upid = proc_uniqueid(ep);
704 so->e_pid = proc_pid(ep);
705 proc_getexecutableuuid(ep, so->e_uuid, sizeof (so->e_uuid));
706 so->so_flags |= SOF_DELEGATED;
707 }
708
709 so->so_cred = kauth_cred_proc_ref(p);
710 if (!suser(kauth_cred_get(), NULL))
711 so->so_state |= SS_PRIV;
712
713 so->so_proto = prp;
714 so->so_rcv.sb_flags |= SB_RECV;
715 so->so_rcv.sb_so = so->so_snd.sb_so = so;
716 so->next_lock_lr = 0;
717 so->next_unlock_lr = 0;
718
719 #if CONFIG_MACF_SOCKET
720 mac_socket_label_associate(kauth_cred_get(), so);
721 #endif /* MAC_SOCKET */
722
723 /*
724 * Attachment will create the per pcb lock if necessary and
725 * increase refcount for creation, make sure it's done before
726 * socket is inserted in lists.
727 */
728 so->so_usecount++;
729
730 error = (*prp->pr_usrreqs->pru_attach)(so, proto, p);
731 if (error != 0) {
732 /*
733 * Warning:
734 * If so_pcb is not zero, the socket will be leaked,
735 * so protocol attachment handler must be coded carefuly
736 */
737 so->so_state |= SS_NOFDREF;
738 VERIFY(so->so_usecount > 0);
739 so->so_usecount--;
740 sofreelastref(so, 1); /* will deallocate the socket */
741 return (error);
742 }
743
744 atomic_add_32(&prp->pr_domain->dom_refs, 1);
745 TAILQ_INIT(&so->so_evlist);
746
747 /* Attach socket filters for this protocol */
748 sflt_initsock(so);
749 #if TCPDEBUG
750 if (tcpconsdebug == 2)
751 so->so_options |= SO_DEBUG;
752 #endif
753 so_set_default_traffic_class(so);
754
755 /*
756 * If this thread or task is marked to create backgrounded sockets,
757 * mark the socket as background.
758 */
759 if (proc_get_effective_thread_policy(current_thread(),
760 TASK_POLICY_NEW_SOCKETS_BG)) {
761 socket_set_traffic_mgt_flags(so, TRAFFIC_MGT_SO_BACKGROUND);
762 so->so_background_thread = current_thread();
763 }
764
765 switch (dom) {
766 /*
767 * Don't mark Unix domain, system or multipath sockets as
768 * eligible for defunct by default.
769 */
770 case PF_LOCAL:
771 case PF_SYSTEM:
772 case PF_MULTIPATH:
773 so->so_flags |= SOF_NODEFUNCT;
774 break;
775 default:
776 break;
777 }
778
779 /*
780 * Entitlements can't be checked at socket creation time except if the
781 * application requested a feature guarded by a privilege (c.f., socket
782 * delegation).
783 * The priv(9) and the Sandboxing APIs are designed with the idea that
784 * a privilege check should only be triggered by a userland request.
785 * A privilege check at socket creation time is time consuming and
786 * could trigger many authorisation error messages from the security
787 * APIs.
788 */
789
790 *aso = so;
791
792 return (0);
793 }
794
795 /*
796 * Returns: 0 Success
797 * EAFNOSUPPORT
798 * EPROTOTYPE
799 * EPROTONOSUPPORT
800 * ENOBUFS
801 * <pru_attach>:ENOBUFS[AF_UNIX]
802 * <pru_attach>:ENOBUFS[TCP]
803 * <pru_attach>:ENOMEM[TCP]
804 * <pru_attach>:??? [other protocol families, IPSEC]
805 */
806 int
807 socreate(int dom, struct socket **aso, int type, int proto)
808 {
809 return (socreate_internal(dom, aso, type, proto, current_proc(), 0,
810 PROC_NULL));
811 }
812
813 int
814 socreate_delegate(int dom, struct socket **aso, int type, int proto, pid_t epid)
815 {
816 int error = 0;
817 struct proc *ep = PROC_NULL;
818
819 if ((proc_selfpid() != epid) && ((ep = proc_find(epid)) == PROC_NULL)) {
820 error = ESRCH;
821 goto done;
822 }
823
824 error = socreate_internal(dom, aso, type, proto, current_proc(), 0, ep);
825
826 /*
827 * It might not be wise to hold the proc reference when calling
828 * socreate_internal since it calls soalloc with M_WAITOK
829 */
830 done:
831 if (ep != PROC_NULL)
832 proc_rele(ep);
833
834 return (error);
835 }
836
837 /*
838 * Returns: 0 Success
839 * <pru_bind>:EINVAL Invalid argument [COMMON_START]
840 * <pru_bind>:EAFNOSUPPORT Address family not supported
841 * <pru_bind>:EADDRNOTAVAIL Address not available.
842 * <pru_bind>:EINVAL Invalid argument
843 * <pru_bind>:EAFNOSUPPORT Address family not supported [notdef]
844 * <pru_bind>:EACCES Permission denied
845 * <pru_bind>:EADDRINUSE Address in use
846 * <pru_bind>:EAGAIN Resource unavailable, try again
847 * <pru_bind>:EPERM Operation not permitted
848 * <pru_bind>:???
849 * <sf_bind>:???
850 *
851 * Notes: It's not possible to fully enumerate the return codes above,
852 * since socket filter authors and protocol family authors may
853 * not choose to limit their error returns to those listed, even
854 * though this may result in some software operating incorrectly.
855 *
856 * The error codes which are enumerated above are those known to
857 * be returned by the tcp_usr_bind function supplied.
858 */
859 int
860 sobindlock(struct socket *so, struct sockaddr *nam, int dolock)
861 {
862 struct proc *p = current_proc();
863 int error = 0;
864
865 if (dolock)
866 socket_lock(so, 1);
867 VERIFY(so->so_usecount > 1);
868
869 so_update_last_owner_locked(so, p);
870 so_update_policy(so);
871
872 #if NECP
873 so_update_necp_policy(so, nam, NULL);
874 #endif /* NECP */
875
876 /*
877 * If this is a bind request on a socket that has been marked
878 * as inactive, reject it now before we go any further.
879 */
880 if (so->so_flags & SOF_DEFUNCT) {
881 error = EINVAL;
882 SODEFUNCTLOG("%s[%d, %s]: defunct so 0x%llx [%d,%d] (%d)\n",
883 __func__, proc_pid(p), proc_best_name(p),
884 (uint64_t)DEBUG_KERNEL_ADDRPERM(so),
885 SOCK_DOM(so), SOCK_TYPE(so), error);
886 goto out;
887 }
888
889 /* Socket filter */
890 error = sflt_bind(so, nam);
891
892 if (error == 0)
893 error = (*so->so_proto->pr_usrreqs->pru_bind)(so, nam, p);
894 out:
895 if (dolock)
896 socket_unlock(so, 1);
897
898 if (error == EJUSTRETURN)
899 error = 0;
900
901 return (error);
902 }
903
904 void
905 sodealloc(struct socket *so)
906 {
907 kauth_cred_unref(&so->so_cred);
908
909 /* Remove any filters */
910 sflt_termsock(so);
911
912 #if CONTENT_FILTER
913 cfil_sock_detach(so);
914 #endif /* CONTENT_FILTER */
915
916 /* Delete the state allocated for msg queues on a socket */
917 if (so->so_flags & SOF_ENABLE_MSGS) {
918 FREE(so->so_msg_state, M_TEMP);
919 so->so_msg_state = NULL;
920 }
921 VERIFY(so->so_msg_state == NULL);
922
923 so->so_gencnt = OSIncrementAtomic64((SInt64 *)&so_gencnt);
924
925 #if CONFIG_MACF_SOCKET
926 mac_socket_label_destroy(so);
927 #endif /* MAC_SOCKET */
928
929 if (so->so_flags1 & SOF1_CACHED_IN_SOCK_LAYER) {
930 cached_sock_free(so);
931 } else {
932 FREE_ZONE(so, sizeof (*so), so->so_zone);
933 }
934 }
935
936 /*
937 * Returns: 0 Success
938 * EINVAL
939 * EOPNOTSUPP
940 * <pru_listen>:EINVAL[AF_UNIX]
941 * <pru_listen>:EINVAL[TCP]
942 * <pru_listen>:EADDRNOTAVAIL[TCP] Address not available.
943 * <pru_listen>:EINVAL[TCP] Invalid argument
944 * <pru_listen>:EAFNOSUPPORT[TCP] Address family not supported [notdef]
945 * <pru_listen>:EACCES[TCP] Permission denied
946 * <pru_listen>:EADDRINUSE[TCP] Address in use
947 * <pru_listen>:EAGAIN[TCP] Resource unavailable, try again
948 * <pru_listen>:EPERM[TCP] Operation not permitted
949 * <sf_listen>:???
950 *
951 * Notes: Other <pru_listen> returns depend on the protocol family; all
952 * <sf_listen> returns depend on what the filter author causes
953 * their filter to return.
954 */
955 int
956 solisten(struct socket *so, int backlog)
957 {
958 struct proc *p = current_proc();
959 int error = 0;
960
961 socket_lock(so, 1);
962
963 so_update_last_owner_locked(so, p);
964 so_update_policy(so);
965
966 #if NECP
967 so_update_necp_policy(so, NULL, NULL);
968 #endif /* NECP */
969
970 if (so->so_proto == NULL) {
971 error = EINVAL;
972 goto out;
973 }
974 if ((so->so_proto->pr_flags & PR_CONNREQUIRED) == 0) {
975 error = EOPNOTSUPP;
976 goto out;
977 }
978
979 /*
980 * If the listen request is made on a socket that is not fully
981 * disconnected, or on a socket that has been marked as inactive,
982 * reject the request now.
983 */
984 if ((so->so_state &
985 (SS_ISCONNECTED|SS_ISCONNECTING|SS_ISDISCONNECTING)) ||
986 (so->so_flags & SOF_DEFUNCT)) {
987 error = EINVAL;
988 if (so->so_flags & SOF_DEFUNCT) {
989 SODEFUNCTLOG("%s[%d, %s]: defunct so 0x%llx [%d,%d] "
990 "(%d)\n", __func__, proc_pid(p),
991 proc_best_name(p),
992 (uint64_t)DEBUG_KERNEL_ADDRPERM(so),
993 SOCK_DOM(so), SOCK_TYPE(so), error);
994 }
995 goto out;
996 }
997
998 if ((so->so_restrictions & SO_RESTRICT_DENY_IN) != 0) {
999 error = EPERM;
1000 goto out;
1001 }
1002
1003 error = sflt_listen(so);
1004 if (error == 0)
1005 error = (*so->so_proto->pr_usrreqs->pru_listen)(so, p);
1006
1007 if (error) {
1008 if (error == EJUSTRETURN)
1009 error = 0;
1010 goto out;
1011 }
1012
1013 if (TAILQ_EMPTY(&so->so_comp))
1014 so->so_options |= SO_ACCEPTCONN;
1015 /*
1016 * POSIX: The implementation may have an upper limit on the length of
1017 * the listen queue-either global or per accepting socket. If backlog
1018 * exceeds this limit, the length of the listen queue is set to the
1019 * limit.
1020 *
1021 * If listen() is called with a backlog argument value that is less
1022 * than 0, the function behaves as if it had been called with a backlog
1023 * argument value of 0.
1024 *
1025 * A backlog argument of 0 may allow the socket to accept connections,
1026 * in which case the length of the listen queue may be set to an
1027 * implementation-defined minimum value.
1028 */
1029 if (backlog <= 0 || backlog > somaxconn)
1030 backlog = somaxconn;
1031
1032 so->so_qlimit = backlog;
1033 out:
1034 socket_unlock(so, 1);
1035 return (error);
1036 }
1037
1038 /*
1039 * The "accept list lock" protects the fields related to the listener queues
1040 * because we can unlock a socket to respect the lock ordering between
1041 * the listener socket and its clients sockets. The lock ordering is first to
1042 * acquire the client socket before the listener socket.
1043 *
1044 * The accept list lock serializes access to the following fields:
1045 * - of the listener socket:
1046 * - so_comp
1047 * - so_incomp
1048 * - so_qlen
1049 * - so_inqlen
1050 * - of client sockets that are in so_comp or so_incomp:
1051 * - so_head
1052 * - so_list
1053 *
1054 * As one can see the accept list lock protects the consistent of the
1055 * linkage of the client sockets.
1056 *
1057 * Note that those fields may be read without holding the accept list lock
1058 * for a preflight provided the accept list lock is taken when committing
1059 * to take an action based on the result of the preflight. The preflight
1060 * saves the cost of doing the unlock/lock dance.
1061 */
1062 void
1063 so_acquire_accept_list(struct socket *head, struct socket *so)
1064 {
1065 lck_mtx_t *mutex_held;
1066
1067 if (head->so_proto->pr_getlock == NULL) {
1068 return;
1069 }
1070 mutex_held = (*head->so_proto->pr_getlock)(head, 0);
1071 lck_mtx_assert(mutex_held, LCK_MTX_ASSERT_OWNED);
1072
1073 if (!(head->so_flags1 & SOF1_ACCEPT_LIST_HELD)) {
1074 head->so_flags1 |= SOF1_ACCEPT_LIST_HELD;
1075 return;
1076 }
1077 if (so != NULL) {
1078 socket_unlock(so, 0);
1079 }
1080 while (head->so_flags1 & SOF1_ACCEPT_LIST_HELD) {
1081 so_accept_list_waits += 1;
1082 msleep((caddr_t)&head->so_incomp, mutex_held,
1083 PSOCK | PCATCH, __func__, NULL);
1084 }
1085 head->so_flags1 |= SOF1_ACCEPT_LIST_HELD;
1086 if (so != NULL) {
1087 socket_unlock(head, 0);
1088 socket_lock(so, 0);
1089 socket_lock(head, 0);
1090 }
1091 }
1092
1093 void
1094 so_release_accept_list(struct socket *head)
1095 {
1096 if (head->so_proto->pr_getlock != NULL) {
1097 lck_mtx_t *mutex_held;
1098
1099 mutex_held = (*head->so_proto->pr_getlock)(head, 0);
1100 lck_mtx_assert(mutex_held, LCK_MTX_ASSERT_OWNED);
1101
1102 head->so_flags1 &= ~SOF1_ACCEPT_LIST_HELD;
1103 wakeup((caddr_t)&head->so_incomp);
1104 }
1105 }
1106
1107 void
1108 sofreelastref(struct socket *so, int dealloc)
1109 {
1110 struct socket *head = so->so_head;
1111
1112 /* Assume socket is locked */
1113
1114 if (!(so->so_flags & SOF_PCBCLEARING) || !(so->so_state & SS_NOFDREF)) {
1115 selthreadclear(&so->so_snd.sb_sel);
1116 selthreadclear(&so->so_rcv.sb_sel);
1117 so->so_rcv.sb_flags &= ~(SB_SEL|SB_UPCALL);
1118 so->so_snd.sb_flags &= ~(SB_SEL|SB_UPCALL);
1119 so->so_event = sonullevent;
1120 return;
1121 }
1122 if (head != NULL) {
1123 /*
1124 * Need to lock the listener when the protocol has
1125 * per socket locks
1126 */
1127 if (head->so_proto->pr_getlock != NULL) {
1128 socket_lock(head, 1);
1129 so_acquire_accept_list(head, so);
1130 }
1131 if (so->so_state & SS_INCOMP) {
1132 so->so_state &= ~SS_INCOMP;
1133 TAILQ_REMOVE(&head->so_incomp, so, so_list);
1134 head->so_incqlen--;
1135 head->so_qlen--;
1136 so->so_head = NULL;
1137
1138 if (head->so_proto->pr_getlock != NULL) {
1139 so_release_accept_list(head);
1140 socket_unlock(head, 1);
1141 }
1142 } else if (so->so_state & SS_COMP) {
1143 if (head->so_proto->pr_getlock != NULL) {
1144 so_release_accept_list(head);
1145 socket_unlock(head, 1);
1146 }
1147 /*
1148 * We must not decommission a socket that's
1149 * on the accept(2) queue. If we do, then
1150 * accept(2) may hang after select(2) indicated
1151 * that the listening socket was ready.
1152 */
1153 selthreadclear(&so->so_snd.sb_sel);
1154 selthreadclear(&so->so_rcv.sb_sel);
1155 so->so_rcv.sb_flags &= ~(SB_SEL|SB_UPCALL);
1156 so->so_snd.sb_flags &= ~(SB_SEL|SB_UPCALL);
1157 so->so_event = sonullevent;
1158 return;
1159 } else {
1160 if (head->so_proto->pr_getlock != NULL) {
1161 so_release_accept_list(head);
1162 socket_unlock(head, 1);
1163 }
1164 printf("sofree: not queued\n");
1165 }
1166 }
1167 sowflush(so);
1168 sorflush(so);
1169
1170 #if FLOW_DIVERT
1171 if (so->so_flags & SOF_FLOW_DIVERT) {
1172 flow_divert_detach(so);
1173 }
1174 #endif /* FLOW_DIVERT */
1175
1176 /* 3932268: disable upcall */
1177 so->so_rcv.sb_flags &= ~SB_UPCALL;
1178 so->so_snd.sb_flags &= ~(SB_UPCALL|SB_SNDBYTE_CNT);
1179 so->so_event = sonullevent;
1180
1181 if (dealloc)
1182 sodealloc(so);
1183 }
1184
1185 void
1186 soclose_wait_locked(struct socket *so)
1187 {
1188 lck_mtx_t *mutex_held;
1189
1190 if (so->so_proto->pr_getlock != NULL)
1191 mutex_held = (*so->so_proto->pr_getlock)(so, 0);
1192 else
1193 mutex_held = so->so_proto->pr_domain->dom_mtx;
1194 lck_mtx_assert(mutex_held, LCK_MTX_ASSERT_OWNED);
1195
1196 /*
1197 * Double check here and return if there's no outstanding upcall;
1198 * otherwise proceed further only if SOF_UPCALLCLOSEWAIT is set.
1199 */
1200 if (!so->so_upcallusecount || !(so->so_flags & SOF_UPCALLCLOSEWAIT))
1201 return;
1202 so->so_rcv.sb_flags &= ~SB_UPCALL;
1203 so->so_snd.sb_flags &= ~SB_UPCALL;
1204 so->so_flags |= SOF_CLOSEWAIT;
1205 (void) msleep((caddr_t)&so->so_upcallusecount, mutex_held, (PZERO - 1),
1206 "soclose_wait_locked", NULL);
1207 lck_mtx_assert(mutex_held, LCK_MTX_ASSERT_OWNED);
1208 so->so_flags &= ~SOF_CLOSEWAIT;
1209 }
1210
1211 /*
1212 * Close a socket on last file table reference removal.
1213 * Initiate disconnect if connected.
1214 * Free socket when disconnect complete.
1215 */
1216 int
1217 soclose_locked(struct socket *so)
1218 {
1219 int error = 0;
1220 struct timespec ts;
1221
1222 if (so->so_usecount == 0) {
1223 panic("soclose: so=%p refcount=0\n", so);
1224 /* NOTREACHED */
1225 }
1226
1227 sflt_notify(so, sock_evt_closing, NULL);
1228
1229 if (so->so_upcallusecount)
1230 soclose_wait_locked(so);
1231
1232 #if CONTENT_FILTER
1233 /*
1234 * We have to wait until the content filters are done
1235 */
1236 if ((so->so_flags & SOF_CONTENT_FILTER) != 0) {
1237 cfil_sock_close_wait(so);
1238 cfil_sock_is_closed(so);
1239 cfil_sock_detach(so);
1240 }
1241 #endif /* CONTENT_FILTER */
1242
1243 if (so->so_flags1 & SOF1_EXTEND_BK_IDLE_INPROG) {
1244 soresume(current_proc(), so, 1);
1245 so->so_flags1 &= ~SOF1_EXTEND_BK_IDLE_WANTED;
1246 }
1247
1248 if ((so->so_options & SO_ACCEPTCONN)) {
1249 struct socket *sp, *sonext;
1250 int persocklock = 0;
1251 int incomp_overflow_only;
1252
1253 /*
1254 * We do not want new connection to be added
1255 * to the connection queues
1256 */
1257 so->so_options &= ~SO_ACCEPTCONN;
1258
1259 /*
1260 * We can drop the lock on the listener once
1261 * we've acquired the incoming list
1262 */
1263 if (so->so_proto->pr_getlock != NULL) {
1264 persocklock = 1;
1265 so_acquire_accept_list(so, NULL);
1266 socket_unlock(so, 0);
1267 }
1268 again:
1269 incomp_overflow_only = 1;
1270
1271 TAILQ_FOREACH_SAFE(sp, &so->so_incomp, so_list, sonext) {
1272 /*
1273 * Radar 5350314
1274 * skip sockets thrown away by tcpdropdropblreq
1275 * they will get cleanup by the garbage collection.
1276 * otherwise, remove the incomp socket from the queue
1277 * and let soabort trigger the appropriate cleanup.
1278 */
1279 if (sp->so_flags & SOF_OVERFLOW)
1280 continue;
1281
1282 if (persocklock != 0)
1283 socket_lock(sp, 1);
1284
1285 /*
1286 * Radar 27945981
1287 * The extra reference for the list insure the
1288 * validity of the socket pointer when we perform the
1289 * unlock of the head above
1290 */
1291 if (sp->so_state & SS_INCOMP) {
1292 sp->so_state &= ~SS_INCOMP;
1293 sp->so_head = NULL;
1294 TAILQ_REMOVE(&so->so_incomp, sp, so_list);
1295 so->so_incqlen--;
1296 so->so_qlen--;
1297
1298 (void) soabort(sp);
1299 } else {
1300 panic("%s sp %p in so_incomp but !SS_INCOMP",
1301 __func__, sp);
1302 }
1303
1304 if (persocklock != 0)
1305 socket_unlock(sp, 1);
1306 }
1307
1308 TAILQ_FOREACH_SAFE(sp, &so->so_comp, so_list, sonext) {
1309 /* Dequeue from so_comp since sofree() won't do it */
1310 if (persocklock != 0)
1311 socket_lock(sp, 1);
1312
1313 if (sp->so_state & SS_COMP) {
1314 sp->so_state &= ~SS_COMP;
1315 sp->so_head = NULL;
1316 TAILQ_REMOVE(&so->so_comp, sp, so_list);
1317 so->so_qlen--;
1318
1319 (void) soabort(sp);
1320 } else {
1321 panic("%s sp %p in so_comp but !SS_COMP",
1322 __func__, sp);
1323 }
1324
1325 if (persocklock)
1326 socket_unlock(sp, 1);
1327 }
1328
1329 if (incomp_overflow_only == 0 && !TAILQ_EMPTY(&so->so_incomp)) {
1330 #if (DEBUG|DEVELOPMENT)
1331 panic("%s head %p so_comp not empty\n", __func__, so);
1332 #endif /* (DEVELOPMENT || DEBUG) */
1333
1334 goto again;
1335 }
1336
1337 if (!TAILQ_EMPTY(&so->so_comp)) {
1338 #if (DEBUG|DEVELOPMENT)
1339 panic("%s head %p so_comp not empty\n", __func__, so);
1340 #endif /* (DEVELOPMENT || DEBUG) */
1341
1342 goto again;
1343 }
1344
1345 if (persocklock) {
1346 socket_lock(so, 0);
1347 so_release_accept_list(so);
1348 }
1349 }
1350 if (so->so_pcb == NULL) {
1351 /* 3915887: mark the socket as ready for dealloc */
1352 so->so_flags |= SOF_PCBCLEARING;
1353 goto discard;
1354 }
1355 if (so->so_state & SS_ISCONNECTED) {
1356 if ((so->so_state & SS_ISDISCONNECTING) == 0) {
1357 error = sodisconnectlocked(so);
1358 if (error)
1359 goto drop;
1360 }
1361 if (so->so_options & SO_LINGER) {
1362 lck_mtx_t *mutex_held;
1363
1364 if ((so->so_state & SS_ISDISCONNECTING) &&
1365 (so->so_state & SS_NBIO))
1366 goto drop;
1367 if (so->so_proto->pr_getlock != NULL)
1368 mutex_held = (*so->so_proto->pr_getlock)(so, 0);
1369 else
1370 mutex_held = so->so_proto->pr_domain->dom_mtx;
1371 while (so->so_state & SS_ISCONNECTED) {
1372 ts.tv_sec = (so->so_linger/100);
1373 ts.tv_nsec = (so->so_linger % 100) *
1374 NSEC_PER_USEC * 1000 * 10;
1375 error = msleep((caddr_t)&so->so_timeo,
1376 mutex_held, PSOCK | PCATCH, "soclose", &ts);
1377 if (error) {
1378 /*
1379 * It's OK when the time fires,
1380 * don't report an error
1381 */
1382 if (error == EWOULDBLOCK)
1383 error = 0;
1384 break;
1385 }
1386 }
1387 }
1388 }
1389 drop:
1390 if (so->so_usecount == 0) {
1391 panic("soclose: usecount is zero so=%p\n", so);
1392 /* NOTREACHED */
1393 }
1394 if (so->so_pcb != NULL && !(so->so_flags & SOF_PCBCLEARING)) {
1395 int error2 = (*so->so_proto->pr_usrreqs->pru_detach)(so);
1396 if (error == 0)
1397 error = error2;
1398 }
1399 if (so->so_usecount <= 0) {
1400 panic("soclose: usecount is zero so=%p\n", so);
1401 /* NOTREACHED */
1402 }
1403 discard:
1404 if (so->so_pcb != NULL && !(so->so_flags & SOF_MP_SUBFLOW) &&
1405 (so->so_state & SS_NOFDREF)) {
1406 panic("soclose: NOFDREF");
1407 /* NOTREACHED */
1408 }
1409 so->so_state |= SS_NOFDREF;
1410
1411 if (so->so_flags & SOF_MP_SUBFLOW)
1412 so->so_flags &= ~SOF_MP_SUBFLOW;
1413
1414 if ((so->so_flags & SOF_KNOTE) != 0)
1415 KNOTE(&so->so_klist, SO_FILT_HINT_LOCKED);
1416
1417 atomic_add_32(&so->so_proto->pr_domain->dom_refs, -1);
1418 evsofree(so);
1419
1420 VERIFY(so->so_usecount > 0);
1421 so->so_usecount--;
1422 sofree(so);
1423 return (error);
1424 }
1425
1426 int
1427 soclose(struct socket *so)
1428 {
1429 int error = 0;
1430 socket_lock(so, 1);
1431
1432 if (so->so_retaincnt == 0) {
1433 error = soclose_locked(so);
1434 } else {
1435 /*
1436 * if the FD is going away, but socket is
1437 * retained in kernel remove its reference
1438 */
1439 so->so_usecount--;
1440 if (so->so_usecount < 2)
1441 panic("soclose: retaincnt non null and so=%p "
1442 "usecount=%d\n", so, so->so_usecount);
1443 }
1444 socket_unlock(so, 1);
1445 return (error);
1446 }
1447
1448 /*
1449 * Must be called at splnet...
1450 */
1451 /* Should already be locked */
1452 int
1453 soabort(struct socket *so)
1454 {
1455 int error;
1456
1457 #ifdef MORE_LOCKING_DEBUG
1458 lck_mtx_t *mutex_held;
1459
1460 if (so->so_proto->pr_getlock != NULL)
1461 mutex_held = (*so->so_proto->pr_getlock)(so, 0);
1462 else
1463 mutex_held = so->so_proto->pr_domain->dom_mtx;
1464 lck_mtx_assert(mutex_held, LCK_MTX_ASSERT_OWNED);
1465 #endif
1466
1467 if ((so->so_flags & SOF_ABORTED) == 0) {
1468 so->so_flags |= SOF_ABORTED;
1469 error = (*so->so_proto->pr_usrreqs->pru_abort)(so);
1470 if (error) {
1471 sofree(so);
1472 return (error);
1473 }
1474 }
1475 return (0);
1476 }
1477
1478 int
1479 soacceptlock(struct socket *so, struct sockaddr **nam, int dolock)
1480 {
1481 int error;
1482
1483 if (dolock)
1484 socket_lock(so, 1);
1485
1486 so_update_last_owner_locked(so, PROC_NULL);
1487 so_update_policy(so);
1488 #if NECP
1489 so_update_necp_policy(so, NULL, NULL);
1490 #endif /* NECP */
1491
1492 if ((so->so_state & SS_NOFDREF) == 0)
1493 panic("soaccept: !NOFDREF");
1494 so->so_state &= ~SS_NOFDREF;
1495 error = (*so->so_proto->pr_usrreqs->pru_accept)(so, nam);
1496
1497 if (dolock)
1498 socket_unlock(so, 1);
1499 return (error);
1500 }
1501
1502 int
1503 soaccept(struct socket *so, struct sockaddr **nam)
1504 {
1505 return (soacceptlock(so, nam, 1));
1506 }
1507
1508 int
1509 soacceptfilter(struct socket *so, struct socket *head)
1510 {
1511 struct sockaddr *local = NULL, *remote = NULL;
1512 int error = 0;
1513
1514 /*
1515 * Hold the lock even if this socket has not been made visible
1516 * to the filter(s). For sockets with global locks, this protects
1517 * against the head or peer going away
1518 */
1519 socket_lock(so, 1);
1520 if (sogetaddr_locked(so, &remote, 1) != 0 ||
1521 sogetaddr_locked(so, &local, 0) != 0) {
1522 so->so_state &= ~SS_NOFDREF;
1523 socket_unlock(so, 1);
1524 soclose(so);
1525 /* Out of resources; try it again next time */
1526 error = ECONNABORTED;
1527 goto done;
1528 }
1529
1530 error = sflt_accept(head, so, local, remote);
1531
1532 /*
1533 * If we get EJUSTRETURN from one of the filters, mark this socket
1534 * as inactive and return it anyway. This newly accepted socket
1535 * will be disconnected later before we hand it off to the caller.
1536 */
1537 if (error == EJUSTRETURN) {
1538 error = 0;
1539 (void) sosetdefunct(current_proc(), so,
1540 SHUTDOWN_SOCKET_LEVEL_DISCONNECT_INTERNAL, FALSE);
1541 }
1542
1543 if (error != 0) {
1544 /*
1545 * This may seem like a duplication to the above error
1546 * handling part when we return ECONNABORTED, except
1547 * the following is done while holding the lock since
1548 * the socket has been exposed to the filter(s) earlier.
1549 */
1550 so->so_state &= ~SS_COMP;
1551 socket_unlock(so, 1);
1552 soclose(so);
1553 /* Propagate socket filter's error code to the caller */
1554 } else {
1555 socket_unlock(so, 1);
1556 }
1557 done:
1558 /* Callee checks for NULL pointer */
1559 sock_freeaddr(remote);
1560 sock_freeaddr(local);
1561 return (error);
1562 }
1563
1564 /*
1565 * Returns: 0 Success
1566 * EOPNOTSUPP Operation not supported on socket
1567 * EISCONN Socket is connected
1568 * <pru_connect>:EADDRNOTAVAIL Address not available.
1569 * <pru_connect>:EINVAL Invalid argument
1570 * <pru_connect>:EAFNOSUPPORT Address family not supported [notdef]
1571 * <pru_connect>:EACCES Permission denied
1572 * <pru_connect>:EADDRINUSE Address in use
1573 * <pru_connect>:EAGAIN Resource unavailable, try again
1574 * <pru_connect>:EPERM Operation not permitted
1575 * <sf_connect_out>:??? [anything a filter writer might set]
1576 */
1577 int
1578 soconnectlock(struct socket *so, struct sockaddr *nam, int dolock)
1579 {
1580 int error;
1581 struct proc *p = current_proc();
1582
1583 if (dolock)
1584 socket_lock(so, 1);
1585
1586 so_update_last_owner_locked(so, p);
1587 so_update_policy(so);
1588
1589 #if NECP
1590 so_update_necp_policy(so, NULL, nam);
1591 #endif /* NECP */
1592
1593 /*
1594 * If this is a listening socket or if this is a previously-accepted
1595 * socket that has been marked as inactive, reject the connect request.
1596 */
1597 if ((so->so_options & SO_ACCEPTCONN) || (so->so_flags & SOF_DEFUNCT)) {
1598 error = EOPNOTSUPP;
1599 if (so->so_flags & SOF_DEFUNCT) {
1600 SODEFUNCTLOG("%s[%d, %s]: defunct so 0x%llx [%d,%d] "
1601 "(%d)\n", __func__, proc_pid(p),
1602 proc_best_name(p),
1603 (uint64_t)DEBUG_KERNEL_ADDRPERM(so),
1604 SOCK_DOM(so), SOCK_TYPE(so), error);
1605 }
1606 if (dolock)
1607 socket_unlock(so, 1);
1608 return (error);
1609 }
1610
1611 if ((so->so_restrictions & SO_RESTRICT_DENY_OUT) != 0) {
1612 if (dolock)
1613 socket_unlock(so, 1);
1614 return (EPERM);
1615 }
1616
1617 /*
1618 * If protocol is connection-based, can only connect once.
1619 * Otherwise, if connected, try to disconnect first.
1620 * This allows user to disconnect by connecting to, e.g.,
1621 * a null address.
1622 */
1623 if (so->so_state & (SS_ISCONNECTED|SS_ISCONNECTING) &&
1624 ((so->so_proto->pr_flags & PR_CONNREQUIRED) ||
1625 (error = sodisconnectlocked(so)))) {
1626 error = EISCONN;
1627 } else {
1628 /*
1629 * Run connect filter before calling protocol:
1630 * - non-blocking connect returns before completion;
1631 */
1632 error = sflt_connectout(so, nam);
1633 if (error != 0) {
1634 if (error == EJUSTRETURN)
1635 error = 0;
1636 } else {
1637 error = (*so->so_proto->pr_usrreqs->pru_connect)
1638 (so, nam, p);
1639 }
1640 }
1641 if (dolock)
1642 socket_unlock(so, 1);
1643 return (error);
1644 }
1645
1646 int
1647 soconnect(struct socket *so, struct sockaddr *nam)
1648 {
1649 return (soconnectlock(so, nam, 1));
1650 }
1651
1652 /*
1653 * Returns: 0 Success
1654 * <pru_connect2>:EINVAL[AF_UNIX]
1655 * <pru_connect2>:EPROTOTYPE[AF_UNIX]
1656 * <pru_connect2>:??? [other protocol families]
1657 *
1658 * Notes: <pru_connect2> is not supported by [TCP].
1659 */
1660 int
1661 soconnect2(struct socket *so1, struct socket *so2)
1662 {
1663 int error;
1664
1665 socket_lock(so1, 1);
1666 if (so2->so_proto->pr_lock)
1667 socket_lock(so2, 1);
1668
1669 error = (*so1->so_proto->pr_usrreqs->pru_connect2)(so1, so2);
1670
1671 socket_unlock(so1, 1);
1672 if (so2->so_proto->pr_lock)
1673 socket_unlock(so2, 1);
1674 return (error);
1675 }
1676
1677 int
1678 soconnectxlocked(struct socket *so, struct sockaddr *src,
1679 struct sockaddr *dst, struct proc *p, uint32_t ifscope,
1680 sae_associd_t aid, sae_connid_t *pcid, uint32_t flags, void *arg,
1681 uint32_t arglen, uio_t auio, user_ssize_t *bytes_written)
1682 {
1683 int error;
1684
1685 so_update_last_owner_locked(so, p);
1686 so_update_policy(so);
1687
1688 /*
1689 * If this is a listening socket or if this is a previously-accepted
1690 * socket that has been marked as inactive, reject the connect request.
1691 */
1692 if ((so->so_options & SO_ACCEPTCONN) || (so->so_flags & SOF_DEFUNCT)) {
1693 error = EOPNOTSUPP;
1694 if (so->so_flags & SOF_DEFUNCT) {
1695 SODEFUNCTLOG("%s[%d, %s]: defunct so 0x%llx [%d,%d] "
1696 "(%d)\n", __func__, proc_pid(p),
1697 proc_best_name(p),
1698 (uint64_t)DEBUG_KERNEL_ADDRPERM(so),
1699 SOCK_DOM(so), SOCK_TYPE(so), error);
1700 }
1701 return (error);
1702 }
1703
1704 if ((so->so_restrictions & SO_RESTRICT_DENY_OUT) != 0)
1705 return (EPERM);
1706
1707 /*
1708 * If protocol is connection-based, can only connect once
1709 * unless PR_MULTICONN is set. Otherwise, if connected,
1710 * try to disconnect first. This allows user to disconnect
1711 * by connecting to, e.g., a null address.
1712 */
1713 if ((so->so_state & (SS_ISCONNECTED|SS_ISCONNECTING)) &&
1714 !(so->so_proto->pr_flags & PR_MULTICONN) &&
1715 ((so->so_proto->pr_flags & PR_CONNREQUIRED) ||
1716 (error = sodisconnectlocked(so)) != 0)) {
1717 error = EISCONN;
1718 } else {
1719 /*
1720 * Run connect filter before calling protocol:
1721 * - non-blocking connect returns before completion;
1722 */
1723 error = sflt_connectout(so, dst);
1724 if (error != 0) {
1725 /* Disable PRECONNECT_DATA, as we don't need to send a SYN anymore. */
1726 so->so_flags1 &= ~SOF1_PRECONNECT_DATA;
1727 if (error == EJUSTRETURN)
1728 error = 0;
1729 } else {
1730 error = (*so->so_proto->pr_usrreqs->pru_connectx)
1731 (so, src, dst, p, ifscope, aid, pcid,
1732 flags, arg, arglen, auio, bytes_written);
1733 }
1734 }
1735
1736 return (error);
1737 }
1738
1739 int
1740 sodisconnectlocked(struct socket *so)
1741 {
1742 int error;
1743
1744 if ((so->so_state & SS_ISCONNECTED) == 0) {
1745 error = ENOTCONN;
1746 goto bad;
1747 }
1748 if (so->so_state & SS_ISDISCONNECTING) {
1749 error = EALREADY;
1750 goto bad;
1751 }
1752
1753 error = (*so->so_proto->pr_usrreqs->pru_disconnect)(so);
1754 if (error == 0)
1755 sflt_notify(so, sock_evt_disconnected, NULL);
1756
1757 bad:
1758 return (error);
1759 }
1760
1761 /* Locking version */
1762 int
1763 sodisconnect(struct socket *so)
1764 {
1765 int error;
1766
1767 socket_lock(so, 1);
1768 error = sodisconnectlocked(so);
1769 socket_unlock(so, 1);
1770 return (error);
1771 }
1772
1773 int
1774 sodisconnectxlocked(struct socket *so, sae_associd_t aid, sae_connid_t cid)
1775 {
1776 int error;
1777
1778 /*
1779 * Call the protocol disconnectx handler; let it handle all
1780 * matters related to the connection state of this session.
1781 */
1782 error = (*so->so_proto->pr_usrreqs->pru_disconnectx)(so, aid, cid);
1783 if (error == 0) {
1784 /*
1785 * The event applies only for the session, not for
1786 * the disconnection of individual subflows.
1787 */
1788 if (so->so_state & (SS_ISDISCONNECTING|SS_ISDISCONNECTED))
1789 sflt_notify(so, sock_evt_disconnected, NULL);
1790 }
1791 return (error);
1792 }
1793
1794 int
1795 sodisconnectx(struct socket *so, sae_associd_t aid, sae_connid_t cid)
1796 {
1797 int error;
1798
1799 socket_lock(so, 1);
1800 error = sodisconnectxlocked(so, aid, cid);
1801 socket_unlock(so, 1);
1802 return (error);
1803 }
1804
1805 int
1806 sopeelofflocked(struct socket *so, sae_associd_t aid, struct socket **psop)
1807 {
1808 return ((*so->so_proto->pr_usrreqs->pru_peeloff)(so, aid, psop));
1809 }
1810
1811 #define SBLOCKWAIT(f) (((f) & MSG_DONTWAIT) ? 0 : SBL_WAIT)
1812
1813 /*
1814 * sosendcheck will lock the socket buffer if it isn't locked and
1815 * verify that there is space for the data being inserted.
1816 *
1817 * Returns: 0 Success
1818 * EPIPE
1819 * sblock:EWOULDBLOCK
1820 * sblock:EINTR
1821 * sbwait:EBADF
1822 * sbwait:EINTR
1823 * [so_error]:???
1824 */
1825 int
1826 sosendcheck(struct socket *so, struct sockaddr *addr, user_ssize_t resid,
1827 int32_t clen, int32_t atomic, int flags, int *sblocked,
1828 struct mbuf *control)
1829 {
1830 int error = 0;
1831 int32_t space;
1832 int assumelock = 0;
1833
1834 restart:
1835 if (*sblocked == 0) {
1836 if ((so->so_snd.sb_flags & SB_LOCK) != 0 &&
1837 so->so_send_filt_thread != 0 &&
1838 so->so_send_filt_thread == current_thread()) {
1839 /*
1840 * We're being called recursively from a filter,
1841 * allow this to continue. Radar 4150520.
1842 * Don't set sblocked because we don't want
1843 * to perform an unlock later.
1844 */
1845 assumelock = 1;
1846 } else {
1847 error = sblock(&so->so_snd, SBLOCKWAIT(flags));
1848 if (error) {
1849 if (so->so_flags & SOF_DEFUNCT)
1850 goto defunct;
1851 return (error);
1852 }
1853 *sblocked = 1;
1854 }
1855 }
1856
1857 /*
1858 * If a send attempt is made on a socket that has been marked
1859 * as inactive (disconnected), reject the request.
1860 */
1861 if (so->so_flags & SOF_DEFUNCT) {
1862 defunct:
1863 error = EPIPE;
1864 SODEFUNCTLOG("%s[%d, %s]: defunct so 0x%llx [%d,%d] (%d)\n",
1865 __func__, proc_selfpid(), proc_best_name(current_proc()),
1866 (uint64_t)DEBUG_KERNEL_ADDRPERM(so),
1867 SOCK_DOM(so), SOCK_TYPE(so), error);
1868 return (error);
1869 }
1870
1871 if (so->so_state & SS_CANTSENDMORE) {
1872 #if CONTENT_FILTER
1873 /*
1874 * Can re-inject data of half closed connections
1875 */
1876 if ((so->so_state & SS_ISDISCONNECTED) == 0 &&
1877 so->so_snd.sb_cfil_thread == current_thread() &&
1878 cfil_sock_data_pending(&so->so_snd) != 0)
1879 CFIL_LOG(LOG_INFO,
1880 "so %llx ignore SS_CANTSENDMORE",
1881 (uint64_t)DEBUG_KERNEL_ADDRPERM(so));
1882 else
1883 #endif /* CONTENT_FILTER */
1884 return (EPIPE);
1885 }
1886 if (so->so_error) {
1887 error = so->so_error;
1888 so->so_error = 0;
1889 return (error);
1890 }
1891
1892 if ((so->so_state & SS_ISCONNECTED) == 0) {
1893 if ((so->so_proto->pr_flags & PR_CONNREQUIRED) != 0) {
1894 if (((so->so_state & SS_ISCONFIRMING) == 0) &&
1895 (resid != 0 || clen == 0) &&
1896 !(so->so_flags1 & SOF1_PRECONNECT_DATA)) {
1897 #if MPTCP
1898 /*
1899 * MPTCP Fast Join sends data before the
1900 * socket is truly connected.
1901 */
1902 if ((so->so_flags & (SOF_MP_SUBFLOW |
1903 SOF_MPTCP_FASTJOIN)) !=
1904 (SOF_MP_SUBFLOW | SOF_MPTCP_FASTJOIN))
1905 #endif /* MPTCP */
1906 return (ENOTCONN);
1907 }
1908 } else if (addr == 0 && !(flags&MSG_HOLD)) {
1909 return ((so->so_proto->pr_flags & PR_CONNREQUIRED) ?
1910 ENOTCONN : EDESTADDRREQ);
1911 }
1912 }
1913
1914 if (so->so_flags & SOF_ENABLE_MSGS)
1915 space = msgq_sbspace(so, control);
1916 else
1917 space = sbspace(&so->so_snd);
1918
1919 if (flags & MSG_OOB)
1920 space += 1024;
1921 if ((atomic && resid > so->so_snd.sb_hiwat) ||
1922 clen > so->so_snd.sb_hiwat)
1923 return (EMSGSIZE);
1924
1925 if ((space < resid + clen &&
1926 (atomic || (space < (int32_t)so->so_snd.sb_lowat) ||
1927 space < clen)) ||
1928 (so->so_type == SOCK_STREAM && so_wait_for_if_feedback(so))) {
1929 /*
1930 * don't block the connectx call when there's more data
1931 * than can be copied.
1932 */
1933 if (so->so_flags1 & SOF1_PRECONNECT_DATA) {
1934 if (space == 0) {
1935 return (EWOULDBLOCK);
1936 }
1937 if (space < (int32_t)so->so_snd.sb_lowat) {
1938 return (0);
1939 }
1940 }
1941 if ((so->so_state & SS_NBIO) || (flags & MSG_NBIO) ||
1942 assumelock) {
1943 return (EWOULDBLOCK);
1944 }
1945 sbunlock(&so->so_snd, TRUE); /* keep socket locked */
1946 *sblocked = 0;
1947 error = sbwait(&so->so_snd);
1948 if (error) {
1949 if (so->so_flags & SOF_DEFUNCT)
1950 goto defunct;
1951 return (error);
1952 }
1953 goto restart;
1954 }
1955 return (0);
1956 }
1957
1958 /*
1959 * Send on a socket.
1960 * If send must go all at once and message is larger than
1961 * send buffering, then hard error.
1962 * Lock against other senders.
1963 * If must go all at once and not enough room now, then
1964 * inform user that this would block and do nothing.
1965 * Otherwise, if nonblocking, send as much as possible.
1966 * The data to be sent is described by "uio" if nonzero,
1967 * otherwise by the mbuf chain "top" (which must be null
1968 * if uio is not). Data provided in mbuf chain must be small
1969 * enough to send all at once.
1970 *
1971 * Returns nonzero on error, timeout or signal; callers
1972 * must check for short counts if EINTR/ERESTART are returned.
1973 * Data and control buffers are freed on return.
1974 * Experiment:
1975 * MSG_HOLD: go thru most of sosend(), but just enqueue the mbuf
1976 * MSG_SEND: go thru as for MSG_HOLD on current fragment, then
1977 * point at the mbuf chain being constructed and go from there.
1978 *
1979 * Returns: 0 Success
1980 * EOPNOTSUPP
1981 * EINVAL
1982 * ENOBUFS
1983 * uiomove:EFAULT
1984 * sosendcheck:EPIPE
1985 * sosendcheck:EWOULDBLOCK
1986 * sosendcheck:EINTR
1987 * sosendcheck:EBADF
1988 * sosendcheck:EINTR
1989 * sosendcheck:??? [value from so_error]
1990 * <pru_send>:ECONNRESET[TCP]
1991 * <pru_send>:EINVAL[TCP]
1992 * <pru_send>:ENOBUFS[TCP]
1993 * <pru_send>:EADDRINUSE[TCP]
1994 * <pru_send>:EADDRNOTAVAIL[TCP]
1995 * <pru_send>:EAFNOSUPPORT[TCP]
1996 * <pru_send>:EACCES[TCP]
1997 * <pru_send>:EAGAIN[TCP]
1998 * <pru_send>:EPERM[TCP]
1999 * <pru_send>:EMSGSIZE[TCP]
2000 * <pru_send>:EHOSTUNREACH[TCP]
2001 * <pru_send>:ENETUNREACH[TCP]
2002 * <pru_send>:ENETDOWN[TCP]
2003 * <pru_send>:ENOMEM[TCP]
2004 * <pru_send>:ENOBUFS[TCP]
2005 * <pru_send>:???[TCP] [ignorable: mostly IPSEC/firewall/DLIL]
2006 * <pru_send>:EINVAL[AF_UNIX]
2007 * <pru_send>:EOPNOTSUPP[AF_UNIX]
2008 * <pru_send>:EPIPE[AF_UNIX]
2009 * <pru_send>:ENOTCONN[AF_UNIX]
2010 * <pru_send>:EISCONN[AF_UNIX]
2011 * <pru_send>:???[AF_UNIX] [whatever a filter author chooses]
2012 * <sf_data_out>:??? [whatever a filter author chooses]
2013 *
2014 * Notes: Other <pru_send> returns depend on the protocol family; all
2015 * <sf_data_out> returns depend on what the filter author causes
2016 * their filter to return.
2017 */
2018 int
2019 sosend(struct socket *so, struct sockaddr *addr, struct uio *uio,
2020 struct mbuf *top, struct mbuf *control, int flags)
2021 {
2022 struct mbuf **mp;
2023 struct mbuf *m, *freelist = NULL;
2024 user_ssize_t space, len, resid, orig_resid;
2025 int clen = 0, error, dontroute, mlen, sendflags;
2026 int atomic = sosendallatonce(so) || top;
2027 int sblocked = 0;
2028 struct proc *p = current_proc();
2029 struct mbuf *control_copy = NULL;
2030 uint16_t headroom = 0;
2031 boolean_t en_tracing = FALSE;
2032
2033 if (uio != NULL)
2034 resid = uio_resid(uio);
2035 else
2036 resid = top->m_pkthdr.len;
2037
2038 KERNEL_DEBUG((DBG_FNC_SOSEND | DBG_FUNC_START), so, resid,
2039 so->so_snd.sb_cc, so->so_snd.sb_lowat, so->so_snd.sb_hiwat);
2040
2041 socket_lock(so, 1);
2042
2043 /*
2044 * trace if tracing & network (vs. unix) sockets & and
2045 * non-loopback
2046 */
2047 if (ENTR_SHOULDTRACE &&
2048 (SOCK_CHECK_DOM(so, AF_INET) || SOCK_CHECK_DOM(so, AF_INET6))) {
2049 struct inpcb *inp = sotoinpcb(so);
2050 if (inp->inp_last_outifp != NULL &&
2051 !(inp->inp_last_outifp->if_flags & IFF_LOOPBACK)) {
2052 en_tracing = TRUE;
2053 KERNEL_ENERGYTRACE(kEnTrActKernSockWrite, DBG_FUNC_START,
2054 VM_KERNEL_ADDRPERM(so),
2055 ((so->so_state & SS_NBIO) ? kEnTrFlagNonBlocking : 0),
2056 (int64_t)resid);
2057 orig_resid = resid;
2058 }
2059 }
2060
2061 /*
2062 * Re-injection should not affect process accounting
2063 */
2064 if ((flags & MSG_SKIPCFIL) == 0) {
2065 so_update_last_owner_locked(so, p);
2066 so_update_policy(so);
2067
2068 #if NECP
2069 so_update_necp_policy(so, NULL, addr);
2070 #endif /* NECP */
2071 }
2072
2073 if (so->so_type != SOCK_STREAM && (flags & MSG_OOB) != 0) {
2074 error = EOPNOTSUPP;
2075 socket_unlock(so, 1);
2076 goto out;
2077 }
2078
2079 /*
2080 * In theory resid should be unsigned.
2081 * However, space must be signed, as it might be less than 0
2082 * if we over-committed, and we must use a signed comparison
2083 * of space and resid. On the other hand, a negative resid
2084 * causes us to loop sending 0-length segments to the protocol.
2085 *
2086 * Usually, MSG_EOR isn't used on SOCK_STREAM type sockets.
2087 * But it will be used by sockets doing message delivery.
2088 *
2089 * Note: We limit resid to be a positive int value as we use
2090 * imin() to set bytes_to_copy -- radr://14558484
2091 */
2092 if (resid < 0 || resid > INT_MAX || (so->so_type == SOCK_STREAM &&
2093 !(so->so_flags & SOF_ENABLE_MSGS) && (flags & MSG_EOR))) {
2094 error = EINVAL;
2095 socket_unlock(so, 1);
2096 goto out;
2097 }
2098
2099 dontroute = (flags & MSG_DONTROUTE) &&
2100 (so->so_options & SO_DONTROUTE) == 0 &&
2101 (so->so_proto->pr_flags & PR_ATOMIC);
2102 OSIncrementAtomicLong(&p->p_stats->p_ru.ru_msgsnd);
2103
2104 if (control != NULL)
2105 clen = control->m_len;
2106
2107 if (soreserveheadroom != 0)
2108 headroom = so->so_pktheadroom;
2109
2110 do {
2111 error = sosendcheck(so, addr, resid, clen, atomic, flags,
2112 &sblocked, control);
2113 if (error)
2114 goto release;
2115
2116 mp = &top;
2117 if (so->so_flags & SOF_ENABLE_MSGS)
2118 space = msgq_sbspace(so, control);
2119 else
2120 space = sbspace(&so->so_snd) - clen;
2121 space += ((flags & MSG_OOB) ? 1024 : 0);
2122
2123 do {
2124 if (uio == NULL) {
2125 /*
2126 * Data is prepackaged in "top".
2127 */
2128 resid = 0;
2129 if (flags & MSG_EOR)
2130 top->m_flags |= M_EOR;
2131 } else {
2132 int chainlength;
2133 int bytes_to_copy;
2134 boolean_t jumbocl;
2135 boolean_t bigcl;
2136 int bytes_to_alloc;
2137
2138 bytes_to_copy = imin(resid, space);
2139
2140 bytes_to_alloc = bytes_to_copy;
2141 if (top == NULL)
2142 bytes_to_alloc += headroom;
2143
2144 if (sosendminchain > 0)
2145 chainlength = 0;
2146 else
2147 chainlength = sosendmaxchain;
2148
2149 /*
2150 * Use big 4 KB cluster when the outgoing interface
2151 * does not prefer 2 KB clusters
2152 */
2153 bigcl = !(so->so_flags1 & SOF1_IF_2KCL) ||
2154 sosendbigcl_ignore_capab;
2155
2156 /*
2157 * Attempt to use larger than system page-size
2158 * clusters for large writes only if there is
2159 * a jumbo cluster pool and if the socket is
2160 * marked accordingly.
2161 */
2162 jumbocl = sosendjcl && njcl > 0 &&
2163 ((so->so_flags & SOF_MULTIPAGES) ||
2164 sosendjcl_ignore_capab) &&
2165 bigcl;
2166
2167 socket_unlock(so, 0);
2168
2169 do {
2170 int num_needed;
2171 int hdrs_needed = (top == NULL) ? 1 : 0;
2172
2173 /*
2174 * try to maintain a local cache of mbuf
2175 * clusters needed to complete this
2176 * write the list is further limited to
2177 * the number that are currently needed
2178 * to fill the socket this mechanism
2179 * allows a large number of mbufs/
2180 * clusters to be grabbed under a single
2181 * mbuf lock... if we can't get any
2182 * clusters, than fall back to trying
2183 * for mbufs if we fail early (or
2184 * miscalcluate the number needed) make
2185 * sure to release any clusters we
2186 * haven't yet consumed.
2187 */
2188 if (freelist == NULL &&
2189 bytes_to_alloc > MBIGCLBYTES &&
2190 jumbocl) {
2191 num_needed =
2192 bytes_to_alloc / M16KCLBYTES;
2193
2194 if ((bytes_to_alloc -
2195 (num_needed * M16KCLBYTES))
2196 >= MINCLSIZE)
2197 num_needed++;
2198
2199 freelist =
2200 m_getpackets_internal(
2201 (unsigned int *)&num_needed,
2202 hdrs_needed, M_WAIT, 0,
2203 M16KCLBYTES);
2204 /*
2205 * Fall back to 4K cluster size
2206 * if allocation failed
2207 */
2208 }
2209
2210 if (freelist == NULL &&
2211 bytes_to_alloc > MCLBYTES &&
2212 bigcl) {
2213 num_needed =
2214 bytes_to_alloc / MBIGCLBYTES;
2215
2216 if ((bytes_to_alloc -
2217 (num_needed * MBIGCLBYTES)) >=
2218 MINCLSIZE)
2219 num_needed++;
2220
2221 freelist =
2222 m_getpackets_internal(
2223 (unsigned int *)&num_needed,
2224 hdrs_needed, M_WAIT, 0,
2225 MBIGCLBYTES);
2226 /*
2227 * Fall back to cluster size
2228 * if allocation failed
2229 */
2230 }
2231
2232 /*
2233 * Allocate a cluster as we want to
2234 * avoid to split the data in more
2235 * that one segment and using MINCLSIZE
2236 * would lead us to allocate two mbufs
2237 */
2238 if (soreserveheadroom != 0 &&
2239 freelist == NULL &&
2240 ((top == NULL &&
2241 bytes_to_alloc > _MHLEN) ||
2242 bytes_to_alloc > _MLEN)) {
2243 num_needed = ROUNDUP(bytes_to_alloc, MCLBYTES) /
2244 MCLBYTES;
2245 freelist =
2246 m_getpackets_internal(
2247 (unsigned int *)&num_needed,
2248 hdrs_needed, M_WAIT, 0,
2249 MCLBYTES);
2250 /*
2251 * Fall back to a single mbuf
2252 * if allocation failed
2253 */
2254 } else if (freelist == NULL &&
2255 bytes_to_alloc > MINCLSIZE) {
2256 num_needed =
2257 bytes_to_alloc / MCLBYTES;
2258
2259 if ((bytes_to_alloc -
2260 (num_needed * MCLBYTES)) >=
2261 MINCLSIZE)
2262 num_needed++;
2263
2264 freelist =
2265 m_getpackets_internal(
2266 (unsigned int *)&num_needed,
2267 hdrs_needed, M_WAIT, 0,
2268 MCLBYTES);
2269 /*
2270 * Fall back to a single mbuf
2271 * if allocation failed
2272 */
2273 }
2274 /*
2275 * For datagram protocols, leave
2276 * headroom for protocol headers
2277 * in the first cluster of the chain
2278 */
2279 if (freelist != NULL && atomic &&
2280 top == NULL && headroom > 0) {
2281 freelist->m_data += headroom;
2282 }
2283
2284 /*
2285 * Fall back to regular mbufs without
2286 * reserving the socket headroom
2287 */
2288 if (freelist == NULL) {
2289 if (top == NULL)
2290 MGETHDR(freelist,
2291 M_WAIT, MT_DATA);
2292 else
2293 MGET(freelist,
2294 M_WAIT, MT_DATA);
2295
2296 if (freelist == NULL) {
2297 error = ENOBUFS;
2298 socket_lock(so, 0);
2299 goto release;
2300 }
2301 /*
2302 * For datagram protocols,
2303 * leave room for protocol
2304 * headers in first mbuf.
2305 */
2306 if (atomic && top == NULL &&
2307 bytes_to_copy < MHLEN) {
2308 MH_ALIGN(freelist,
2309 bytes_to_copy);
2310 }
2311 }
2312 m = freelist;
2313 freelist = m->m_next;
2314 m->m_next = NULL;
2315
2316 if ((m->m_flags & M_EXT))
2317 mlen = m->m_ext.ext_size -
2318 m_leadingspace(m);
2319 else if ((m->m_flags & M_PKTHDR))
2320 mlen =
2321 MHLEN - m_leadingspace(m);
2322 else
2323 mlen = MLEN - m_leadingspace(m);
2324 len = imin(mlen, bytes_to_copy);
2325
2326 chainlength += len;
2327
2328 space -= len;
2329
2330 error = uiomove(mtod(m, caddr_t),
2331 len, uio);
2332
2333 resid = uio_resid(uio);
2334
2335 m->m_len = len;
2336 *mp = m;
2337 top->m_pkthdr.len += len;
2338 if (error)
2339 break;
2340 mp = &m->m_next;
2341 if (resid <= 0) {
2342 if (flags & MSG_EOR)
2343 top->m_flags |= M_EOR;
2344 break;
2345 }
2346 bytes_to_copy = min(resid, space);
2347
2348 } while (space > 0 &&
2349 (chainlength < sosendmaxchain || atomic ||
2350 resid < MINCLSIZE));
2351
2352 socket_lock(so, 0);
2353
2354 if (error)
2355 goto release;
2356 }
2357
2358 if (flags & (MSG_HOLD|MSG_SEND)) {
2359 /* Enqueue for later, go away if HOLD */
2360 struct mbuf *mb1;
2361 if (so->so_temp && (flags & MSG_FLUSH)) {
2362 m_freem(so->so_temp);
2363 so->so_temp = NULL;
2364 }
2365 if (so->so_temp)
2366 so->so_tail->m_next = top;
2367 else
2368 so->so_temp = top;
2369 mb1 = top;
2370 while (mb1->m_next)
2371 mb1 = mb1->m_next;
2372 so->so_tail = mb1;
2373 if (flags & MSG_HOLD) {
2374 top = NULL;
2375 goto release;
2376 }
2377 top = so->so_temp;
2378 }
2379 if (dontroute)
2380 so->so_options |= SO_DONTROUTE;
2381
2382 /*
2383 * Compute flags here, for pru_send and NKEs
2384 *
2385 * If the user set MSG_EOF, the protocol
2386 * understands this flag and nothing left to
2387 * send then use PRU_SEND_EOF instead of PRU_SEND.
2388 */
2389 sendflags = (flags & MSG_OOB) ? PRUS_OOB :
2390 ((flags & MSG_EOF) &&
2391 (so->so_proto->pr_flags & PR_IMPLOPCL) &&
2392 (resid <= 0)) ? PRUS_EOF :
2393 /* If there is more to send set PRUS_MORETOCOME */
2394 (resid > 0 && space > 0) ? PRUS_MORETOCOME : 0;
2395
2396 if ((flags & MSG_SKIPCFIL) == 0) {
2397 /*
2398 * Socket filter processing
2399 */
2400 error = sflt_data_out(so, addr, &top,
2401 &control, (sendflags & MSG_OOB) ?
2402 sock_data_filt_flag_oob : 0);
2403 if (error) {
2404 if (error == EJUSTRETURN) {
2405 error = 0;
2406 clen = 0;
2407 control = NULL;
2408 top = NULL;
2409 }
2410 goto release;
2411 }
2412 #if CONTENT_FILTER
2413 /*
2414 * Content filter processing
2415 */
2416 error = cfil_sock_data_out(so, addr, top,
2417 control, (sendflags & MSG_OOB) ?
2418 sock_data_filt_flag_oob : 0);
2419 if (error) {
2420 if (error == EJUSTRETURN) {
2421 error = 0;
2422 clen = 0;
2423 control = NULL;
2424 top = NULL;
2425 }
2426 goto release;
2427 }
2428 #endif /* CONTENT_FILTER */
2429 }
2430 if (so->so_flags & SOF_ENABLE_MSGS) {
2431 /*
2432 * Make a copy of control mbuf,
2433 * so that msg priority can be
2434 * passed to subsequent mbufs.
2435 */
2436 control_copy = m_dup(control, M_NOWAIT);
2437 }
2438 error = (*so->so_proto->pr_usrreqs->pru_send)
2439 (so, sendflags, top, addr, control, p);
2440
2441 if (flags & MSG_SEND)
2442 so->so_temp = NULL;
2443
2444 if (dontroute)
2445 so->so_options &= ~SO_DONTROUTE;
2446
2447 clen = 0;
2448 control = control_copy;
2449 control_copy = NULL;
2450 top = NULL;
2451 mp = &top;
2452 if (error)
2453 goto release;
2454 } while (resid && space > 0);
2455 } while (resid);
2456
2457 release:
2458 if (sblocked)
2459 sbunlock(&so->so_snd, FALSE); /* will unlock socket */
2460 else
2461 socket_unlock(so, 1);
2462 out:
2463 if (top != NULL)
2464 m_freem(top);
2465 if (control != NULL)
2466 m_freem(control);
2467 if (freelist != NULL)
2468 m_freem_list(freelist);
2469 if (control_copy != NULL)
2470 m_freem(control_copy);
2471
2472 /*
2473 * One write has been done. This was enough. Get back to "normal"
2474 * behavior.
2475 */
2476 if (so->so_flags1 & SOF1_PRECONNECT_DATA)
2477 so->so_flags1 &= ~SOF1_PRECONNECT_DATA;
2478
2479 if (en_tracing) {
2480 /* resid passed here is the bytes left in uio */
2481 KERNEL_ENERGYTRACE(kEnTrActKernSockWrite, DBG_FUNC_END,
2482 VM_KERNEL_ADDRPERM(so),
2483 ((error == EWOULDBLOCK) ? kEnTrFlagNoWork : 0),
2484 (int64_t)(orig_resid - resid));
2485 }
2486 KERNEL_DEBUG(DBG_FNC_SOSEND | DBG_FUNC_END, so, resid,
2487 so->so_snd.sb_cc, space, error);
2488
2489 return (error);
2490 }
2491
2492 /*
2493 * Supported only connected sockets (no address) without ancillary data
2494 * (control mbuf) for atomic protocols
2495 */
2496 int
2497 sosend_list(struct socket *so, struct uio **uioarray, u_int uiocnt, int flags)
2498 {
2499 struct mbuf *m, *freelist = NULL;
2500 user_ssize_t len, resid;
2501 int error, dontroute, mlen;
2502 int atomic = sosendallatonce(so);
2503 int sblocked = 0;
2504 struct proc *p = current_proc();
2505 u_int uiofirst = 0;
2506 u_int uiolast = 0;
2507 struct mbuf *top = NULL;
2508 uint16_t headroom = 0;
2509 boolean_t bigcl;
2510
2511 KERNEL_DEBUG((DBG_FNC_SOSEND_LIST | DBG_FUNC_START), so, uiocnt,
2512 so->so_snd.sb_cc, so->so_snd.sb_lowat, so->so_snd.sb_hiwat);
2513
2514 if (so->so_type != SOCK_DGRAM) {
2515 error = EINVAL;
2516 goto out;
2517 }
2518 if (atomic == 0) {
2519 error = EINVAL;
2520 goto out;
2521 }
2522 if (so->so_proto->pr_usrreqs->pru_send_list == NULL) {
2523 error = EPROTONOSUPPORT;
2524 goto out;
2525 }
2526 if (flags & ~(MSG_DONTWAIT | MSG_NBIO)) {
2527 error = EINVAL;
2528 goto out;
2529 }
2530 resid = uio_array_resid(uioarray, uiocnt);
2531
2532 /*
2533 * In theory resid should be unsigned.
2534 * However, space must be signed, as it might be less than 0
2535 * if we over-committed, and we must use a signed comparison
2536 * of space and resid. On the other hand, a negative resid
2537 * causes us to loop sending 0-length segments to the protocol.
2538 *
2539 * Note: We limit resid to be a positive int value as we use
2540 * imin() to set bytes_to_copy -- radr://14558484
2541 */
2542 if (resid < 0 || resid > INT_MAX) {
2543 error = EINVAL;
2544 goto out;
2545 }
2546
2547 socket_lock(so, 1);
2548 so_update_last_owner_locked(so, p);
2549 so_update_policy(so);
2550
2551 #if NECP
2552 so_update_necp_policy(so, NULL, NULL);
2553 #endif /* NECP */
2554
2555 dontroute = (flags & MSG_DONTROUTE) &&
2556 (so->so_options & SO_DONTROUTE) == 0 &&
2557 (so->so_proto->pr_flags & PR_ATOMIC);
2558 OSIncrementAtomicLong(&p->p_stats->p_ru.ru_msgsnd);
2559
2560 error = sosendcheck(so, NULL, resid, 0, atomic, flags,
2561 &sblocked, NULL);
2562 if (error)
2563 goto release;
2564
2565 /*
2566 * Use big 4 KB clusters when the outgoing interface does not prefer
2567 * 2 KB clusters
2568 */
2569 bigcl = !(so->so_flags1 & SOF1_IF_2KCL) || sosendbigcl_ignore_capab;
2570
2571 if (soreserveheadroom != 0)
2572 headroom = so->so_pktheadroom;
2573
2574 do {
2575 int i;
2576 int num_needed = 0;
2577 int chainlength;
2578 size_t maxpktlen = 0;
2579 int bytes_to_alloc;
2580
2581 if (sosendminchain > 0)
2582 chainlength = 0;
2583 else
2584 chainlength = sosendmaxchain;
2585
2586 socket_unlock(so, 0);
2587
2588 /*
2589 * Find a set of uio that fit in a reasonable number
2590 * of mbuf packets
2591 */
2592 for (i = uiofirst; i < uiocnt; i++) {
2593 struct uio *auio = uioarray[i];
2594
2595 len = uio_resid(auio);
2596
2597 /* Do nothing for empty messages */
2598 if (len == 0)
2599 continue;
2600
2601 num_needed += 1;
2602 uiolast += 1;
2603
2604 if (len > maxpktlen)
2605 maxpktlen = len;
2606
2607 chainlength += len;
2608 if (chainlength > sosendmaxchain)
2609 break;
2610 }
2611 /*
2612 * Nothing left to send
2613 */
2614 if (num_needed == 0) {
2615 socket_lock(so, 0);
2616 break;
2617 }
2618 /*
2619 * Allocate buffer large enough to include headroom space for
2620 * network and link header
2621 *
2622 */
2623 bytes_to_alloc = maxpktlen + headroom;
2624
2625 /*
2626 * Allocate a single contiguous buffer of the smallest available
2627 * size when possible
2628 */
2629 if (bytes_to_alloc > MCLBYTES &&
2630 bytes_to_alloc <= MBIGCLBYTES && bigcl) {
2631 freelist = m_getpackets_internal(
2632 (unsigned int *)&num_needed,
2633 num_needed, M_WAIT, 1,
2634 MBIGCLBYTES);
2635 } else if (bytes_to_alloc > _MHLEN &&
2636 bytes_to_alloc <= MCLBYTES) {
2637 freelist = m_getpackets_internal(
2638 (unsigned int *)&num_needed,
2639 num_needed, M_WAIT, 1,
2640 MCLBYTES);
2641 } else {
2642 freelist = m_allocpacket_internal(
2643 (unsigned int *)&num_needed,
2644 bytes_to_alloc, NULL, M_WAIT, 1, 0);
2645 }
2646
2647 if (freelist == NULL) {
2648 socket_lock(so, 0);
2649 error = ENOMEM;
2650 goto release;
2651 }
2652 /*
2653 * Copy each uio of the set into its own mbuf packet
2654 */
2655 for (i = uiofirst, m = freelist;
2656 i < uiolast && m != NULL;
2657 i++) {
2658 int bytes_to_copy;
2659 struct mbuf *n;
2660 struct uio *auio = uioarray[i];
2661
2662 bytes_to_copy = uio_resid(auio);
2663
2664 /* Do nothing for empty messages */
2665 if (bytes_to_copy == 0)
2666 continue;
2667 /*
2668 * Leave headroom for protocol headers
2669 * in the first mbuf of the chain
2670 */
2671 m->m_data += headroom;
2672
2673 for (n = m; n != NULL; n = n->m_next) {
2674 if ((m->m_flags & M_EXT))
2675 mlen = m->m_ext.ext_size -
2676 m_leadingspace(m);
2677 else if ((m->m_flags & M_PKTHDR))
2678 mlen =
2679 MHLEN - m_leadingspace(m);
2680 else
2681 mlen = MLEN - m_leadingspace(m);
2682 len = imin(mlen, bytes_to_copy);
2683
2684 /*
2685 * Note: uiomove() decrements the iovec
2686 * length
2687 */
2688 error = uiomove(mtod(n, caddr_t),
2689 len, auio);
2690 if (error != 0)
2691 break;
2692 n->m_len = len;
2693 m->m_pkthdr.len += len;
2694
2695 VERIFY(m->m_pkthdr.len <= maxpktlen);
2696
2697 bytes_to_copy -= len;
2698 resid -= len;
2699 }
2700 if (m->m_pkthdr.len == 0) {
2701 printf(
2702 "%s:%d so %llx pkt %llx type %u len null\n",
2703 __func__, __LINE__,
2704 (uint64_t)DEBUG_KERNEL_ADDRPERM(so),
2705 (uint64_t)DEBUG_KERNEL_ADDRPERM(m),
2706 m->m_type);
2707 }
2708 if (error != 0)
2709 break;
2710 m = m->m_nextpkt;
2711 }
2712
2713 socket_lock(so, 0);
2714
2715 if (error)
2716 goto release;
2717 top = freelist;
2718 freelist = NULL;
2719
2720 if (dontroute)
2721 so->so_options |= SO_DONTROUTE;
2722
2723 if ((flags & MSG_SKIPCFIL) == 0) {
2724 struct mbuf **prevnextp = NULL;
2725
2726 for (i = uiofirst, m = top;
2727 i < uiolast && m != NULL;
2728 i++) {
2729 struct mbuf *nextpkt = m->m_nextpkt;
2730
2731 /*
2732 * Socket filter processing
2733 */
2734 error = sflt_data_out(so, NULL, &m,
2735 NULL, 0);
2736 if (error != 0 && error != EJUSTRETURN)
2737 goto release;
2738
2739 #if CONTENT_FILTER
2740 if (error == 0) {
2741 /*
2742 * Content filter processing
2743 */
2744 error = cfil_sock_data_out(so, NULL, m,
2745 NULL, 0);
2746 if (error != 0 && error != EJUSTRETURN)
2747 goto release;
2748 }
2749 #endif /* CONTENT_FILTER */
2750 /*
2751 * Remove packet from the list when
2752 * swallowed by a filter
2753 */
2754 if (error == EJUSTRETURN) {
2755 error = 0;
2756 if (prevnextp != NULL)
2757 *prevnextp = nextpkt;
2758 else
2759 top = nextpkt;
2760 }
2761
2762 m = nextpkt;
2763 if (m != NULL)
2764 prevnextp = &m->m_nextpkt;
2765 }
2766 }
2767 if (top != NULL)
2768 error = (*so->so_proto->pr_usrreqs->pru_send_list)
2769 (so, 0, top, NULL, NULL, p);
2770
2771 if (dontroute)
2772 so->so_options &= ~SO_DONTROUTE;
2773
2774 top = NULL;
2775 uiofirst = uiolast;
2776 } while (resid > 0 && error == 0);
2777 release:
2778 if (sblocked)
2779 sbunlock(&so->so_snd, FALSE); /* will unlock socket */
2780 else
2781 socket_unlock(so, 1);
2782 out:
2783 if (top != NULL)
2784 m_freem(top);
2785 if (freelist != NULL)
2786 m_freem_list(freelist);
2787
2788 KERNEL_DEBUG(DBG_FNC_SOSEND_LIST | DBG_FUNC_END, so, resid,
2789 so->so_snd.sb_cc, 0, error);
2790
2791 return (error);
2792 }
2793
2794 /*
2795 * May return ERESTART when packet is dropped by MAC policy check
2796 */
2797 static int
2798 soreceive_addr(struct proc *p, struct socket *so, struct sockaddr **psa,
2799 int flags, struct mbuf **mp, struct mbuf **nextrecordp, int canwait)
2800 {
2801 int error = 0;
2802 struct mbuf *m = *mp;
2803 struct mbuf *nextrecord = *nextrecordp;
2804
2805 KASSERT(m->m_type == MT_SONAME, ("receive 1a"));
2806 #if CONFIG_MACF_SOCKET_SUBSET
2807 /*
2808 * Call the MAC framework for policy checking if we're in
2809 * the user process context and the socket isn't connected.
2810 */
2811 if (p != kernproc && !(so->so_state & SS_ISCONNECTED)) {
2812 struct mbuf *m0 = m;
2813 /*
2814 * Dequeue this record (temporarily) from the receive
2815 * list since we're about to drop the socket's lock
2816 * where a new record may arrive and be appended to
2817 * the list. Upon MAC policy failure, the record
2818 * will be freed. Otherwise, we'll add it back to
2819 * the head of the list. We cannot rely on SB_LOCK
2820 * because append operation uses the socket's lock.
2821 */
2822 do {
2823 m->m_nextpkt = NULL;
2824 sbfree(&so->so_rcv, m);
2825 m = m->m_next;
2826 } while (m != NULL);
2827 m = m0;
2828 so->so_rcv.sb_mb = nextrecord;
2829 SB_EMPTY_FIXUP(&so->so_rcv);
2830 SBLASTRECORDCHK(&so->so_rcv, "soreceive 1a");
2831 SBLASTMBUFCHK(&so->so_rcv, "soreceive 1a");
2832 socket_unlock(so, 0);
2833
2834 if (mac_socket_check_received(proc_ucred(p), so,
2835 mtod(m, struct sockaddr *)) != 0) {
2836 /*
2837 * MAC policy failure; free this record and
2838 * process the next record (or block until
2839 * one is available). We have adjusted sb_cc
2840 * and sb_mbcnt above so there is no need to
2841 * call sbfree() again.
2842 */
2843 m_freem(m);
2844 /*
2845 * Clear SB_LOCK but don't unlock the socket.
2846 * Process the next record or wait for one.
2847 */
2848 socket_lock(so, 0);
2849 sbunlock(&so->so_rcv, TRUE); /* stay locked */
2850 error = ERESTART;
2851 goto done;
2852 }
2853 socket_lock(so, 0);
2854 /*
2855 * If the socket has been defunct'd, drop it.
2856 */
2857 if (so->so_flags & SOF_DEFUNCT) {
2858 m_freem(m);
2859 error = ENOTCONN;
2860 goto done;
2861 }
2862 /*
2863 * Re-adjust the socket receive list and re-enqueue
2864 * the record in front of any packets which may have
2865 * been appended while we dropped the lock.
2866 */
2867 for (m = m0; m->m_next != NULL; m = m->m_next)
2868 sballoc(&so->so_rcv, m);
2869 sballoc(&so->so_rcv, m);
2870 if (so->so_rcv.sb_mb == NULL) {
2871 so->so_rcv.sb_lastrecord = m0;
2872 so->so_rcv.sb_mbtail = m;
2873 }
2874 m = m0;
2875 nextrecord = m->m_nextpkt = so->so_rcv.sb_mb;
2876 so->so_rcv.sb_mb = m;
2877 SBLASTRECORDCHK(&so->so_rcv, "soreceive 1b");
2878 SBLASTMBUFCHK(&so->so_rcv, "soreceive 1b");
2879 }
2880 #endif /* CONFIG_MACF_SOCKET_SUBSET */
2881 if (psa != NULL) {
2882 *psa = dup_sockaddr(mtod(m, struct sockaddr *), canwait);
2883 if ((*psa == NULL) && (flags & MSG_NEEDSA)) {
2884 error = EWOULDBLOCK;
2885 goto done;
2886 }
2887 }
2888 if (flags & MSG_PEEK) {
2889 m = m->m_next;
2890 } else {
2891 sbfree(&so->so_rcv, m);
2892 if (m->m_next == NULL && so->so_rcv.sb_cc != 0) {
2893 panic("%s: about to create invalid socketbuf",
2894 __func__);
2895 /* NOTREACHED */
2896 }
2897 MFREE(m, so->so_rcv.sb_mb);
2898 m = so->so_rcv.sb_mb;
2899 if (m != NULL) {
2900 m->m_nextpkt = nextrecord;
2901 } else {
2902 so->so_rcv.sb_mb = nextrecord;
2903 SB_EMPTY_FIXUP(&so->so_rcv);
2904 }
2905 }
2906 done:
2907 *mp = m;
2908 *nextrecordp = nextrecord;
2909
2910 return (error);
2911 }
2912
2913 /*
2914 * Process one or more MT_CONTROL mbufs present before any data mbufs
2915 * in the first mbuf chain on the socket buffer. If MSG_PEEK, we
2916 * just copy the data; if !MSG_PEEK, we call into the protocol to
2917 * perform externalization.
2918 */
2919 static int
2920 soreceive_ctl(struct socket *so, struct mbuf **controlp, int flags,
2921 struct mbuf **mp, struct mbuf **nextrecordp)
2922 {
2923 int error = 0;
2924 struct mbuf *cm = NULL, *cmn;
2925 struct mbuf **cme = &cm;
2926 struct sockbuf *sb_rcv = &so->so_rcv;
2927 struct mbuf **msgpcm = NULL;
2928 struct mbuf *m = *mp;
2929 struct mbuf *nextrecord = *nextrecordp;
2930 struct protosw *pr = so->so_proto;
2931
2932 /*
2933 * Externalizing the control messages would require us to
2934 * drop the socket's lock below. Once we re-acquire the
2935 * lock, the mbuf chain might change. In order to preserve
2936 * consistency, we unlink all control messages from the
2937 * first mbuf chain in one shot and link them separately
2938 * onto a different chain.
2939 */
2940 do {
2941 if (flags & MSG_PEEK) {
2942 if (controlp != NULL) {
2943 if (*controlp == NULL) {
2944 msgpcm = controlp;
2945 }
2946 *controlp = m_copy(m, 0, m->m_len);
2947
2948 /*
2949 * If we failed to allocate an mbuf,
2950 * release any previously allocated
2951 * mbufs for control data. Return
2952 * an error. Keep the mbufs in the
2953 * socket as this is using
2954 * MSG_PEEK flag.
2955 */
2956 if (*controlp == NULL) {
2957 m_freem(*msgpcm);
2958 error = ENOBUFS;
2959 goto done;
2960 }
2961 controlp = &(*controlp)->m_next;
2962 }
2963 m = m->m_next;
2964 } else {
2965 m->m_nextpkt = NULL;
2966 sbfree(sb_rcv, m);
2967 sb_rcv->sb_mb = m->m_next;
2968 m->m_next = NULL;
2969 *cme = m;
2970 cme = &(*cme)->m_next;
2971 m = sb_rcv->sb_mb;
2972 }
2973 } while (m != NULL && m->m_type == MT_CONTROL);
2974
2975 if (!(flags & MSG_PEEK)) {
2976 if (sb_rcv->sb_mb != NULL) {
2977 sb_rcv->sb_mb->m_nextpkt = nextrecord;
2978 } else {
2979 sb_rcv->sb_mb = nextrecord;
2980 SB_EMPTY_FIXUP(sb_rcv);
2981 }
2982 if (nextrecord == NULL)
2983 sb_rcv->sb_lastrecord = m;
2984 }
2985
2986 SBLASTRECORDCHK(&so->so_rcv, "soreceive ctl");
2987 SBLASTMBUFCHK(&so->so_rcv, "soreceive ctl");
2988
2989 while (cm != NULL) {
2990 int cmsg_type;
2991
2992 cmn = cm->m_next;
2993 cm->m_next = NULL;
2994 cmsg_type = mtod(cm, struct cmsghdr *)->cmsg_type;
2995
2996 /*
2997 * Call the protocol to externalize SCM_RIGHTS message
2998 * and return the modified message to the caller upon
2999 * success. Otherwise, all other control messages are
3000 * returned unmodified to the caller. Note that we
3001 * only get into this loop if MSG_PEEK is not set.
3002 */
3003 if (pr->pr_domain->dom_externalize != NULL &&
3004 cmsg_type == SCM_RIGHTS) {
3005 /*
3006 * Release socket lock: see 3903171. This
3007 * would also allow more records to be appended
3008 * to the socket buffer. We still have SB_LOCK
3009 * set on it, so we can be sure that the head
3010 * of the mbuf chain won't change.
3011 */
3012 socket_unlock(so, 0);
3013 error = (*pr->pr_domain->dom_externalize)(cm);
3014 socket_lock(so, 0);
3015 } else {
3016 error = 0;
3017 }
3018
3019 if (controlp != NULL && error == 0) {
3020 *controlp = cm;
3021 controlp = &(*controlp)->m_next;
3022 } else {
3023 (void) m_free(cm);
3024 }
3025 cm = cmn;
3026 }
3027 /*
3028 * Update the value of nextrecord in case we received new
3029 * records when the socket was unlocked above for
3030 * externalizing SCM_RIGHTS.
3031 */
3032 if (m != NULL)
3033 nextrecord = sb_rcv->sb_mb->m_nextpkt;
3034 else
3035 nextrecord = sb_rcv->sb_mb;
3036
3037 done:
3038 *mp = m;
3039 *nextrecordp = nextrecord;
3040
3041 return (error);
3042 }
3043
3044 /*
3045 * Implement receive operations on a socket.
3046 * We depend on the way that records are added to the sockbuf
3047 * by sbappend*. In particular, each record (mbufs linked through m_next)
3048 * must begin with an address if the protocol so specifies,
3049 * followed by an optional mbuf or mbufs containing ancillary data,
3050 * and then zero or more mbufs of data.
3051 * In order to avoid blocking network interrupts for the entire time here,
3052 * we splx() while doing the actual copy to user space.
3053 * Although the sockbuf is locked, new data may still be appended,
3054 * and thus we must maintain consistency of the sockbuf during that time.
3055 *
3056 * The caller may receive the data as a single mbuf chain by supplying
3057 * an mbuf **mp0 for use in returning the chain. The uio is then used
3058 * only for the count in uio_resid.
3059 *
3060 * Returns: 0 Success
3061 * ENOBUFS
3062 * ENOTCONN
3063 * EWOULDBLOCK
3064 * uiomove:EFAULT
3065 * sblock:EWOULDBLOCK
3066 * sblock:EINTR
3067 * sbwait:EBADF
3068 * sbwait:EINTR
3069 * sodelayed_copy:EFAULT
3070 * <pru_rcvoob>:EINVAL[TCP]
3071 * <pru_rcvoob>:EWOULDBLOCK[TCP]
3072 * <pru_rcvoob>:???
3073 * <pr_domain->dom_externalize>:EMSGSIZE[AF_UNIX]
3074 * <pr_domain->dom_externalize>:ENOBUFS[AF_UNIX]
3075 * <pr_domain->dom_externalize>:???
3076 *
3077 * Notes: Additional return values from calls through <pru_rcvoob> and
3078 * <pr_domain->dom_externalize> depend on protocols other than
3079 * TCP or AF_UNIX, which are documented above.
3080 */
3081 int
3082 soreceive(struct socket *so, struct sockaddr **psa, struct uio *uio,
3083 struct mbuf **mp0, struct mbuf **controlp, int *flagsp)
3084 {
3085 struct mbuf *m, **mp, *ml = NULL;
3086 struct mbuf *nextrecord, *free_list;
3087 int flags, error, offset;
3088 user_ssize_t len;
3089 struct protosw *pr = so->so_proto;
3090 int moff, type = 0;
3091 user_ssize_t orig_resid = uio_resid(uio);
3092 user_ssize_t delayed_copy_len;
3093 int can_delay;
3094 int need_event;
3095 struct proc *p = current_proc();
3096 boolean_t en_tracing = FALSE;
3097
3098 /*
3099 * Sanity check on the length passed by caller as we are making 'int'
3100 * comparisons
3101 */
3102 if (orig_resid < 0 || orig_resid > INT_MAX)
3103 return (EINVAL);
3104
3105 KERNEL_DEBUG(DBG_FNC_SORECEIVE | DBG_FUNC_START, so,
3106 uio_resid(uio), so->so_rcv.sb_cc, so->so_rcv.sb_lowat,
3107 so->so_rcv.sb_hiwat);
3108
3109 socket_lock(so, 1);
3110 so_update_last_owner_locked(so, p);
3111 so_update_policy(so);
3112
3113 #ifdef MORE_LOCKING_DEBUG
3114 if (so->so_usecount == 1) {
3115 panic("%s: so=%x no other reference on socket\n", __func__, so);
3116 /* NOTREACHED */
3117 }
3118 #endif
3119 mp = mp0;
3120 if (psa != NULL)
3121 *psa = NULL;
3122 if (controlp != NULL)
3123 *controlp = NULL;
3124 if (flagsp != NULL)
3125 flags = *flagsp &~ MSG_EOR;
3126 else
3127 flags = 0;
3128
3129 /*
3130 * If a recv attempt is made on a previously-accepted socket
3131 * that has been marked as inactive (disconnected), reject
3132 * the request.
3133 */
3134 if (so->so_flags & SOF_DEFUNCT) {
3135 struct sockbuf *sb = &so->so_rcv;
3136
3137 error = ENOTCONN;
3138 SODEFUNCTLOG("%s[%d, %s]: defunct so 0x%llx [%d,%d] (%d)\n",
3139 __func__, proc_pid(p), proc_best_name(p),
3140 (uint64_t)DEBUG_KERNEL_ADDRPERM(so),
3141 SOCK_DOM(so), SOCK_TYPE(so), error);
3142 /*
3143 * This socket should have been disconnected and flushed
3144 * prior to being returned from sodefunct(); there should
3145 * be no data on its receive list, so panic otherwise.
3146 */
3147 if (so->so_state & SS_DEFUNCT)
3148 sb_empty_assert(sb, __func__);
3149 socket_unlock(so, 1);
3150 return (error);
3151 }
3152
3153 if ((so->so_flags1 & SOF1_PRECONNECT_DATA) &&
3154 pr->pr_usrreqs->pru_preconnect) {
3155 /*
3156 * A user may set the CONNECT_RESUME_ON_READ_WRITE-flag but not
3157 * calling write() right after this. *If* the app calls a read
3158 * we do not want to block this read indefinetely. Thus,
3159 * we trigger a connect so that the session gets initiated.
3160 */
3161 error = (*pr->pr_usrreqs->pru_preconnect)(so);
3162
3163 if (error) {
3164 socket_unlock(so, 1);
3165 return (error);
3166 }
3167 }
3168
3169 if (ENTR_SHOULDTRACE &&
3170 (SOCK_CHECK_DOM(so, AF_INET) || SOCK_CHECK_DOM(so, AF_INET6))) {
3171 /*
3172 * enable energy tracing for inet sockets that go over
3173 * non-loopback interfaces only.
3174 */
3175 struct inpcb *inp = sotoinpcb(so);
3176 if (inp->inp_last_outifp != NULL &&
3177 !(inp->inp_last_outifp->if_flags & IFF_LOOPBACK)) {
3178 en_tracing = TRUE;
3179 KERNEL_ENERGYTRACE(kEnTrActKernSockRead, DBG_FUNC_START,
3180 VM_KERNEL_ADDRPERM(so),
3181 ((so->so_state & SS_NBIO) ?
3182 kEnTrFlagNonBlocking : 0),
3183 (int64_t)orig_resid);
3184 }
3185 }
3186
3187 /*
3188 * When SO_WANTOOBFLAG is set we try to get out-of-band data
3189 * regardless of the flags argument. Here is the case were
3190 * out-of-band data is not inline.
3191 */
3192 if ((flags & MSG_OOB) ||
3193 ((so->so_options & SO_WANTOOBFLAG) != 0 &&
3194 (so->so_options & SO_OOBINLINE) == 0 &&
3195 (so->so_oobmark || (so->so_state & SS_RCVATMARK)))) {
3196 m = m_get(M_WAIT, MT_DATA);
3197 if (m == NULL) {
3198 socket_unlock(so, 1);
3199 KERNEL_DEBUG(DBG_FNC_SORECEIVE | DBG_FUNC_END,
3200 ENOBUFS, 0, 0, 0, 0);
3201 return (ENOBUFS);
3202 }
3203 error = (*pr->pr_usrreqs->pru_rcvoob)(so, m, flags & MSG_PEEK);
3204 if (error)
3205 goto bad;
3206 socket_unlock(so, 0);
3207 do {
3208 error = uiomove(mtod(m, caddr_t),
3209 imin(uio_resid(uio), m->m_len), uio);
3210 m = m_free(m);
3211 } while (uio_resid(uio) && error == 0 && m != NULL);
3212 socket_lock(so, 0);
3213 bad:
3214 if (m != NULL)
3215 m_freem(m);
3216
3217 if ((so->so_options & SO_WANTOOBFLAG) != 0) {
3218 if (error == EWOULDBLOCK || error == EINVAL) {
3219 /*
3220 * Let's try to get normal data:
3221 * EWOULDBLOCK: out-of-band data not
3222 * receive yet. EINVAL: out-of-band data
3223 * already read.
3224 */
3225 error = 0;
3226 goto nooob;
3227 } else if (error == 0 && flagsp != NULL) {
3228 *flagsp |= MSG_OOB;
3229 }
3230 }
3231 socket_unlock(so, 1);
3232 if (en_tracing) {
3233 KERNEL_ENERGYTRACE(kEnTrActKernSockRead, DBG_FUNC_END,
3234 VM_KERNEL_ADDRPERM(so), 0,
3235 (int64_t)(orig_resid - uio_resid(uio)));
3236 }
3237 KERNEL_DEBUG(DBG_FNC_SORECEIVE | DBG_FUNC_END, error,
3238 0, 0, 0, 0);
3239
3240 return (error);
3241 }
3242 nooob:
3243 if (mp != NULL)
3244 *mp = NULL;
3245
3246 if (so->so_state & SS_ISCONFIRMING && uio_resid(uio)) {
3247 (*pr->pr_usrreqs->pru_rcvd)(so, 0);
3248 }
3249
3250 free_list = NULL;
3251 delayed_copy_len = 0;
3252 restart:
3253 #ifdef MORE_LOCKING_DEBUG
3254 if (so->so_usecount <= 1)
3255 printf("soreceive: sblock so=0x%llx ref=%d on socket\n",
3256 (uint64_t)DEBUG_KERNEL_ADDRPERM(so), so->so_usecount);
3257 #endif
3258 /*
3259 * See if the socket has been closed (SS_NOFDREF|SS_CANTRCVMORE)
3260 * and if so just return to the caller. This could happen when
3261 * soreceive() is called by a socket upcall function during the
3262 * time the socket is freed. The socket buffer would have been
3263 * locked across the upcall, therefore we cannot put this thread
3264 * to sleep (else we will deadlock) or return EWOULDBLOCK (else
3265 * we may livelock), because the lock on the socket buffer will
3266 * only be released when the upcall routine returns to its caller.
3267 * Because the socket has been officially closed, there can be
3268 * no further read on it.
3269 *
3270 * A multipath subflow socket would have its SS_NOFDREF set by
3271 * default, so check for SOF_MP_SUBFLOW socket flag; when the
3272 * socket is closed for real, SOF_MP_SUBFLOW would be cleared.
3273 */
3274 if ((so->so_state & (SS_NOFDREF | SS_CANTRCVMORE)) ==
3275 (SS_NOFDREF | SS_CANTRCVMORE) && !(so->so_flags & SOF_MP_SUBFLOW)) {
3276 socket_unlock(so, 1);
3277 return (0);
3278 }
3279
3280 error = sblock(&so->so_rcv, SBLOCKWAIT(flags));
3281 if (error) {
3282 socket_unlock(so, 1);
3283 KERNEL_DEBUG(DBG_FNC_SORECEIVE | DBG_FUNC_END, error,
3284 0, 0, 0, 0);
3285 if (en_tracing) {
3286 KERNEL_ENERGYTRACE(kEnTrActKernSockRead, DBG_FUNC_END,
3287 VM_KERNEL_ADDRPERM(so), 0,
3288 (int64_t)(orig_resid - uio_resid(uio)));
3289 }
3290 return (error);
3291 }
3292
3293 m = so->so_rcv.sb_mb;
3294 /*
3295 * If we have less data than requested, block awaiting more
3296 * (subject to any timeout) if:
3297 * 1. the current count is less than the low water mark, or
3298 * 2. MSG_WAITALL is set, and it is possible to do the entire
3299 * receive operation at once if we block (resid <= hiwat).
3300 * 3. MSG_DONTWAIT is not set
3301 * If MSG_WAITALL is set but resid is larger than the receive buffer,
3302 * we have to do the receive in sections, and thus risk returning
3303 * a short count if a timeout or signal occurs after we start.
3304 */
3305 if (m == NULL || (((flags & MSG_DONTWAIT) == 0 &&
3306 so->so_rcv.sb_cc < uio_resid(uio)) &&
3307 (so->so_rcv.sb_cc < so->so_rcv.sb_lowat ||
3308 ((flags & MSG_WAITALL) && uio_resid(uio) <= so->so_rcv.sb_hiwat)) &&
3309 m->m_nextpkt == NULL && (pr->pr_flags & PR_ATOMIC) == 0)) {
3310 /*
3311 * Panic if we notice inconsistencies in the socket's
3312 * receive list; both sb_mb and sb_cc should correctly
3313 * reflect the contents of the list, otherwise we may
3314 * end up with false positives during select() or poll()
3315 * which could put the application in a bad state.
3316 */
3317 SB_MB_CHECK(&so->so_rcv);
3318
3319 if (so->so_error) {
3320 if (m != NULL)
3321 goto dontblock;
3322 error = so->so_error;
3323 if ((flags & MSG_PEEK) == 0)
3324 so->so_error = 0;
3325 goto release;
3326 }
3327 if (so->so_state & SS_CANTRCVMORE) {
3328 #if CONTENT_FILTER
3329 /*
3330 * Deal with half closed connections
3331 */
3332 if ((so->so_state & SS_ISDISCONNECTED) == 0 &&
3333 cfil_sock_data_pending(&so->so_rcv) != 0)
3334 CFIL_LOG(LOG_INFO,
3335 "so %llx ignore SS_CANTRCVMORE",
3336 (uint64_t)DEBUG_KERNEL_ADDRPERM(so));
3337 else
3338 #endif /* CONTENT_FILTER */
3339 if (m != NULL)
3340 goto dontblock;
3341 else
3342 goto release;
3343 }
3344 for (; m != NULL; m = m->m_next)
3345 if (m->m_type == MT_OOBDATA || (m->m_flags & M_EOR)) {
3346 m = so->so_rcv.sb_mb;
3347 goto dontblock;
3348 }
3349 if ((so->so_state & (SS_ISCONNECTED|SS_ISCONNECTING)) == 0 &&
3350 (so->so_proto->pr_flags & PR_CONNREQUIRED)) {
3351 error = ENOTCONN;
3352 goto release;
3353 }
3354 if (uio_resid(uio) == 0)
3355 goto release;
3356
3357 if ((so->so_state & SS_NBIO) ||
3358 (flags & (MSG_DONTWAIT|MSG_NBIO))) {
3359 error = EWOULDBLOCK;
3360 goto release;
3361 }
3362 SBLASTRECORDCHK(&so->so_rcv, "soreceive sbwait 1");
3363 SBLASTMBUFCHK(&so->so_rcv, "soreceive sbwait 1");
3364 sbunlock(&so->so_rcv, TRUE); /* keep socket locked */
3365 #if EVEN_MORE_LOCKING_DEBUG
3366 if (socket_debug)
3367 printf("Waiting for socket data\n");
3368 #endif
3369
3370 error = sbwait(&so->so_rcv);
3371 #if EVEN_MORE_LOCKING_DEBUG
3372 if (socket_debug)
3373 printf("SORECEIVE - sbwait returned %d\n", error);
3374 #endif
3375 if (so->so_usecount < 1) {
3376 panic("%s: after 2nd sblock so=%p ref=%d on socket\n",
3377 __func__, so, so->so_usecount);
3378 /* NOTREACHED */
3379 }
3380 if (error) {
3381 socket_unlock(so, 1);
3382 KERNEL_DEBUG(DBG_FNC_SORECEIVE | DBG_FUNC_END, error,
3383 0, 0, 0, 0);
3384 if (en_tracing) {
3385 KERNEL_ENERGYTRACE(kEnTrActKernSockRead, DBG_FUNC_END,
3386 VM_KERNEL_ADDRPERM(so), 0,
3387 (int64_t)(orig_resid - uio_resid(uio)));
3388 }
3389 return (error);
3390 }
3391 goto restart;
3392 }
3393 dontblock:
3394 OSIncrementAtomicLong(&p->p_stats->p_ru.ru_msgrcv);
3395 SBLASTRECORDCHK(&so->so_rcv, "soreceive 1");
3396 SBLASTMBUFCHK(&so->so_rcv, "soreceive 1");
3397 nextrecord = m->m_nextpkt;
3398
3399 if ((pr->pr_flags & PR_ADDR) && m->m_type == MT_SONAME) {
3400 error = soreceive_addr(p, so, psa, flags, &m, &nextrecord,
3401 mp0 == NULL);
3402 if (error == ERESTART)
3403 goto restart;
3404 else if (error != 0)
3405 goto release;
3406 orig_resid = 0;
3407 }
3408
3409 /*
3410 * Process one or more MT_CONTROL mbufs present before any data mbufs
3411 * in the first mbuf chain on the socket buffer. If MSG_PEEK, we
3412 * just copy the data; if !MSG_PEEK, we call into the protocol to
3413 * perform externalization.
3414 */
3415 if (m != NULL && m->m_type == MT_CONTROL) {
3416 error = soreceive_ctl(so, controlp, flags, &m, &nextrecord);
3417 if (error != 0)
3418 goto release;
3419 orig_resid = 0;
3420 }
3421
3422 /*
3423 * If the socket is a TCP socket with message delivery
3424 * enabled, then create a control msg to deliver the
3425 * relative TCP sequence number for this data. Waiting
3426 * until this point will protect against failures to
3427 * allocate an mbuf for control msgs.
3428 */
3429 if (so->so_type == SOCK_STREAM && SOCK_PROTO(so) == IPPROTO_TCP &&
3430 (so->so_flags & SOF_ENABLE_MSGS) && controlp != NULL) {
3431 struct mbuf *seq_cm;
3432
3433 seq_cm = sbcreatecontrol((caddr_t)&m->m_pkthdr.msg_seq,
3434 sizeof (uint32_t), SCM_SEQNUM, SOL_SOCKET);
3435 if (seq_cm == NULL) {
3436 /* unable to allocate a control mbuf */
3437 error = ENOBUFS;
3438 goto release;
3439 }
3440 *controlp = seq_cm;
3441 controlp = &seq_cm->m_next;
3442 }
3443
3444 if (m != NULL) {
3445 if (!(flags & MSG_PEEK)) {
3446 /*
3447 * We get here because m points to an mbuf following
3448 * any MT_SONAME or MT_CONTROL mbufs which have been
3449 * processed above. In any case, m should be pointing
3450 * to the head of the mbuf chain, and the nextrecord
3451 * should be either NULL or equal to m->m_nextpkt.
3452 * See comments above about SB_LOCK.
3453 */
3454 if (m != so->so_rcv.sb_mb ||
3455 m->m_nextpkt != nextrecord) {
3456 panic("%s: post-control !sync so=%p m=%p "
3457 "nextrecord=%p\n", __func__, so, m,
3458 nextrecord);
3459 /* NOTREACHED */
3460 }
3461 if (nextrecord == NULL)
3462 so->so_rcv.sb_lastrecord = m;
3463 }
3464 type = m->m_type;
3465 if (type == MT_OOBDATA)
3466 flags |= MSG_OOB;
3467 } else {
3468 if (!(flags & MSG_PEEK)) {
3469 SB_EMPTY_FIXUP(&so->so_rcv);
3470 }
3471 }
3472 SBLASTRECORDCHK(&so->so_rcv, "soreceive 2");
3473 SBLASTMBUFCHK(&so->so_rcv, "soreceive 2");
3474
3475 moff = 0;
3476 offset = 0;
3477
3478 if (!(flags & MSG_PEEK) && uio_resid(uio) > sorecvmincopy)
3479 can_delay = 1;
3480 else
3481 can_delay = 0;
3482
3483 need_event = 0;
3484
3485 while (m != NULL &&
3486 (uio_resid(uio) - delayed_copy_len) > 0 && error == 0) {
3487 if (m->m_type == MT_OOBDATA) {
3488 if (type != MT_OOBDATA)
3489 break;
3490 } else if (type == MT_OOBDATA) {
3491 break;
3492 }
3493 /*
3494 * Make sure to allways set MSG_OOB event when getting
3495 * out of band data inline.
3496 */
3497 if ((so->so_options & SO_WANTOOBFLAG) != 0 &&
3498 (so->so_options & SO_OOBINLINE) != 0 &&
3499 (so->so_state & SS_RCVATMARK) != 0) {
3500 flags |= MSG_OOB;
3501 }
3502 so->so_state &= ~SS_RCVATMARK;
3503 len = uio_resid(uio) - delayed_copy_len;
3504 if (so->so_oobmark && len > so->so_oobmark - offset)
3505 len = so->so_oobmark - offset;
3506 if (len > m->m_len - moff)
3507 len = m->m_len - moff;
3508 /*
3509 * If mp is set, just pass back the mbufs.
3510 * Otherwise copy them out via the uio, then free.
3511 * Sockbuf must be consistent here (points to current mbuf,
3512 * it points to next record) when we drop priority;
3513 * we must note any additions to the sockbuf when we
3514 * block interrupts again.
3515 */
3516 if (mp == NULL) {
3517 SBLASTRECORDCHK(&so->so_rcv, "soreceive uiomove");
3518 SBLASTMBUFCHK(&so->so_rcv, "soreceive uiomove");
3519 if (can_delay && len == m->m_len) {
3520 /*
3521 * only delay the copy if we're consuming the
3522 * mbuf and we're NOT in MSG_PEEK mode
3523 * and we have enough data to make it worthwile
3524 * to drop and retake the lock... can_delay
3525 * reflects the state of the 2 latter
3526 * constraints moff should always be zero
3527 * in these cases
3528 */
3529 delayed_copy_len += len;
3530 } else {
3531 if (delayed_copy_len) {
3532 error = sodelayed_copy(so, uio,
3533 &free_list, &delayed_copy_len);
3534
3535 if (error) {
3536 goto release;
3537 }
3538 /*
3539 * can only get here if MSG_PEEK is not
3540 * set therefore, m should point at the
3541 * head of the rcv queue; if it doesn't,
3542 * it means something drastically
3543 * changed while we were out from behind
3544 * the lock in sodelayed_copy. perhaps
3545 * a RST on the stream. in any event,
3546 * the stream has been interrupted. it's
3547 * probably best just to return whatever
3548 * data we've moved and let the caller
3549 * sort it out...
3550 */
3551 if (m != so->so_rcv.sb_mb) {
3552 break;
3553 }
3554 }
3555 socket_unlock(so, 0);
3556 error = uiomove(mtod(m, caddr_t) + moff,
3557 (int)len, uio);
3558 socket_lock(so, 0);
3559
3560 if (error)
3561 goto release;
3562 }
3563 } else {
3564 uio_setresid(uio, (uio_resid(uio) - len));
3565 }
3566 if (len == m->m_len - moff) {
3567 if (m->m_flags & M_EOR)
3568 flags |= MSG_EOR;
3569 if (flags & MSG_PEEK) {
3570 m = m->m_next;
3571 moff = 0;
3572 } else {
3573 nextrecord = m->m_nextpkt;
3574 sbfree(&so->so_rcv, m);
3575 m->m_nextpkt = NULL;
3576
3577 /*
3578 * If this packet is an unordered packet
3579 * (indicated by M_UNORDERED_DATA flag), remove
3580 * the additional bytes added to the
3581 * receive socket buffer size.
3582 */
3583 if ((so->so_flags & SOF_ENABLE_MSGS) &&
3584 m->m_len &&
3585 (m->m_flags & M_UNORDERED_DATA) &&
3586 sbreserve(&so->so_rcv,
3587 so->so_rcv.sb_hiwat - m->m_len)) {
3588 if (so->so_msg_state->msg_uno_bytes >
3589 m->m_len) {
3590 so->so_msg_state->
3591 msg_uno_bytes -= m->m_len;
3592 } else {
3593 so->so_msg_state->
3594 msg_uno_bytes = 0;
3595 }
3596 m->m_flags &= ~M_UNORDERED_DATA;
3597 }
3598
3599 if (mp != NULL) {
3600 *mp = m;
3601 mp = &m->m_next;
3602 so->so_rcv.sb_mb = m = m->m_next;
3603 *mp = NULL;
3604 } else {
3605 if (free_list == NULL)
3606 free_list = m;
3607 else
3608 ml->m_next = m;
3609 ml = m;
3610 so->so_rcv.sb_mb = m = m->m_next;
3611 ml->m_next = NULL;
3612 }
3613 if (m != NULL) {
3614 m->m_nextpkt = nextrecord;
3615 if (nextrecord == NULL)
3616 so->so_rcv.sb_lastrecord = m;
3617 } else {
3618 so->so_rcv.sb_mb = nextrecord;
3619 SB_EMPTY_FIXUP(&so->so_rcv);
3620 }
3621 SBLASTRECORDCHK(&so->so_rcv, "soreceive 3");
3622 SBLASTMBUFCHK(&so->so_rcv, "soreceive 3");
3623 }
3624 } else {
3625 if (flags & MSG_PEEK) {
3626 moff += len;
3627 } else {
3628 if (mp != NULL) {
3629 int copy_flag;
3630
3631 if (flags & MSG_DONTWAIT)
3632 copy_flag = M_DONTWAIT;
3633 else
3634 copy_flag = M_WAIT;
3635 *mp = m_copym(m, 0, len, copy_flag);
3636 /*
3637 * Failed to allocate an mbuf?
3638 * Adjust uio_resid back, it was
3639 * adjusted down by len bytes which
3640 * we didn't copy over.
3641 */
3642 if (*mp == NULL) {
3643 uio_setresid(uio,
3644 (uio_resid(uio) + len));
3645 break;
3646 }
3647 }
3648 m->m_data += len;
3649 m->m_len -= len;
3650 so->so_rcv.sb_cc -= len;
3651 }
3652 }
3653 if (so->so_oobmark) {
3654 if ((flags & MSG_PEEK) == 0) {
3655 so->so_oobmark -= len;
3656 if (so->so_oobmark == 0) {
3657 so->so_state |= SS_RCVATMARK;
3658 /*
3659 * delay posting the actual event until
3660 * after any delayed copy processing
3661 * has finished
3662 */
3663 need_event = 1;
3664 break;
3665 }
3666 } else {
3667 offset += len;
3668 if (offset == so->so_oobmark)
3669 break;
3670 }
3671 }
3672 if (flags & MSG_EOR)
3673 break;
3674 /*
3675 * If the MSG_WAITALL or MSG_WAITSTREAM flag is set
3676 * (for non-atomic socket), we must not quit until
3677 * "uio->uio_resid == 0" or an error termination.
3678 * If a signal/timeout occurs, return with a short
3679 * count but without error. Keep sockbuf locked
3680 * against other readers.
3681 */
3682 while (flags & (MSG_WAITALL|MSG_WAITSTREAM) && m == NULL &&
3683 (uio_resid(uio) - delayed_copy_len) > 0 &&
3684 !sosendallatonce(so) && !nextrecord) {
3685 if (so->so_error || ((so->so_state & SS_CANTRCVMORE)
3686 #if CONTENT_FILTER
3687 && cfil_sock_data_pending(&so->so_rcv) == 0
3688 #endif /* CONTENT_FILTER */
3689 ))
3690 goto release;
3691
3692 /*
3693 * Depending on the protocol (e.g. TCP), the following
3694 * might cause the socket lock to be dropped and later
3695 * be reacquired, and more data could have arrived and
3696 * have been appended to the receive socket buffer by
3697 * the time it returns. Therefore, we only sleep in
3698 * sbwait() below if and only if the socket buffer is
3699 * empty, in order to avoid a false sleep.
3700 */
3701 if (pr->pr_flags & PR_WANTRCVD && so->so_pcb &&
3702 (((struct inpcb *)so->so_pcb)->inp_state !=
3703 INPCB_STATE_DEAD))
3704 (*pr->pr_usrreqs->pru_rcvd)(so, flags);
3705
3706 SBLASTRECORDCHK(&so->so_rcv, "soreceive sbwait 2");
3707 SBLASTMBUFCHK(&so->so_rcv, "soreceive sbwait 2");
3708
3709 if (so->so_rcv.sb_mb == NULL && sbwait(&so->so_rcv)) {
3710 error = 0;
3711 goto release;
3712 }
3713 /*
3714 * have to wait until after we get back from the sbwait
3715 * to do the copy because we will drop the lock if we
3716 * have enough data that has been delayed... by dropping
3717 * the lock we open up a window allowing the netisr
3718 * thread to process the incoming packets and to change
3719 * the state of this socket... we're issuing the sbwait
3720 * because the socket is empty and we're expecting the
3721 * netisr thread to wake us up when more packets arrive;
3722 * if we allow that processing to happen and then sbwait
3723 * we could stall forever with packets sitting in the
3724 * socket if no further packets arrive from the remote
3725 * side.
3726 *
3727 * we want to copy before we've collected all the data
3728 * to satisfy this request to allow the copy to overlap
3729 * the incoming packet processing on an MP system
3730 */
3731 if (delayed_copy_len > sorecvmincopy &&
3732 (delayed_copy_len > (so->so_rcv.sb_hiwat / 2))) {
3733 error = sodelayed_copy(so, uio,
3734 &free_list, &delayed_copy_len);
3735
3736 if (error)
3737 goto release;
3738 }
3739 m = so->so_rcv.sb_mb;
3740 if (m != NULL) {
3741 nextrecord = m->m_nextpkt;
3742 }
3743 SB_MB_CHECK(&so->so_rcv);
3744 }
3745 }
3746 #ifdef MORE_LOCKING_DEBUG
3747 if (so->so_usecount <= 1) {
3748 panic("%s: after big while so=%p ref=%d on socket\n",
3749 __func__, so, so->so_usecount);
3750 /* NOTREACHED */
3751 }
3752 #endif
3753
3754 if (m != NULL && pr->pr_flags & PR_ATOMIC) {
3755 if (so->so_options & SO_DONTTRUNC) {
3756 flags |= MSG_RCVMORE;
3757 } else {
3758 flags |= MSG_TRUNC;
3759 if ((flags & MSG_PEEK) == 0)
3760 (void) sbdroprecord(&so->so_rcv);
3761 }
3762 }
3763
3764 /*
3765 * pru_rcvd below (for TCP) may cause more data to be received
3766 * if the socket lock is dropped prior to sending the ACK; some
3767 * legacy OpenTransport applications don't handle this well
3768 * (if it receives less data than requested while MSG_HAVEMORE
3769 * is set), and so we set the flag now based on what we know
3770 * prior to calling pru_rcvd.
3771 */
3772 if ((so->so_options & SO_WANTMORE) && so->so_rcv.sb_cc > 0)
3773 flags |= MSG_HAVEMORE;
3774
3775 if ((flags & MSG_PEEK) == 0) {
3776 if (m == NULL) {
3777 so->so_rcv.sb_mb = nextrecord;
3778 /*
3779 * First part is an inline SB_EMPTY_FIXUP(). Second
3780 * part makes sure sb_lastrecord is up-to-date if
3781 * there is still data in the socket buffer.
3782 */
3783 if (so->so_rcv.sb_mb == NULL) {
3784 so->so_rcv.sb_mbtail = NULL;
3785 so->so_rcv.sb_lastrecord = NULL;
3786 } else if (nextrecord->m_nextpkt == NULL) {
3787 so->so_rcv.sb_lastrecord = nextrecord;
3788 }
3789 SB_MB_CHECK(&so->so_rcv);
3790 }
3791 SBLASTRECORDCHK(&so->so_rcv, "soreceive 4");
3792 SBLASTMBUFCHK(&so->so_rcv, "soreceive 4");
3793 if (pr->pr_flags & PR_WANTRCVD && so->so_pcb)
3794 (*pr->pr_usrreqs->pru_rcvd)(so, flags);
3795 }
3796
3797 if (delayed_copy_len) {
3798 error = sodelayed_copy(so, uio, &free_list, &delayed_copy_len);
3799 if (error)
3800 goto release;
3801 }
3802 if (free_list != NULL) {
3803 m_freem_list(free_list);
3804 free_list = NULL;
3805 }
3806 if (need_event)
3807 postevent(so, 0, EV_OOB);
3808
3809 if (orig_resid == uio_resid(uio) && orig_resid &&
3810 (flags & MSG_EOR) == 0 && (so->so_state & SS_CANTRCVMORE) == 0) {
3811 sbunlock(&so->so_rcv, TRUE); /* keep socket locked */
3812 goto restart;
3813 }
3814
3815 if (flagsp != NULL)
3816 *flagsp |= flags;
3817 release:
3818 #ifdef MORE_LOCKING_DEBUG
3819 if (so->so_usecount <= 1) {
3820 panic("%s: release so=%p ref=%d on socket\n", __func__,
3821 so, so->so_usecount);
3822 /* NOTREACHED */
3823 }
3824 #endif
3825 if (delayed_copy_len)
3826 error = sodelayed_copy(so, uio, &free_list, &delayed_copy_len);
3827
3828 if (free_list != NULL)
3829 m_freem_list(free_list);
3830
3831 sbunlock(&so->so_rcv, FALSE); /* will unlock socket */
3832
3833 if (en_tracing) {
3834 KERNEL_ENERGYTRACE(kEnTrActKernSockRead, DBG_FUNC_END,
3835 VM_KERNEL_ADDRPERM(so),
3836 ((error == EWOULDBLOCK) ? kEnTrFlagNoWork : 0),
3837 (int64_t)(orig_resid - uio_resid(uio)));
3838 }
3839 KERNEL_DEBUG(DBG_FNC_SORECEIVE | DBG_FUNC_END, so, uio_resid(uio),
3840 so->so_rcv.sb_cc, 0, error);
3841
3842 return (error);
3843 }
3844
3845 /*
3846 * Returns: 0 Success
3847 * uiomove:EFAULT
3848 */
3849 static int
3850 sodelayed_copy(struct socket *so, struct uio *uio, struct mbuf **free_list,
3851 user_ssize_t *resid)
3852 {
3853 int error = 0;
3854 struct mbuf *m;
3855
3856 m = *free_list;
3857
3858 socket_unlock(so, 0);
3859
3860 while (m != NULL && error == 0) {
3861 error = uiomove(mtod(m, caddr_t), (int)m->m_len, uio);
3862 m = m->m_next;
3863 }
3864 m_freem_list(*free_list);
3865
3866 *free_list = NULL;
3867 *resid = 0;
3868
3869 socket_lock(so, 0);
3870
3871 return (error);
3872 }
3873
3874 static int
3875 sodelayed_copy_list(struct socket *so, struct recv_msg_elem *msgarray,
3876 u_int uiocnt, struct mbuf **free_list, user_ssize_t *resid)
3877 {
3878 #pragma unused(so)
3879 int error = 0;
3880 struct mbuf *ml, *m;
3881 int i = 0;
3882 struct uio *auio;
3883
3884 for (ml = *free_list, i = 0; ml != NULL && i < uiocnt;
3885 ml = ml->m_nextpkt, i++) {
3886 auio = msgarray[i].uio;
3887 for (m = ml; m != NULL; m = m->m_next) {
3888 error = uiomove(mtod(m, caddr_t), m->m_len, auio);
3889 if (error != 0)
3890 goto out;
3891 }
3892 }
3893 out:
3894 m_freem_list(*free_list);
3895
3896 *free_list = NULL;
3897 *resid = 0;
3898
3899 return (error);
3900 }
3901
3902 int
3903 soreceive_list(struct socket *so, struct recv_msg_elem *msgarray, u_int uiocnt,
3904 int *flagsp)
3905 {
3906 struct mbuf *m;
3907 struct mbuf *nextrecord;
3908 struct mbuf *ml = NULL, *free_list = NULL, *free_tail = NULL;
3909 int error;
3910 user_ssize_t len, pktlen, delayed_copy_len = 0;
3911 struct protosw *pr = so->so_proto;
3912 user_ssize_t resid;
3913 struct proc *p = current_proc();
3914 struct uio *auio = NULL;
3915 int npkts = 0;
3916 int sblocked = 0;
3917 struct sockaddr **psa = NULL;
3918 struct mbuf **controlp = NULL;
3919 int can_delay;
3920 int flags;
3921 struct mbuf *free_others = NULL;
3922
3923 KERNEL_DEBUG(DBG_FNC_SORECEIVE_LIST | DBG_FUNC_START,
3924 so, uiocnt,
3925 so->so_rcv.sb_cc, so->so_rcv.sb_lowat, so->so_rcv.sb_hiwat);
3926
3927 /*
3928 * Sanity checks:
3929 * - Only supports don't wait flags
3930 * - Only support datagram sockets (could be extended to raw)
3931 * - Must be atomic
3932 * - Protocol must support packet chains
3933 * - The uio array is NULL (should we panic?)
3934 */
3935 if (flagsp != NULL)
3936 flags = *flagsp;
3937 else
3938 flags = 0;
3939 if (flags & ~(MSG_PEEK | MSG_WAITALL | MSG_DONTWAIT | MSG_NEEDSA |
3940 MSG_NBIO)) {
3941 printf("%s invalid flags 0x%x\n", __func__, flags);
3942 error = EINVAL;
3943 goto out;
3944 }
3945 if (so->so_type != SOCK_DGRAM) {
3946 error = EINVAL;
3947 goto out;
3948 }
3949 if (sosendallatonce(so) == 0) {
3950 error = EINVAL;
3951 goto out;
3952 }
3953 if (so->so_proto->pr_usrreqs->pru_send_list == NULL) {
3954 error = EPROTONOSUPPORT;
3955 goto out;
3956 }
3957 if (msgarray == NULL) {
3958 printf("%s uioarray is NULL\n", __func__);
3959 error = EINVAL;
3960 goto out;
3961 }
3962 if (uiocnt == 0) {
3963 printf("%s uiocnt is 0\n", __func__);
3964 error = EINVAL;
3965 goto out;
3966 }
3967 /*
3968 * Sanity check on the length passed by caller as we are making 'int'
3969 * comparisons
3970 */
3971 resid = recv_msg_array_resid(msgarray, uiocnt);
3972 if (resid < 0 || resid > INT_MAX) {
3973 error = EINVAL;
3974 goto out;
3975 }
3976
3977 if (!(flags & MSG_PEEK) && sorecvmincopy > 0)
3978 can_delay = 1;
3979 else
3980 can_delay = 0;
3981
3982 socket_lock(so, 1);
3983 so_update_last_owner_locked(so, p);
3984 so_update_policy(so);
3985
3986 #if NECP
3987 so_update_necp_policy(so, NULL, NULL);
3988 #endif /* NECP */
3989
3990 /*
3991 * If a recv attempt is made on a previously-accepted socket
3992 * that has been marked as inactive (disconnected), reject
3993 * the request.
3994 */
3995 if (so->so_flags & SOF_DEFUNCT) {
3996 struct sockbuf *sb = &so->so_rcv;
3997
3998 error = ENOTCONN;
3999 SODEFUNCTLOG("%s[%d, %s]: defunct so 0x%llx [%d,%d] (%d)\n",
4000 __func__, proc_pid(p), proc_best_name(p),
4001 (uint64_t)DEBUG_KERNEL_ADDRPERM(so),
4002 SOCK_DOM(so), SOCK_TYPE(so), error);
4003 /*
4004 * This socket should have been disconnected and flushed
4005 * prior to being returned from sodefunct(); there should
4006 * be no data on its receive list, so panic otherwise.
4007 */
4008 if (so->so_state & SS_DEFUNCT)
4009 sb_empty_assert(sb, __func__);
4010 goto release;
4011 }
4012
4013 next:
4014 /*
4015 * The uio may be empty
4016 */
4017 if (npkts >= uiocnt) {
4018 error = 0;
4019 goto release;
4020 }
4021 restart:
4022 /*
4023 * See if the socket has been closed (SS_NOFDREF|SS_CANTRCVMORE)
4024 * and if so just return to the caller. This could happen when
4025 * soreceive() is called by a socket upcall function during the
4026 * time the socket is freed. The socket buffer would have been
4027 * locked across the upcall, therefore we cannot put this thread
4028 * to sleep (else we will deadlock) or return EWOULDBLOCK (else
4029 * we may livelock), because the lock on the socket buffer will
4030 * only be released when the upcall routine returns to its caller.
4031 * Because the socket has been officially closed, there can be
4032 * no further read on it.
4033 */
4034 if ((so->so_state & (SS_NOFDREF | SS_CANTRCVMORE)) ==
4035 (SS_NOFDREF | SS_CANTRCVMORE)) {
4036 error = 0;
4037 goto release;
4038 }
4039
4040 error = sblock(&so->so_rcv, SBLOCKWAIT(flags));
4041 if (error) {
4042 goto release;
4043 }
4044 sblocked = 1;
4045
4046 m = so->so_rcv.sb_mb;
4047 /*
4048 * Block awaiting more datagram if needed
4049 */
4050 if (m == NULL || (((flags & MSG_DONTWAIT) == 0 &&
4051 (so->so_rcv.sb_cc < so->so_rcv.sb_lowat ||
4052 ((flags & MSG_WAITALL) && npkts < uiocnt))))) {
4053 /*
4054 * Panic if we notice inconsistencies in the socket's
4055 * receive list; both sb_mb and sb_cc should correctly
4056 * reflect the contents of the list, otherwise we may
4057 * end up with false positives during select() or poll()
4058 * which could put the application in a bad state.
4059 */
4060 SB_MB_CHECK(&so->so_rcv);
4061
4062 if (so->so_error) {
4063 error = so->so_error;
4064 if ((flags & MSG_PEEK) == 0)
4065 so->so_error = 0;
4066 goto release;
4067 }
4068 if (so->so_state & SS_CANTRCVMORE) {
4069 goto release;
4070 }
4071 if ((so->so_state & (SS_ISCONNECTED|SS_ISCONNECTING)) == 0 &&
4072 (so->so_proto->pr_flags & PR_CONNREQUIRED)) {
4073 error = ENOTCONN;
4074 goto release;
4075 }
4076 if ((so->so_state & SS_NBIO) ||
4077 (flags & (MSG_DONTWAIT|MSG_NBIO))) {
4078 error = EWOULDBLOCK;
4079 goto release;
4080 }
4081 /*
4082 * Do not block if we got some data
4083 */
4084 if (free_list != NULL) {
4085 error = 0;
4086 goto release;
4087 }
4088
4089 SBLASTRECORDCHK(&so->so_rcv, "soreceive sbwait 1");
4090 SBLASTMBUFCHK(&so->so_rcv, "soreceive sbwait 1");
4091
4092 sbunlock(&so->so_rcv, TRUE); /* keep socket locked */
4093 sblocked = 0;
4094
4095 error = sbwait(&so->so_rcv);
4096 if (error) {
4097 goto release;
4098 }
4099 goto restart;
4100 }
4101
4102 OSIncrementAtomicLong(&p->p_stats->p_ru.ru_msgrcv);
4103 SBLASTRECORDCHK(&so->so_rcv, "soreceive 1");
4104 SBLASTMBUFCHK(&so->so_rcv, "soreceive 1");
4105
4106 /*
4107 * Consume the current uio index as we have a datagram
4108 */
4109 auio = msgarray[npkts].uio;
4110 resid = uio_resid(auio);
4111 msgarray[npkts].which |= SOCK_MSG_DATA;
4112 psa = (msgarray[npkts].which & SOCK_MSG_SA) ?
4113 &msgarray[npkts].psa : NULL;
4114 controlp = (msgarray[npkts].which & SOCK_MSG_CONTROL) ?
4115 &msgarray[npkts].controlp : NULL;
4116 npkts += 1;
4117 nextrecord = m->m_nextpkt;
4118
4119 if ((pr->pr_flags & PR_ADDR) && m->m_type == MT_SONAME) {
4120 error = soreceive_addr(p, so, psa, flags, &m, &nextrecord, 1);
4121 if (error == ERESTART)
4122 goto restart;
4123 else if (error != 0)
4124 goto release;
4125 }
4126
4127 if (m != NULL && m->m_type == MT_CONTROL) {
4128 error = soreceive_ctl(so, controlp, flags, &m, &nextrecord);
4129 if (error != 0)
4130 goto release;
4131 }
4132
4133 if (m->m_pkthdr.len == 0) {
4134 printf("%s:%d so %llx pkt %llx type %u pktlen null\n",
4135 __func__, __LINE__,
4136 (uint64_t)DEBUG_KERNEL_ADDRPERM(so),
4137 (uint64_t)DEBUG_KERNEL_ADDRPERM(m),
4138 m->m_type);
4139 }
4140
4141 /*
4142 * Loop to copy the mbufs of the current record
4143 * Support zero length packets
4144 */
4145 ml = NULL;
4146 pktlen = 0;
4147 while (m != NULL && (len = resid - pktlen) >= 0 && error == 0) {
4148 if (m->m_len == 0)
4149 panic("%p m_len zero", m);
4150 if (m->m_type == 0)
4151 panic("%p m_type zero", m);
4152 /*
4153 * Clip to the residual length
4154 */
4155 if (len > m->m_len)
4156 len = m->m_len;
4157 pktlen += len;
4158 /*
4159 * Copy the mbufs via the uio or delay the copy
4160 * Sockbuf must be consistent here (points to current mbuf,
4161 * it points to next record) when we drop priority;
4162 * we must note any additions to the sockbuf when we
4163 * block interrupts again.
4164 */
4165 if (len > 0 && can_delay == 0) {
4166 socket_unlock(so, 0);
4167 error = uiomove(mtod(m, caddr_t), (int)len, auio);
4168 socket_lock(so, 0);
4169 if (error)
4170 goto release;
4171 } else {
4172 delayed_copy_len += len;
4173 }
4174
4175 if (len == m->m_len) {
4176 /*
4177 * m was entirely copied
4178 */
4179 sbfree(&so->so_rcv, m);
4180 nextrecord = m->m_nextpkt;
4181 m->m_nextpkt = NULL;
4182
4183 /*
4184 * Set the first packet to the head of the free list
4185 */
4186 if (free_list == NULL)
4187 free_list = m;
4188 /*
4189 * Link current packet to tail of free list
4190 */
4191 if (ml == NULL) {
4192 if (free_tail != NULL)
4193 free_tail->m_nextpkt = m;
4194 free_tail = m;
4195 }
4196 /*
4197 * Link current mbuf to last mbuf of current packet
4198 */
4199 if (ml != NULL)
4200 ml->m_next = m;
4201 ml = m;
4202
4203 /*
4204 * Move next buf to head of socket buffer
4205 */
4206 so->so_rcv.sb_mb = m = ml->m_next;
4207 ml->m_next = NULL;
4208
4209 if (m != NULL) {
4210 m->m_nextpkt = nextrecord;
4211 if (nextrecord == NULL)
4212 so->so_rcv.sb_lastrecord = m;
4213 } else {
4214 so->so_rcv.sb_mb = nextrecord;
4215 SB_EMPTY_FIXUP(&so->so_rcv);
4216 }
4217 SBLASTRECORDCHK(&so->so_rcv, "soreceive 3");
4218 SBLASTMBUFCHK(&so->so_rcv, "soreceive 3");
4219 } else {
4220 /*
4221 * Stop the loop on partial copy
4222 */
4223 break;
4224 }
4225 }
4226 #ifdef MORE_LOCKING_DEBUG
4227 if (so->so_usecount <= 1) {
4228 panic("%s: after big while so=%llx ref=%d on socket\n",
4229 __func__,
4230 (uint64_t)DEBUG_KERNEL_ADDRPERM(so), so->so_usecount);
4231 /* NOTREACHED */
4232 }
4233 #endif
4234 /*
4235 * Tell the caller we made a partial copy
4236 */
4237 if (m != NULL) {
4238 if (so->so_options & SO_DONTTRUNC) {
4239 /*
4240 * Copyout first the freelist then the partial mbuf
4241 */
4242 socket_unlock(so, 0);
4243 if (delayed_copy_len)
4244 error = sodelayed_copy_list(so, msgarray,
4245 uiocnt, &free_list, &delayed_copy_len);
4246
4247 if (error == 0) {
4248 error = uiomove(mtod(m, caddr_t), (int)len,
4249 auio);
4250 }
4251 socket_lock(so, 0);
4252 if (error)
4253 goto release;
4254
4255 m->m_data += len;
4256 m->m_len -= len;
4257 so->so_rcv.sb_cc -= len;
4258 flags |= MSG_RCVMORE;
4259 } else {
4260 (void) sbdroprecord(&so->so_rcv);
4261 nextrecord = so->so_rcv.sb_mb;
4262 m = NULL;
4263 flags |= MSG_TRUNC;
4264 }
4265 }
4266
4267 if (m == NULL) {
4268 so->so_rcv.sb_mb = nextrecord;
4269 /*
4270 * First part is an inline SB_EMPTY_FIXUP(). Second
4271 * part makes sure sb_lastrecord is up-to-date if
4272 * there is still data in the socket buffer.
4273 */
4274 if (so->so_rcv.sb_mb == NULL) {
4275 so->so_rcv.sb_mbtail = NULL;
4276 so->so_rcv.sb_lastrecord = NULL;
4277 } else if (nextrecord->m_nextpkt == NULL) {
4278 so->so_rcv.sb_lastrecord = nextrecord;
4279 }
4280 SB_MB_CHECK(&so->so_rcv);
4281 }
4282 SBLASTRECORDCHK(&so->so_rcv, "soreceive 4");
4283 SBLASTMBUFCHK(&so->so_rcv, "soreceive 4");
4284
4285 /*
4286 * We can continue to the next packet as long as:
4287 * - We haven't exhausted the uio array
4288 * - There was no error
4289 * - A packet was not truncated
4290 * - We can still receive more data
4291 */
4292 if (npkts < uiocnt && error == 0 &&
4293 (flags & (MSG_RCVMORE | MSG_TRUNC)) == 0 &&
4294 (so->so_state & SS_CANTRCVMORE) == 0) {
4295 sbunlock(&so->so_rcv, TRUE); /* keep socket locked */
4296 sblocked = 0;
4297
4298 goto next;
4299 }
4300 if (flagsp != NULL)
4301 *flagsp |= flags;
4302
4303 release:
4304 /*
4305 * pru_rcvd may cause more data to be received if the socket lock
4306 * is dropped so we set MSG_HAVEMORE now based on what we know.
4307 * That way the caller won't be surprised if it receives less data
4308 * than requested.
4309 */
4310 if ((so->so_options & SO_WANTMORE) && so->so_rcv.sb_cc > 0)
4311 flags |= MSG_HAVEMORE;
4312
4313 if (pr->pr_flags & PR_WANTRCVD && so->so_pcb)
4314 (*pr->pr_usrreqs->pru_rcvd)(so, flags);
4315
4316 if (sblocked)
4317 sbunlock(&so->so_rcv, FALSE); /* will unlock socket */
4318 else
4319 socket_unlock(so, 1);
4320
4321 if (delayed_copy_len)
4322 error = sodelayed_copy_list(so, msgarray, uiocnt,
4323 &free_list, &delayed_copy_len);
4324 out:
4325 /*
4326 * Amortize the cost of freeing the mbufs
4327 */
4328 if (free_list != NULL)
4329 m_freem_list(free_list);
4330 if (free_others != NULL)
4331 m_freem_list(free_others);
4332
4333 KERNEL_DEBUG(DBG_FNC_SORECEIVE_LIST | DBG_FUNC_END, error,
4334 0, 0, 0, 0);
4335 return (error);
4336 }
4337
4338 /*
4339 * Returns: 0 Success
4340 * EINVAL
4341 * ENOTCONN
4342 * <pru_shutdown>:EINVAL
4343 * <pru_shutdown>:EADDRNOTAVAIL[TCP]
4344 * <pru_shutdown>:ENOBUFS[TCP]
4345 * <pru_shutdown>:EMSGSIZE[TCP]
4346 * <pru_shutdown>:EHOSTUNREACH[TCP]
4347 * <pru_shutdown>:ENETUNREACH[TCP]
4348 * <pru_shutdown>:ENETDOWN[TCP]
4349 * <pru_shutdown>:ENOMEM[TCP]
4350 * <pru_shutdown>:EACCES[TCP]
4351 * <pru_shutdown>:EMSGSIZE[TCP]
4352 * <pru_shutdown>:ENOBUFS[TCP]
4353 * <pru_shutdown>:???[TCP] [ignorable: mostly IPSEC/firewall/DLIL]
4354 * <pru_shutdown>:??? [other protocol families]
4355 */
4356 int
4357 soshutdown(struct socket *so, int how)
4358 {
4359 int error;
4360
4361 KERNEL_DEBUG(DBG_FNC_SOSHUTDOWN | DBG_FUNC_START, how, 0, 0, 0, 0);
4362
4363 switch (how) {
4364 case SHUT_RD:
4365 case SHUT_WR:
4366 case SHUT_RDWR:
4367 socket_lock(so, 1);
4368 if ((so->so_state &
4369 (SS_ISCONNECTED|SS_ISCONNECTING|SS_ISDISCONNECTING)) == 0) {
4370 error = ENOTCONN;
4371 } else {
4372 error = soshutdownlock(so, how);
4373 }
4374 socket_unlock(so, 1);
4375 break;
4376 default:
4377 error = EINVAL;
4378 break;
4379 }
4380
4381 KERNEL_DEBUG(DBG_FNC_SOSHUTDOWN | DBG_FUNC_END, how, error, 0, 0, 0);
4382
4383 return (error);
4384 }
4385
4386 int
4387 soshutdownlock_final(struct socket *so, int how)
4388 {
4389 struct protosw *pr = so->so_proto;
4390 int error = 0;
4391
4392 sflt_notify(so, sock_evt_shutdown, &how);
4393
4394 if (how != SHUT_WR) {
4395 if ((so->so_state & SS_CANTRCVMORE) != 0) {
4396 /* read already shut down */
4397 error = ENOTCONN;
4398 goto done;
4399 }
4400 sorflush(so);
4401 postevent(so, 0, EV_RCLOSED);
4402 }
4403 if (how != SHUT_RD) {
4404 if ((so->so_state & SS_CANTSENDMORE) != 0) {
4405 /* write already shut down */
4406 error = ENOTCONN;
4407 goto done;
4408 }
4409 error = (*pr->pr_usrreqs->pru_shutdown)(so);
4410 postevent(so, 0, EV_WCLOSED);
4411 }
4412 done:
4413 KERNEL_DEBUG(DBG_FNC_SOSHUTDOWN, how, 1, 0, 0, 0);
4414 return (error);
4415 }
4416
4417 int
4418 soshutdownlock(struct socket *so, int how)
4419 {
4420 int error = 0;
4421
4422 #if CONTENT_FILTER
4423 /*
4424 * A content filter may delay the actual shutdown until it
4425 * has processed the pending data
4426 */
4427 if (so->so_flags & SOF_CONTENT_FILTER) {
4428 error = cfil_sock_shutdown(so, &how);
4429 if (error == EJUSTRETURN) {
4430 error = 0;
4431 goto done;
4432 } else if (error != 0) {
4433 goto done;
4434 }
4435 }
4436 #endif /* CONTENT_FILTER */
4437
4438 error = soshutdownlock_final(so, how);
4439
4440 done:
4441 return (error);
4442 }
4443
4444 void
4445 sowflush(struct socket *so)
4446 {
4447 struct sockbuf *sb = &so->so_snd;
4448
4449 /*
4450 * Obtain lock on the socket buffer (SB_LOCK). This is required
4451 * to prevent the socket buffer from being unexpectedly altered
4452 * while it is used by another thread in socket send/receive.
4453 *
4454 * sblock() must not fail here, hence the assertion.
4455 */
4456 (void) sblock(sb, SBL_WAIT | SBL_NOINTR | SBL_IGNDEFUNCT);
4457 VERIFY(sb->sb_flags & SB_LOCK);
4458
4459 sb->sb_flags &= ~(SB_SEL|SB_UPCALL);
4460 sb->sb_flags |= SB_DROP;
4461 sb->sb_upcall = NULL;
4462 sb->sb_upcallarg = NULL;
4463
4464 sbunlock(sb, TRUE); /* keep socket locked */
4465
4466 selthreadclear(&sb->sb_sel);
4467 sbrelease(sb);
4468 }
4469
4470 void
4471 sorflush(struct socket *so)
4472 {
4473 struct sockbuf *sb = &so->so_rcv;
4474 struct protosw *pr = so->so_proto;
4475 struct sockbuf asb;
4476 #ifdef notyet
4477 lck_mtx_t *mutex_held;
4478 /*
4479 * XXX: This code is currently commented out, because we may get here
4480 * as part of sofreelastref(), and at that time, pr_getlock() may no
4481 * longer be able to return us the lock; this will be fixed in future.
4482 */
4483 if (so->so_proto->pr_getlock != NULL)
4484 mutex_held = (*so->so_proto->pr_getlock)(so, 0);
4485 else
4486 mutex_held = so->so_proto->pr_domain->dom_mtx;
4487
4488 lck_mtx_assert(mutex_held, LCK_MTX_ASSERT_OWNED);
4489 #endif /* notyet */
4490
4491 sflt_notify(so, sock_evt_flush_read, NULL);
4492
4493 socantrcvmore(so);
4494
4495 /*
4496 * Obtain lock on the socket buffer (SB_LOCK). This is required
4497 * to prevent the socket buffer from being unexpectedly altered
4498 * while it is used by another thread in socket send/receive.
4499 *
4500 * sblock() must not fail here, hence the assertion.
4501 */
4502 (void) sblock(sb, SBL_WAIT | SBL_NOINTR | SBL_IGNDEFUNCT);
4503 VERIFY(sb->sb_flags & SB_LOCK);
4504
4505 /*
4506 * Copy only the relevant fields from "sb" to "asb" which we
4507 * need for sbrelease() to function. In particular, skip
4508 * sb_sel as it contains the wait queue linkage, which would
4509 * wreak havoc if we were to issue selthreadclear() on "asb".
4510 * Make sure to not carry over SB_LOCK in "asb", as we need
4511 * to acquire it later as part of sbrelease().
4512 */
4513 bzero(&asb, sizeof (asb));
4514 asb.sb_cc = sb->sb_cc;
4515 asb.sb_hiwat = sb->sb_hiwat;
4516 asb.sb_mbcnt = sb->sb_mbcnt;
4517 asb.sb_mbmax = sb->sb_mbmax;
4518 asb.sb_ctl = sb->sb_ctl;
4519 asb.sb_lowat = sb->sb_lowat;
4520 asb.sb_mb = sb->sb_mb;
4521 asb.sb_mbtail = sb->sb_mbtail;
4522 asb.sb_lastrecord = sb->sb_lastrecord;
4523 asb.sb_so = sb->sb_so;
4524 asb.sb_flags = sb->sb_flags;
4525 asb.sb_flags &= ~(SB_LOCK|SB_SEL|SB_KNOTE|SB_UPCALL);
4526 asb.sb_flags |= SB_DROP;
4527
4528 /*
4529 * Ideally we'd bzero() these and preserve the ones we need;
4530 * but to do that we'd need to shuffle things around in the
4531 * sockbuf, and we can't do it now because there are KEXTS
4532 * that are directly referring to the socket structure.
4533 *
4534 * Setting SB_DROP acts as a barrier to prevent further appends.
4535 * Clearing SB_SEL is done for selthreadclear() below.
4536 */
4537 sb->sb_cc = 0;
4538 sb->sb_hiwat = 0;
4539 sb->sb_mbcnt = 0;
4540 sb->sb_mbmax = 0;
4541 sb->sb_ctl = 0;
4542 sb->sb_lowat = 0;
4543 sb->sb_mb = NULL;
4544 sb->sb_mbtail = NULL;
4545 sb->sb_lastrecord = NULL;
4546 sb->sb_timeo.tv_sec = 0;
4547 sb->sb_timeo.tv_usec = 0;
4548 sb->sb_upcall = NULL;
4549 sb->sb_upcallarg = NULL;
4550 sb->sb_flags &= ~(SB_SEL|SB_UPCALL);
4551 sb->sb_flags |= SB_DROP;
4552
4553 sbunlock(sb, TRUE); /* keep socket locked */
4554
4555 /*
4556 * Note that selthreadclear() is called on the original "sb" and
4557 * not the local "asb" because of the way wait queue linkage is
4558 * implemented. Given that selwakeup() may be triggered, SB_SEL
4559 * should no longer be set (cleared above.)
4560 */
4561 selthreadclear(&sb->sb_sel);
4562
4563 if ((pr->pr_flags & PR_RIGHTS) && pr->pr_domain->dom_dispose)
4564 (*pr->pr_domain->dom_dispose)(asb.sb_mb);
4565
4566 sbrelease(&asb);
4567 }
4568
4569 /*
4570 * Perhaps this routine, and sooptcopyout(), below, ought to come in
4571 * an additional variant to handle the case where the option value needs
4572 * to be some kind of integer, but not a specific size.
4573 * In addition to their use here, these functions are also called by the
4574 * protocol-level pr_ctloutput() routines.
4575 *
4576 * Returns: 0 Success
4577 * EINVAL
4578 * copyin:EFAULT
4579 */
4580 int
4581 sooptcopyin(struct sockopt *sopt, void *buf, size_t len, size_t minlen)
4582 {
4583 size_t valsize;
4584
4585 /*
4586 * If the user gives us more than we wanted, we ignore it,
4587 * but if we don't get the minimum length the caller
4588 * wants, we return EINVAL. On success, sopt->sopt_valsize
4589 * is set to however much we actually retrieved.
4590 */
4591 if ((valsize = sopt->sopt_valsize) < minlen)
4592 return (EINVAL);
4593 if (valsize > len)
4594 sopt->sopt_valsize = valsize = len;
4595
4596 if (sopt->sopt_p != kernproc)
4597 return (copyin(sopt->sopt_val, buf, valsize));
4598
4599 bcopy(CAST_DOWN(caddr_t, sopt->sopt_val), buf, valsize);
4600 return (0);
4601 }
4602
4603 /*
4604 * sooptcopyin_timeval
4605 * Copy in a timeval value into tv_p, and take into account whether the
4606 * the calling process is 64-bit or 32-bit. Moved the sanity checking
4607 * code here so that we can verify the 64-bit tv_sec value before we lose
4608 * the top 32-bits assigning tv64.tv_sec to tv_p->tv_sec.
4609 */
4610 static int
4611 sooptcopyin_timeval(struct sockopt *sopt, struct timeval *tv_p)
4612 {
4613 int error;
4614
4615 if (proc_is64bit(sopt->sopt_p)) {
4616 struct user64_timeval tv64;
4617
4618 if (sopt->sopt_valsize < sizeof (tv64))
4619 return (EINVAL);
4620
4621 sopt->sopt_valsize = sizeof (tv64);
4622 if (sopt->sopt_p != kernproc) {
4623 error = copyin(sopt->sopt_val, &tv64, sizeof (tv64));
4624 if (error != 0)
4625 return (error);
4626 } else {
4627 bcopy(CAST_DOWN(caddr_t, sopt->sopt_val), &tv64,
4628 sizeof (tv64));
4629 }
4630 if (tv64.tv_sec < 0 || tv64.tv_sec > LONG_MAX ||
4631 tv64.tv_usec < 0 || tv64.tv_usec >= 1000000)
4632 return (EDOM);
4633
4634 tv_p->tv_sec = tv64.tv_sec;
4635 tv_p->tv_usec = tv64.tv_usec;
4636 } else {
4637 struct user32_timeval tv32;
4638
4639 if (sopt->sopt_valsize < sizeof (tv32))
4640 return (EINVAL);
4641
4642 sopt->sopt_valsize = sizeof (tv32);
4643 if (sopt->sopt_p != kernproc) {
4644 error = copyin(sopt->sopt_val, &tv32, sizeof (tv32));
4645 if (error != 0) {
4646 return (error);
4647 }
4648 } else {
4649 bcopy(CAST_DOWN(caddr_t, sopt->sopt_val), &tv32,
4650 sizeof (tv32));
4651 }
4652 #ifndef __LP64__
4653 /*
4654 * K64todo "comparison is always false due to
4655 * limited range of data type"
4656 */
4657 if (tv32.tv_sec < 0 || tv32.tv_sec > LONG_MAX ||
4658 tv32.tv_usec < 0 || tv32.tv_usec >= 1000000)
4659 return (EDOM);
4660 #endif
4661 tv_p->tv_sec = tv32.tv_sec;
4662 tv_p->tv_usec = tv32.tv_usec;
4663 }
4664 return (0);
4665 }
4666
4667 static int
4668 soopt_cred_check(struct socket *so, int priv)
4669 {
4670 kauth_cred_t cred = NULL;
4671 proc_t ep = PROC_NULL;
4672 int error;
4673
4674 if (so->so_flags & SOF_DELEGATED) {
4675 ep = proc_find(so->e_pid);
4676 if (ep)
4677 cred = kauth_cred_proc_ref(ep);
4678 }
4679 error = priv_check_cred(cred ? cred : so->so_cred, priv, 0);
4680 if (cred)
4681 kauth_cred_unref(&cred);
4682 if (ep != PROC_NULL)
4683 proc_rele(ep);
4684
4685 return (error);
4686 }
4687
4688 /*
4689 * Returns: 0 Success
4690 * EINVAL
4691 * ENOPROTOOPT
4692 * ENOBUFS
4693 * EDOM
4694 * sooptcopyin:EINVAL
4695 * sooptcopyin:EFAULT
4696 * sooptcopyin_timeval:EINVAL
4697 * sooptcopyin_timeval:EFAULT
4698 * sooptcopyin_timeval:EDOM
4699 * <pr_ctloutput>:EOPNOTSUPP[AF_UNIX]
4700 * <pr_ctloutput>:???w
4701 * sflt_attach_private:??? [whatever a filter author chooses]
4702 * <sf_setoption>:??? [whatever a filter author chooses]
4703 *
4704 * Notes: Other <pru_listen> returns depend on the protocol family; all
4705 * <sf_listen> returns depend on what the filter author causes
4706 * their filter to return.
4707 */
4708 int
4709 sosetoptlock(struct socket *so, struct sockopt *sopt, int dolock)
4710 {
4711 int error, optval;
4712 struct linger l;
4713 struct timeval tv;
4714 #if CONFIG_MACF_SOCKET
4715 struct mac extmac;
4716 #endif /* MAC_SOCKET */
4717
4718 if (sopt->sopt_dir != SOPT_SET)
4719 sopt->sopt_dir = SOPT_SET;
4720
4721 if (dolock)
4722 socket_lock(so, 1);
4723
4724 if ((so->so_state & (SS_CANTRCVMORE | SS_CANTSENDMORE)) ==
4725 (SS_CANTRCVMORE | SS_CANTSENDMORE) &&
4726 (so->so_flags & SOF_NPX_SETOPTSHUT) == 0) {
4727 /* the socket has been shutdown, no more sockopt's */
4728 error = EINVAL;
4729 goto out;
4730 }
4731
4732 error = sflt_setsockopt(so, sopt);
4733 if (error != 0) {
4734 if (error == EJUSTRETURN)
4735 error = 0;
4736 goto out;
4737 }
4738
4739 if (sopt->sopt_level != SOL_SOCKET) {
4740 if (so->so_proto != NULL &&
4741 so->so_proto->pr_ctloutput != NULL) {
4742 error = (*so->so_proto->pr_ctloutput)(so, sopt);
4743 goto out;
4744 }
4745 error = ENOPROTOOPT;
4746 } else {
4747 /*
4748 * Allow socket-level (SOL_SOCKET) options to be filtered by
4749 * the protocol layer, if needed. A zero value returned from
4750 * the handler means use default socket-level processing as
4751 * done by the rest of this routine. Otherwise, any other
4752 * return value indicates that the option is unsupported.
4753 */
4754 if (so->so_proto != NULL && (error = so->so_proto->pr_usrreqs->
4755 pru_socheckopt(so, sopt)) != 0)
4756 goto out;
4757
4758 error = 0;
4759 switch (sopt->sopt_name) {
4760 case SO_LINGER:
4761 case SO_LINGER_SEC:
4762 error = sooptcopyin(sopt, &l, sizeof (l), sizeof (l));
4763 if (error != 0)
4764 goto out;
4765
4766 so->so_linger = (sopt->sopt_name == SO_LINGER) ?
4767 l.l_linger : l.l_linger * hz;
4768 if (l.l_onoff != 0)
4769 so->so_options |= SO_LINGER;
4770 else
4771 so->so_options &= ~SO_LINGER;
4772 break;
4773
4774 case SO_DEBUG:
4775 case SO_KEEPALIVE:
4776 case SO_DONTROUTE:
4777 case SO_USELOOPBACK:
4778 case SO_BROADCAST:
4779 case SO_REUSEADDR:
4780 case SO_REUSEPORT:
4781 case SO_OOBINLINE:
4782 case SO_TIMESTAMP:
4783 case SO_TIMESTAMP_MONOTONIC:
4784 case SO_DONTTRUNC:
4785 case SO_WANTMORE:
4786 case SO_WANTOOBFLAG:
4787 case SO_NOWAKEFROMSLEEP:
4788 case SO_NOAPNFALLBK:
4789 error = sooptcopyin(sopt, &optval, sizeof (optval),
4790 sizeof (optval));
4791 if (error != 0)
4792 goto out;
4793 if (optval)
4794 so->so_options |= sopt->sopt_name;
4795 else
4796 so->so_options &= ~sopt->sopt_name;
4797 break;
4798
4799 case SO_SNDBUF:
4800 case SO_RCVBUF:
4801 case SO_SNDLOWAT:
4802 case SO_RCVLOWAT:
4803 error = sooptcopyin(sopt, &optval, sizeof (optval),
4804 sizeof (optval));
4805 if (error != 0)
4806 goto out;
4807
4808 /*
4809 * Values < 1 make no sense for any of these
4810 * options, so disallow them.
4811 */
4812 if (optval < 1) {
4813 error = EINVAL;
4814 goto out;
4815 }
4816
4817 switch (sopt->sopt_name) {
4818 case SO_SNDBUF:
4819 case SO_RCVBUF: {
4820 struct sockbuf *sb =
4821 (sopt->sopt_name == SO_SNDBUF) ?
4822 &so->so_snd : &so->so_rcv;
4823 if (sbreserve(sb, (u_int32_t)optval) == 0) {
4824 error = ENOBUFS;
4825 goto out;
4826 }
4827 sb->sb_flags |= SB_USRSIZE;
4828 sb->sb_flags &= ~SB_AUTOSIZE;
4829 sb->sb_idealsize = (u_int32_t)optval;
4830 break;
4831 }
4832 /*
4833 * Make sure the low-water is never greater than
4834 * the high-water.
4835 */
4836 case SO_SNDLOWAT: {
4837 int space = sbspace(&so->so_snd);
4838 u_int32_t hiwat = so->so_snd.sb_hiwat;
4839
4840 if (so->so_snd.sb_flags & SB_UNIX) {
4841 struct unpcb *unp =
4842 (struct unpcb *)(so->so_pcb);
4843 if (unp != NULL &&
4844 unp->unp_conn != NULL) {
4845 hiwat += unp->unp_conn->unp_cc;
4846 }
4847 }
4848
4849 so->so_snd.sb_lowat =
4850 (optval > hiwat) ?
4851 hiwat : optval;
4852
4853 if (space >= so->so_snd.sb_lowat) {
4854 sowwakeup(so);
4855 }
4856 break;
4857 }
4858 case SO_RCVLOWAT: {
4859 int64_t data_len;
4860 so->so_rcv.sb_lowat =
4861 (optval > so->so_rcv.sb_hiwat) ?
4862 so->so_rcv.sb_hiwat : optval;
4863 data_len = so->so_rcv.sb_cc
4864 - so->so_rcv.sb_ctl;
4865 if (data_len >= so->so_rcv.sb_lowat)
4866 sorwakeup(so);
4867 break;
4868 }
4869 }
4870 break;
4871
4872 case SO_SNDTIMEO:
4873 case SO_RCVTIMEO:
4874 error = sooptcopyin_timeval(sopt, &tv);
4875 if (error != 0)
4876 goto out;
4877
4878 switch (sopt->sopt_name) {
4879 case SO_SNDTIMEO:
4880 so->so_snd.sb_timeo = tv;
4881 break;
4882 case SO_RCVTIMEO:
4883 so->so_rcv.sb_timeo = tv;
4884 break;
4885 }
4886 break;
4887
4888 case SO_NKE: {
4889 struct so_nke nke;
4890
4891 error = sooptcopyin(sopt, &nke, sizeof (nke),
4892 sizeof (nke));
4893 if (error != 0)
4894 goto out;
4895
4896 error = sflt_attach_internal(so, nke.nke_handle);
4897 break;
4898 }
4899
4900 case SO_NOSIGPIPE:
4901 error = sooptcopyin(sopt, &optval, sizeof (optval),
4902 sizeof (optval));
4903 if (error != 0)
4904 goto out;
4905 if (optval != 0)
4906 so->so_flags |= SOF_NOSIGPIPE;
4907 else
4908 so->so_flags &= ~SOF_NOSIGPIPE;
4909 break;
4910
4911 case SO_NOADDRERR:
4912 error = sooptcopyin(sopt, &optval, sizeof (optval),
4913 sizeof (optval));
4914 if (error != 0)
4915 goto out;
4916 if (optval != 0)
4917 so->so_flags |= SOF_NOADDRAVAIL;
4918 else
4919 so->so_flags &= ~SOF_NOADDRAVAIL;
4920 break;
4921
4922 case SO_REUSESHAREUID:
4923 error = sooptcopyin(sopt, &optval, sizeof (optval),
4924 sizeof (optval));
4925 if (error != 0)
4926 goto out;
4927 if (optval != 0)
4928 so->so_flags |= SOF_REUSESHAREUID;
4929 else
4930 so->so_flags &= ~SOF_REUSESHAREUID;
4931 break;
4932
4933 case SO_NOTIFYCONFLICT:
4934 if (kauth_cred_issuser(kauth_cred_get()) == 0) {
4935 error = EPERM;
4936 goto out;
4937 }
4938 error = sooptcopyin(sopt, &optval, sizeof (optval),
4939 sizeof (optval));
4940 if (error != 0)
4941 goto out;
4942 if (optval != 0)
4943 so->so_flags |= SOF_NOTIFYCONFLICT;
4944 else
4945 so->so_flags &= ~SOF_NOTIFYCONFLICT;
4946 break;
4947
4948 case SO_RESTRICTIONS:
4949 error = sooptcopyin(sopt, &optval, sizeof (optval),
4950 sizeof (optval));
4951 if (error != 0)
4952 goto out;
4953
4954 error = so_set_restrictions(so, optval);
4955 break;
4956
4957 case SO_AWDL_UNRESTRICTED:
4958 if (SOCK_DOM(so) != PF_INET &&
4959 SOCK_DOM(so) != PF_INET6) {
4960 error = EOPNOTSUPP;
4961 goto out;
4962 }
4963 error = sooptcopyin(sopt, &optval, sizeof(optval),
4964 sizeof(optval));
4965 if (error != 0)
4966 goto out;
4967 if (optval != 0) {
4968 error = soopt_cred_check(so,
4969 PRIV_NET_RESTRICTED_AWDL);
4970 if (error == 0)
4971 inp_set_awdl_unrestricted(
4972 sotoinpcb(so));
4973 } else
4974 inp_clear_awdl_unrestricted(sotoinpcb(so));
4975 break;
4976 case SO_INTCOPROC_ALLOW:
4977 if (SOCK_DOM(so) != PF_INET6) {
4978 error = EOPNOTSUPP;
4979 goto out;
4980 }
4981 error = sooptcopyin(sopt, &optval, sizeof(optval),
4982 sizeof(optval));
4983 if (error != 0)
4984 goto out;
4985 if (optval != 0 &&
4986 inp_get_intcoproc_allowed(sotoinpcb(so)) == FALSE) {
4987 error = soopt_cred_check(so,
4988 PRIV_NET_RESTRICTED_INTCOPROC);
4989 if (error == 0)
4990 inp_set_intcoproc_allowed(
4991 sotoinpcb(so));
4992 } else if (optval == 0)
4993 inp_clear_intcoproc_allowed(sotoinpcb(so));
4994 break;
4995
4996 case SO_LABEL:
4997 #if CONFIG_MACF_SOCKET
4998 if ((error = sooptcopyin(sopt, &extmac, sizeof (extmac),
4999 sizeof (extmac))) != 0)
5000 goto out;
5001
5002 error = mac_setsockopt_label(proc_ucred(sopt->sopt_p),
5003 so, &extmac);
5004 #else
5005 error = EOPNOTSUPP;
5006 #endif /* MAC_SOCKET */
5007 break;
5008
5009 case SO_UPCALLCLOSEWAIT:
5010 error = sooptcopyin(sopt, &optval, sizeof (optval),
5011 sizeof (optval));
5012 if (error != 0)
5013 goto out;
5014 if (optval != 0)
5015 so->so_flags |= SOF_UPCALLCLOSEWAIT;
5016 else
5017 so->so_flags &= ~SOF_UPCALLCLOSEWAIT;
5018 break;
5019
5020 case SO_RANDOMPORT:
5021 error = sooptcopyin(sopt, &optval, sizeof (optval),
5022 sizeof (optval));
5023 if (error != 0)
5024 goto out;
5025 if (optval != 0)
5026 so->so_flags |= SOF_BINDRANDOMPORT;
5027 else
5028 so->so_flags &= ~SOF_BINDRANDOMPORT;
5029 break;
5030
5031 case SO_NP_EXTENSIONS: {
5032 struct so_np_extensions sonpx;
5033
5034 error = sooptcopyin(sopt, &sonpx, sizeof (sonpx),
5035 sizeof (sonpx));
5036 if (error != 0)
5037 goto out;
5038 if (sonpx.npx_mask & ~SONPX_MASK_VALID) {
5039 error = EINVAL;
5040 goto out;
5041 }
5042 /*
5043 * Only one bit defined for now
5044 */
5045 if ((sonpx.npx_mask & SONPX_SETOPTSHUT)) {
5046 if ((sonpx.npx_flags & SONPX_SETOPTSHUT))
5047 so->so_flags |= SOF_NPX_SETOPTSHUT;
5048 else
5049 so->so_flags &= ~SOF_NPX_SETOPTSHUT;
5050 }
5051 break;
5052 }
5053
5054 case SO_TRAFFIC_CLASS: {
5055 error = sooptcopyin(sopt, &optval, sizeof (optval),
5056 sizeof (optval));
5057 if (error != 0)
5058 goto out;
5059 if (optval >= SO_TC_NET_SERVICE_OFFSET) {
5060 int netsvc = optval - SO_TC_NET_SERVICE_OFFSET;
5061 error = so_set_net_service_type(so, netsvc);
5062 goto out;
5063 }
5064 error = so_set_traffic_class(so, optval);
5065 if (error != 0)
5066 goto out;
5067 so->so_flags1 &= ~SOF1_TC_NET_SERV_TYPE;
5068 so->so_netsvctype = _NET_SERVICE_TYPE_UNSPEC;
5069 break;
5070 }
5071
5072 case SO_RECV_TRAFFIC_CLASS: {
5073 error = sooptcopyin(sopt, &optval, sizeof (optval),
5074 sizeof (optval));
5075 if (error != 0)
5076 goto out;
5077 if (optval == 0)
5078 so->so_flags &= ~SOF_RECV_TRAFFIC_CLASS;
5079 else
5080 so->so_flags |= SOF_RECV_TRAFFIC_CLASS;
5081 break;
5082 }
5083
5084 #if (DEVELOPMENT || DEBUG)
5085 case SO_TRAFFIC_CLASS_DBG: {
5086 struct so_tcdbg so_tcdbg;
5087
5088 error = sooptcopyin(sopt, &so_tcdbg,
5089 sizeof (struct so_tcdbg), sizeof (struct so_tcdbg));
5090 if (error != 0)
5091 goto out;
5092 error = so_set_tcdbg(so, &so_tcdbg);
5093 if (error != 0)
5094 goto out;
5095 break;
5096 }
5097 #endif /* (DEVELOPMENT || DEBUG) */
5098
5099 case SO_PRIVILEGED_TRAFFIC_CLASS:
5100 error = priv_check_cred(kauth_cred_get(),
5101 PRIV_NET_PRIVILEGED_TRAFFIC_CLASS, 0);
5102 if (error != 0)
5103 goto out;
5104 error = sooptcopyin(sopt, &optval, sizeof (optval),
5105 sizeof (optval));
5106 if (error != 0)
5107 goto out;
5108 if (optval == 0)
5109 so->so_flags &= ~SOF_PRIVILEGED_TRAFFIC_CLASS;
5110 else
5111 so->so_flags |= SOF_PRIVILEGED_TRAFFIC_CLASS;
5112 break;
5113
5114 case SO_DEFUNCTOK:
5115 error = sooptcopyin(sopt, &optval, sizeof (optval),
5116 sizeof (optval));
5117 if (error != 0 || (so->so_flags & SOF_DEFUNCT)) {
5118 if (error == 0)
5119 error = EBADF;
5120 goto out;
5121 }
5122 /*
5123 * Any process can set SO_DEFUNCTOK (clear
5124 * SOF_NODEFUNCT), but only root can clear
5125 * SO_DEFUNCTOK (set SOF_NODEFUNCT).
5126 */
5127 if (optval == 0 &&
5128 kauth_cred_issuser(kauth_cred_get()) == 0) {
5129 error = EPERM;
5130 goto out;
5131 }
5132 if (optval)
5133 so->so_flags &= ~SOF_NODEFUNCT;
5134 else
5135 so->so_flags |= SOF_NODEFUNCT;
5136
5137 if (SOCK_DOM(so) == PF_INET ||
5138 SOCK_DOM(so) == PF_INET6) {
5139 char s[MAX_IPv6_STR_LEN];
5140 char d[MAX_IPv6_STR_LEN];
5141 struct inpcb *inp = sotoinpcb(so);
5142
5143 SODEFUNCTLOG("%s[%d, %s]: so 0x%llx "
5144 "[%s %s:%d -> %s:%d] is now marked "
5145 "as %seligible for "
5146 "defunct\n", __func__, proc_selfpid(),
5147 proc_best_name(current_proc()),
5148 (uint64_t)DEBUG_KERNEL_ADDRPERM(so),
5149 (SOCK_TYPE(so) == SOCK_STREAM) ?
5150 "TCP" : "UDP", inet_ntop(SOCK_DOM(so),
5151 ((SOCK_DOM(so) == PF_INET) ?
5152 (void *)&inp->inp_laddr.s_addr :
5153 (void *)&inp->in6p_laddr), s, sizeof (s)),
5154 ntohs(inp->in6p_lport),
5155 inet_ntop(SOCK_DOM(so),
5156 (SOCK_DOM(so) == PF_INET) ?
5157 (void *)&inp->inp_faddr.s_addr :
5158 (void *)&inp->in6p_faddr, d, sizeof (d)),
5159 ntohs(inp->in6p_fport),
5160 (so->so_flags & SOF_NODEFUNCT) ?
5161 "not " : "");
5162 } else {
5163 SODEFUNCTLOG("%s[%d, %s]: so 0x%llx [%d,%d] "
5164 "is now marked as %seligible for "
5165 "defunct\n",
5166 __func__, proc_selfpid(),
5167 proc_best_name(current_proc()),
5168 (uint64_t)DEBUG_KERNEL_ADDRPERM(so),
5169 SOCK_DOM(so), SOCK_TYPE(so),
5170 (so->so_flags & SOF_NODEFUNCT) ?
5171 "not " : "");
5172 }
5173 break;
5174
5175 case SO_ISDEFUNCT:
5176 /* This option is not settable */
5177 error = EINVAL;
5178 break;
5179
5180 case SO_OPPORTUNISTIC:
5181 error = sooptcopyin(sopt, &optval, sizeof (optval),
5182 sizeof (optval));
5183 if (error == 0)
5184 error = so_set_opportunistic(so, optval);
5185 break;
5186
5187 case SO_FLUSH:
5188 /* This option is handled by lower layer(s) */
5189 error = 0;
5190 break;
5191
5192 case SO_RECV_ANYIF:
5193 error = sooptcopyin(sopt, &optval, sizeof (optval),
5194 sizeof (optval));
5195 if (error == 0)
5196 error = so_set_recv_anyif(so, optval);
5197 break;
5198
5199 case SO_TRAFFIC_MGT_BACKGROUND: {
5200 /* This option is handled by lower layer(s) */
5201 error = 0;
5202 break;
5203 }
5204
5205 #if FLOW_DIVERT
5206 case SO_FLOW_DIVERT_TOKEN:
5207 error = flow_divert_token_set(so, sopt);
5208 break;
5209 #endif /* FLOW_DIVERT */
5210
5211
5212 case SO_DELEGATED:
5213 if ((error = sooptcopyin(sopt, &optval, sizeof (optval),
5214 sizeof (optval))) != 0)
5215 break;
5216
5217 error = so_set_effective_pid(so, optval, sopt->sopt_p);
5218 break;
5219
5220 case SO_DELEGATED_UUID: {
5221 uuid_t euuid;
5222
5223 if ((error = sooptcopyin(sopt, &euuid, sizeof (euuid),
5224 sizeof (euuid))) != 0)
5225 break;
5226
5227 error = so_set_effective_uuid(so, euuid, sopt->sopt_p);
5228 break;
5229 }
5230
5231 #if NECP
5232 case SO_NECP_ATTRIBUTES:
5233 error = necp_set_socket_attributes(so, sopt);
5234 break;
5235 #endif /* NECP */
5236
5237 #if MPTCP
5238 case SO_MPTCP_FASTJOIN:
5239 if (!((so->so_flags & SOF_MP_SUBFLOW) ||
5240 ((SOCK_CHECK_DOM(so, PF_MULTIPATH)) &&
5241 (SOCK_CHECK_PROTO(so, IPPROTO_TCP))))) {
5242 error = ENOPROTOOPT;
5243 break;
5244 }
5245
5246 error = sooptcopyin(sopt, &optval, sizeof (optval),
5247 sizeof (optval));
5248 if (error != 0)
5249 goto out;
5250 if (optval == 0)
5251 so->so_flags &= ~SOF_MPTCP_FASTJOIN;
5252 else
5253 so->so_flags |= SOF_MPTCP_FASTJOIN;
5254 break;
5255 #endif /* MPTCP */
5256
5257 case SO_EXTENDED_BK_IDLE:
5258 error = sooptcopyin(sopt, &optval, sizeof (optval),
5259 sizeof (optval));
5260 if (error == 0)
5261 error = so_set_extended_bk_idle(so, optval);
5262 break;
5263
5264 case SO_MARK_CELLFALLBACK:
5265 error = sooptcopyin(sopt, &optval, sizeof(optval),
5266 sizeof(optval));
5267 if (error != 0)
5268 goto out;
5269 if (optval < 0) {
5270 error = EINVAL;
5271 goto out;
5272 }
5273 if (optval == 0)
5274 so->so_flags1 &= ~SOF1_CELLFALLBACK;
5275 else
5276 so->so_flags1 |= SOF1_CELLFALLBACK;
5277 break;
5278
5279 case SO_NET_SERVICE_TYPE: {
5280 error = sooptcopyin(sopt, &optval, sizeof(optval),
5281 sizeof(optval));
5282 if (error != 0)
5283 goto out;
5284 error = so_set_net_service_type(so, optval);
5285 break;
5286 }
5287
5288 case SO_QOSMARKING_POLICY_OVERRIDE:
5289 error = priv_check_cred(kauth_cred_get(),
5290 PRIV_NET_QOSMARKING_POLICY_OVERRIDE, 0);
5291 if (error != 0)
5292 goto out;
5293 error = sooptcopyin(sopt, &optval, sizeof(optval),
5294 sizeof(optval));
5295 if (error != 0)
5296 goto out;
5297 if (optval == 0)
5298 so->so_flags1 &= ~SOF1_QOSMARKING_POLICY_OVERRIDE;
5299 else
5300 so->so_flags1 |= SOF1_QOSMARKING_POLICY_OVERRIDE;
5301 break;
5302
5303 default:
5304 error = ENOPROTOOPT;
5305 break;
5306 }
5307 if (error == 0 && so->so_proto != NULL &&
5308 so->so_proto->pr_ctloutput != NULL) {
5309 (void) so->so_proto->pr_ctloutput(so, sopt);
5310 }
5311 }
5312 out:
5313 if (dolock)
5314 socket_unlock(so, 1);
5315 return (error);
5316 }
5317
5318 /* Helper routines for getsockopt */
5319 int
5320 sooptcopyout(struct sockopt *sopt, void *buf, size_t len)
5321 {
5322 int error;
5323 size_t valsize;
5324
5325 error = 0;
5326
5327 /*
5328 * Documented get behavior is that we always return a value,
5329 * possibly truncated to fit in the user's buffer.
5330 * Traditional behavior is that we always tell the user
5331 * precisely how much we copied, rather than something useful
5332 * like the total amount we had available for her.
5333 * Note that this interface is not idempotent; the entire answer must
5334 * generated ahead of time.
5335 */
5336 valsize = min(len, sopt->sopt_valsize);
5337 sopt->sopt_valsize = valsize;
5338 if (sopt->sopt_val != USER_ADDR_NULL) {
5339 if (sopt->sopt_p != kernproc)
5340 error = copyout(buf, sopt->sopt_val, valsize);
5341 else
5342 bcopy(buf, CAST_DOWN(caddr_t, sopt->sopt_val), valsize);
5343 }
5344 return (error);
5345 }
5346
5347 static int
5348 sooptcopyout_timeval(struct sockopt *sopt, const struct timeval *tv_p)
5349 {
5350 int error;
5351 size_t len;
5352 struct user64_timeval tv64;
5353 struct user32_timeval tv32;
5354 const void * val;
5355 size_t valsize;
5356
5357 error = 0;
5358 if (proc_is64bit(sopt->sopt_p)) {
5359 len = sizeof (tv64);
5360 tv64.tv_sec = tv_p->tv_sec;
5361 tv64.tv_usec = tv_p->tv_usec;
5362 val = &tv64;
5363 } else {
5364 len = sizeof (tv32);
5365 tv32.tv_sec = tv_p->tv_sec;
5366 tv32.tv_usec = tv_p->tv_usec;
5367 val = &tv32;
5368 }
5369 valsize = min(len, sopt->sopt_valsize);
5370 sopt->sopt_valsize = valsize;
5371 if (sopt->sopt_val != USER_ADDR_NULL) {
5372 if (sopt->sopt_p != kernproc)
5373 error = copyout(val, sopt->sopt_val, valsize);
5374 else
5375 bcopy(val, CAST_DOWN(caddr_t, sopt->sopt_val), valsize);
5376 }
5377 return (error);
5378 }
5379
5380 /*
5381 * Return: 0 Success
5382 * ENOPROTOOPT
5383 * <pr_ctloutput>:EOPNOTSUPP[AF_UNIX]
5384 * <pr_ctloutput>:???
5385 * <sf_getoption>:???
5386 */
5387 int
5388 sogetoptlock(struct socket *so, struct sockopt *sopt, int dolock)
5389 {
5390 int error, optval;
5391 struct linger l;
5392 struct timeval tv;
5393 #if CONFIG_MACF_SOCKET
5394 struct mac extmac;
5395 #endif /* MAC_SOCKET */
5396
5397 if (sopt->sopt_dir != SOPT_GET)
5398 sopt->sopt_dir = SOPT_GET;
5399
5400 if (dolock)
5401 socket_lock(so, 1);
5402
5403 error = sflt_getsockopt(so, sopt);
5404 if (error != 0) {
5405 if (error == EJUSTRETURN)
5406 error = 0;
5407 goto out;
5408 }
5409
5410 if (sopt->sopt_level != SOL_SOCKET) {
5411 if (so->so_proto != NULL &&
5412 so->so_proto->pr_ctloutput != NULL) {
5413 error = (*so->so_proto->pr_ctloutput)(so, sopt);
5414 goto out;
5415 }
5416 error = ENOPROTOOPT;
5417 } else {
5418 /*
5419 * Allow socket-level (SOL_SOCKET) options to be filtered by
5420 * the protocol layer, if needed. A zero value returned from
5421 * the handler means use default socket-level processing as
5422 * done by the rest of this routine. Otherwise, any other
5423 * return value indicates that the option is unsupported.
5424 */
5425 if (so->so_proto != NULL && (error = so->so_proto->pr_usrreqs->
5426 pru_socheckopt(so, sopt)) != 0)
5427 goto out;
5428
5429 error = 0;
5430 switch (sopt->sopt_name) {
5431 case SO_LINGER:
5432 case SO_LINGER_SEC:
5433 l.l_onoff = ((so->so_options & SO_LINGER) ? 1 : 0);
5434 l.l_linger = (sopt->sopt_name == SO_LINGER) ?
5435 so->so_linger : so->so_linger / hz;
5436 error = sooptcopyout(sopt, &l, sizeof (l));
5437 break;
5438
5439 case SO_USELOOPBACK:
5440 case SO_DONTROUTE:
5441 case SO_DEBUG:
5442 case SO_KEEPALIVE:
5443 case SO_REUSEADDR:
5444 case SO_REUSEPORT:
5445 case SO_BROADCAST:
5446 case SO_OOBINLINE:
5447 case SO_TIMESTAMP:
5448 case SO_TIMESTAMP_MONOTONIC:
5449 case SO_DONTTRUNC:
5450 case SO_WANTMORE:
5451 case SO_WANTOOBFLAG:
5452 case SO_NOWAKEFROMSLEEP:
5453 case SO_NOAPNFALLBK:
5454 optval = so->so_options & sopt->sopt_name;
5455 integer:
5456 error = sooptcopyout(sopt, &optval, sizeof (optval));
5457 break;
5458
5459 case SO_TYPE:
5460 optval = so->so_type;
5461 goto integer;
5462
5463 case SO_NREAD:
5464 if (so->so_proto->pr_flags & PR_ATOMIC) {
5465 int pkt_total;
5466 struct mbuf *m1;
5467
5468 pkt_total = 0;
5469 m1 = so->so_rcv.sb_mb;
5470 while (m1 != NULL) {
5471 if (m1->m_type == MT_DATA ||
5472 m1->m_type == MT_HEADER ||
5473 m1->m_type == MT_OOBDATA)
5474 pkt_total += m1->m_len;
5475 m1 = m1->m_next;
5476 }
5477 optval = pkt_total;
5478 } else {
5479 optval = so->so_rcv.sb_cc - so->so_rcv.sb_ctl;
5480 }
5481 goto integer;
5482
5483 case SO_NUMRCVPKT:
5484 if (so->so_proto->pr_flags & PR_ATOMIC) {
5485 int cnt = 0;
5486 struct mbuf *m1;
5487
5488 m1 = so->so_rcv.sb_mb;
5489 while (m1 != NULL) {
5490 if (m1->m_type == MT_DATA ||
5491 m1->m_type == MT_HEADER ||
5492 m1->m_type == MT_OOBDATA)
5493 cnt += 1;
5494 m1 = m1->m_nextpkt;
5495 }
5496 optval = cnt;
5497 goto integer;
5498 } else {
5499 error = EINVAL;
5500 break;
5501 }
5502
5503 case SO_NWRITE:
5504 optval = so->so_snd.sb_cc;
5505 goto integer;
5506
5507 case SO_ERROR:
5508 optval = so->so_error;
5509 so->so_error = 0;
5510 goto integer;
5511
5512 case SO_SNDBUF: {
5513 u_int32_t hiwat = so->so_snd.sb_hiwat;
5514
5515 if (so->so_snd.sb_flags & SB_UNIX) {
5516 struct unpcb *unp =
5517 (struct unpcb *)(so->so_pcb);
5518 if (unp != NULL && unp->unp_conn != NULL) {
5519 hiwat += unp->unp_conn->unp_cc;
5520 }
5521 }
5522
5523 optval = hiwat;
5524 goto integer;
5525 }
5526 case SO_RCVBUF:
5527 optval = so->so_rcv.sb_hiwat;
5528 goto integer;
5529
5530 case SO_SNDLOWAT:
5531 optval = so->so_snd.sb_lowat;
5532 goto integer;
5533
5534 case SO_RCVLOWAT:
5535 optval = so->so_rcv.sb_lowat;
5536 goto integer;
5537
5538 case SO_SNDTIMEO:
5539 case SO_RCVTIMEO:
5540 tv = (sopt->sopt_name == SO_SNDTIMEO ?
5541 so->so_snd.sb_timeo : so->so_rcv.sb_timeo);
5542
5543 error = sooptcopyout_timeval(sopt, &tv);
5544 break;
5545
5546 case SO_NOSIGPIPE:
5547 optval = (so->so_flags & SOF_NOSIGPIPE);
5548 goto integer;
5549
5550 case SO_NOADDRERR:
5551 optval = (so->so_flags & SOF_NOADDRAVAIL);
5552 goto integer;
5553
5554 case SO_REUSESHAREUID:
5555 optval = (so->so_flags & SOF_REUSESHAREUID);
5556 goto integer;
5557
5558
5559 case SO_NOTIFYCONFLICT:
5560 optval = (so->so_flags & SOF_NOTIFYCONFLICT);
5561 goto integer;
5562
5563 case SO_RESTRICTIONS:
5564 optval = so_get_restrictions(so);
5565 goto integer;
5566
5567 case SO_AWDL_UNRESTRICTED:
5568 if (SOCK_DOM(so) == PF_INET ||
5569 SOCK_DOM(so) == PF_INET6) {
5570 optval = inp_get_awdl_unrestricted(
5571 sotoinpcb(so));
5572 goto integer;
5573 } else
5574 error = EOPNOTSUPP;
5575 break;
5576
5577 case SO_INTCOPROC_ALLOW:
5578 if (SOCK_DOM(so) == PF_INET6) {
5579 optval = inp_get_intcoproc_allowed(
5580 sotoinpcb(so));
5581 goto integer;
5582 } else
5583 error = EOPNOTSUPP;
5584 break;
5585
5586 case SO_LABEL:
5587 #if CONFIG_MACF_SOCKET
5588 if ((error = sooptcopyin(sopt, &extmac, sizeof (extmac),
5589 sizeof (extmac))) != 0 ||
5590 (error = mac_socket_label_get(proc_ucred(
5591 sopt->sopt_p), so, &extmac)) != 0)
5592 break;
5593
5594 error = sooptcopyout(sopt, &extmac, sizeof (extmac));
5595 #else
5596 error = EOPNOTSUPP;
5597 #endif /* MAC_SOCKET */
5598 break;
5599
5600 case SO_PEERLABEL:
5601 #if CONFIG_MACF_SOCKET
5602 if ((error = sooptcopyin(sopt, &extmac, sizeof (extmac),
5603 sizeof (extmac))) != 0 ||
5604 (error = mac_socketpeer_label_get(proc_ucred(
5605 sopt->sopt_p), so, &extmac)) != 0)
5606 break;
5607
5608 error = sooptcopyout(sopt, &extmac, sizeof (extmac));
5609 #else
5610 error = EOPNOTSUPP;
5611 #endif /* MAC_SOCKET */
5612 break;
5613
5614 #ifdef __APPLE_API_PRIVATE
5615 case SO_UPCALLCLOSEWAIT:
5616 optval = (so->so_flags & SOF_UPCALLCLOSEWAIT);
5617 goto integer;
5618 #endif
5619 case SO_RANDOMPORT:
5620 optval = (so->so_flags & SOF_BINDRANDOMPORT);
5621 goto integer;
5622
5623 case SO_NP_EXTENSIONS: {
5624 struct so_np_extensions sonpx;
5625
5626 sonpx.npx_flags = (so->so_flags & SOF_NPX_SETOPTSHUT) ?
5627 SONPX_SETOPTSHUT : 0;
5628 sonpx.npx_mask = SONPX_MASK_VALID;
5629
5630 error = sooptcopyout(sopt, &sonpx,
5631 sizeof (struct so_np_extensions));
5632 break;
5633 }
5634
5635 case SO_TRAFFIC_CLASS:
5636 optval = so->so_traffic_class;
5637 goto integer;
5638
5639 case SO_RECV_TRAFFIC_CLASS:
5640 optval = (so->so_flags & SOF_RECV_TRAFFIC_CLASS);
5641 goto integer;
5642
5643 case SO_TRAFFIC_CLASS_STATS:
5644 error = sooptcopyout(sopt, &so->so_tc_stats,
5645 sizeof (so->so_tc_stats));
5646 break;
5647
5648 #if (DEVELOPMENT || DEBUG)
5649 case SO_TRAFFIC_CLASS_DBG:
5650 error = sogetopt_tcdbg(so, sopt);
5651 break;
5652 #endif /* (DEVELOPMENT || DEBUG) */
5653
5654 case SO_PRIVILEGED_TRAFFIC_CLASS:
5655 optval = (so->so_flags & SOF_PRIVILEGED_TRAFFIC_CLASS);
5656 goto integer;
5657
5658 case SO_DEFUNCTOK:
5659 optval = !(so->so_flags & SOF_NODEFUNCT);
5660 goto integer;
5661
5662 case SO_ISDEFUNCT:
5663 optval = (so->so_flags & SOF_DEFUNCT);
5664 goto integer;
5665
5666 case SO_OPPORTUNISTIC:
5667 optval = so_get_opportunistic(so);
5668 goto integer;
5669
5670 case SO_FLUSH:
5671 /* This option is not gettable */
5672 error = EINVAL;
5673 break;
5674
5675 case SO_RECV_ANYIF:
5676 optval = so_get_recv_anyif(so);
5677 goto integer;
5678
5679 case SO_TRAFFIC_MGT_BACKGROUND:
5680 /* This option is handled by lower layer(s) */
5681 if (so->so_proto != NULL &&
5682 so->so_proto->pr_ctloutput != NULL) {
5683 (void) so->so_proto->pr_ctloutput(so, sopt);
5684 }
5685 break;
5686
5687 #if FLOW_DIVERT
5688 case SO_FLOW_DIVERT_TOKEN:
5689 error = flow_divert_token_get(so, sopt);
5690 break;
5691 #endif /* FLOW_DIVERT */
5692
5693 #if NECP
5694 case SO_NECP_ATTRIBUTES:
5695 error = necp_get_socket_attributes(so, sopt);
5696 break;
5697 #endif /* NECP */
5698
5699 #if CONTENT_FILTER
5700 case SO_CFIL_SOCK_ID: {
5701 cfil_sock_id_t sock_id;
5702
5703 sock_id = cfil_sock_id_from_socket(so);
5704
5705 error = sooptcopyout(sopt, &sock_id,
5706 sizeof(cfil_sock_id_t));
5707 break;
5708 }
5709 #endif /* CONTENT_FILTER */
5710
5711 #if MPTCP
5712 case SO_MPTCP_FASTJOIN:
5713 if (!((so->so_flags & SOF_MP_SUBFLOW) ||
5714 ((SOCK_CHECK_DOM(so, PF_MULTIPATH)) &&
5715 (SOCK_CHECK_PROTO(so, IPPROTO_TCP))))) {
5716 error = ENOPROTOOPT;
5717 break;
5718 }
5719 optval = (so->so_flags & SOF_MPTCP_FASTJOIN);
5720 /* Fixed along with rdar://19391339 */
5721 goto integer;
5722 #endif /* MPTCP */
5723
5724 case SO_EXTENDED_BK_IDLE:
5725 optval = (so->so_flags1 & SOF1_EXTEND_BK_IDLE_WANTED);
5726 goto integer;
5727 case SO_MARK_CELLFALLBACK:
5728 optval = ((so->so_flags1 & SOF1_CELLFALLBACK) > 0)
5729 ? 1 : 0;
5730 goto integer;
5731 case SO_NET_SERVICE_TYPE: {
5732 if ((so->so_flags1 & SOF1_TC_NET_SERV_TYPE))
5733 optval = so->so_netsvctype;
5734 else
5735 optval = NET_SERVICE_TYPE_BE;
5736 goto integer;
5737 }
5738 case SO_NETSVC_MARKING_LEVEL:
5739 optval = so_get_netsvc_marking_level(so);
5740 goto integer;
5741
5742 default:
5743 error = ENOPROTOOPT;
5744 break;
5745 }
5746 }
5747 out:
5748 if (dolock)
5749 socket_unlock(so, 1);
5750 return (error);
5751 }
5752
5753 /*
5754 * The size limits on our soopt_getm is different from that on FreeBSD.
5755 * We limit the size of options to MCLBYTES. This will have to change
5756 * if we need to define options that need more space than MCLBYTES.
5757 */
5758 int
5759 soopt_getm(struct sockopt *sopt, struct mbuf **mp)
5760 {
5761 struct mbuf *m, *m_prev;
5762 int sopt_size = sopt->sopt_valsize;
5763 int how;
5764
5765 if (sopt_size <= 0 || sopt_size > MCLBYTES)
5766 return (EMSGSIZE);
5767
5768 how = sopt->sopt_p != kernproc ? M_WAIT : M_DONTWAIT;
5769 MGET(m, how, MT_DATA);
5770 if (m == NULL)
5771 return (ENOBUFS);
5772 if (sopt_size > MLEN) {
5773 MCLGET(m, how);
5774 if ((m->m_flags & M_EXT) == 0) {
5775 m_free(m);
5776 return (ENOBUFS);
5777 }
5778 m->m_len = min(MCLBYTES, sopt_size);
5779 } else {
5780 m->m_len = min(MLEN, sopt_size);
5781 }
5782 sopt_size -= m->m_len;
5783 *mp = m;
5784 m_prev = m;
5785
5786 while (sopt_size > 0) {
5787 MGET(m, how, MT_DATA);
5788 if (m == NULL) {
5789 m_freem(*mp);
5790 return (ENOBUFS);
5791 }
5792 if (sopt_size > MLEN) {
5793 MCLGET(m, how);
5794 if ((m->m_flags & M_EXT) == 0) {
5795 m_freem(*mp);
5796 m_freem(m);
5797 return (ENOBUFS);
5798 }
5799 m->m_len = min(MCLBYTES, sopt_size);
5800 } else {
5801 m->m_len = min(MLEN, sopt_size);
5802 }
5803 sopt_size -= m->m_len;
5804 m_prev->m_next = m;
5805 m_prev = m;
5806 }
5807 return (0);
5808 }
5809
5810 /* copyin sopt data into mbuf chain */
5811 int
5812 soopt_mcopyin(struct sockopt *sopt, struct mbuf *m)
5813 {
5814 struct mbuf *m0 = m;
5815
5816 if (sopt->sopt_val == USER_ADDR_NULL)
5817 return (0);
5818 while (m != NULL && sopt->sopt_valsize >= m->m_len) {
5819 if (sopt->sopt_p != kernproc) {
5820 int error;
5821
5822 error = copyin(sopt->sopt_val, mtod(m, char *),
5823 m->m_len);
5824 if (error != 0) {
5825 m_freem(m0);
5826 return (error);
5827 }
5828 } else {
5829 bcopy(CAST_DOWN(caddr_t, sopt->sopt_val),
5830 mtod(m, char *), m->m_len);
5831 }
5832 sopt->sopt_valsize -= m->m_len;
5833 sopt->sopt_val += m->m_len;
5834 m = m->m_next;
5835 }
5836 /* should be allocated enoughly at ip6_sooptmcopyin() */
5837 if (m != NULL) {
5838 panic("soopt_mcopyin");
5839 /* NOTREACHED */
5840 }
5841 return (0);
5842 }
5843
5844 /* copyout mbuf chain data into soopt */
5845 int
5846 soopt_mcopyout(struct sockopt *sopt, struct mbuf *m)
5847 {
5848 struct mbuf *m0 = m;
5849 size_t valsize = 0;
5850
5851 if (sopt->sopt_val == USER_ADDR_NULL)
5852 return (0);
5853 while (m != NULL && sopt->sopt_valsize >= m->m_len) {
5854 if (sopt->sopt_p != kernproc) {
5855 int error;
5856
5857 error = copyout(mtod(m, char *), sopt->sopt_val,
5858 m->m_len);
5859 if (error != 0) {
5860 m_freem(m0);
5861 return (error);
5862 }
5863 } else {
5864 bcopy(mtod(m, char *),
5865 CAST_DOWN(caddr_t, sopt->sopt_val), m->m_len);
5866 }
5867 sopt->sopt_valsize -= m->m_len;
5868 sopt->sopt_val += m->m_len;
5869 valsize += m->m_len;
5870 m = m->m_next;
5871 }
5872 if (m != NULL) {
5873 /* enough soopt buffer should be given from user-land */
5874 m_freem(m0);
5875 return (EINVAL);
5876 }
5877 sopt->sopt_valsize = valsize;
5878 return (0);
5879 }
5880
5881 void
5882 sohasoutofband(struct socket *so)
5883 {
5884 if (so->so_pgid < 0)
5885 gsignal(-so->so_pgid, SIGURG);
5886 else if (so->so_pgid > 0)
5887 proc_signal(so->so_pgid, SIGURG);
5888 selwakeup(&so->so_rcv.sb_sel);
5889 if (so->so_rcv.sb_flags & SB_KNOTE) {
5890 KNOTE(&so->so_rcv.sb_sel.si_note,
5891 (NOTE_OOB | SO_FILT_HINT_LOCKED));
5892 }
5893 }
5894
5895 int
5896 sopoll(struct socket *so, int events, kauth_cred_t cred, void * wql)
5897 {
5898 #pragma unused(cred)
5899 struct proc *p = current_proc();
5900 int revents = 0;
5901
5902 socket_lock(so, 1);
5903 so_update_last_owner_locked(so, PROC_NULL);
5904 so_update_policy(so);
5905
5906 if (events & (POLLIN | POLLRDNORM))
5907 if (soreadable(so))
5908 revents |= events & (POLLIN | POLLRDNORM);
5909
5910 if (events & (POLLOUT | POLLWRNORM))
5911 if (sowriteable(so))
5912 revents |= events & (POLLOUT | POLLWRNORM);
5913
5914 if (events & (POLLPRI | POLLRDBAND))
5915 if (so->so_oobmark || (so->so_state & SS_RCVATMARK))
5916 revents |= events & (POLLPRI | POLLRDBAND);
5917
5918 if (revents == 0) {
5919 if (events & (POLLIN | POLLPRI | POLLRDNORM | POLLRDBAND)) {
5920 /*
5921 * Darwin sets the flag first,
5922 * BSD calls selrecord first
5923 */
5924 so->so_rcv.sb_flags |= SB_SEL;
5925 selrecord(p, &so->so_rcv.sb_sel, wql);
5926 }
5927
5928 if (events & (POLLOUT | POLLWRNORM)) {
5929 /*
5930 * Darwin sets the flag first,
5931 * BSD calls selrecord first
5932 */
5933 so->so_snd.sb_flags |= SB_SEL;
5934 selrecord(p, &so->so_snd.sb_sel, wql);
5935 }
5936 }
5937
5938 socket_unlock(so, 1);
5939 return (revents);
5940 }
5941
5942 int
5943 soo_kqfilter(struct fileproc *fp, struct knote *kn, vfs_context_t ctx)
5944 {
5945 #pragma unused(fp)
5946 #if !CONFIG_MACF_SOCKET
5947 #pragma unused(ctx)
5948 #endif /* MAC_SOCKET */
5949 struct socket *so = (struct socket *)kn->kn_fp->f_fglob->fg_data;
5950 int result;
5951
5952 socket_lock(so, 1);
5953 so_update_last_owner_locked(so, PROC_NULL);
5954 so_update_policy(so);
5955
5956 #if CONFIG_MACF_SOCKET
5957 if (mac_socket_check_kqfilter(proc_ucred(vfs_context_proc(ctx)),
5958 kn, so) != 0) {
5959 socket_unlock(so, 1);
5960 kn->kn_flags = EV_ERROR;
5961 kn->kn_data = EPERM;
5962 return 0;
5963 }
5964 #endif /* MAC_SOCKET */
5965
5966 switch (kn->kn_filter) {
5967 case EVFILT_READ:
5968 kn->kn_filtid = EVFILTID_SOREAD;
5969 break;
5970 case EVFILT_WRITE:
5971 kn->kn_filtid = EVFILTID_SOWRITE;
5972 break;
5973 case EVFILT_SOCK:
5974 kn->kn_filtid = EVFILTID_SCK;
5975 break;
5976 case EVFILT_EXCEPT:
5977 kn->kn_filtid = EVFILTID_SOEXCEPT;
5978 break;
5979 default:
5980 socket_unlock(so, 1);
5981 kn->kn_flags = EV_ERROR;
5982 kn->kn_data = EINVAL;
5983 return 0;
5984 }
5985
5986 /*
5987 * call the appropriate sub-filter attach
5988 * with the socket still locked
5989 */
5990 result = knote_fops(kn)->f_attach(kn);
5991
5992 socket_unlock(so, 1);
5993
5994 return result;
5995 }
5996
5997 static int
5998 filt_soread_common(struct knote *kn, struct socket *so)
5999 {
6000 if (so->so_options & SO_ACCEPTCONN) {
6001 int is_not_empty;
6002
6003 /*
6004 * Radar 6615193 handle the listen case dynamically
6005 * for kqueue read filter. This allows to call listen()
6006 * after registering the kqueue EVFILT_READ.
6007 */
6008
6009 kn->kn_data = so->so_qlen;
6010 is_not_empty = ! TAILQ_EMPTY(&so->so_comp);
6011
6012 return (is_not_empty);
6013 }
6014
6015 /* socket isn't a listener */
6016 /*
6017 * NOTE_LOWAT specifies new low water mark in data, i.e.
6018 * the bytes of protocol data. We therefore exclude any
6019 * control bytes.
6020 */
6021 kn->kn_data = so->so_rcv.sb_cc - so->so_rcv.sb_ctl;
6022
6023 if (kn->kn_sfflags & NOTE_OOB) {
6024 if (so->so_oobmark || (so->so_state & SS_RCVATMARK)) {
6025 kn->kn_fflags |= NOTE_OOB;
6026 kn->kn_data -= so->so_oobmark;
6027 return (1);
6028 }
6029 }
6030
6031 if ((so->so_state & SS_CANTRCVMORE)
6032 #if CONTENT_FILTER
6033 && cfil_sock_data_pending(&so->so_rcv) == 0
6034 #endif /* CONTENT_FILTER */
6035 ) {
6036 kn->kn_flags |= EV_EOF;
6037 kn->kn_fflags = so->so_error;
6038 return (1);
6039 }
6040
6041 if (so->so_error) { /* temporary udp error */
6042 return (1);
6043 }
6044
6045 int64_t lowwat = so->so_rcv.sb_lowat;
6046 /*
6047 * Ensure that when NOTE_LOWAT is used, the derived
6048 * low water mark is bounded by socket's rcv buf's
6049 * high and low water mark values.
6050 */
6051 if (kn->kn_sfflags & NOTE_LOWAT) {
6052 if (kn->kn_sdata > so->so_rcv.sb_hiwat)
6053 lowwat = so->so_rcv.sb_hiwat;
6054 else if (kn->kn_sdata > lowwat)
6055 lowwat = kn->kn_sdata;
6056 }
6057
6058 /*
6059 * The order below is important. Since NOTE_LOWAT
6060 * overrides sb_lowat, check for NOTE_LOWAT case
6061 * first.
6062 */
6063 if (kn->kn_sfflags & NOTE_LOWAT)
6064 return (kn->kn_data >= lowwat);
6065
6066 return (so->so_rcv.sb_cc >= lowwat);
6067 }
6068
6069 static int
6070 filt_sorattach(struct knote *kn)
6071 {
6072 struct socket *so = (struct socket *)kn->kn_fp->f_fglob->fg_data;
6073
6074 /* socket locked */
6075
6076 /*
6077 * If the caller explicitly asked for OOB results (e.g. poll())
6078 * from EVFILT_READ, then save that off in the hookid field
6079 * and reserve the kn_flags EV_OOBAND bit for output only.
6080 */
6081 if (kn->kn_filter == EVFILT_READ &&
6082 kn->kn_flags & EV_OOBAND) {
6083 kn->kn_flags &= ~EV_OOBAND;
6084 kn->kn_hookid = EV_OOBAND;
6085 } else {
6086 kn->kn_hookid = 0;
6087 }
6088 if (KNOTE_ATTACH(&so->so_rcv.sb_sel.si_note, kn))
6089 so->so_rcv.sb_flags |= SB_KNOTE;
6090
6091 /* indicate if event is already fired */
6092 return filt_soread_common(kn, so);
6093 }
6094
6095 static void
6096 filt_sordetach(struct knote *kn)
6097 {
6098 struct socket *so = (struct socket *)kn->kn_fp->f_fglob->fg_data;
6099
6100 socket_lock(so, 1);
6101 if (so->so_rcv.sb_flags & SB_KNOTE)
6102 if (KNOTE_DETACH(&so->so_rcv.sb_sel.si_note, kn))
6103 so->so_rcv.sb_flags &= ~SB_KNOTE;
6104 socket_unlock(so, 1);
6105 }
6106
6107 /*ARGSUSED*/
6108 static int
6109 filt_soread(struct knote *kn, long hint)
6110 {
6111 struct socket *so = (struct socket *)kn->kn_fp->f_fglob->fg_data;
6112 int retval;
6113
6114 if ((hint & SO_FILT_HINT_LOCKED) == 0)
6115 socket_lock(so, 1);
6116
6117 retval = filt_soread_common(kn, so);
6118
6119 if ((hint & SO_FILT_HINT_LOCKED) == 0)
6120 socket_unlock(so, 1);
6121
6122 return retval;
6123 }
6124
6125 static int
6126 filt_sortouch(struct knote *kn, struct kevent_internal_s *kev)
6127 {
6128 struct socket *so = (struct socket *)kn->kn_fp->f_fglob->fg_data;
6129 int retval;
6130
6131 socket_lock(so, 1);
6132
6133 /* save off the new input fflags and data */
6134 kn->kn_sfflags = kev->fflags;
6135 kn->kn_sdata = kev->data;
6136 if ((kn->kn_status & KN_UDATA_SPECIFIC) == 0)
6137 kn->kn_udata = kev->udata;
6138
6139 /* determine if changes result in fired events */
6140 retval = filt_soread_common(kn, so);
6141
6142 socket_unlock(so, 1);
6143
6144 return retval;
6145 }
6146
6147 static int
6148 filt_sorprocess(struct knote *kn, struct filt_process_s *data, struct kevent_internal_s *kev)
6149 {
6150 #pragma unused(data)
6151 struct socket *so = (struct socket *)kn->kn_fp->f_fglob->fg_data;
6152 int retval;
6153
6154 socket_lock(so, 1);
6155 retval = filt_soread_common(kn, so);
6156 if (retval) {
6157 *kev = kn->kn_kevent;
6158 if (kn->kn_flags & EV_CLEAR) {
6159 kn->kn_fflags = 0;
6160 kn->kn_data = 0;
6161 }
6162 }
6163 socket_unlock(so, 1);
6164
6165 return retval;
6166 }
6167
6168 int
6169 so_wait_for_if_feedback(struct socket *so)
6170 {
6171 if ((SOCK_DOM(so) == PF_INET || SOCK_DOM(so) == PF_INET6) &&
6172 (so->so_state & SS_ISCONNECTED)) {
6173 struct inpcb *inp = sotoinpcb(so);
6174 if (INP_WAIT_FOR_IF_FEEDBACK(inp))
6175 return (1);
6176 }
6177 return (0);
6178 }
6179
6180 static int
6181 filt_sowrite_common(struct knote *kn, struct socket *so)
6182 {
6183 int ret = 0;
6184
6185 kn->kn_data = sbspace(&so->so_snd);
6186 if (so->so_state & SS_CANTSENDMORE) {
6187 kn->kn_flags |= EV_EOF;
6188 kn->kn_fflags = so->so_error;
6189 return 1;
6190 }
6191 if (so->so_error) { /* temporary udp error */
6192 return 1;
6193 }
6194 if (!socanwrite(so)) {
6195 return 0;
6196 }
6197 if (so->so_flags1 & SOF1_PRECONNECT_DATA) {
6198 return 1;
6199 }
6200 int64_t lowwat = so->so_snd.sb_lowat;
6201 if (kn->kn_sfflags & NOTE_LOWAT) {
6202 if (kn->kn_sdata > so->so_snd.sb_hiwat)
6203 lowwat = so->so_snd.sb_hiwat;
6204 else if (kn->kn_sdata > lowwat)
6205 lowwat = kn->kn_sdata;
6206 }
6207 if (kn->kn_data >= lowwat) {
6208 if ((so->so_flags & SOF_NOTSENT_LOWAT)
6209 #if (DEBUG || DEVELOPMENT)
6210 && so_notsent_lowat_check == 1
6211 #endif /* DEBUG || DEVELOPMENT */
6212 ) {
6213 if ((SOCK_DOM(so) == PF_INET ||
6214 SOCK_DOM(so) == PF_INET6) &&
6215 so->so_type == SOCK_STREAM) {
6216 ret = tcp_notsent_lowat_check(so);
6217 }
6218 #if MPTCP
6219 else if ((SOCK_DOM(so) == PF_MULTIPATH) &&
6220 (SOCK_PROTO(so) == IPPROTO_TCP)) {
6221 ret = mptcp_notsent_lowat_check(so);
6222 }
6223 #endif
6224 else {
6225 return 1;
6226 }
6227 } else {
6228 ret = 1;
6229 }
6230 }
6231 if (so_wait_for_if_feedback(so))
6232 ret = 0;
6233 return (ret);
6234 }
6235
6236 static int
6237 filt_sowattach(struct knote *kn)
6238 {
6239 struct socket *so = (struct socket *)kn->kn_fp->f_fglob->fg_data;
6240
6241 /* socket locked */
6242 if (KNOTE_ATTACH(&so->so_snd.sb_sel.si_note, kn))
6243 so->so_snd.sb_flags |= SB_KNOTE;
6244
6245 /* determine if its already fired */
6246 return filt_sowrite_common(kn, so);
6247 }
6248
6249 static void
6250 filt_sowdetach(struct knote *kn)
6251 {
6252 struct socket *so = (struct socket *)kn->kn_fp->f_fglob->fg_data;
6253 socket_lock(so, 1);
6254
6255 if (so->so_snd.sb_flags & SB_KNOTE)
6256 if (KNOTE_DETACH(&so->so_snd.sb_sel.si_note, kn))
6257 so->so_snd.sb_flags &= ~SB_KNOTE;
6258 socket_unlock(so, 1);
6259 }
6260
6261 /*ARGSUSED*/
6262 static int
6263 filt_sowrite(struct knote *kn, long hint)
6264 {
6265 struct socket *so = (struct socket *)kn->kn_fp->f_fglob->fg_data;
6266 int ret;
6267
6268 if ((hint & SO_FILT_HINT_LOCKED) == 0)
6269 socket_lock(so, 1);
6270
6271 ret = filt_sowrite_common(kn, so);
6272
6273 if ((hint & SO_FILT_HINT_LOCKED) == 0)
6274 socket_unlock(so, 1);
6275
6276 return ret;
6277 }
6278
6279 static int
6280 filt_sowtouch(struct knote *kn, struct kevent_internal_s *kev)
6281 {
6282 struct socket *so = (struct socket *)kn->kn_fp->f_fglob->fg_data;
6283 int ret;
6284
6285 socket_lock(so, 1);
6286
6287 /*save off the new input fflags and data */
6288 kn->kn_sfflags = kev->fflags;
6289 kn->kn_sdata = kev->data;
6290 if ((kn->kn_status & KN_UDATA_SPECIFIC) == 0)
6291 kn->kn_udata = kev->udata;
6292
6293 /* determine if these changes result in a triggered event */
6294 ret = filt_sowrite_common(kn, so);
6295
6296 socket_unlock(so, 1);
6297
6298 return ret;
6299 }
6300
6301 static int
6302 filt_sowprocess(struct knote *kn, struct filt_process_s *data, struct kevent_internal_s *kev)
6303 {
6304 #pragma unused(data)
6305 struct socket *so = (struct socket *)kn->kn_fp->f_fglob->fg_data;
6306 int ret;
6307
6308 socket_lock(so, 1);
6309 ret = filt_sowrite_common(kn, so);
6310 if (ret) {
6311 *kev = kn->kn_kevent;
6312 if (kn->kn_flags & EV_CLEAR) {
6313 kn->kn_fflags = 0;
6314 kn->kn_data = 0;
6315 }
6316 }
6317 socket_unlock(so, 1);
6318 return ret;
6319 }
6320
6321 static int
6322 filt_sockev_common(struct knote *kn, struct socket *so, long ev_hint)
6323 {
6324 int ret = 0;
6325 uint32_t level_trigger = 0;
6326
6327 if (ev_hint & SO_FILT_HINT_CONNRESET) {
6328 kn->kn_fflags |= NOTE_CONNRESET;
6329 }
6330 if (ev_hint & SO_FILT_HINT_TIMEOUT) {
6331 kn->kn_fflags |= NOTE_TIMEOUT;
6332 }
6333 if (ev_hint & SO_FILT_HINT_NOSRCADDR) {
6334 kn->kn_fflags |= NOTE_NOSRCADDR;
6335 }
6336 if (ev_hint & SO_FILT_HINT_IFDENIED) {
6337 kn->kn_fflags |= NOTE_IFDENIED;
6338 }
6339 if (ev_hint & SO_FILT_HINT_KEEPALIVE) {
6340 kn->kn_fflags |= NOTE_KEEPALIVE;
6341 }
6342 if (ev_hint & SO_FILT_HINT_ADAPTIVE_WTIMO) {
6343 kn->kn_fflags |= NOTE_ADAPTIVE_WTIMO;
6344 }
6345 if (ev_hint & SO_FILT_HINT_ADAPTIVE_RTIMO) {
6346 kn->kn_fflags |= NOTE_ADAPTIVE_RTIMO;
6347 }
6348 if ((ev_hint & SO_FILT_HINT_CONNECTED) ||
6349 (so->so_state & SS_ISCONNECTED)) {
6350 kn->kn_fflags |= NOTE_CONNECTED;
6351 level_trigger |= NOTE_CONNECTED;
6352 }
6353 if ((ev_hint & SO_FILT_HINT_DISCONNECTED) ||
6354 (so->so_state & SS_ISDISCONNECTED)) {
6355 kn->kn_fflags |= NOTE_DISCONNECTED;
6356 level_trigger |= NOTE_DISCONNECTED;
6357 }
6358 if (ev_hint & SO_FILT_HINT_CONNINFO_UPDATED) {
6359 if (so->so_proto != NULL &&
6360 (so->so_proto->pr_flags & PR_EVCONNINFO))
6361 kn->kn_fflags |= NOTE_CONNINFO_UPDATED;
6362 }
6363
6364 if ((ev_hint & SO_FILT_HINT_NOTIFY_ACK) ||
6365 tcp_notify_ack_active(so)) {
6366 kn->kn_fflags |= NOTE_NOTIFY_ACK;
6367 }
6368
6369 if ((so->so_state & SS_CANTRCVMORE)
6370 #if CONTENT_FILTER
6371 && cfil_sock_data_pending(&so->so_rcv) == 0
6372 #endif /* CONTENT_FILTER */
6373 ) {
6374 kn->kn_fflags |= NOTE_READCLOSED;
6375 level_trigger |= NOTE_READCLOSED;
6376 }
6377
6378 if (so->so_state & SS_CANTSENDMORE) {
6379 kn->kn_fflags |= NOTE_WRITECLOSED;
6380 level_trigger |= NOTE_WRITECLOSED;
6381 }
6382
6383 if ((ev_hint & SO_FILT_HINT_SUSPEND) ||
6384 (so->so_flags & SOF_SUSPENDED)) {
6385 kn->kn_fflags &= ~(NOTE_SUSPEND | NOTE_RESUME);
6386
6387 /* If resume event was delivered before, reset it */
6388 kn->kn_hookid &= ~NOTE_RESUME;
6389
6390 kn->kn_fflags |= NOTE_SUSPEND;
6391 level_trigger |= NOTE_SUSPEND;
6392 }
6393
6394 if ((ev_hint & SO_FILT_HINT_RESUME) ||
6395 (so->so_flags & SOF_SUSPENDED) == 0) {
6396 kn->kn_fflags &= ~(NOTE_SUSPEND | NOTE_RESUME);
6397
6398 /* If suspend event was delivered before, reset it */
6399 kn->kn_hookid &= ~NOTE_SUSPEND;
6400
6401 kn->kn_fflags |= NOTE_RESUME;
6402 level_trigger |= NOTE_RESUME;
6403 }
6404
6405 if (so->so_error != 0) {
6406 ret = 1;
6407 kn->kn_data = so->so_error;
6408 kn->kn_flags |= EV_EOF;
6409 } else {
6410 get_sockev_state(so, (u_int32_t *)&(kn->kn_data));
6411 }
6412
6413 /* Reset any events that are not requested on this knote */
6414 kn->kn_fflags &= (kn->kn_sfflags & EVFILT_SOCK_ALL_MASK);
6415 level_trigger &= (kn->kn_sfflags & EVFILT_SOCK_ALL_MASK);
6416
6417 /* Find the level triggerred events that are already delivered */
6418 level_trigger &= kn->kn_hookid;
6419 level_trigger &= EVFILT_SOCK_LEVEL_TRIGGER_MASK;
6420
6421 /* Do not deliver level triggerred events more than once */
6422 if ((kn->kn_fflags & ~level_trigger) != 0)
6423 ret = 1;
6424
6425 return (ret);
6426 }
6427
6428 static int
6429 filt_sockattach(struct knote *kn)
6430 {
6431 struct socket *so = (struct socket *)kn->kn_fp->f_fglob->fg_data;
6432
6433 /* socket locked */
6434 kn->kn_hookid = 0;
6435 if (KNOTE_ATTACH(&so->so_klist, kn))
6436 so->so_flags |= SOF_KNOTE;
6437
6438 /* determine if event already fired */
6439 return filt_sockev_common(kn, so, 0);
6440 }
6441
6442 static void
6443 filt_sockdetach(struct knote *kn)
6444 {
6445 struct socket *so = (struct socket *)kn->kn_fp->f_fglob->fg_data;
6446 socket_lock(so, 1);
6447
6448 if ((so->so_flags & SOF_KNOTE) != 0)
6449 if (KNOTE_DETACH(&so->so_klist, kn))
6450 so->so_flags &= ~SOF_KNOTE;
6451 socket_unlock(so, 1);
6452 }
6453
6454 static int
6455 filt_sockev(struct knote *kn, long hint)
6456 {
6457 int ret = 0, locked = 0;
6458 struct socket *so = (struct socket *)kn->kn_fp->f_fglob->fg_data;
6459 long ev_hint = (hint & SO_FILT_HINT_EV);
6460
6461 if ((hint & SO_FILT_HINT_LOCKED) == 0) {
6462 socket_lock(so, 1);
6463 locked = 1;
6464 }
6465
6466 ret = filt_sockev_common(kn, so, ev_hint);
6467
6468 if (locked)
6469 socket_unlock(so, 1);
6470
6471 return ret;
6472 }
6473
6474
6475
6476 /*
6477 * filt_socktouch - update event state
6478 */
6479 static int
6480 filt_socktouch(
6481 struct knote *kn,
6482 struct kevent_internal_s *kev)
6483 {
6484 struct socket *so = (struct socket *)kn->kn_fp->f_fglob->fg_data;
6485 uint32_t changed_flags;
6486 int ret;
6487
6488 socket_lock(so, 1);
6489
6490 /* save off the [result] data and fflags */
6491 changed_flags = (kn->kn_sfflags ^ kn->kn_hookid);
6492
6493 /* save off the new input fflags and data */
6494 kn->kn_sfflags = kev->fflags;
6495 kn->kn_sdata = kev->data;
6496 if ((kn->kn_status & KN_UDATA_SPECIFIC) == 0)
6497 kn->kn_udata = kev->udata;
6498
6499 /* restrict the current results to the (smaller?) set of new interest */
6500 /*
6501 * For compatibility with previous implementations, we leave kn_fflags
6502 * as they were before.
6503 */
6504 //kn->kn_fflags &= kev->fflags;
6505
6506 /*
6507 * Since we keep track of events that are already
6508 * delivered, if any of those events are not requested
6509 * anymore the state related to them can be reset
6510 */
6511 kn->kn_hookid &=
6512 ~(changed_flags & EVFILT_SOCK_LEVEL_TRIGGER_MASK);
6513
6514 /* determine if we have events to deliver */
6515 ret = filt_sockev_common(kn, so, 0);
6516
6517 socket_unlock(so, 1);
6518
6519 return ret;
6520 }
6521
6522 /*
6523 * filt_sockprocess - query event fired state and return data
6524 */
6525 static int
6526 filt_sockprocess(
6527 struct knote *kn,
6528 struct filt_process_s *data,
6529 struct kevent_internal_s *kev)
6530 {
6531 #pragma unused(data)
6532
6533 struct socket *so = (struct socket *)kn->kn_fp->f_fglob->fg_data;
6534 int ret = 0;
6535
6536 socket_lock(so, 1);
6537
6538 ret = filt_sockev_common(kn, so, 0);
6539 if (ret) {
6540 *kev = kn->kn_kevent;
6541
6542 /*
6543 * Store the state of the events being delivered. This
6544 * state can be used to deliver level triggered events
6545 * ateast once and still avoid waking up the application
6546 * multiple times as long as the event is active.
6547 */
6548 if (kn->kn_fflags != 0)
6549 kn->kn_hookid |= (kn->kn_fflags &
6550 EVFILT_SOCK_LEVEL_TRIGGER_MASK);
6551
6552 /*
6553 * NOTE_RESUME and NOTE_SUSPEND are an exception, deliver
6554 * only one of them and remember the last one that was
6555 * delivered last
6556 */
6557 if (kn->kn_fflags & NOTE_SUSPEND)
6558 kn->kn_hookid &= ~NOTE_RESUME;
6559 if (kn->kn_fflags & NOTE_RESUME)
6560 kn->kn_hookid &= ~NOTE_SUSPEND;
6561
6562 if (kn->kn_flags & EV_CLEAR) {
6563 kn->kn_data = 0;
6564 kn->kn_fflags = 0;
6565 }
6566 }
6567
6568 socket_unlock(so, 1);
6569
6570 return ret;
6571 }
6572
6573 void
6574 get_sockev_state(struct socket *so, u_int32_t *statep)
6575 {
6576 u_int32_t state = *(statep);
6577
6578 /*
6579 * If the state variable is already used by a previous event,
6580 * reset it.
6581 */
6582 if (state != 0)
6583 return;
6584
6585 if (so->so_state & SS_ISCONNECTED)
6586 state |= SOCKEV_CONNECTED;
6587 else
6588 state &= ~(SOCKEV_CONNECTED);
6589 state |= ((so->so_state & SS_ISDISCONNECTED) ? SOCKEV_DISCONNECTED : 0);
6590 *(statep) = state;
6591 }
6592
6593 #define SO_LOCK_HISTORY_STR_LEN \
6594 (2 * SO_LCKDBG_MAX * (2 + (2 * sizeof (void *)) + 1) + 1)
6595
6596 __private_extern__ const char *
6597 solockhistory_nr(struct socket *so)
6598 {
6599 size_t n = 0;
6600 int i;
6601 static char lock_history_str[SO_LOCK_HISTORY_STR_LEN];
6602
6603 bzero(lock_history_str, sizeof (lock_history_str));
6604 for (i = SO_LCKDBG_MAX - 1; i >= 0; i--) {
6605 n += snprintf(lock_history_str + n,
6606 SO_LOCK_HISTORY_STR_LEN - n, "%p:%p ",
6607 so->lock_lr[(so->next_lock_lr + i) % SO_LCKDBG_MAX],
6608 so->unlock_lr[(so->next_unlock_lr + i) % SO_LCKDBG_MAX]);
6609 }
6610 return (lock_history_str);
6611 }
6612
6613 int
6614 socket_lock(struct socket *so, int refcount)
6615 {
6616 int error = 0;
6617 void *lr_saved;
6618
6619 lr_saved = __builtin_return_address(0);
6620
6621 if (so->so_proto->pr_lock) {
6622 error = (*so->so_proto->pr_lock)(so, refcount, lr_saved);
6623 } else {
6624 #ifdef MORE_LOCKING_DEBUG
6625 lck_mtx_assert(so->so_proto->pr_domain->dom_mtx,
6626 LCK_MTX_ASSERT_NOTOWNED);
6627 #endif
6628 lck_mtx_lock(so->so_proto->pr_domain->dom_mtx);
6629 if (refcount)
6630 so->so_usecount++;
6631 so->lock_lr[so->next_lock_lr] = lr_saved;
6632 so->next_lock_lr = (so->next_lock_lr+1) % SO_LCKDBG_MAX;
6633 }
6634
6635 return (error);
6636 }
6637
6638 int
6639 socket_unlock(struct socket *so, int refcount)
6640 {
6641 int error = 0;
6642 void *lr_saved;
6643 lck_mtx_t *mutex_held;
6644
6645 lr_saved = __builtin_return_address(0);
6646
6647 if (so->so_proto == NULL) {
6648 panic("%s: null so_proto so=%p\n", __func__, so);
6649 /* NOTREACHED */
6650 }
6651
6652 if (so && so->so_proto->pr_unlock) {
6653 error = (*so->so_proto->pr_unlock)(so, refcount, lr_saved);
6654 } else {
6655 mutex_held = so->so_proto->pr_domain->dom_mtx;
6656 #ifdef MORE_LOCKING_DEBUG
6657 lck_mtx_assert(mutex_held, LCK_MTX_ASSERT_OWNED);
6658 #endif
6659 so->unlock_lr[so->next_unlock_lr] = lr_saved;
6660 so->next_unlock_lr = (so->next_unlock_lr+1) % SO_LCKDBG_MAX;
6661
6662 if (refcount) {
6663 if (so->so_usecount <= 0) {
6664 panic("%s: bad refcount=%d so=%p (%d, %d, %d) "
6665 "lrh=%s", __func__, so->so_usecount, so,
6666 SOCK_DOM(so), so->so_type,
6667 SOCK_PROTO(so), solockhistory_nr(so));
6668 /* NOTREACHED */
6669 }
6670
6671 so->so_usecount--;
6672 if (so->so_usecount == 0)
6673 sofreelastref(so, 1);
6674 }
6675 lck_mtx_unlock(mutex_held);
6676 }
6677
6678 return (error);
6679 }
6680
6681 /* Called with socket locked, will unlock socket */
6682 void
6683 sofree(struct socket *so)
6684 {
6685 lck_mtx_t *mutex_held;
6686
6687 if (so->so_proto->pr_getlock != NULL)
6688 mutex_held = (*so->so_proto->pr_getlock)(so, 0);
6689 else
6690 mutex_held = so->so_proto->pr_domain->dom_mtx;
6691 lck_mtx_assert(mutex_held, LCK_MTX_ASSERT_OWNED);
6692
6693 sofreelastref(so, 0);
6694 }
6695
6696 void
6697 soreference(struct socket *so)
6698 {
6699 socket_lock(so, 1); /* locks & take one reference on socket */
6700 socket_unlock(so, 0); /* unlock only */
6701 }
6702
6703 void
6704 sodereference(struct socket *so)
6705 {
6706 socket_lock(so, 0);
6707 socket_unlock(so, 1);
6708 }
6709
6710 /*
6711 * Set or clear SOF_MULTIPAGES on the socket to enable or disable the
6712 * possibility of using jumbo clusters. Caller must ensure to hold
6713 * the socket lock.
6714 */
6715 void
6716 somultipages(struct socket *so, boolean_t set)
6717 {
6718 if (set)
6719 so->so_flags |= SOF_MULTIPAGES;
6720 else
6721 so->so_flags &= ~SOF_MULTIPAGES;
6722 }
6723
6724 void
6725 soif2kcl(struct socket *so, boolean_t set)
6726 {
6727 if (set)
6728 so->so_flags1 |= SOF1_IF_2KCL;
6729 else
6730 so->so_flags1 &= ~SOF1_IF_2KCL;
6731 }
6732
6733 int
6734 so_isdstlocal(struct socket *so) {
6735
6736 struct inpcb *inp = (struct inpcb *)so->so_pcb;
6737
6738 if (SOCK_DOM(so) == PF_INET)
6739 return (inaddr_local(inp->inp_faddr));
6740 else if (SOCK_DOM(so) == PF_INET6)
6741 return (in6addr_local(&inp->in6p_faddr));
6742
6743 return (0);
6744 }
6745
6746 int
6747 sosetdefunct(struct proc *p, struct socket *so, int level, boolean_t noforce)
6748 {
6749 struct sockbuf *rcv, *snd;
6750 int err = 0, defunct;
6751
6752 rcv = &so->so_rcv;
6753 snd = &so->so_snd;
6754
6755 defunct = (so->so_flags & SOF_DEFUNCT);
6756 if (defunct) {
6757 if (!(snd->sb_flags & rcv->sb_flags & SB_DROP)) {
6758 panic("%s: SB_DROP not set", __func__);
6759 /* NOTREACHED */
6760 }
6761 goto done;
6762 }
6763
6764 if (so->so_flags & SOF_NODEFUNCT) {
6765 if (noforce) {
6766 err = EOPNOTSUPP;
6767 SODEFUNCTLOG("%s[%d, %s]: (target pid %d "
6768 "name %s level %d) so 0x%llx [%d,%d] "
6769 "is not eligible for defunct "
6770 "(%d)\n", __func__, proc_selfpid(),
6771 proc_best_name(current_proc()), proc_pid(p),
6772 proc_best_name(p), level,
6773 (uint64_t)DEBUG_KERNEL_ADDRPERM(so),
6774 SOCK_DOM(so), SOCK_TYPE(so), err);
6775 return (err);
6776 }
6777 so->so_flags &= ~SOF_NODEFUNCT;
6778 SODEFUNCTLOG("%s[%d, %s]: (target pid %d name %s level %d) "
6779 "so 0x%llx [%d,%d] defunct by force\n", __func__,
6780 proc_selfpid(), proc_best_name(current_proc()),
6781 proc_pid(p), proc_best_name(p), level,
6782 (uint64_t)DEBUG_KERNEL_ADDRPERM(so),
6783 SOCK_DOM(so), SOCK_TYPE(so));
6784 } else if (so->so_flags1 & SOF1_EXTEND_BK_IDLE_WANTED) {
6785 struct inpcb *inp = (struct inpcb *)so->so_pcb;
6786 struct ifnet *ifp = inp->inp_last_outifp;
6787
6788 if (ifp && IFNET_IS_CELLULAR(ifp)) {
6789 OSIncrementAtomic(&soextbkidlestat.so_xbkidle_nocell);
6790 } else if (so->so_flags & SOF_DELEGATED) {
6791 OSIncrementAtomic(&soextbkidlestat.so_xbkidle_nodlgtd);
6792 } else if (soextbkidlestat.so_xbkidle_time == 0) {
6793 OSIncrementAtomic(&soextbkidlestat.so_xbkidle_notime);
6794 } else if (noforce) {
6795 OSIncrementAtomic(&soextbkidlestat.so_xbkidle_active);
6796
6797 so->so_flags1 |= SOF1_EXTEND_BK_IDLE_INPROG;
6798 so->so_extended_bk_start = net_uptime();
6799 OSBitOrAtomic(P_LXBKIDLEINPROG, &p->p_ladvflag);
6800
6801 inpcb_timer_sched(inp->inp_pcbinfo, INPCB_TIMER_LAZY);
6802
6803 err = EOPNOTSUPP;
6804 SODEFUNCTLOG("%s[%d, %s]: (target pid %d name %s "
6805 "level %d) extend bk idle so 0x%llx rcv hw %d "
6806 "cc %d\n",
6807 __func__, proc_selfpid(),
6808 proc_best_name(current_proc()), proc_pid(p),
6809 proc_best_name(p), level,
6810 (uint64_t)DEBUG_KERNEL_ADDRPERM(so),
6811 so->so_rcv.sb_hiwat, so->so_rcv.sb_cc);
6812 return (err);
6813 } else {
6814 OSIncrementAtomic(&soextbkidlestat.so_xbkidle_forced);
6815 }
6816 }
6817
6818 so->so_flags |= SOF_DEFUNCT;
6819
6820 /* Prevent further data from being appended to the socket buffers */
6821 snd->sb_flags |= SB_DROP;
6822 rcv->sb_flags |= SB_DROP;
6823
6824 /* Flush any existing data in the socket buffers */
6825 if (rcv->sb_cc != 0) {
6826 rcv->sb_flags &= ~SB_SEL;
6827 selthreadclear(&rcv->sb_sel);
6828 sbrelease(rcv);
6829 }
6830 if (snd->sb_cc != 0) {
6831 snd->sb_flags &= ~SB_SEL;
6832 selthreadclear(&snd->sb_sel);
6833 sbrelease(snd);
6834 }
6835
6836 done:
6837 SODEFUNCTLOG("%s[%d, %s]: (target pid %d name %s level %d) "
6838 "so 0x%llx [%d,%d] %s defunct%s\n", __func__, proc_selfpid(),
6839 proc_best_name(current_proc()), proc_pid(p), proc_best_name(p),
6840 level, (uint64_t)DEBUG_KERNEL_ADDRPERM(so), SOCK_DOM(so),
6841 SOCK_TYPE(so), defunct ? "is already" : "marked as",
6842 (so->so_flags1 & SOF1_EXTEND_BK_IDLE_WANTED) ? " extbkidle" : "");
6843
6844 return (err);
6845 }
6846
6847 int
6848 sodefunct(struct proc *p, struct socket *so, int level)
6849 {
6850 struct sockbuf *rcv, *snd;
6851
6852 if (!(so->so_flags & SOF_DEFUNCT)) {
6853 panic("%s improperly called", __func__);
6854 /* NOTREACHED */
6855 }
6856 if (so->so_state & SS_DEFUNCT)
6857 goto done;
6858
6859 rcv = &so->so_rcv;
6860 snd = &so->so_snd;
6861
6862 if (SOCK_DOM(so) == PF_INET || SOCK_DOM(so) == PF_INET6) {
6863 char s[MAX_IPv6_STR_LEN];
6864 char d[MAX_IPv6_STR_LEN];
6865 struct inpcb *inp = sotoinpcb(so);
6866
6867 SODEFUNCTLOG("%s[%d, %s]: (target pid %d name %s level %d) "
6868 "so 0x%llx [%s %s:%d -> %s:%d] is now defunct "
6869 "[rcv_si 0x%x, snd_si 0x%x, rcv_fl 0x%x, snd_fl 0x%x]\n",
6870 __func__, proc_selfpid(), proc_best_name(current_proc()),
6871 proc_pid(p), proc_best_name(p), level,
6872 (uint64_t)DEBUG_KERNEL_ADDRPERM(so),
6873 (SOCK_TYPE(so) == SOCK_STREAM) ? "TCP" : "UDP",
6874 inet_ntop(SOCK_DOM(so), ((SOCK_DOM(so) == PF_INET) ?
6875 (void *)&inp->inp_laddr.s_addr : (void *)&inp->in6p_laddr),
6876 s, sizeof (s)), ntohs(inp->in6p_lport),
6877 inet_ntop(SOCK_DOM(so), (SOCK_DOM(so) == PF_INET) ?
6878 (void *)&inp->inp_faddr.s_addr : (void *)&inp->in6p_faddr,
6879 d, sizeof (d)), ntohs(inp->in6p_fport),
6880 (uint32_t)rcv->sb_sel.si_flags,
6881 (uint32_t)snd->sb_sel.si_flags,
6882 rcv->sb_flags, snd->sb_flags);
6883 } else {
6884 SODEFUNCTLOG("%s[%d, %s]: (target pid %d name %s level %d) "
6885 "so 0x%llx [%d,%d] is now defunct [rcv_si 0x%x, "
6886 "snd_si 0x%x, rcv_fl 0x%x, snd_fl 0x%x]\n", __func__,
6887 proc_selfpid(), proc_best_name(current_proc()),
6888 proc_pid(p), proc_best_name(p), level,
6889 (uint64_t)DEBUG_KERNEL_ADDRPERM(so),
6890 SOCK_DOM(so), SOCK_TYPE(so),
6891 (uint32_t)rcv->sb_sel.si_flags,
6892 (uint32_t)snd->sb_sel.si_flags, rcv->sb_flags,
6893 snd->sb_flags);
6894 }
6895
6896 /*
6897 * Unwedge threads blocked on sbwait() and sb_lock().
6898 */
6899 sbwakeup(rcv);
6900 sbwakeup(snd);
6901
6902 so->so_flags1 |= SOF1_DEFUNCTINPROG;
6903 if (rcv->sb_flags & SB_LOCK)
6904 sbunlock(rcv, TRUE); /* keep socket locked */
6905 if (snd->sb_flags & SB_LOCK)
6906 sbunlock(snd, TRUE); /* keep socket locked */
6907
6908 /*
6909 * Flush the buffers and disconnect. We explicitly call shutdown
6910 * on both data directions to ensure that SS_CANT{RCV,SEND}MORE
6911 * states are set for the socket. This would also flush out data
6912 * hanging off the receive list of this socket.
6913 */
6914 (void) soshutdownlock_final(so, SHUT_RD);
6915 (void) soshutdownlock_final(so, SHUT_WR);
6916 (void) sodisconnectlocked(so);
6917
6918 /*
6919 * Explicitly handle connectionless-protocol disconnection
6920 * and release any remaining data in the socket buffers.
6921 */
6922 if (!(so->so_flags & SS_ISDISCONNECTED))
6923 (void) soisdisconnected(so);
6924
6925 if (so->so_error == 0)
6926 so->so_error = EBADF;
6927
6928 if (rcv->sb_cc != 0) {
6929 rcv->sb_flags &= ~SB_SEL;
6930 selthreadclear(&rcv->sb_sel);
6931 sbrelease(rcv);
6932 }
6933 if (snd->sb_cc != 0) {
6934 snd->sb_flags &= ~SB_SEL;
6935 selthreadclear(&snd->sb_sel);
6936 sbrelease(snd);
6937 }
6938 so->so_state |= SS_DEFUNCT;
6939 OSIncrementAtomicLong((volatile long *)&sodefunct_calls);
6940
6941 done:
6942 return (0);
6943 }
6944
6945 int
6946 soresume(struct proc *p, struct socket *so, int locked)
6947 {
6948 if (locked == 0)
6949 socket_lock(so, 1);
6950
6951 if (so->so_flags1 & SOF1_EXTEND_BK_IDLE_INPROG) {
6952 SODEFUNCTLOG("%s[%d, %s]: (target pid %d name %s) so 0x%llx "
6953 "[%d,%d] resumed from bk idle\n",
6954 __func__, proc_selfpid(), proc_best_name(current_proc()),
6955 proc_pid(p), proc_best_name(p),
6956 (uint64_t)DEBUG_KERNEL_ADDRPERM(so),
6957 SOCK_DOM(so), SOCK_TYPE(so));
6958
6959 so->so_flags1 &= ~SOF1_EXTEND_BK_IDLE_INPROG;
6960 so->so_extended_bk_start = 0;
6961 OSBitAndAtomic(~P_LXBKIDLEINPROG, &p->p_ladvflag);
6962
6963 OSIncrementAtomic(&soextbkidlestat.so_xbkidle_resumed);
6964 OSDecrementAtomic(&soextbkidlestat.so_xbkidle_active);
6965 VERIFY(soextbkidlestat.so_xbkidle_active >= 0);
6966 }
6967 if (locked == 0)
6968 socket_unlock(so, 1);
6969
6970 return (0);
6971 }
6972
6973 /*
6974 * Does not attempt to account for sockets that are delegated from
6975 * the current process
6976 */
6977 int
6978 so_set_extended_bk_idle(struct socket *so, int optval)
6979 {
6980 int error = 0;
6981
6982 if ((SOCK_DOM(so) != PF_INET && SOCK_DOM(so) != PF_INET6) ||
6983 SOCK_PROTO(so) != IPPROTO_TCP) {
6984 OSDecrementAtomic(&soextbkidlestat.so_xbkidle_notsupp);
6985 error = EOPNOTSUPP;
6986 } else if (optval == 0) {
6987 so->so_flags1 &= ~SOF1_EXTEND_BK_IDLE_WANTED;
6988
6989 soresume(current_proc(), so, 1);
6990 } else {
6991 struct proc *p = current_proc();
6992 int i;
6993 struct filedesc *fdp;
6994 int count = 0;
6995
6996 proc_fdlock(p);
6997
6998 fdp = p->p_fd;
6999 for (i = 0; i < fdp->fd_nfiles; i++) {
7000 struct fileproc *fp = fdp->fd_ofiles[i];
7001 struct socket *so2;
7002
7003 if (fp == NULL ||
7004 (fdp->fd_ofileflags[i] & UF_RESERVED) != 0 ||
7005 FILEGLOB_DTYPE(fp->f_fglob) != DTYPE_SOCKET)
7006 continue;
7007
7008 so2 = (struct socket *)fp->f_fglob->fg_data;
7009 if (so != so2 &&
7010 so2->so_flags1 & SOF1_EXTEND_BK_IDLE_WANTED)
7011 count++;
7012 if (count >= soextbkidlestat.so_xbkidle_maxperproc)
7013 break;
7014 }
7015 if (count >= soextbkidlestat.so_xbkidle_maxperproc) {
7016 OSIncrementAtomic(&soextbkidlestat.so_xbkidle_toomany);
7017 error = EBUSY;
7018 } else if (so->so_flags & SOF_DELEGATED) {
7019 OSIncrementAtomic(&soextbkidlestat.so_xbkidle_nodlgtd);
7020 error = EBUSY;
7021 } else {
7022 so->so_flags1 |= SOF1_EXTEND_BK_IDLE_WANTED;
7023 OSIncrementAtomic(&soextbkidlestat.so_xbkidle_wantok);
7024 }
7025 SODEFUNCTLOG("%s[%d, %s]: so 0x%llx [%d,%d] "
7026 "%s marked for extended bk idle\n",
7027 __func__, proc_selfpid(), proc_best_name(current_proc()),
7028 (uint64_t)DEBUG_KERNEL_ADDRPERM(so),
7029 SOCK_DOM(so), SOCK_TYPE(so),
7030 (so->so_flags1 & SOF1_EXTEND_BK_IDLE_WANTED) ?
7031 "is" : "not");
7032
7033 proc_fdunlock(p);
7034 }
7035
7036 return (error);
7037 }
7038
7039 static void
7040 so_stop_extended_bk_idle(struct socket *so)
7041 {
7042 so->so_flags1 &= ~SOF1_EXTEND_BK_IDLE_INPROG;
7043 so->so_extended_bk_start = 0;
7044
7045 OSDecrementAtomic(&soextbkidlestat.so_xbkidle_active);
7046 VERIFY(soextbkidlestat.so_xbkidle_active >= 0);
7047 /*
7048 * Force defunct
7049 */
7050 sosetdefunct(current_proc(), so,
7051 SHUTDOWN_SOCKET_LEVEL_DISCONNECT_INTERNAL, FALSE);
7052 if (so->so_flags & SOF_DEFUNCT) {
7053 sodefunct(current_proc(), so,
7054 SHUTDOWN_SOCKET_LEVEL_DISCONNECT_INTERNAL);
7055 }
7056 }
7057
7058 void
7059 so_drain_extended_bk_idle(struct socket *so)
7060 {
7061 if (so && (so->so_flags1 & SOF1_EXTEND_BK_IDLE_INPROG)) {
7062 /*
7063 * Only penalize sockets that have outstanding data
7064 */
7065 if (so->so_rcv.sb_cc || so->so_snd.sb_cc) {
7066 so_stop_extended_bk_idle(so);
7067
7068 OSIncrementAtomic(&soextbkidlestat.so_xbkidle_drained);
7069 }
7070 }
7071 }
7072
7073 /*
7074 * Return values tells if socket is still in extended background idle
7075 */
7076 int
7077 so_check_extended_bk_idle_time(struct socket *so)
7078 {
7079 int ret = 1;
7080
7081 if ((so->so_flags1 & SOF1_EXTEND_BK_IDLE_INPROG)) {
7082 SODEFUNCTLOG("%s[%d, %s]: so 0x%llx [%d,%d]\n",
7083 __func__, proc_selfpid(), proc_best_name(current_proc()),
7084 (uint64_t)DEBUG_KERNEL_ADDRPERM(so),
7085 SOCK_DOM(so), SOCK_TYPE(so));
7086 if (net_uptime() - so->so_extended_bk_start >
7087 soextbkidlestat.so_xbkidle_time) {
7088 so_stop_extended_bk_idle(so);
7089
7090 OSIncrementAtomic(&soextbkidlestat.so_xbkidle_expired);
7091
7092 ret = 0;
7093 } else {
7094 struct inpcb *inp = (struct inpcb *)so->so_pcb;
7095
7096 inpcb_timer_sched(inp->inp_pcbinfo, INPCB_TIMER_LAZY);
7097 OSIncrementAtomic(&soextbkidlestat.so_xbkidle_resched);
7098 }
7099 }
7100
7101 return (ret);
7102 }
7103
7104 void
7105 resume_proc_sockets(proc_t p)
7106 {
7107 if (p->p_ladvflag & P_LXBKIDLEINPROG) {
7108 struct filedesc *fdp;
7109 int i;
7110
7111 proc_fdlock(p);
7112 fdp = p->p_fd;
7113 for (i = 0; i < fdp->fd_nfiles; i++) {
7114 struct fileproc *fp;
7115 struct socket *so;
7116
7117 fp = fdp->fd_ofiles[i];
7118 if (fp == NULL ||
7119 (fdp->fd_ofileflags[i] & UF_RESERVED) != 0 ||
7120 FILEGLOB_DTYPE(fp->f_fglob) != DTYPE_SOCKET)
7121 continue;
7122
7123 so = (struct socket *)fp->f_fglob->fg_data;
7124 (void) soresume(p, so, 0);
7125 }
7126 proc_fdunlock(p);
7127
7128 OSBitAndAtomic(~P_LXBKIDLEINPROG, &p->p_ladvflag);
7129 }
7130 }
7131
7132 __private_extern__ int
7133 so_set_recv_anyif(struct socket *so, int optval)
7134 {
7135 int ret = 0;
7136
7137 #if INET6
7138 if (SOCK_DOM(so) == PF_INET || SOCK_DOM(so) == PF_INET6) {
7139 #else
7140 if (SOCK_DOM(so) == PF_INET) {
7141 #endif /* !INET6 */
7142 if (optval)
7143 sotoinpcb(so)->inp_flags |= INP_RECV_ANYIF;
7144 else
7145 sotoinpcb(so)->inp_flags &= ~INP_RECV_ANYIF;
7146 }
7147
7148 return (ret);
7149 }
7150
7151 __private_extern__ int
7152 so_get_recv_anyif(struct socket *so)
7153 {
7154 int ret = 0;
7155
7156 #if INET6
7157 if (SOCK_DOM(so) == PF_INET || SOCK_DOM(so) == PF_INET6) {
7158 #else
7159 if (SOCK_DOM(so) == PF_INET) {
7160 #endif /* !INET6 */
7161 ret = (sotoinpcb(so)->inp_flags & INP_RECV_ANYIF) ? 1 : 0;
7162 }
7163
7164 return (ret);
7165 }
7166
7167 int
7168 so_set_restrictions(struct socket *so, uint32_t vals)
7169 {
7170 int nocell_old, nocell_new;
7171 int noexpensive_old, noexpensive_new;
7172
7173 /*
7174 * Deny-type restrictions are trapdoors; once set they cannot be
7175 * unset for the lifetime of the socket. This allows them to be
7176 * issued by a framework on behalf of the application without
7177 * having to worry that they can be undone.
7178 *
7179 * Note here that socket-level restrictions overrides any protocol
7180 * level restrictions. For instance, SO_RESTRICT_DENY_CELLULAR
7181 * socket restriction issued on the socket has a higher precendence
7182 * than INP_NO_IFT_CELLULAR. The latter is affected by the UUID
7183 * policy PROC_UUID_NO_CELLULAR for unrestricted sockets only,
7184 * i.e. when SO_RESTRICT_DENY_CELLULAR has not been issued.
7185 */
7186 nocell_old = (so->so_restrictions & SO_RESTRICT_DENY_CELLULAR);
7187 noexpensive_old = (so->so_restrictions & SO_RESTRICT_DENY_EXPENSIVE);
7188 so->so_restrictions |= (vals & (SO_RESTRICT_DENY_IN |
7189 SO_RESTRICT_DENY_OUT | SO_RESTRICT_DENY_CELLULAR |
7190 SO_RESTRICT_DENY_EXPENSIVE));
7191 nocell_new = (so->so_restrictions & SO_RESTRICT_DENY_CELLULAR);
7192 noexpensive_new = (so->so_restrictions & SO_RESTRICT_DENY_EXPENSIVE);
7193
7194 /* we can only set, not clear restrictions */
7195 if ((nocell_new - nocell_old) == 0 &&
7196 (noexpensive_new - noexpensive_old) == 0)
7197 return (0);
7198 #if INET6
7199 if (SOCK_DOM(so) == PF_INET || SOCK_DOM(so) == PF_INET6) {
7200 #else
7201 if (SOCK_DOM(so) == PF_INET) {
7202 #endif /* !INET6 */
7203 if (nocell_new - nocell_old != 0) {
7204 /*
7205 * if deny cellular is now set, do what's needed
7206 * for INPCB
7207 */
7208 inp_set_nocellular(sotoinpcb(so));
7209 }
7210 if (noexpensive_new - noexpensive_old != 0) {
7211 inp_set_noexpensive(sotoinpcb(so));
7212 }
7213 }
7214
7215 return (0);
7216 }
7217
7218 uint32_t
7219 so_get_restrictions(struct socket *so)
7220 {
7221 return (so->so_restrictions & (SO_RESTRICT_DENY_IN |
7222 SO_RESTRICT_DENY_OUT |
7223 SO_RESTRICT_DENY_CELLULAR | SO_RESTRICT_DENY_EXPENSIVE));
7224 }
7225
7226 int
7227 so_set_effective_pid(struct socket *so, int epid, struct proc *p)
7228 {
7229 struct proc *ep = PROC_NULL;
7230 int error = 0;
7231
7232 /* pid 0 is reserved for kernel */
7233 if (epid == 0) {
7234 error = EINVAL;
7235 goto done;
7236 }
7237
7238 /*
7239 * If this is an in-kernel socket, prevent its delegate
7240 * association from changing unless the socket option is
7241 * coming from within the kernel itself.
7242 */
7243 if (so->last_pid == 0 && p != kernproc) {
7244 error = EACCES;
7245 goto done;
7246 }
7247
7248 /*
7249 * If this is issued by a process that's recorded as the
7250 * real owner of the socket, or if the pid is the same as
7251 * the process's own pid, then proceed. Otherwise ensure
7252 * that the issuing process has the necessary privileges.
7253 */
7254 if (epid != so->last_pid || epid != proc_pid(p)) {
7255 if ((error = priv_check_cred(kauth_cred_get(),
7256 PRIV_NET_PRIVILEGED_SOCKET_DELEGATE, 0))) {
7257 error = EACCES;
7258 goto done;
7259 }
7260 }
7261
7262 /* Find the process that corresponds to the effective pid */
7263 if ((ep = proc_find(epid)) == PROC_NULL) {
7264 error = ESRCH;
7265 goto done;
7266 }
7267
7268 /*
7269 * If a process tries to delegate the socket to itself, then
7270 * there's really nothing to do; treat it as a way for the
7271 * delegate association to be cleared. Note that we check
7272 * the passed-in proc rather than calling proc_selfpid(),
7273 * as we need to check the process issuing the socket option
7274 * which could be kernproc. Given that we don't allow 0 for
7275 * effective pid, it means that a delegated in-kernel socket
7276 * stays delegated during its lifetime (which is probably OK.)
7277 */
7278 if (epid == proc_pid(p)) {
7279 so->so_flags &= ~SOF_DELEGATED;
7280 so->e_upid = 0;
7281 so->e_pid = 0;
7282 uuid_clear(so->e_uuid);
7283 } else {
7284 so->so_flags |= SOF_DELEGATED;
7285 so->e_upid = proc_uniqueid(ep);
7286 so->e_pid = proc_pid(ep);
7287 proc_getexecutableuuid(ep, so->e_uuid, sizeof (so->e_uuid));
7288 }
7289 done:
7290 if (error == 0 && net_io_policy_log) {
7291 uuid_string_t buf;
7292
7293 uuid_unparse(so->e_uuid, buf);
7294 log(LOG_DEBUG, "%s[%s,%d]: so 0x%llx [%d,%d] epid %d (%s) "
7295 "euuid %s%s\n", __func__, proc_name_address(p),
7296 proc_pid(p), (uint64_t)DEBUG_KERNEL_ADDRPERM(so),
7297 SOCK_DOM(so), SOCK_TYPE(so),
7298 so->e_pid, proc_name_address(ep), buf,
7299 ((so->so_flags & SOF_DELEGATED) ? " [delegated]" : ""));
7300 } else if (error != 0 && net_io_policy_log) {
7301 log(LOG_ERR, "%s[%s,%d]: so 0x%llx [%d,%d] epid %d (%s) "
7302 "ERROR (%d)\n", __func__, proc_name_address(p),
7303 proc_pid(p), (uint64_t)DEBUG_KERNEL_ADDRPERM(so),
7304 SOCK_DOM(so), SOCK_TYPE(so),
7305 epid, (ep == PROC_NULL) ? "PROC_NULL" :
7306 proc_name_address(ep), error);
7307 }
7308
7309 /* Update this socket's policy upon success */
7310 if (error == 0) {
7311 so->so_policy_gencnt *= -1;
7312 so_update_policy(so);
7313 #if NECP
7314 so_update_necp_policy(so, NULL, NULL);
7315 #endif /* NECP */
7316 }
7317
7318 if (ep != PROC_NULL)
7319 proc_rele(ep);
7320
7321 return (error);
7322 }
7323
7324 int
7325 so_set_effective_uuid(struct socket *so, uuid_t euuid, struct proc *p)
7326 {
7327 uuid_string_t buf;
7328 uuid_t uuid;
7329 int error = 0;
7330
7331 /* UUID must not be all-zeroes (reserved for kernel) */
7332 if (uuid_is_null(euuid)) {
7333 error = EINVAL;
7334 goto done;
7335 }
7336
7337 /*
7338 * If this is an in-kernel socket, prevent its delegate
7339 * association from changing unless the socket option is
7340 * coming from within the kernel itself.
7341 */
7342 if (so->last_pid == 0 && p != kernproc) {
7343 error = EACCES;
7344 goto done;
7345 }
7346
7347 /* Get the UUID of the issuing process */
7348 proc_getexecutableuuid(p, uuid, sizeof (uuid));
7349
7350 /*
7351 * If this is issued by a process that's recorded as the
7352 * real owner of the socket, or if the uuid is the same as
7353 * the process's own uuid, then proceed. Otherwise ensure
7354 * that the issuing process has the necessary privileges.
7355 */
7356 if (uuid_compare(euuid, so->last_uuid) != 0 ||
7357 uuid_compare(euuid, uuid) != 0) {
7358 if ((error = priv_check_cred(kauth_cred_get(),
7359 PRIV_NET_PRIVILEGED_SOCKET_DELEGATE, 0))) {
7360 error = EACCES;
7361 goto done;
7362 }
7363 }
7364
7365 /*
7366 * If a process tries to delegate the socket to itself, then
7367 * there's really nothing to do; treat it as a way for the
7368 * delegate association to be cleared. Note that we check
7369 * the uuid of the passed-in proc rather than that of the
7370 * current process, as we need to check the process issuing
7371 * the socket option which could be kernproc itself. Given
7372 * that we don't allow 0 for effective uuid, it means that
7373 * a delegated in-kernel socket stays delegated during its
7374 * lifetime (which is okay.)
7375 */
7376 if (uuid_compare(euuid, uuid) == 0) {
7377 so->so_flags &= ~SOF_DELEGATED;
7378 so->e_upid = 0;
7379 so->e_pid = 0;
7380 uuid_clear(so->e_uuid);
7381 } else {
7382 so->so_flags |= SOF_DELEGATED;
7383 /*
7384 * Unlike so_set_effective_pid(), we only have the UUID
7385 * here and the process ID is not known. Inherit the
7386 * real {pid,upid} of the socket.
7387 */
7388 so->e_upid = so->last_upid;
7389 so->e_pid = so->last_pid;
7390 uuid_copy(so->e_uuid, euuid);
7391 }
7392
7393 done:
7394 if (error == 0 && net_io_policy_log) {
7395 uuid_unparse(so->e_uuid, buf);
7396 log(LOG_DEBUG, "%s[%s,%d]: so 0x%llx [%d,%d] epid %d "
7397 "euuid %s%s\n", __func__, proc_name_address(p), proc_pid(p),
7398 (uint64_t)DEBUG_KERNEL_ADDRPERM(so), SOCK_DOM(so),
7399 SOCK_TYPE(so), so->e_pid, buf,
7400 ((so->so_flags & SOF_DELEGATED) ? " [delegated]" : ""));
7401 } else if (error != 0 && net_io_policy_log) {
7402 uuid_unparse(euuid, buf);
7403 log(LOG_DEBUG, "%s[%s,%d]: so 0x%llx [%d,%d] euuid %s "
7404 "ERROR (%d)\n", __func__, proc_name_address(p), proc_pid(p),
7405 (uint64_t)DEBUG_KERNEL_ADDRPERM(so), SOCK_DOM(so),
7406 SOCK_TYPE(so), buf, error);
7407 }
7408
7409 /* Update this socket's policy upon success */
7410 if (error == 0) {
7411 so->so_policy_gencnt *= -1;
7412 so_update_policy(so);
7413 #if NECP
7414 so_update_necp_policy(so, NULL, NULL);
7415 #endif /* NECP */
7416 }
7417
7418 return (error);
7419 }
7420
7421 void
7422 netpolicy_post_msg(uint32_t ev_code, struct netpolicy_event_data *ev_data,
7423 uint32_t ev_datalen)
7424 {
7425 struct kev_msg ev_msg;
7426
7427 /*
7428 * A netpolicy event always starts with a netpolicy_event_data
7429 * structure, but the caller can provide for a longer event
7430 * structure to post, depending on the event code.
7431 */
7432 VERIFY(ev_data != NULL && ev_datalen >= sizeof (*ev_data));
7433
7434 bzero(&ev_msg, sizeof (ev_msg));
7435 ev_msg.vendor_code = KEV_VENDOR_APPLE;
7436 ev_msg.kev_class = KEV_NETWORK_CLASS;
7437 ev_msg.kev_subclass = KEV_NETPOLICY_SUBCLASS;
7438 ev_msg.event_code = ev_code;
7439
7440 ev_msg.dv[0].data_ptr = ev_data;
7441 ev_msg.dv[0].data_length = ev_datalen;
7442
7443 kev_post_msg(&ev_msg);
7444 }
7445
7446 void
7447 socket_post_kev_msg(uint32_t ev_code,
7448 struct kev_socket_event_data *ev_data,
7449 uint32_t ev_datalen)
7450 {
7451 struct kev_msg ev_msg;
7452
7453 bzero(&ev_msg, sizeof(ev_msg));
7454 ev_msg.vendor_code = KEV_VENDOR_APPLE;
7455 ev_msg.kev_class = KEV_NETWORK_CLASS;
7456 ev_msg.kev_subclass = KEV_SOCKET_SUBCLASS;
7457 ev_msg.event_code = ev_code;
7458
7459 ev_msg.dv[0].data_ptr = ev_data;
7460 ev_msg.dv[0]. data_length = ev_datalen;
7461
7462 kev_post_msg(&ev_msg);
7463 }
7464
7465 void
7466 socket_post_kev_msg_closed(struct socket *so)
7467 {
7468 struct kev_socket_closed ev;
7469 struct sockaddr *socksa = NULL, *peersa = NULL;
7470 int err;
7471 bzero(&ev, sizeof(ev));
7472 err = (*so->so_proto->pr_usrreqs->pru_sockaddr)(so, &socksa);
7473 if (err == 0) {
7474 err = (*so->so_proto->pr_usrreqs->pru_peeraddr)(so,
7475 &peersa);
7476 if (err == 0) {
7477 memcpy(&ev.ev_data.kev_sockname, socksa,
7478 min(socksa->sa_len,
7479 sizeof (ev.ev_data.kev_sockname)));
7480 memcpy(&ev.ev_data.kev_peername, peersa,
7481 min(peersa->sa_len,
7482 sizeof (ev.ev_data.kev_peername)));
7483 socket_post_kev_msg(KEV_SOCKET_CLOSED,
7484 &ev.ev_data, sizeof (ev));
7485 }
7486 }
7487 if (socksa != NULL)
7488 FREE(socksa, M_SONAME);
7489 if (peersa != NULL)
7490 FREE(peersa, M_SONAME);
7491 }