]> git.saurik.com Git - apple/xnu.git/blob - bsd/kern/uipc_socket.c
xnu-1699.26.8.tar.gz
[apple/xnu.git] / bsd / kern / uipc_socket.c
1 /*
2 * Copyright (c) 1998-2011 Apple Inc. All rights reserved.
3 *
4 * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
5 *
6 * This file contains Original Code and/or Modifications of Original Code
7 * as defined in and that are subject to the Apple Public Source License
8 * Version 2.0 (the 'License'). You may not use this file except in
9 * compliance with the License. The rights granted to you under the License
10 * may not be used to create, or enable the creation or redistribution of,
11 * unlawful or unlicensed copies of an Apple operating system, or to
12 * circumvent, violate, or enable the circumvention or violation of, any
13 * terms of an Apple operating system software license agreement.
14 *
15 * Please obtain a copy of the License at
16 * http://www.opensource.apple.com/apsl/ and read it before using this file.
17 *
18 * The Original Code and all software distributed under the License are
19 * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
20 * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
21 * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
22 * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
23 * Please see the License for the specific language governing rights and
24 * limitations under the License.
25 *
26 * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
27 */
28 /* Copyright (c) 1995 NeXT Computer, Inc. All Rights Reserved */
29 /*
30 * Copyright (c) 1982, 1986, 1988, 1990, 1993
31 * The Regents of the University of California. All rights reserved.
32 *
33 * Redistribution and use in source and binary forms, with or without
34 * modification, are permitted provided that the following conditions
35 * are met:
36 * 1. Redistributions of source code must retain the above copyright
37 * notice, this list of conditions and the following disclaimer.
38 * 2. Redistributions in binary form must reproduce the above copyright
39 * notice, this list of conditions and the following disclaimer in the
40 * documentation and/or other materials provided with the distribution.
41 * 3. All advertising materials mentioning features or use of this software
42 * must display the following acknowledgement:
43 * This product includes software developed by the University of
44 * California, Berkeley and its contributors.
45 * 4. Neither the name of the University nor the names of its contributors
46 * may be used to endorse or promote products derived from this software
47 * without specific prior written permission.
48 *
49 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
50 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
51 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
52 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
53 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
54 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
55 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
56 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
57 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
58 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
59 * SUCH DAMAGE.
60 *
61 * @(#)uipc_socket.c 8.3 (Berkeley) 4/15/94
62 * $FreeBSD: src/sys/kern/uipc_socket.c,v 1.68.2.16 2001/06/14 20:46:06 ume Exp $
63 */
64 /*
65 * NOTICE: This file was modified by SPARTA, Inc. in 2005 to introduce
66 * support for mandatory and extensible security protections. This notice
67 * is included in support of clause 2.2 (b) of the Apple Public License,
68 * Version 2.0.
69 */
70
71 #include <sys/param.h>
72 #include <sys/systm.h>
73 #include <sys/filedesc.h>
74 #include <sys/proc.h>
75 #include <sys/proc_internal.h>
76 #include <sys/kauth.h>
77 #include <sys/file_internal.h>
78 #include <sys/fcntl.h>
79 #include <sys/malloc.h>
80 #include <sys/mbuf.h>
81 #include <sys/domain.h>
82 #include <sys/kernel.h>
83 #include <sys/event.h>
84 #include <sys/poll.h>
85 #include <sys/protosw.h>
86 #include <sys/socket.h>
87 #include <sys/socketvar.h>
88 #include <sys/resourcevar.h>
89 #include <sys/signalvar.h>
90 #include <sys/sysctl.h>
91 #include <sys/uio.h>
92 #include <sys/ev.h>
93 #include <sys/kdebug.h>
94 #include <sys/un.h>
95 #include <sys/user.h>
96 #include <net/route.h>
97 #include <netinet/in.h>
98 #include <netinet/in_pcb.h>
99 #include <netinet/ip6.h>
100 #include <netinet6/ip6_var.h>
101 #include <kern/zalloc.h>
102 #include <kern/locks.h>
103 #include <machine/limits.h>
104 #include <libkern/OSAtomic.h>
105 #include <pexpert/pexpert.h>
106 #include <kern/assert.h>
107 #include <kern/task.h>
108
109 #include <sys/mcache.h>
110
111 #if CONFIG_MACF
112 #include <security/mac.h>
113 #include <security/mac_framework.h>
114 #endif /* MAC */
115
116 extern int in6_init_done;
117
118 int so_cache_hw = 0;
119 int so_cache_timeouts = 0;
120 int so_cache_max_freed = 0;
121 int cached_sock_count = 0;
122 __private_extern__ int max_cached_sock_count = MAX_CACHED_SOCKETS;
123 struct socket *socket_cache_head = 0;
124 struct socket *socket_cache_tail = 0;
125 u_int32_t so_cache_time = 0;
126 int so_cache_init_done = 0;
127 struct zone *so_cache_zone;
128
129 static lck_grp_t *so_cache_mtx_grp;
130 static lck_attr_t *so_cache_mtx_attr;
131 static lck_grp_attr_t *so_cache_mtx_grp_attr;
132 lck_mtx_t *so_cache_mtx;
133
134 #include <machine/limits.h>
135
136 static void filt_sordetach(struct knote *kn);
137 static int filt_soread(struct knote *kn, long hint);
138 static void filt_sowdetach(struct knote *kn);
139 static int filt_sowrite(struct knote *kn, long hint);
140
141 static int
142 sooptcopyin_timeval(struct sockopt *sopt, struct timeval * tv_p);
143
144 static int
145 sooptcopyout_timeval(struct sockopt *sopt, const struct timeval * tv_p);
146
147 static struct filterops soread_filtops = {
148 .f_isfd = 1,
149 .f_detach = filt_sordetach,
150 .f_event = filt_soread,
151 };
152 static struct filterops sowrite_filtops = {
153 .f_isfd = 1,
154 .f_detach = filt_sowdetach,
155 .f_event = filt_sowrite,
156 };
157
158 #define EVEN_MORE_LOCKING_DEBUG 0
159 int socket_debug = 0;
160 int socket_zone = M_SOCKET;
161 so_gen_t so_gencnt; /* generation count for sockets */
162
163 MALLOC_DEFINE(M_SONAME, "soname", "socket name");
164 MALLOC_DEFINE(M_PCB, "pcb", "protocol control block");
165
166 #define DBG_LAYER_IN_BEG NETDBG_CODE(DBG_NETSOCK, 0)
167 #define DBG_LAYER_IN_END NETDBG_CODE(DBG_NETSOCK, 2)
168 #define DBG_LAYER_OUT_BEG NETDBG_CODE(DBG_NETSOCK, 1)
169 #define DBG_LAYER_OUT_END NETDBG_CODE(DBG_NETSOCK, 3)
170 #define DBG_FNC_SOSEND NETDBG_CODE(DBG_NETSOCK, (4 << 8) | 1)
171 #define DBG_FNC_SORECEIVE NETDBG_CODE(DBG_NETSOCK, (8 << 8))
172 #define DBG_FNC_SOSHUTDOWN NETDBG_CODE(DBG_NETSOCK, (9 << 8))
173
174 #define MAX_SOOPTGETM_SIZE (128 * MCLBYTES)
175
176
177 SYSCTL_DECL(_kern_ipc);
178
179 int somaxconn = SOMAXCONN;
180 SYSCTL_INT(_kern_ipc, KIPC_SOMAXCONN, somaxconn, CTLFLAG_RW | CTLFLAG_LOCKED, &somaxconn, 0, "");
181
182 /* Should we get a maximum also ??? */
183 static int sosendmaxchain = 65536;
184 static int sosendminchain = 16384;
185 static int sorecvmincopy = 16384;
186 SYSCTL_INT(_kern_ipc, OID_AUTO, sosendminchain, CTLFLAG_RW | CTLFLAG_LOCKED, &sosendminchain,
187 0, "");
188 SYSCTL_INT(_kern_ipc, OID_AUTO, sorecvmincopy, CTLFLAG_RW | CTLFLAG_LOCKED, &sorecvmincopy,
189 0, "");
190
191 /*
192 * Set to enable jumbo clusters (if available) for large writes when
193 * the socket is marked with SOF_MULTIPAGES; see below.
194 */
195 int sosendjcl = 1;
196 SYSCTL_INT(_kern_ipc, OID_AUTO, sosendjcl, CTLFLAG_RW | CTLFLAG_LOCKED, &sosendjcl, 0, "");
197
198 /*
199 * Set this to ignore SOF_MULTIPAGES and use jumbo clusters for large
200 * writes on the socket for all protocols on any network interfaces,
201 * depending upon sosendjcl above. Be extra careful when setting this
202 * to 1, because sending down packets that cross physical pages down to
203 * broken drivers (those that falsely assume that the physical pages
204 * are contiguous) might lead to system panics or silent data corruption.
205 * When set to 0, the system will respect SOF_MULTIPAGES, which is set
206 * only for TCP sockets whose outgoing interface is IFNET_MULTIPAGES
207 * capable. Set this to 1 only for testing/debugging purposes.
208 */
209 int sosendjcl_ignore_capab = 0;
210 SYSCTL_INT(_kern_ipc, OID_AUTO, sosendjcl_ignore_capab, CTLFLAG_RW | CTLFLAG_LOCKED,
211 &sosendjcl_ignore_capab, 0, "");
212
213 int sodefunctlog = 0;
214 SYSCTL_INT(_kern_ipc, OID_AUTO, sodefunctlog, CTLFLAG_RW | CTLFLAG_LOCKED,
215 &sodefunctlog, 0, "");
216
217 /*
218 * Socket operation routines.
219 * These routines are called by the routines in
220 * sys_socket.c or from a system process, and
221 * implement the semantics of socket operations by
222 * switching out to the protocol specific routines.
223 */
224
225 /* sys_generic.c */
226 extern void postevent(struct socket *, struct sockbuf *, int);
227 extern void evsofree(struct socket *);
228
229 /* TODO: these should be in header file */
230 extern int get_inpcb_str_size(void);
231 extern int get_tcp_str_size(void);
232 extern struct domain *pffinddomain(int);
233 extern struct protosw *pffindprotonotype(int, int);
234 extern int soclose_locked(struct socket *);
235 extern int soo_kqfilter(struct fileproc *, struct knote *, struct proc *);
236
237 #if CONFIG_EMBEDDED
238 extern int uthread_get_background_state(uthread_t);
239 #endif /*CONFIG_EMBEDDED */
240
241 #ifdef __APPLE__
242
243 vm_size_t so_cache_zone_element_size;
244
245 static int sodelayed_copy(struct socket *, struct uio *, struct mbuf **, int *);
246 static void cached_sock_alloc(struct socket **, int);
247 static void cached_sock_free(struct socket *);
248 static void so_cache_timer(void *);
249
250 void soclose_wait_locked(struct socket *so);
251 int so_isdstlocal(struct socket *so);
252
253 __private_extern__ u_int32_t sotcdb = 0;
254 SYSCTL_INT(_kern_ipc, OID_AUTO, sotcdb, CTLFLAG_RW | CTLFLAG_LOCKED,
255 &sotcdb, 0, "");
256
257 void
258 socketinit(void)
259 {
260 vm_size_t str_size;
261
262 if (so_cache_init_done) {
263 printf("socketinit: already called...\n");
264 return;
265 }
266
267 PE_parse_boot_argn("socket_debug", &socket_debug, sizeof (socket_debug));
268
269 /*
270 * allocate lock group attribute and group for socket cache mutex
271 */
272 so_cache_mtx_grp_attr = lck_grp_attr_alloc_init();
273
274 so_cache_mtx_grp = lck_grp_alloc_init("so_cache",
275 so_cache_mtx_grp_attr);
276
277 /*
278 * allocate the lock attribute for socket cache mutex
279 */
280 so_cache_mtx_attr = lck_attr_alloc_init();
281
282 so_cache_init_done = 1;
283
284 /* cached sockets mutex */
285 so_cache_mtx = lck_mtx_alloc_init(so_cache_mtx_grp, so_cache_mtx_attr);
286
287 if (so_cache_mtx == NULL)
288 return; /* we're hosed... */
289
290 str_size = (vm_size_t)(sizeof (struct socket) + 4 +
291 get_inpcb_str_size() + 4 + get_tcp_str_size());
292
293 so_cache_zone = zinit(str_size, 120000*str_size, 8192, "socache zone");
294 zone_change(so_cache_zone, Z_CALLERACCT, FALSE);
295 zone_change(so_cache_zone, Z_NOENCRYPT, TRUE);
296 #if TEMPDEBUG
297 printf("cached_sock_alloc -- so_cache_zone size is %x\n", str_size);
298 #endif
299 timeout(so_cache_timer, NULL, (SO_CACHE_FLUSH_INTERVAL * hz));
300
301 so_cache_zone_element_size = str_size;
302
303 sflt_init();
304
305 VERIFY(SO_TC_MAX == SO_TC_STATS_MAX);
306
307 socket_tclass_init();
308 }
309
310 static void
311 cached_sock_alloc(struct socket **so, int waitok)
312 {
313 caddr_t temp;
314 register uintptr_t offset;
315
316 lck_mtx_lock(so_cache_mtx);
317
318 if (cached_sock_count) {
319 cached_sock_count--;
320 *so = socket_cache_head;
321 if (*so == 0)
322 panic("cached_sock_alloc: cached sock is null");
323
324 socket_cache_head = socket_cache_head->cache_next;
325 if (socket_cache_head)
326 socket_cache_head->cache_prev = 0;
327 else
328 socket_cache_tail = 0;
329
330 lck_mtx_unlock(so_cache_mtx);
331
332 temp = (*so)->so_saved_pcb;
333 bzero((caddr_t)*so, sizeof (struct socket));
334 #if TEMPDEBUG
335 kprintf("cached_sock_alloc - retreiving cached sock %p - "
336 "count == %d\n", *so, cached_sock_count);
337 #endif
338 (*so)->so_saved_pcb = temp;
339 (*so)->cached_in_sock_layer = 1;
340 } else {
341 #if TEMPDEBUG
342 kprintf("Allocating cached sock %p from memory\n", *so);
343 #endif
344
345 lck_mtx_unlock(so_cache_mtx);
346
347 if (waitok)
348 *so = (struct socket *)zalloc(so_cache_zone);
349 else
350 *so = (struct socket *)zalloc_noblock(so_cache_zone);
351
352 if (*so == 0)
353 return;
354
355 bzero((caddr_t)*so, sizeof (struct socket));
356
357 /*
358 * Define offsets for extra structures into our single block of
359 * memory. Align extra structures on longword boundaries.
360 */
361
362 offset = (uintptr_t) *so;
363 offset += sizeof (struct socket);
364
365 offset = ALIGN(offset);
366
367 (*so)->so_saved_pcb = (caddr_t)offset;
368 offset += get_inpcb_str_size();
369
370 offset = ALIGN(offset);
371
372 ((struct inpcb *)(*so)->so_saved_pcb)->inp_saved_ppcb =
373 (caddr_t)offset;
374 #if TEMPDEBUG
375 kprintf("Allocating cached socket - %p, pcb=%p tcpcb=%p\n",
376 *so, (*so)->so_saved_pcb,
377 ((struct inpcb *)(*so)->so_saved_pcb)->inp_saved_ppcb);
378 #endif
379 }
380
381 (*so)->cached_in_sock_layer = 1;
382 }
383
384 static void
385 cached_sock_free(struct socket *so)
386 {
387
388 lck_mtx_lock(so_cache_mtx);
389
390 if (++cached_sock_count > max_cached_sock_count) {
391 --cached_sock_count;
392 lck_mtx_unlock(so_cache_mtx);
393 #if TEMPDEBUG
394 kprintf("Freeing overflowed cached socket %p\n", so);
395 #endif
396 zfree(so_cache_zone, so);
397 } else {
398 #if TEMPDEBUG
399 kprintf("Freeing socket %p into cache\n", so);
400 #endif
401 if (so_cache_hw < cached_sock_count)
402 so_cache_hw = cached_sock_count;
403
404 so->cache_next = socket_cache_head;
405 so->cache_prev = 0;
406 if (socket_cache_head)
407 socket_cache_head->cache_prev = so;
408 else
409 socket_cache_tail = so;
410
411 so->cache_timestamp = so_cache_time;
412 socket_cache_head = so;
413 lck_mtx_unlock(so_cache_mtx);
414 }
415
416 #if TEMPDEBUG
417 kprintf("Freed cached sock %p into cache - count is %d\n",
418 so, cached_sock_count);
419 #endif
420 }
421
422 static void
423 so_update_last_owner_locked(
424 struct socket *so,
425 proc_t self)
426 {
427 if (self == NULL)
428 self = current_proc();
429
430 if (self)
431 {
432 so->last_upid = proc_uniqueid(self);
433 so->last_pid = proc_pid(self);
434 }
435 }
436
437 static void
438 so_cache_timer(__unused void *dummy)
439 {
440 register struct socket *p;
441 register int n_freed = 0;
442
443 lck_mtx_lock(so_cache_mtx);
444
445 ++so_cache_time;
446
447 while ((p = socket_cache_tail)) {
448 if ((so_cache_time - p->cache_timestamp) < SO_CACHE_TIME_LIMIT)
449 break;
450
451 so_cache_timeouts++;
452
453 if ((socket_cache_tail = p->cache_prev))
454 p->cache_prev->cache_next = 0;
455 if (--cached_sock_count == 0)
456 socket_cache_head = 0;
457
458 zfree(so_cache_zone, p);
459
460 if (++n_freed >= SO_CACHE_MAX_FREE_BATCH) {
461 so_cache_max_freed++;
462 break;
463 }
464 }
465 lck_mtx_unlock(so_cache_mtx);
466
467 timeout(so_cache_timer, NULL, (SO_CACHE_FLUSH_INTERVAL * hz));
468 }
469 #endif /* __APPLE__ */
470
471 /*
472 * Get a socket structure from our zone, and initialize it.
473 * We don't implement `waitok' yet (see comments in uipc_domain.c).
474 * Note that it would probably be better to allocate socket
475 * and PCB at the same time, but I'm not convinced that all
476 * the protocols can be easily modified to do this.
477 */
478 struct socket *
479 soalloc(int waitok, int dom, int type)
480 {
481 struct socket *so;
482
483 if ((dom == PF_INET) && (type == SOCK_STREAM)) {
484 cached_sock_alloc(&so, waitok);
485 } else {
486 MALLOC_ZONE(so, struct socket *, sizeof (*so), socket_zone,
487 M_WAITOK);
488 if (so != NULL)
489 bzero(so, sizeof (*so));
490 }
491 /* XXX race condition for reentrant kernel */
492 //###LD Atomic add for so_gencnt
493 if (so != NULL) {
494 so->so_gencnt = ++so_gencnt;
495 so->so_zone = socket_zone;
496 #if CONFIG_MACF_SOCKET
497 /* Convert waitok to M_WAITOK/M_NOWAIT for MAC Framework. */
498 if (mac_socket_label_init(so, !waitok) != 0) {
499 sodealloc(so);
500 return (NULL);
501 }
502 #endif /* MAC_SOCKET */
503 so_update_last_owner_locked(so, NULL);
504 }
505
506 return (so);
507 }
508
509 /*
510 * Returns: 0 Success
511 * EAFNOSUPPORT
512 * EPROTOTYPE
513 * EPROTONOSUPPORT
514 * ENOBUFS
515 * <pru_attach>:ENOBUFS[AF_UNIX]
516 * <pru_attach>:ENOBUFS[TCP]
517 * <pru_attach>:ENOMEM[TCP]
518 * <pru_attach>:EISCONN[TCP]
519 * <pru_attach>:??? [other protocol families, IPSEC]
520 */
521 int
522 socreate(int dom, struct socket **aso, int type, int proto)
523 {
524 struct proc *p = current_proc();
525 register struct protosw *prp;
526 register struct socket *so;
527 register int error = 0;
528 #if CONFIG_EMBEDDED
529 thread_t thread;
530 struct uthread *ut;
531 #endif /* CONFIG_EMBEDDED */
532
533 #if TCPDEBUG
534 extern int tcpconsdebug;
535 #endif
536 if (proto)
537 prp = pffindproto(dom, proto, type);
538 else
539 prp = pffindtype(dom, type);
540
541 if (prp == 0 || prp->pr_usrreqs->pru_attach == 0) {
542 if (pffinddomain(dom) == NULL) {
543 return (EAFNOSUPPORT);
544 }
545 if (proto != 0) {
546 if (pffindprotonotype(dom, proto) != NULL) {
547 return (EPROTOTYPE);
548 }
549 }
550 return (EPROTONOSUPPORT);
551 }
552 if (prp->pr_type != type)
553 return (EPROTOTYPE);
554 so = soalloc(1, dom, type);
555 if (so == 0)
556 return (ENOBUFS);
557
558 TAILQ_INIT(&so->so_incomp);
559 TAILQ_INIT(&so->so_comp);
560 so->so_type = type;
561
562 so->so_uid = kauth_cred_getuid(kauth_cred_get());
563 so->so_gid = kauth_cred_getgid(kauth_cred_get());
564 if (!suser(kauth_cred_get(), NULL))
565 so->so_state = SS_PRIV;
566
567 so->so_proto = prp;
568 #ifdef __APPLE__
569 so->so_rcv.sb_flags |= SB_RECV; /* XXX */
570 so->so_rcv.sb_so = so->so_snd.sb_so = so;
571 #endif
572 so->next_lock_lr = 0;
573 so->next_unlock_lr = 0;
574
575 #if CONFIG_MACF_SOCKET
576 mac_socket_label_associate(kauth_cred_get(), so);
577 #endif /* MAC_SOCKET */
578
579 //### Attachement will create the per pcb lock if necessary and increase refcount
580 /*
581 * for creation, make sure it's done before
582 * socket is inserted in lists
583 */
584 so->so_usecount++;
585
586 error = (*prp->pr_usrreqs->pru_attach)(so, proto, p);
587 if (error) {
588 /*
589 * Warning:
590 * If so_pcb is not zero, the socket will be leaked,
591 * so protocol attachment handler must be coded carefuly
592 */
593 so->so_state |= SS_NOFDREF;
594 so->so_usecount--;
595 sofreelastref(so, 1); /* will deallocate the socket */
596 return (error);
597 }
598 #ifdef __APPLE__
599 prp->pr_domain->dom_refs++;
600 TAILQ_INIT(&so->so_evlist);
601
602 /* Attach socket filters for this protocol */
603 sflt_initsock(so);
604 #if TCPDEBUG
605 if (tcpconsdebug == 2)
606 so->so_options |= SO_DEBUG;
607 #endif
608 #endif
609 so_set_default_traffic_class(so);
610 /*
611 * If this is a background thread/task, mark the socket as such.
612 */
613 #if !CONFIG_EMBEDDED
614 if (proc_get_self_isbackground() != 0)
615 #else /* !CONFIG_EMBEDDED */
616 thread = current_thread();
617 ut = get_bsdthread_info(thread);
618 if (uthread_get_background_state(ut))
619 #endif /* !CONFIG_EMBEDDED */
620 {
621 socket_set_traffic_mgt_flags(so, TRAFFIC_MGT_SO_BACKGROUND);
622 so->so_background_thread = current_thread();
623 }
624
625 switch (dom) {
626 /*
627 * Don't mark Unix domain sockets as eligible for defunct by default.
628 */
629 case PF_LOCAL:
630 so->so_flags |= SOF_NODEFUNCT;
631 break;
632 /*
633 * Radar 9119053
634 * Since v6 initialization is asynchronous and we can't hold
635 * up the main boot path, we need to at least hold off any
636 * sockets attempting to be created until the v6 stack is
637 * up and ready.
638 */
639 case PF_INET6:
640 if (in6_init_done == 0)
641 ip6_fin();
642 break;
643 default:
644 break;
645 }
646
647 *aso = so;
648 return (0);
649 }
650
651 /*
652 * Returns: 0 Success
653 * <pru_bind>:EINVAL Invalid argument [COMMON_START]
654 * <pru_bind>:EAFNOSUPPORT Address family not supported
655 * <pru_bind>:EADDRNOTAVAIL Address not available.
656 * <pru_bind>:EINVAL Invalid argument
657 * <pru_bind>:EAFNOSUPPORT Address family not supported [notdef]
658 * <pru_bind>:EACCES Permission denied
659 * <pru_bind>:EADDRINUSE Address in use
660 * <pru_bind>:EAGAIN Resource unavailable, try again
661 * <pru_bind>:EPERM Operation not permitted
662 * <pru_bind>:???
663 * <sf_bind>:???
664 *
665 * Notes: It's not possible to fully enumerate the return codes above,
666 * since socket filter authors and protocol family authors may
667 * not choose to limit their error returns to those listed, even
668 * though this may result in some software operating incorrectly.
669 *
670 * The error codes which are enumerated above are those known to
671 * be returned by the tcp_usr_bind function supplied.
672 */
673 int
674 sobind(struct socket *so, struct sockaddr *nam)
675 {
676 struct proc *p = current_proc();
677 int error = 0;
678
679 socket_lock(so, 1);
680
681 so_update_last_owner_locked(so, p);
682
683 /*
684 * If this is a bind request on a socket that has been marked
685 * as inactive, reject it now before we go any further.
686 */
687 if (so->so_flags & SOF_DEFUNCT) {
688 error = EINVAL;
689 SODEFUNCTLOG(("%s[%d]: defunct so %p [%d,%d] (%d)\n",
690 __func__, proc_pid(p), so, INP_SOCKAF(so), INP_SOCKTYPE(so),
691 error));
692 goto out;
693 }
694
695 /* Socket filter */
696 error = sflt_bind(so, nam);
697
698 if (error == 0)
699 error = (*so->so_proto->pr_usrreqs->pru_bind)(so, nam, p);
700 out:
701 socket_unlock(so, 1);
702
703 if (error == EJUSTRETURN)
704 error = 0;
705
706 return (error);
707 }
708
709 void
710 sodealloc(struct socket *so)
711 {
712 /* Remove any filters */
713 sflt_termsock(so);
714
715 so->so_gencnt = ++so_gencnt;
716
717 #if CONFIG_MACF_SOCKET
718 mac_socket_label_destroy(so);
719 #endif /* MAC_SOCKET */
720 if (so->cached_in_sock_layer == 1) {
721 cached_sock_free(so);
722 } else {
723 if (so->cached_in_sock_layer == -1)
724 panic("sodealloc: double dealloc: so=%p\n", so);
725 so->cached_in_sock_layer = -1;
726 FREE_ZONE(so, sizeof (*so), so->so_zone);
727 }
728 }
729
730 /*
731 * Returns: 0 Success
732 * EINVAL
733 * EOPNOTSUPP
734 * <pru_listen>:EINVAL[AF_UNIX]
735 * <pru_listen>:EINVAL[TCP]
736 * <pru_listen>:EADDRNOTAVAIL[TCP] Address not available.
737 * <pru_listen>:EINVAL[TCP] Invalid argument
738 * <pru_listen>:EAFNOSUPPORT[TCP] Address family not supported [notdef]
739 * <pru_listen>:EACCES[TCP] Permission denied
740 * <pru_listen>:EADDRINUSE[TCP] Address in use
741 * <pru_listen>:EAGAIN[TCP] Resource unavailable, try again
742 * <pru_listen>:EPERM[TCP] Operation not permitted
743 * <sf_listen>:???
744 *
745 * Notes: Other <pru_listen> returns depend on the protocol family; all
746 * <sf_listen> returns depend on what the filter author causes
747 * their filter to return.
748 */
749 int
750 solisten(struct socket *so, int backlog)
751 {
752 struct proc *p = current_proc();
753 int error = 0;
754
755 socket_lock(so, 1);
756
757 so_update_last_owner_locked(so, p);
758
759 if (so->so_proto == NULL) {
760 error = EINVAL;
761 goto out;
762 }
763 if ((so->so_proto->pr_flags & PR_CONNREQUIRED) == 0) {
764 error = EOPNOTSUPP;
765 goto out;
766 }
767
768 /*
769 * If the listen request is made on a socket that is not fully
770 * disconnected, or on a socket that has been marked as inactive,
771 * reject the request now.
772 */
773 if ((so->so_state &
774 (SS_ISCONNECTED|SS_ISCONNECTING|SS_ISDISCONNECTING)) ||
775 (so->so_flags & SOF_DEFUNCT)) {
776 error = EINVAL;
777 if (so->so_flags & SOF_DEFUNCT) {
778 SODEFUNCTLOG(("%s[%d]: defunct so %p [%d,%d] (%d)\n",
779 __func__, proc_pid(p), so, INP_SOCKAF(so),
780 INP_SOCKTYPE(so), error));
781 }
782 goto out;
783 }
784
785 if ((so->so_restrictions & SO_RESTRICT_DENYIN) != 0) {
786 error = EPERM;
787 goto out;
788 }
789
790 error = sflt_listen(so);
791
792 if (error == 0) {
793 error = (*so->so_proto->pr_usrreqs->pru_listen)(so, p);
794 }
795
796 if (error) {
797 if (error == EJUSTRETURN)
798 error = 0;
799 goto out;
800 }
801
802 if (TAILQ_EMPTY(&so->so_comp))
803 so->so_options |= SO_ACCEPTCONN;
804 /*
805 * POSIX: The implementation may have an upper limit on the length of
806 * the listen queue-either global or per accepting socket. If backlog
807 * exceeds this limit, the length of the listen queue is set to the
808 * limit.
809 *
810 * If listen() is called with a backlog argument value that is less
811 * than 0, the function behaves as if it had been called with a backlog
812 * argument value of 0.
813 *
814 * A backlog argument of 0 may allow the socket to accept connections,
815 * in which case the length of the listen queue may be set to an
816 * implementation-defined minimum value.
817 */
818 if (backlog <= 0 || backlog > somaxconn)
819 backlog = somaxconn;
820
821 so->so_qlimit = backlog;
822 out:
823 socket_unlock(so, 1);
824 return (error);
825 }
826
827 void
828 sofreelastref(struct socket *so, int dealloc)
829 {
830 struct socket *head = so->so_head;
831
832 /* Assume socket is locked */
833
834 if ((!(so->so_flags & SOF_PCBCLEARING)) ||
835 ((so->so_state & SS_NOFDREF) == 0)) {
836 #ifdef __APPLE__
837 selthreadclear(&so->so_snd.sb_sel);
838 selthreadclear(&so->so_rcv.sb_sel);
839 so->so_rcv.sb_flags &= ~SB_UPCALL;
840 so->so_snd.sb_flags &= ~SB_UPCALL;
841 #endif
842 return;
843 }
844 if (head != NULL) {
845 socket_lock(head, 1);
846 if (so->so_state & SS_INCOMP) {
847 TAILQ_REMOVE(&head->so_incomp, so, so_list);
848 head->so_incqlen--;
849 } else if (so->so_state & SS_COMP) {
850 /*
851 * We must not decommission a socket that's
852 * on the accept(2) queue. If we do, then
853 * accept(2) may hang after select(2) indicated
854 * that the listening socket was ready.
855 */
856 #ifdef __APPLE__
857 selthreadclear(&so->so_snd.sb_sel);
858 selthreadclear(&so->so_rcv.sb_sel);
859 so->so_rcv.sb_flags &= ~SB_UPCALL;
860 so->so_snd.sb_flags &= ~SB_UPCALL;
861 #endif
862 socket_unlock(head, 1);
863 return;
864 } else {
865 panic("sofree: not queued");
866 }
867 head->so_qlen--;
868 so->so_state &= ~SS_INCOMP;
869 so->so_head = NULL;
870 socket_unlock(head, 1);
871 }
872 #ifdef __APPLE__
873 selthreadclear(&so->so_snd.sb_sel);
874 sbrelease(&so->so_snd);
875 #endif
876 sorflush(so);
877
878 /* 3932268: disable upcall */
879 so->so_rcv.sb_flags &= ~SB_UPCALL;
880 so->so_snd.sb_flags &= ~SB_UPCALL;
881
882 if (dealloc)
883 sodealloc(so);
884 }
885
886 void
887 soclose_wait_locked(struct socket *so)
888 {
889 lck_mtx_t *mutex_held;
890
891 if (so->so_proto->pr_getlock != NULL)
892 mutex_held = (*so->so_proto->pr_getlock)(so, 0);
893 else
894 mutex_held = so->so_proto->pr_domain->dom_mtx;
895 lck_mtx_assert(mutex_held, LCK_MTX_ASSERT_OWNED);
896
897 /*
898 * Double check here and return if there's no outstanding upcall;
899 * otherwise proceed further only if SOF_UPCALLCLOSEWAIT is set.
900 */
901 if (!(so->so_flags & SOF_UPCALLINUSE) ||
902 !(so->so_flags & SOF_UPCALLCLOSEWAIT))
903 return;
904
905 so->so_flags |= SOF_CLOSEWAIT;
906 (void) msleep((caddr_t)&so->so_upcall, mutex_held, (PZERO - 1),
907 "soclose_wait_locked", NULL);
908 lck_mtx_assert(mutex_held, LCK_MTX_ASSERT_OWNED);
909 so->so_flags &= ~SOF_CLOSEWAIT;
910 }
911
912 /*
913 * Close a socket on last file table reference removal.
914 * Initiate disconnect if connected.
915 * Free socket when disconnect complete.
916 */
917 int
918 soclose_locked(struct socket *so)
919 {
920 int error = 0;
921 lck_mtx_t *mutex_held;
922 struct timespec ts;
923
924 if (so->so_usecount == 0) {
925 panic("soclose: so=%p refcount=0\n", so);
926 }
927
928 sflt_notify(so, sock_evt_closing, NULL);
929
930 if ((so->so_options & SO_ACCEPTCONN)) {
931 struct socket *sp, *sonext;
932 int socklock = 0;
933
934 /*
935 * We do not want new connection to be added
936 * to the connection queues
937 */
938 so->so_options &= ~SO_ACCEPTCONN;
939
940 for (sp = TAILQ_FIRST(&so->so_incomp); sp != NULL; sp = sonext) {
941 sonext = TAILQ_NEXT(sp, so_list);
942
943 /* Radar 5350314
944 * skip sockets thrown away by tcpdropdropblreq
945 * they will get cleanup by the garbage collection.
946 * otherwise, remove the incomp socket from the queue
947 * and let soabort trigger the appropriate cleanup.
948 */
949 if (sp->so_flags & SOF_OVERFLOW)
950 continue;
951
952 if (so->so_proto->pr_getlock != NULL) {
953 /* lock ordering for consistency with the rest of the stack,
954 * we lock the socket first and then grabb the head.
955 */
956 socket_unlock(so, 0);
957 socket_lock(sp, 1);
958 socket_lock(so, 0);
959 socklock = 1;
960 }
961
962 TAILQ_REMOVE(&so->so_incomp, sp, so_list);
963 so->so_incqlen--;
964
965 if (sp->so_state & SS_INCOMP) {
966 sp->so_state &= ~SS_INCOMP;
967 sp->so_head = NULL;
968
969 (void) soabort(sp);
970 }
971
972 if (socklock)
973 socket_unlock(sp, 1);
974 }
975
976 while ((sp = TAILQ_FIRST(&so->so_comp)) != NULL) {
977 /* Dequeue from so_comp since sofree() won't do it */
978 TAILQ_REMOVE(&so->so_comp, sp, so_list);
979 so->so_qlen--;
980
981 if (so->so_proto->pr_getlock != NULL) {
982 socket_unlock(so, 0);
983 socket_lock(sp, 1);
984 }
985
986 if (sp->so_state & SS_COMP) {
987 sp->so_state &= ~SS_COMP;
988 sp->so_head = NULL;
989
990 (void) soabort(sp);
991 }
992
993 if (so->so_proto->pr_getlock != NULL) {
994 socket_unlock(sp, 1);
995 socket_lock(so, 0);
996 }
997 }
998 }
999 if (so->so_pcb == 0) {
1000 /* 3915887: mark the socket as ready for dealloc */
1001 so->so_flags |= SOF_PCBCLEARING;
1002 goto discard;
1003 }
1004 if (so->so_state & SS_ISCONNECTED) {
1005 if ((so->so_state & SS_ISDISCONNECTING) == 0) {
1006 error = sodisconnectlocked(so);
1007 if (error)
1008 goto drop;
1009 }
1010 if (so->so_options & SO_LINGER) {
1011 if ((so->so_state & SS_ISDISCONNECTING) &&
1012 (so->so_state & SS_NBIO))
1013 goto drop;
1014 if (so->so_proto->pr_getlock != NULL)
1015 mutex_held = (*so->so_proto->pr_getlock)(so, 0);
1016 else
1017 mutex_held = so->so_proto->pr_domain->dom_mtx;
1018 while (so->so_state & SS_ISCONNECTED) {
1019 ts.tv_sec = (so->so_linger/100);
1020 ts.tv_nsec = (so->so_linger % 100) *
1021 NSEC_PER_USEC * 1000 * 10;
1022 error = msleep((caddr_t)&so->so_timeo,
1023 mutex_held, PSOCK | PCATCH, "soclose", &ts);
1024 if (error) {
1025 /*
1026 * It's OK when the time fires,
1027 * don't report an error
1028 */
1029 if (error == EWOULDBLOCK)
1030 error = 0;
1031 break;
1032 }
1033 }
1034 }
1035 }
1036 drop:
1037 if (so->so_usecount == 0)
1038 panic("soclose: usecount is zero so=%p\n", so);
1039 if (so->so_pcb && !(so->so_flags & SOF_PCBCLEARING)) {
1040 int error2 = (*so->so_proto->pr_usrreqs->pru_detach)(so);
1041 if (error == 0)
1042 error = error2;
1043 }
1044 if (so->so_usecount <= 0)
1045 panic("soclose: usecount is zero so=%p\n", so);
1046 discard:
1047 if (so->so_pcb && so->so_state & SS_NOFDREF)
1048 panic("soclose: NOFDREF");
1049 so->so_state |= SS_NOFDREF;
1050 #ifdef __APPLE__
1051 so->so_proto->pr_domain->dom_refs--;
1052 evsofree(so);
1053 #endif
1054 so->so_usecount--;
1055 sofree(so);
1056 return (error);
1057 }
1058
1059 int
1060 soclose(struct socket *so)
1061 {
1062 int error = 0;
1063 socket_lock(so, 1);
1064
1065 if (so->so_flags & SOF_UPCALLINUSE)
1066 soclose_wait_locked(so);
1067
1068 if (so->so_retaincnt == 0) {
1069 error = soclose_locked(so);
1070 } else {
1071 /*
1072 * if the FD is going away, but socket is
1073 * retained in kernel remove its reference
1074 */
1075 so->so_usecount--;
1076 if (so->so_usecount < 2)
1077 panic("soclose: retaincnt non null and so=%p "
1078 "usecount=%d\n", so, so->so_usecount);
1079 }
1080 socket_unlock(so, 1);
1081 return (error);
1082 }
1083
1084 /*
1085 * Must be called at splnet...
1086 */
1087 /* Should already be locked */
1088 int
1089 soabort(struct socket *so)
1090 {
1091 int error;
1092
1093 #ifdef MORE_LOCKING_DEBUG
1094 lck_mtx_t *mutex_held;
1095
1096 if (so->so_proto->pr_getlock != NULL)
1097 mutex_held = (*so->so_proto->pr_getlock)(so, 0);
1098 else
1099 mutex_held = so->so_proto->pr_domain->dom_mtx;
1100 lck_mtx_assert(mutex_held, LCK_MTX_ASSERT_OWNED);
1101 #endif
1102
1103 if ((so->so_flags & SOF_ABORTED) == 0) {
1104 so->so_flags |= SOF_ABORTED;
1105 error = (*so->so_proto->pr_usrreqs->pru_abort)(so);
1106 if (error) {
1107 sofree(so);
1108 return (error);
1109 }
1110 }
1111 return (0);
1112 }
1113
1114 int
1115 soacceptlock(struct socket *so, struct sockaddr **nam, int dolock)
1116 {
1117 int error;
1118
1119 if (dolock)
1120 socket_lock(so, 1);
1121
1122 if ((so->so_state & SS_NOFDREF) == 0)
1123 panic("soaccept: !NOFDREF");
1124 so->so_state &= ~SS_NOFDREF;
1125 error = (*so->so_proto->pr_usrreqs->pru_accept)(so, nam);
1126
1127 if (dolock)
1128 socket_unlock(so, 1);
1129 return (error);
1130 }
1131
1132 int
1133 soaccept(struct socket *so, struct sockaddr **nam)
1134 {
1135 return (soacceptlock(so, nam, 1));
1136 }
1137
1138 int
1139 soacceptfilter(struct socket *so)
1140 {
1141 struct sockaddr *local = NULL, *remote = NULL;
1142 int error = 0;
1143 struct socket *head = so->so_head;
1144
1145 /*
1146 * Hold the lock even if this socket
1147 * has not been made visible to the filter(s).
1148 * For sockets with global locks, this protect against the
1149 * head or peer going away
1150 */
1151 socket_lock(so, 1);
1152 if (sogetaddr_locked(so, &remote, 1) != 0 ||
1153 sogetaddr_locked(so, &local, 0) != 0) {
1154 so->so_state &= ~(SS_NOFDREF | SS_COMP);
1155 so->so_head = NULL;
1156 socket_unlock(so, 1);
1157 soclose(so);
1158 /* Out of resources; try it again next time */
1159 error = ECONNABORTED;
1160 goto done;
1161 }
1162
1163 error = sflt_accept(head, so, local, remote);
1164
1165 /*
1166 * If we get EJUSTRETURN from one of the filters, mark this socket
1167 * as inactive and return it anyway. This newly accepted socket
1168 * will be disconnected later before we hand it off to the caller.
1169 */
1170 if (error == EJUSTRETURN) {
1171 error = 0;
1172 (void) sosetdefunct(current_proc(), so,
1173 SHUTDOWN_SOCKET_LEVEL_DISCONNECT_INTERNAL, FALSE);
1174 }
1175
1176 if (error != 0) {
1177 /*
1178 * This may seem like a duplication to the above error
1179 * handling part when we return ECONNABORTED, except
1180 * the following is done while holding the lock since
1181 * the socket has been exposed to the filter(s) earlier.
1182 */
1183 so->so_state &= ~(SS_NOFDREF | SS_COMP);
1184 so->so_head = NULL;
1185 socket_unlock(so, 1);
1186 soclose(so);
1187 /* Propagate socket filter's error code to the caller */
1188 } else {
1189 socket_unlock(so, 1);
1190 }
1191 done:
1192 /* Callee checks for NULL pointer */
1193 sock_freeaddr(remote);
1194 sock_freeaddr(local);
1195 return (error);
1196 }
1197
1198 /*
1199 * Returns: 0 Success
1200 * EOPNOTSUPP Operation not supported on socket
1201 * EISCONN Socket is connected
1202 * <pru_connect>:EADDRNOTAVAIL Address not available.
1203 * <pru_connect>:EINVAL Invalid argument
1204 * <pru_connect>:EAFNOSUPPORT Address family not supported [notdef]
1205 * <pru_connect>:EACCES Permission denied
1206 * <pru_connect>:EADDRINUSE Address in use
1207 * <pru_connect>:EAGAIN Resource unavailable, try again
1208 * <pru_connect>:EPERM Operation not permitted
1209 * <sf_connect_out>:??? [anything a filter writer might set]
1210 */
1211 int
1212 soconnectlock(struct socket *so, struct sockaddr *nam, int dolock)
1213 {
1214 int error;
1215 struct proc *p = current_proc();
1216
1217 if (dolock)
1218 socket_lock(so, 1);
1219
1220 so_update_last_owner_locked(so, p);
1221
1222 /*
1223 * If this is a listening socket or if this is a previously-accepted
1224 * socket that has been marked as inactive, reject the connect request.
1225 */
1226 if ((so->so_options & SO_ACCEPTCONN) || (so->so_flags & SOF_DEFUNCT)) {
1227 error = EOPNOTSUPP;
1228 if (so->so_flags & SOF_DEFUNCT) {
1229 SODEFUNCTLOG(("%s[%d]: defunct so %p [%d,%d] (%d)\n",
1230 __func__, proc_pid(p), so, INP_SOCKAF(so),
1231 INP_SOCKTYPE(so), error));
1232 }
1233 if (dolock)
1234 socket_unlock(so, 1);
1235 return (error);
1236 }
1237
1238 if ((so->so_restrictions & SO_RESTRICT_DENYOUT) != 0) {
1239 if (dolock)
1240 socket_unlock(so, 1);
1241 return (EPERM);
1242 }
1243
1244 /*
1245 * If protocol is connection-based, can only connect once.
1246 * Otherwise, if connected, try to disconnect first.
1247 * This allows user to disconnect by connecting to, e.g.,
1248 * a null address.
1249 */
1250 if (so->so_state & (SS_ISCONNECTED|SS_ISCONNECTING) &&
1251 ((so->so_proto->pr_flags & PR_CONNREQUIRED) ||
1252 (error = sodisconnectlocked(so)))) {
1253 error = EISCONN;
1254 } else {
1255 /*
1256 * Run connect filter before calling protocol:
1257 * - non-blocking connect returns before completion;
1258 */
1259 error = sflt_connectout(so, nam);
1260
1261 if (error) {
1262 if (error == EJUSTRETURN)
1263 error = 0;
1264 } else {
1265 error = (*so->so_proto->pr_usrreqs->pru_connect)(so, nam, p);
1266 }
1267 }
1268 if (dolock)
1269 socket_unlock(so, 1);
1270 return (error);
1271 }
1272
1273 int
1274 soconnect(struct socket *so, struct sockaddr *nam)
1275 {
1276 return (soconnectlock(so, nam, 1));
1277 }
1278
1279 /*
1280 * Returns: 0 Success
1281 * <pru_connect2>:EINVAL[AF_UNIX]
1282 * <pru_connect2>:EPROTOTYPE[AF_UNIX]
1283 * <pru_connect2>:??? [other protocol families]
1284 *
1285 * Notes: <pru_connect2> is not supported by [TCP].
1286 */
1287 int
1288 soconnect2(struct socket *so1, struct socket *so2)
1289 {
1290 int error;
1291
1292 socket_lock(so1, 1);
1293 if (so2->so_proto->pr_lock)
1294 socket_lock(so2, 1);
1295
1296 error = (*so1->so_proto->pr_usrreqs->pru_connect2)(so1, so2);
1297
1298 socket_unlock(so1, 1);
1299 if (so2->so_proto->pr_lock)
1300 socket_unlock(so2, 1);
1301 return (error);
1302 }
1303
1304 int
1305 sodisconnectlocked(struct socket *so)
1306 {
1307 int error;
1308
1309 if ((so->so_state & SS_ISCONNECTED) == 0) {
1310 error = ENOTCONN;
1311 goto bad;
1312 }
1313 if (so->so_state & SS_ISDISCONNECTING) {
1314 error = EALREADY;
1315 goto bad;
1316 }
1317
1318 error = (*so->so_proto->pr_usrreqs->pru_disconnect)(so);
1319
1320 if (error == 0) {
1321 sflt_notify(so, sock_evt_disconnected, NULL);
1322 }
1323 bad:
1324 return (error);
1325 }
1326
1327 /* Locking version */
1328 int
1329 sodisconnect(struct socket *so)
1330 {
1331 int error;
1332
1333 socket_lock(so, 1);
1334 error = sodisconnectlocked(so);
1335 socket_unlock(so, 1);
1336 return (error);
1337 }
1338
1339 #define SBLOCKWAIT(f) (((f) & MSG_DONTWAIT) ? M_DONTWAIT : M_WAIT)
1340
1341 /*
1342 * sosendcheck will lock the socket buffer if it isn't locked and
1343 * verify that there is space for the data being inserted.
1344 *
1345 * Returns: 0 Success
1346 * EPIPE
1347 * sblock:EWOULDBLOCK
1348 * sblock:EINTR
1349 * sbwait:EBADF
1350 * sbwait:EINTR
1351 * [so_error]:???
1352 */
1353 static int
1354 sosendcheck(struct socket *so, struct sockaddr *addr, int32_t resid, int32_t clen,
1355 int32_t atomic, int flags, int *sblocked)
1356 {
1357 int error = 0;
1358 int32_t space;
1359 int assumelock = 0;
1360
1361 restart:
1362 if (*sblocked == 0) {
1363 if ((so->so_snd.sb_flags & SB_LOCK) != 0 &&
1364 so->so_send_filt_thread != 0 &&
1365 so->so_send_filt_thread == current_thread()) {
1366 /*
1367 * We're being called recursively from a filter,
1368 * allow this to continue. Radar 4150520.
1369 * Don't set sblocked because we don't want
1370 * to perform an unlock later.
1371 */
1372 assumelock = 1;
1373 } else {
1374 error = sblock(&so->so_snd, SBLOCKWAIT(flags));
1375 if (error) {
1376 if (so->so_flags & SOF_DEFUNCT)
1377 goto defunct;
1378 return (error);
1379 }
1380 *sblocked = 1;
1381 }
1382 }
1383
1384 /*
1385 * If a send attempt is made on a socket that has been marked
1386 * as inactive (disconnected), reject the request.
1387 */
1388 if (so->so_flags & SOF_DEFUNCT) {
1389 defunct:
1390 error = EPIPE;
1391 SODEFUNCTLOG(("%s[%d]: defunct so %p [%d,%d] (%d)\n", __func__,
1392 proc_selfpid(), so, INP_SOCKAF(so), INP_SOCKTYPE(so),
1393 error));
1394 return (error);
1395 }
1396
1397 if (so->so_state & SS_CANTSENDMORE)
1398 return (EPIPE);
1399
1400 if (so->so_error) {
1401 error = so->so_error;
1402 so->so_error = 0;
1403 return (error);
1404 }
1405
1406 if ((so->so_state & SS_ISCONNECTED) == 0) {
1407 if ((so->so_proto->pr_flags & PR_CONNREQUIRED) != 0) {
1408 if ((so->so_state & SS_ISCONFIRMING) == 0 &&
1409 !(resid == 0 && clen != 0))
1410 return (ENOTCONN);
1411 } else if (addr == 0 && !(flags&MSG_HOLD)) {
1412 return ((so->so_proto->pr_flags & PR_CONNREQUIRED) ?
1413 ENOTCONN : EDESTADDRREQ);
1414 }
1415 }
1416 space = sbspace(&so->so_snd);
1417 if (flags & MSG_OOB)
1418 space += 1024;
1419 if ((atomic && resid > so->so_snd.sb_hiwat) ||
1420 clen > so->so_snd.sb_hiwat)
1421 return (EMSGSIZE);
1422 if (space < resid + clen &&
1423 (atomic || space < (int32_t)so->so_snd.sb_lowat || space < clen)) {
1424 if ((so->so_state & SS_NBIO) || (flags & MSG_NBIO) ||
1425 assumelock) {
1426 return (EWOULDBLOCK);
1427 }
1428 sbunlock(&so->so_snd, 1);
1429 *sblocked = 0;
1430 error = sbwait(&so->so_snd);
1431 if (error) {
1432 if (so->so_flags & SOF_DEFUNCT)
1433 goto defunct;
1434 return (error);
1435 }
1436 goto restart;
1437 }
1438
1439 return (0);
1440 }
1441
1442 /*
1443 * Send on a socket.
1444 * If send must go all at once and message is larger than
1445 * send buffering, then hard error.
1446 * Lock against other senders.
1447 * If must go all at once and not enough room now, then
1448 * inform user that this would block and do nothing.
1449 * Otherwise, if nonblocking, send as much as possible.
1450 * The data to be sent is described by "uio" if nonzero,
1451 * otherwise by the mbuf chain "top" (which must be null
1452 * if uio is not). Data provided in mbuf chain must be small
1453 * enough to send all at once.
1454 *
1455 * Returns nonzero on error, timeout or signal; callers
1456 * must check for short counts if EINTR/ERESTART are returned.
1457 * Data and control buffers are freed on return.
1458 * Experiment:
1459 * MSG_HOLD: go thru most of sosend(), but just enqueue the mbuf
1460 * MSG_SEND: go thru as for MSG_HOLD on current fragment, then
1461 * point at the mbuf chain being constructed and go from there.
1462 *
1463 * Returns: 0 Success
1464 * EOPNOTSUPP
1465 * EINVAL
1466 * ENOBUFS
1467 * uiomove:EFAULT
1468 * sosendcheck:EPIPE
1469 * sosendcheck:EWOULDBLOCK
1470 * sosendcheck:EINTR
1471 * sosendcheck:EBADF
1472 * sosendcheck:EINTR
1473 * sosendcheck:??? [value from so_error]
1474 * <pru_send>:ECONNRESET[TCP]
1475 * <pru_send>:EINVAL[TCP]
1476 * <pru_send>:ENOBUFS[TCP]
1477 * <pru_send>:EADDRINUSE[TCP]
1478 * <pru_send>:EADDRNOTAVAIL[TCP]
1479 * <pru_send>:EAFNOSUPPORT[TCP]
1480 * <pru_send>:EACCES[TCP]
1481 * <pru_send>:EAGAIN[TCP]
1482 * <pru_send>:EPERM[TCP]
1483 * <pru_send>:EMSGSIZE[TCP]
1484 * <pru_send>:EHOSTUNREACH[TCP]
1485 * <pru_send>:ENETUNREACH[TCP]
1486 * <pru_send>:ENETDOWN[TCP]
1487 * <pru_send>:ENOMEM[TCP]
1488 * <pru_send>:ENOBUFS[TCP]
1489 * <pru_send>:???[TCP] [ignorable: mostly IPSEC/firewall/DLIL]
1490 * <pru_send>:EINVAL[AF_UNIX]
1491 * <pru_send>:EOPNOTSUPP[AF_UNIX]
1492 * <pru_send>:EPIPE[AF_UNIX]
1493 * <pru_send>:ENOTCONN[AF_UNIX]
1494 * <pru_send>:EISCONN[AF_UNIX]
1495 * <pru_send>:???[AF_UNIX] [whatever a filter author chooses]
1496 * <sf_data_out>:??? [whatever a filter author chooses]
1497 *
1498 * Notes: Other <pru_send> returns depend on the protocol family; all
1499 * <sf_data_out> returns depend on what the filter author causes
1500 * their filter to return.
1501 */
1502 int
1503 sosend(struct socket *so, struct sockaddr *addr, struct uio *uio,
1504 struct mbuf *top, struct mbuf *control, int flags)
1505 {
1506 struct mbuf **mp;
1507 register struct mbuf *m, *freelist = NULL;
1508 register int32_t space, len, resid;
1509 int clen = 0, error, dontroute, mlen, sendflags;
1510 int atomic = sosendallatonce(so) || top;
1511 int sblocked = 0;
1512 struct proc *p = current_proc();
1513
1514 if (uio) {
1515 // LP64todo - fix this!
1516 resid = uio_resid(uio);
1517 } else {
1518 resid = top->m_pkthdr.len;
1519 }
1520 KERNEL_DEBUG((DBG_FNC_SOSEND | DBG_FUNC_START), so, resid,
1521 so->so_snd.sb_cc, so->so_snd.sb_lowat, so->so_snd.sb_hiwat);
1522
1523 socket_lock(so, 1);
1524 so_update_last_owner_locked(so, p);
1525
1526 if (so->so_type != SOCK_STREAM && (flags & MSG_OOB) != 0) {
1527 error = EOPNOTSUPP;
1528 socket_unlock(so, 1);
1529 goto out;
1530 }
1531
1532 /*
1533 * In theory resid should be unsigned.
1534 * However, space must be signed, as it might be less than 0
1535 * if we over-committed, and we must use a signed comparison
1536 * of space and resid. On the other hand, a negative resid
1537 * causes us to loop sending 0-length segments to the protocol.
1538 *
1539 * Also check to make sure that MSG_EOR isn't used on SOCK_STREAM
1540 * type sockets since that's an error.
1541 */
1542 if (resid < 0 || (so->so_type == SOCK_STREAM && (flags & MSG_EOR))) {
1543 error = EINVAL;
1544 socket_unlock(so, 1);
1545 goto out;
1546 }
1547
1548 dontroute =
1549 (flags & MSG_DONTROUTE) && (so->so_options & SO_DONTROUTE) == 0 &&
1550 (so->so_proto->pr_flags & PR_ATOMIC);
1551 OSIncrementAtomicLong(&p->p_stats->p_ru.ru_msgsnd);
1552 if (control)
1553 clen = control->m_len;
1554
1555 do {
1556 error = sosendcheck(so, addr, resid, clen, atomic, flags,
1557 &sblocked);
1558 if (error) {
1559 goto release;
1560 }
1561 mp = &top;
1562 space = sbspace(&so->so_snd) - clen + ((flags & MSG_OOB) ?
1563 1024 : 0);
1564
1565 do {
1566 if (uio == NULL) {
1567 /*
1568 * Data is prepackaged in "top".
1569 */
1570 resid = 0;
1571 if (flags & MSG_EOR)
1572 top->m_flags |= M_EOR;
1573 } else {
1574 int chainlength;
1575 int bytes_to_copy;
1576 boolean_t jumbocl;
1577
1578 bytes_to_copy = imin(resid, space);
1579
1580 if (sosendminchain > 0) {
1581 chainlength = 0;
1582 } else {
1583 chainlength = sosendmaxchain;
1584 }
1585
1586 /*
1587 * Attempt to use larger than system page-size
1588 * clusters for large writes only if there is
1589 * a jumbo cluster pool and if the socket is
1590 * marked accordingly.
1591 */
1592 jumbocl = sosendjcl && njcl > 0 &&
1593 ((so->so_flags & SOF_MULTIPAGES) ||
1594 sosendjcl_ignore_capab);
1595
1596 socket_unlock(so, 0);
1597
1598 do {
1599 int num_needed;
1600 int hdrs_needed = (top == 0) ? 1 : 0;
1601
1602 /*
1603 * try to maintain a local cache of mbuf
1604 * clusters needed to complete this
1605 * write the list is further limited to
1606 * the number that are currently needed
1607 * to fill the socket this mechanism
1608 * allows a large number of mbufs/
1609 * clusters to be grabbed under a single
1610 * mbuf lock... if we can't get any
1611 * clusters, than fall back to trying
1612 * for mbufs if we fail early (or
1613 * miscalcluate the number needed) make
1614 * sure to release any clusters we
1615 * haven't yet consumed.
1616 */
1617 if (freelist == NULL &&
1618 bytes_to_copy > MBIGCLBYTES &&
1619 jumbocl) {
1620 num_needed =
1621 bytes_to_copy / M16KCLBYTES;
1622
1623 if ((bytes_to_copy -
1624 (num_needed * M16KCLBYTES))
1625 >= MINCLSIZE)
1626 num_needed++;
1627
1628 freelist =
1629 m_getpackets_internal(
1630 (unsigned int *)&num_needed,
1631 hdrs_needed, M_WAIT, 0,
1632 M16KCLBYTES);
1633 /*
1634 * Fall back to 4K cluster size
1635 * if allocation failed
1636 */
1637 }
1638
1639 if (freelist == NULL &&
1640 bytes_to_copy > MCLBYTES) {
1641 num_needed =
1642 bytes_to_copy / MBIGCLBYTES;
1643
1644 if ((bytes_to_copy -
1645 (num_needed * MBIGCLBYTES)) >=
1646 MINCLSIZE)
1647 num_needed++;
1648
1649 freelist =
1650 m_getpackets_internal(
1651 (unsigned int *)&num_needed,
1652 hdrs_needed, M_WAIT, 0,
1653 MBIGCLBYTES);
1654 /*
1655 * Fall back to cluster size
1656 * if allocation failed
1657 */
1658 }
1659
1660 if (freelist == NULL &&
1661 bytes_to_copy > MINCLSIZE) {
1662 num_needed =
1663 bytes_to_copy / MCLBYTES;
1664
1665 if ((bytes_to_copy -
1666 (num_needed * MCLBYTES)) >=
1667 MINCLSIZE)
1668 num_needed++;
1669
1670 freelist =
1671 m_getpackets_internal(
1672 (unsigned int *)&num_needed,
1673 hdrs_needed, M_WAIT, 0,
1674 MCLBYTES);
1675 /*
1676 * Fall back to a single mbuf
1677 * if allocation failed
1678 */
1679 }
1680
1681 if (freelist == NULL) {
1682 if (top == 0)
1683 MGETHDR(freelist,
1684 M_WAIT, MT_DATA);
1685 else
1686 MGET(freelist,
1687 M_WAIT, MT_DATA);
1688
1689 if (freelist == NULL) {
1690 error = ENOBUFS;
1691 socket_lock(so, 0);
1692 goto release;
1693 }
1694 /*
1695 * For datagram protocols,
1696 * leave room for protocol
1697 * headers in first mbuf.
1698 */
1699 if (atomic && top == 0 &&
1700 bytes_to_copy < MHLEN) {
1701 MH_ALIGN(freelist,
1702 bytes_to_copy);
1703 }
1704 }
1705 m = freelist;
1706 freelist = m->m_next;
1707 m->m_next = NULL;
1708
1709 if ((m->m_flags & M_EXT))
1710 mlen = m->m_ext.ext_size;
1711 else if ((m->m_flags & M_PKTHDR))
1712 mlen =
1713 MHLEN - m_leadingspace(m);
1714 else
1715 mlen = MLEN;
1716 len = imin(mlen, bytes_to_copy);
1717
1718 chainlength += len;
1719
1720 space -= len;
1721
1722 error = uiomove(mtod(m, caddr_t),
1723 len, uio);
1724
1725 resid = uio_resid(uio);
1726
1727 m->m_len = len;
1728 *mp = m;
1729 top->m_pkthdr.len += len;
1730 if (error)
1731 break;
1732 mp = &m->m_next;
1733 if (resid <= 0) {
1734 if (flags & MSG_EOR)
1735 top->m_flags |= M_EOR;
1736 break;
1737 }
1738 bytes_to_copy = min(resid, space);
1739
1740 } while (space > 0 &&
1741 (chainlength < sosendmaxchain || atomic ||
1742 resid < MINCLSIZE));
1743
1744 socket_lock(so, 0);
1745
1746 if (error)
1747 goto release;
1748 }
1749
1750 if (flags & (MSG_HOLD|MSG_SEND)) {
1751 /* Enqueue for later, go away if HOLD */
1752 register struct mbuf *mb1;
1753 if (so->so_temp && (flags & MSG_FLUSH)) {
1754 m_freem(so->so_temp);
1755 so->so_temp = NULL;
1756 }
1757 if (so->so_temp)
1758 so->so_tail->m_next = top;
1759 else
1760 so->so_temp = top;
1761 mb1 = top;
1762 while (mb1->m_next)
1763 mb1 = mb1->m_next;
1764 so->so_tail = mb1;
1765 if (flags & MSG_HOLD) {
1766 top = NULL;
1767 goto release;
1768 }
1769 top = so->so_temp;
1770 }
1771 if (dontroute)
1772 so->so_options |= SO_DONTROUTE;
1773
1774 /* Compute flags here, for pru_send and NKEs */
1775 sendflags = (flags & MSG_OOB) ? PRUS_OOB :
1776 /*
1777 * If the user set MSG_EOF, the protocol
1778 * understands this flag and nothing left to
1779 * send then use PRU_SEND_EOF instead of PRU_SEND.
1780 */
1781 ((flags & MSG_EOF) &&
1782 (so->so_proto->pr_flags & PR_IMPLOPCL) &&
1783 (resid <= 0)) ?
1784 PRUS_EOF :
1785 /* If there is more to send set PRUS_MORETOCOME */
1786 (resid > 0 && space > 0) ? PRUS_MORETOCOME : 0;
1787
1788 /*
1789 * Socket filter processing
1790 */
1791 error = sflt_data_out(so, addr, &top, &control,
1792 (sendflags & MSG_OOB) ? sock_data_filt_flag_oob : 0);
1793 if (error) {
1794 if (error == EJUSTRETURN) {
1795 error = 0;
1796 clen = 0;
1797 control = 0;
1798 top = 0;
1799 }
1800
1801 goto release;
1802 }
1803 /*
1804 * End Socket filter processing
1805 */
1806
1807 error = (*so->so_proto->pr_usrreqs->pru_send)
1808 (so, sendflags, top, addr, control, p);
1809 #ifdef __APPLE__
1810 if (flags & MSG_SEND)
1811 so->so_temp = NULL;
1812 #endif
1813 if (dontroute)
1814 so->so_options &= ~SO_DONTROUTE;
1815
1816 clen = 0;
1817 control = 0;
1818 top = 0;
1819 mp = &top;
1820 if (error)
1821 goto release;
1822 } while (resid && space > 0);
1823 } while (resid);
1824
1825 release:
1826 if (sblocked)
1827 sbunlock(&so->so_snd, 0); /* will unlock socket */
1828 else
1829 socket_unlock(so, 1);
1830 out:
1831 if (top)
1832 m_freem(top);
1833 if (control)
1834 m_freem(control);
1835 if (freelist)
1836 m_freem_list(freelist);
1837
1838 KERNEL_DEBUG(DBG_FNC_SOSEND | DBG_FUNC_END, so, resid, so->so_snd.sb_cc,
1839 space, error);
1840
1841 return (error);
1842 }
1843
1844 /*
1845 * Implement receive operations on a socket.
1846 * We depend on the way that records are added to the sockbuf
1847 * by sbappend*. In particular, each record (mbufs linked through m_next)
1848 * must begin with an address if the protocol so specifies,
1849 * followed by an optional mbuf or mbufs containing ancillary data,
1850 * and then zero or more mbufs of data.
1851 * In order to avoid blocking network interrupts for the entire time here,
1852 * we splx() while doing the actual copy to user space.
1853 * Although the sockbuf is locked, new data may still be appended,
1854 * and thus we must maintain consistency of the sockbuf during that time.
1855 *
1856 * The caller may receive the data as a single mbuf chain by supplying
1857 * an mbuf **mp0 for use in returning the chain. The uio is then used
1858 * only for the count in uio_resid.
1859 *
1860 * Returns: 0 Success
1861 * ENOBUFS
1862 * ENOTCONN
1863 * EWOULDBLOCK
1864 * uiomove:EFAULT
1865 * sblock:EWOULDBLOCK
1866 * sblock:EINTR
1867 * sbwait:EBADF
1868 * sbwait:EINTR
1869 * sodelayed_copy:EFAULT
1870 * <pru_rcvoob>:EINVAL[TCP]
1871 * <pru_rcvoob>:EWOULDBLOCK[TCP]
1872 * <pru_rcvoob>:???
1873 * <pr_domain->dom_externalize>:EMSGSIZE[AF_UNIX]
1874 * <pr_domain->dom_externalize>:ENOBUFS[AF_UNIX]
1875 * <pr_domain->dom_externalize>:???
1876 *
1877 * Notes: Additional return values from calls through <pru_rcvoob> and
1878 * <pr_domain->dom_externalize> depend on protocols other than
1879 * TCP or AF_UNIX, which are documented above.
1880 */
1881 int
1882 soreceive(struct socket *so, struct sockaddr **psa, struct uio *uio,
1883 struct mbuf **mp0, struct mbuf **controlp, int *flagsp)
1884 {
1885 register struct mbuf *m, **mp, *ml = NULL;
1886 register int flags, len, error, offset;
1887 struct protosw *pr = so->so_proto;
1888 struct mbuf *nextrecord;
1889 int moff, type = 0;
1890 int orig_resid = uio_resid(uio);
1891 struct mbuf *free_list;
1892 int delayed_copy_len;
1893 int can_delay;
1894 int need_event;
1895 struct proc *p = current_proc();
1896
1897 // LP64todo - fix this!
1898 KERNEL_DEBUG(DBG_FNC_SORECEIVE | DBG_FUNC_START, so, uio_resid(uio),
1899 so->so_rcv.sb_cc, so->so_rcv.sb_lowat, so->so_rcv.sb_hiwat);
1900
1901 socket_lock(so, 1);
1902 so_update_last_owner_locked(so, p);
1903
1904 #ifdef MORE_LOCKING_DEBUG
1905 if (so->so_usecount == 1)
1906 panic("soreceive: so=%x no other reference on socket\n", so);
1907 #endif
1908 mp = mp0;
1909 if (psa)
1910 *psa = 0;
1911 if (controlp)
1912 *controlp = 0;
1913 if (flagsp)
1914 flags = *flagsp &~ MSG_EOR;
1915 else
1916 flags = 0;
1917
1918 /*
1919 * If a recv attempt is made on a previously-accepted socket
1920 * that has been marked as inactive (disconnected), reject
1921 * the request.
1922 */
1923 if (so->so_flags & SOF_DEFUNCT) {
1924 struct sockbuf *sb = &so->so_rcv;
1925
1926 error = ENOTCONN;
1927 SODEFUNCTLOG(("%s[%d]: defunct so %p [%d,%d] (%d)\n", __func__,
1928 proc_pid(p), so, INP_SOCKAF(so), INP_SOCKTYPE(so), error));
1929 /*
1930 * This socket should have been disconnected and flushed
1931 * prior to being returned from sodefunct(); there should
1932 * be no data on its receive list, so panic otherwise.
1933 */
1934 if (so->so_state & SS_DEFUNCT)
1935 sb_empty_assert(sb, __func__);
1936 socket_unlock(so, 1);
1937 return (error);
1938 }
1939
1940 /*
1941 * When SO_WANTOOBFLAG is set we try to get out-of-band data
1942 * regardless of the flags argument. Here is the case were
1943 * out-of-band data is not inline.
1944 */
1945 if ((flags & MSG_OOB) ||
1946 ((so->so_options & SO_WANTOOBFLAG) != 0 &&
1947 (so->so_options & SO_OOBINLINE) == 0 &&
1948 (so->so_oobmark || (so->so_state & SS_RCVATMARK)))) {
1949 m = m_get(M_WAIT, MT_DATA);
1950 if (m == NULL) {
1951 socket_unlock(so, 1);
1952 KERNEL_DEBUG(DBG_FNC_SORECEIVE | DBG_FUNC_END,
1953 ENOBUFS, 0, 0, 0, 0);
1954 return (ENOBUFS);
1955 }
1956 error = (*pr->pr_usrreqs->pru_rcvoob)(so, m, flags & MSG_PEEK);
1957 if (error)
1958 goto bad;
1959 socket_unlock(so, 0);
1960 do {
1961 error = uiomove(mtod(m, caddr_t),
1962 imin(uio_resid(uio), m->m_len), uio);
1963 m = m_free(m);
1964 } while (uio_resid(uio) && error == 0 && m);
1965 socket_lock(so, 0);
1966 bad:
1967 if (m)
1968 m_freem(m);
1969 #ifdef __APPLE__
1970 if ((so->so_options & SO_WANTOOBFLAG) != 0) {
1971 if (error == EWOULDBLOCK || error == EINVAL) {
1972 /*
1973 * Let's try to get normal data:
1974 * EWOULDBLOCK: out-of-band data not
1975 * receive yet. EINVAL: out-of-band data
1976 * already read.
1977 */
1978 error = 0;
1979 goto nooob;
1980 } else if (error == 0 && flagsp) {
1981 *flagsp |= MSG_OOB;
1982 }
1983 }
1984 socket_unlock(so, 1);
1985 KERNEL_DEBUG(DBG_FNC_SORECEIVE | DBG_FUNC_END, error,
1986 0, 0, 0, 0);
1987 #endif
1988 return (error);
1989 }
1990 nooob:
1991 if (mp)
1992 *mp = (struct mbuf *)0;
1993 if (so->so_state & SS_ISCONFIRMING && uio_resid(uio))
1994 (*pr->pr_usrreqs->pru_rcvd)(so, 0);
1995
1996
1997 free_list = (struct mbuf *)0;
1998 delayed_copy_len = 0;
1999 restart:
2000 #ifdef MORE_LOCKING_DEBUG
2001 if (so->so_usecount <= 1)
2002 printf("soreceive: sblock so=%p ref=%d on socket\n",
2003 so, so->so_usecount);
2004 #endif
2005 /*
2006 * See if the socket has been closed (SS_NOFDREF|SS_CANTRCVMORE)
2007 * and if so just return to the caller. This could happen when
2008 * soreceive() is called by a socket upcall function during the
2009 * time the socket is freed. The socket buffer would have been
2010 * locked across the upcall, therefore we cannot put this thread
2011 * to sleep (else we will deadlock) or return EWOULDBLOCK (else
2012 * we may livelock), because the lock on the socket buffer will
2013 * only be released when the upcall routine returns to its caller.
2014 * Because the socket has been officially closed, there can be
2015 * no further read on it.
2016 */
2017 if ((so->so_state & (SS_NOFDREF | SS_CANTRCVMORE)) ==
2018 (SS_NOFDREF | SS_CANTRCVMORE)) {
2019 socket_unlock(so, 1);
2020 return (0);
2021 }
2022
2023 error = sblock(&so->so_rcv, SBLOCKWAIT(flags));
2024 if (error) {
2025 socket_unlock(so, 1);
2026 KERNEL_DEBUG(DBG_FNC_SORECEIVE | DBG_FUNC_END, error,
2027 0, 0, 0, 0);
2028 return (error);
2029 }
2030
2031 m = so->so_rcv.sb_mb;
2032 /*
2033 * If we have less data than requested, block awaiting more
2034 * (subject to any timeout) if:
2035 * 1. the current count is less than the low water mark, or
2036 * 2. MSG_WAITALL is set, and it is possible to do the entire
2037 * receive operation at once if we block (resid <= hiwat).
2038 * 3. MSG_DONTWAIT is not set
2039 * If MSG_WAITALL is set but resid is larger than the receive buffer,
2040 * we have to do the receive in sections, and thus risk returning
2041 * a short count if a timeout or signal occurs after we start.
2042 */
2043 if (m == 0 || (((flags & MSG_DONTWAIT) == 0 &&
2044 so->so_rcv.sb_cc < uio_resid(uio)) &&
2045 (so->so_rcv.sb_cc < so->so_rcv.sb_lowat ||
2046 ((flags & MSG_WAITALL) && uio_resid(uio) <= so->so_rcv.sb_hiwat)) &&
2047 m->m_nextpkt == 0 && (pr->pr_flags & PR_ATOMIC) == 0)) {
2048 /*
2049 * Panic if we notice inconsistencies in the socket's
2050 * receive list; both sb_mb and sb_cc should correctly
2051 * reflect the contents of the list, otherwise we may
2052 * end up with false positives during select() or poll()
2053 * which could put the application in a bad state.
2054 */
2055 if (m == NULL && so->so_rcv.sb_cc != 0)
2056 panic("soreceive corrupted so_rcv: m %p cc %u",
2057 m, so->so_rcv.sb_cc);
2058
2059 if (so->so_error) {
2060 if (m)
2061 goto dontblock;
2062 error = so->so_error;
2063 if ((flags & MSG_PEEK) == 0)
2064 so->so_error = 0;
2065 goto release;
2066 }
2067 if (so->so_state & SS_CANTRCVMORE) {
2068 if (m)
2069 goto dontblock;
2070 else
2071 goto release;
2072 }
2073 for (; m; m = m->m_next)
2074 if (m->m_type == MT_OOBDATA || (m->m_flags & M_EOR)) {
2075 m = so->so_rcv.sb_mb;
2076 goto dontblock;
2077 }
2078 if ((so->so_state & (SS_ISCONNECTED|SS_ISCONNECTING)) == 0 &&
2079 (so->so_proto->pr_flags & PR_CONNREQUIRED)) {
2080 error = ENOTCONN;
2081 goto release;
2082 }
2083 if (uio_resid(uio) == 0)
2084 goto release;
2085 if ((so->so_state & SS_NBIO) ||
2086 (flags & (MSG_DONTWAIT|MSG_NBIO))) {
2087 error = EWOULDBLOCK;
2088 goto release;
2089 }
2090 SBLASTRECORDCHK(&so->so_rcv, "soreceive sbwait 1");
2091 SBLASTMBUFCHK(&so->so_rcv, "soreceive sbwait 1");
2092 sbunlock(&so->so_rcv, 1);
2093 #if EVEN_MORE_LOCKING_DEBUG
2094 if (socket_debug)
2095 printf("Waiting for socket data\n");
2096 #endif
2097
2098 error = sbwait(&so->so_rcv);
2099 #if EVEN_MORE_LOCKING_DEBUG
2100 if (socket_debug)
2101 printf("SORECEIVE - sbwait returned %d\n", error);
2102 #endif
2103 if (so->so_usecount < 1)
2104 panic("soreceive: after 2nd sblock so=%p ref=%d on "
2105 "socket\n", so, so->so_usecount);
2106 if (error) {
2107 socket_unlock(so, 1);
2108 KERNEL_DEBUG(DBG_FNC_SORECEIVE | DBG_FUNC_END, error,
2109 0, 0, 0, 0);
2110 return (error);
2111 }
2112 goto restart;
2113 }
2114 dontblock:
2115 OSIncrementAtomicLong(&p->p_stats->p_ru.ru_msgrcv);
2116 SBLASTRECORDCHK(&so->so_rcv, "soreceive 1");
2117 SBLASTMBUFCHK(&so->so_rcv, "soreceive 1");
2118 nextrecord = m->m_nextpkt;
2119 if ((pr->pr_flags & PR_ADDR) && m->m_type == MT_SONAME) {
2120 KASSERT(m->m_type == MT_SONAME, ("receive 1a"));
2121 #if CONFIG_MACF_SOCKET_SUBSET
2122 /*
2123 * Call the MAC framework for policy checking if we're in
2124 * the user process context and the socket isn't connected.
2125 */
2126 if (p != kernproc && !(so->so_state & SS_ISCONNECTED)) {
2127 struct mbuf *m0 = m;
2128 /*
2129 * Dequeue this record (temporarily) from the receive
2130 * list since we're about to drop the socket's lock
2131 * where a new record may arrive and be appended to
2132 * the list. Upon MAC policy failure, the record
2133 * will be freed. Otherwise, we'll add it back to
2134 * the head of the list. We cannot rely on SB_LOCK
2135 * because append operation uses the socket's lock.
2136 */
2137 do {
2138 m->m_nextpkt = NULL;
2139 sbfree(&so->so_rcv, m);
2140 m = m->m_next;
2141 } while (m != NULL);
2142 m = m0;
2143 so->so_rcv.sb_mb = nextrecord;
2144 SB_EMPTY_FIXUP(&so->so_rcv);
2145 SBLASTRECORDCHK(&so->so_rcv, "soreceive 1a");
2146 SBLASTMBUFCHK(&so->so_rcv, "soreceive 1a");
2147 socket_unlock(so, 0);
2148 if (mac_socket_check_received(proc_ucred(p), so,
2149 mtod(m, struct sockaddr *)) != 0) {
2150 /*
2151 * MAC policy failure; free this record and
2152 * process the next record (or block until
2153 * one is available). We have adjusted sb_cc
2154 * and sb_mbcnt above so there is no need to
2155 * call sbfree() again.
2156 */
2157 do {
2158 m = m_free(m);
2159 } while (m != NULL);
2160 /*
2161 * Clear SB_LOCK but don't unlock the socket.
2162 * Process the next record or wait for one.
2163 */
2164 socket_lock(so, 0);
2165 sbunlock(&so->so_rcv, 1);
2166 goto restart;
2167 }
2168 socket_lock(so, 0);
2169 /*
2170 * If the socket has been defunct'd, drop it.
2171 */
2172 if (so->so_flags & SOF_DEFUNCT) {
2173 m_freem(m);
2174 error = ENOTCONN;
2175 goto release;
2176 }
2177 /*
2178 * Re-adjust the socket receive list and re-enqueue
2179 * the record in front of any packets which may have
2180 * been appended while we dropped the lock.
2181 */
2182 for (m = m0; m->m_next != NULL; m = m->m_next)
2183 sballoc(&so->so_rcv, m);
2184 sballoc(&so->so_rcv, m);
2185 if (so->so_rcv.sb_mb == NULL) {
2186 so->so_rcv.sb_lastrecord = m0;
2187 so->so_rcv.sb_mbtail = m;
2188 }
2189 m = m0;
2190 nextrecord = m->m_nextpkt = so->so_rcv.sb_mb;
2191 so->so_rcv.sb_mb = m;
2192 SBLASTRECORDCHK(&so->so_rcv, "soreceive 1b");
2193 SBLASTMBUFCHK(&so->so_rcv, "soreceive 1b");
2194 }
2195 #endif /* CONFIG_MACF_SOCKET_SUBSET */
2196 orig_resid = 0;
2197 if (psa) {
2198 *psa = dup_sockaddr(mtod(m, struct sockaddr *),
2199 mp0 == 0);
2200 if ((*psa == 0) && (flags & MSG_NEEDSA)) {
2201 error = EWOULDBLOCK;
2202 goto release;
2203 }
2204 }
2205 if (flags & MSG_PEEK) {
2206 m = m->m_next;
2207 } else {
2208 sbfree(&so->so_rcv, m);
2209 if (m->m_next == 0 && so->so_rcv.sb_cc != 0)
2210 panic("soreceive: about to create invalid "
2211 "socketbuf");
2212 MFREE(m, so->so_rcv.sb_mb);
2213 m = so->so_rcv.sb_mb;
2214 if (m != NULL) {
2215 m->m_nextpkt = nextrecord;
2216 } else {
2217 so->so_rcv.sb_mb = nextrecord;
2218 SB_EMPTY_FIXUP(&so->so_rcv);
2219 }
2220 }
2221 }
2222
2223 /*
2224 * Process one or more MT_CONTROL mbufs present before any data mbufs
2225 * in the first mbuf chain on the socket buffer. If MSG_PEEK, we
2226 * just copy the data; if !MSG_PEEK, we call into the protocol to
2227 * perform externalization.
2228 */
2229 if (m != NULL && m->m_type == MT_CONTROL) {
2230 struct mbuf *cm = NULL, *cmn;
2231 struct mbuf **cme = &cm;
2232 struct sockbuf *sb_rcv = &so->so_rcv;
2233 struct mbuf **msgpcm = NULL;
2234
2235 /*
2236 * Externalizing the control messages would require us to
2237 * drop the socket's lock below. Once we re-acquire the
2238 * lock, the mbuf chain might change. In order to preserve
2239 * consistency, we unlink all control messages from the
2240 * first mbuf chain in one shot and link them separately
2241 * onto a different chain.
2242 */
2243 do {
2244 if (flags & MSG_PEEK) {
2245 if (controlp != NULL) {
2246 if (*controlp == NULL) {
2247 msgpcm = controlp;
2248 }
2249 *controlp = m_copy(m, 0, m->m_len);
2250
2251 /* If we failed to allocate an mbuf,
2252 * release any previously allocated
2253 * mbufs for control data. Return
2254 * an error. Keep the mbufs in the
2255 * socket as this is using
2256 * MSG_PEEK flag.
2257 */
2258 if (*controlp == NULL) {
2259 m_freem(*msgpcm);
2260 error = ENOBUFS;
2261 goto release;
2262 }
2263 controlp = &(*controlp)->m_next;
2264 }
2265 m = m->m_next;
2266 } else {
2267 m->m_nextpkt = NULL;
2268 sbfree(sb_rcv, m);
2269 sb_rcv->sb_mb = m->m_next;
2270 m->m_next = NULL;
2271 *cme = m;
2272 cme = &(*cme)->m_next;
2273 m = sb_rcv->sb_mb;
2274 }
2275 } while (m != NULL && m->m_type == MT_CONTROL);
2276
2277 if (!(flags & MSG_PEEK)) {
2278 if (sb_rcv->sb_mb != NULL) {
2279 sb_rcv->sb_mb->m_nextpkt = nextrecord;
2280 } else {
2281 sb_rcv->sb_mb = nextrecord;
2282 SB_EMPTY_FIXUP(sb_rcv);
2283 }
2284 if (nextrecord == NULL)
2285 sb_rcv->sb_lastrecord = m;
2286 }
2287
2288 SBLASTRECORDCHK(&so->so_rcv, "soreceive ctl");
2289 SBLASTMBUFCHK(&so->so_rcv, "soreceive ctl");
2290
2291 while (cm != NULL) {
2292 int cmsg_type;
2293
2294 cmn = cm->m_next;
2295 cm->m_next = NULL;
2296 cmsg_type = mtod(cm, struct cmsghdr *)->cmsg_type;
2297
2298 /*
2299 * Call the protocol to externalize SCM_RIGHTS message
2300 * and return the modified message to the caller upon
2301 * success. Otherwise, all other control messages are
2302 * returned unmodified to the caller. Note that we
2303 * only get into this loop if MSG_PEEK is not set.
2304 */
2305 if (pr->pr_domain->dom_externalize != NULL &&
2306 cmsg_type == SCM_RIGHTS) {
2307 /*
2308 * Release socket lock: see 3903171. This
2309 * would also allow more records to be appended
2310 * to the socket buffer. We still have SB_LOCK
2311 * set on it, so we can be sure that the head
2312 * of the mbuf chain won't change.
2313 */
2314 socket_unlock(so, 0);
2315 error = (*pr->pr_domain->dom_externalize)(cm);
2316 socket_lock(so, 0);
2317 } else {
2318 error = 0;
2319 }
2320
2321 if (controlp != NULL && error == 0) {
2322 *controlp = cm;
2323 controlp = &(*controlp)->m_next;
2324 orig_resid = 0;
2325 } else {
2326 (void) m_free(cm);
2327 }
2328 cm = cmn;
2329 }
2330 orig_resid = 0;
2331 if (sb_rcv->sb_mb != NULL)
2332 nextrecord = sb_rcv->sb_mb->m_nextpkt;
2333 else
2334 nextrecord = NULL;
2335 }
2336
2337 if (m != NULL) {
2338 if (!(flags & MSG_PEEK)) {
2339 /*
2340 * We get here because m points to an mbuf following
2341 * any MT_SONAME or MT_CONTROL mbufs which have been
2342 * processed above. In any case, m should be pointing
2343 * to the head of the mbuf chain, and the nextrecord
2344 * should be either NULL or equal to m->m_nextpkt.
2345 * See comments above about SB_LOCK.
2346 */
2347 if (m != so->so_rcv.sb_mb || m->m_nextpkt != nextrecord)
2348 panic("soreceive: post-control !sync so=%p "
2349 "m=%p nextrecord=%p\n", so, m, nextrecord);
2350
2351 if (nextrecord == NULL)
2352 so->so_rcv.sb_lastrecord = m;
2353 }
2354 type = m->m_type;
2355 if (type == MT_OOBDATA)
2356 flags |= MSG_OOB;
2357 } else {
2358 if (!(flags & MSG_PEEK)) {
2359 so->so_rcv.sb_mb = nextrecord;
2360 SB_EMPTY_FIXUP(&so->so_rcv);
2361 }
2362 }
2363 SBLASTRECORDCHK(&so->so_rcv, "soreceive 2");
2364 SBLASTMBUFCHK(&so->so_rcv, "soreceive 2");
2365
2366 moff = 0;
2367 offset = 0;
2368
2369 if (!(flags & MSG_PEEK) && uio_resid(uio) > sorecvmincopy)
2370 can_delay = 1;
2371 else
2372 can_delay = 0;
2373
2374 need_event = 0;
2375
2376 while (m && (uio_resid(uio) - delayed_copy_len) > 0 && error == 0) {
2377 if (m->m_type == MT_OOBDATA) {
2378 if (type != MT_OOBDATA)
2379 break;
2380 } else if (type == MT_OOBDATA) {
2381 break;
2382 }
2383 /*
2384 * Make sure to allways set MSG_OOB event when getting
2385 * out of band data inline.
2386 */
2387 if ((so->so_options & SO_WANTOOBFLAG) != 0 &&
2388 (so->so_options & SO_OOBINLINE) != 0 &&
2389 (so->so_state & SS_RCVATMARK) != 0) {
2390 flags |= MSG_OOB;
2391 }
2392 so->so_state &= ~SS_RCVATMARK;
2393 len = uio_resid(uio) - delayed_copy_len;
2394 if (so->so_oobmark && len > so->so_oobmark - offset)
2395 len = so->so_oobmark - offset;
2396 if (len > m->m_len - moff)
2397 len = m->m_len - moff;
2398 /*
2399 * If mp is set, just pass back the mbufs.
2400 * Otherwise copy them out via the uio, then free.
2401 * Sockbuf must be consistent here (points to current mbuf,
2402 * it points to next record) when we drop priority;
2403 * we must note any additions to the sockbuf when we
2404 * block interrupts again.
2405 */
2406 if (mp == 0) {
2407 SBLASTRECORDCHK(&so->so_rcv, "soreceive uiomove");
2408 SBLASTMBUFCHK(&so->so_rcv, "soreceive uiomove");
2409 if (can_delay && len == m->m_len) {
2410 /*
2411 * only delay the copy if we're consuming the
2412 * mbuf and we're NOT in MSG_PEEK mode
2413 * and we have enough data to make it worthwile
2414 * to drop and retake the lock... can_delay
2415 * reflects the state of the 2 latter
2416 * constraints moff should always be zero
2417 * in these cases
2418 */
2419 delayed_copy_len += len;
2420 } else {
2421 if (delayed_copy_len) {
2422 error = sodelayed_copy(so, uio,
2423 &free_list, &delayed_copy_len);
2424
2425 if (error) {
2426 goto release;
2427 }
2428 /*
2429 * can only get here if MSG_PEEK is not
2430 * set therefore, m should point at the
2431 * head of the rcv queue; if it doesn't,
2432 * it means something drastically
2433 * changed while we were out from behind
2434 * the lock in sodelayed_copy. perhaps
2435 * a RST on the stream. in any event,
2436 * the stream has been interrupted. it's
2437 * probably best just to return whatever
2438 * data we've moved and let the caller
2439 * sort it out...
2440 */
2441 if (m != so->so_rcv.sb_mb) {
2442 break;
2443 }
2444 }
2445 socket_unlock(so, 0);
2446 error = uiomove(mtod(m, caddr_t) + moff,
2447 (int)len, uio);
2448 socket_lock(so, 0);
2449
2450 if (error)
2451 goto release;
2452 }
2453 } else {
2454 uio_setresid(uio, (uio_resid(uio) - len));
2455 }
2456 if (len == m->m_len - moff) {
2457 if (m->m_flags & M_EOR)
2458 flags |= MSG_EOR;
2459 if (flags & MSG_PEEK) {
2460 m = m->m_next;
2461 moff = 0;
2462 } else {
2463 nextrecord = m->m_nextpkt;
2464 sbfree(&so->so_rcv, m);
2465 m->m_nextpkt = NULL;
2466
2467 if (mp) {
2468 *mp = m;
2469 mp = &m->m_next;
2470 so->so_rcv.sb_mb = m = m->m_next;
2471 *mp = (struct mbuf *)0;
2472 } else {
2473 if (free_list == NULL)
2474 free_list = m;
2475 else
2476 ml->m_next = m;
2477 ml = m;
2478 so->so_rcv.sb_mb = m = m->m_next;
2479 ml->m_next = 0;
2480 }
2481 if (m != NULL) {
2482 m->m_nextpkt = nextrecord;
2483 if (nextrecord == NULL)
2484 so->so_rcv.sb_lastrecord = m;
2485 } else {
2486 so->so_rcv.sb_mb = nextrecord;
2487 SB_EMPTY_FIXUP(&so->so_rcv);
2488 }
2489 SBLASTRECORDCHK(&so->so_rcv, "soreceive 3");
2490 SBLASTMBUFCHK(&so->so_rcv, "soreceive 3");
2491 }
2492 } else {
2493 if (flags & MSG_PEEK) {
2494 moff += len;
2495 } else {
2496 if (mp != NULL) {
2497 int copy_flag;
2498
2499 if (flags & MSG_DONTWAIT)
2500 copy_flag = M_DONTWAIT;
2501 else
2502 copy_flag = M_WAIT;
2503 *mp = m_copym(m, 0, len, copy_flag);
2504 if (*mp == NULL) {
2505 /*
2506 * Failed to allocate an mbuf.
2507 * Adjust uio_resid back, it was
2508 * adjusted down by len bytes which
2509 * we didn't copy over
2510 */
2511 uio_setresid(uio, (uio_resid(uio) + len));
2512 break;
2513 }
2514 }
2515 m->m_data += len;
2516 m->m_len -= len;
2517 so->so_rcv.sb_cc -= len;
2518 }
2519 }
2520 if (so->so_oobmark) {
2521 if ((flags & MSG_PEEK) == 0) {
2522 so->so_oobmark -= len;
2523 if (so->so_oobmark == 0) {
2524 so->so_state |= SS_RCVATMARK;
2525 /*
2526 * delay posting the actual event until
2527 * after any delayed copy processing
2528 * has finished
2529 */
2530 need_event = 1;
2531 break;
2532 }
2533 } else {
2534 offset += len;
2535 if (offset == so->so_oobmark)
2536 break;
2537 }
2538 }
2539 if (flags & MSG_EOR)
2540 break;
2541 /*
2542 * If the MSG_WAITALL or MSG_WAITSTREAM flag is set
2543 * (for non-atomic socket), we must not quit until
2544 * "uio->uio_resid == 0" or an error termination.
2545 * If a signal/timeout occurs, return with a short
2546 * count but without error. Keep sockbuf locked
2547 * against other readers.
2548 */
2549 while (flags & (MSG_WAITALL|MSG_WAITSTREAM) && m == 0 &&
2550 (uio_resid(uio) - delayed_copy_len) > 0 &&
2551 !sosendallatonce(so) && !nextrecord) {
2552 if (so->so_error || so->so_state & SS_CANTRCVMORE)
2553 goto release;
2554
2555 /*
2556 * Depending on the protocol (e.g. TCP), the following
2557 * might cause the socket lock to be dropped and later
2558 * be reacquired, and more data could have arrived and
2559 * have been appended to the receive socket buffer by
2560 * the time it returns. Therefore, we only sleep in
2561 * sbwait() below if and only if the socket buffer is
2562 * empty, in order to avoid a false sleep.
2563 */
2564 if (pr->pr_flags & PR_WANTRCVD && so->so_pcb &&
2565 (((struct inpcb *)so->so_pcb)->inp_state !=
2566 INPCB_STATE_DEAD))
2567 (*pr->pr_usrreqs->pru_rcvd)(so, flags);
2568
2569 SBLASTRECORDCHK(&so->so_rcv, "soreceive sbwait 2");
2570 SBLASTMBUFCHK(&so->so_rcv, "soreceive sbwait 2");
2571
2572 if (so->so_rcv.sb_mb == NULL && sbwait(&so->so_rcv)) {
2573 error = 0;
2574 goto release;
2575 }
2576 /*
2577 * have to wait until after we get back from the sbwait
2578 * to do the copy because we will drop the lock if we
2579 * have enough data that has been delayed... by dropping
2580 * the lock we open up a window allowing the netisr
2581 * thread to process the incoming packets and to change
2582 * the state of this socket... we're issuing the sbwait
2583 * because the socket is empty and we're expecting the
2584 * netisr thread to wake us up when more packets arrive;
2585 * if we allow that processing to happen and then sbwait
2586 * we could stall forever with packets sitting in the
2587 * socket if no further packets arrive from the remote
2588 * side.
2589 *
2590 * we want to copy before we've collected all the data
2591 * to satisfy this request to allow the copy to overlap
2592 * the incoming packet processing on an MP system
2593 */
2594 if (delayed_copy_len > sorecvmincopy &&
2595 (delayed_copy_len > (so->so_rcv.sb_hiwat / 2))) {
2596 error = sodelayed_copy(so, uio,
2597 &free_list, &delayed_copy_len);
2598
2599 if (error)
2600 goto release;
2601 }
2602 m = so->so_rcv.sb_mb;
2603 if (m) {
2604 nextrecord = m->m_nextpkt;
2605 }
2606 }
2607 }
2608 #ifdef MORE_LOCKING_DEBUG
2609 if (so->so_usecount <= 1)
2610 panic("soreceive: after big while so=%p ref=%d on socket\n",
2611 so, so->so_usecount);
2612 #endif
2613
2614 if (m && pr->pr_flags & PR_ATOMIC) {
2615 #ifdef __APPLE__
2616 if (so->so_options & SO_DONTTRUNC) {
2617 flags |= MSG_RCVMORE;
2618 } else {
2619 #endif
2620 flags |= MSG_TRUNC;
2621 if ((flags & MSG_PEEK) == 0)
2622 (void) sbdroprecord(&so->so_rcv);
2623 #ifdef __APPLE__
2624 }
2625 #endif
2626 }
2627
2628 /*
2629 * pru_rcvd below (for TCP) may cause more data to be received
2630 * if the socket lock is dropped prior to sending the ACK; some
2631 * legacy OpenTransport applications don't handle this well
2632 * (if it receives less data than requested while MSG_HAVEMORE
2633 * is set), and so we set the flag now based on what we know
2634 * prior to calling pru_rcvd.
2635 */
2636 if ((so->so_options & SO_WANTMORE) && so->so_rcv.sb_cc > 0)
2637 flags |= MSG_HAVEMORE;
2638
2639 if ((flags & MSG_PEEK) == 0) {
2640 if (m == 0) {
2641 so->so_rcv.sb_mb = nextrecord;
2642 /*
2643 * First part is an inline SB_EMPTY_FIXUP(). Second
2644 * part makes sure sb_lastrecord is up-to-date if
2645 * there is still data in the socket buffer.
2646 */
2647 if (so->so_rcv.sb_mb == NULL) {
2648 so->so_rcv.sb_mbtail = NULL;
2649 so->so_rcv.sb_lastrecord = NULL;
2650 } else if (nextrecord->m_nextpkt == NULL) {
2651 so->so_rcv.sb_lastrecord = nextrecord;
2652 }
2653 }
2654 SBLASTRECORDCHK(&so->so_rcv, "soreceive 4");
2655 SBLASTMBUFCHK(&so->so_rcv, "soreceive 4");
2656 if (pr->pr_flags & PR_WANTRCVD && so->so_pcb)
2657 (*pr->pr_usrreqs->pru_rcvd)(so, flags);
2658 }
2659 #ifdef __APPLE__
2660 if (delayed_copy_len) {
2661 error = sodelayed_copy(so, uio, &free_list, &delayed_copy_len);
2662
2663 if (error)
2664 goto release;
2665 }
2666 if (free_list) {
2667 m_freem_list((struct mbuf *)free_list);
2668 free_list = (struct mbuf *)0;
2669 }
2670 if (need_event)
2671 postevent(so, 0, EV_OOB);
2672 #endif
2673 if (orig_resid == uio_resid(uio) && orig_resid &&
2674 (flags & MSG_EOR) == 0 && (so->so_state & SS_CANTRCVMORE) == 0) {
2675 sbunlock(&so->so_rcv, 1);
2676 goto restart;
2677 }
2678
2679 if (flagsp)
2680 *flagsp |= flags;
2681 release:
2682 #ifdef MORE_LOCKING_DEBUG
2683 if (so->so_usecount <= 1)
2684 panic("soreceive: release so=%p ref=%d on socket\n",
2685 so, so->so_usecount);
2686 #endif
2687 if (delayed_copy_len) {
2688 error = sodelayed_copy(so, uio, &free_list, &delayed_copy_len);
2689 }
2690 if (free_list) {
2691 m_freem_list((struct mbuf *)free_list);
2692 }
2693 sbunlock(&so->so_rcv, 0); /* will unlock socket */
2694
2695 // LP64todo - fix this!
2696 KERNEL_DEBUG(DBG_FNC_SORECEIVE | DBG_FUNC_END, so, uio_resid(uio),
2697 so->so_rcv.sb_cc, 0, error);
2698
2699 return (error);
2700 }
2701
2702 /*
2703 * Returns: 0 Success
2704 * uiomove:EFAULT
2705 */
2706 static int
2707 sodelayed_copy(struct socket *so, struct uio *uio, struct mbuf **free_list,
2708 int *resid)
2709 {
2710 int error = 0;
2711 struct mbuf *m;
2712
2713 m = *free_list;
2714
2715 socket_unlock(so, 0);
2716
2717 while (m && error == 0) {
2718
2719 error = uiomove(mtod(m, caddr_t), (int)m->m_len, uio);
2720
2721 m = m->m_next;
2722 }
2723 m_freem_list(*free_list);
2724
2725 *free_list = (struct mbuf *)NULL;
2726 *resid = 0;
2727
2728 socket_lock(so, 0);
2729
2730 return (error);
2731 }
2732
2733
2734 /*
2735 * Returns: 0 Success
2736 * EINVAL
2737 * ENOTCONN
2738 * <pru_shutdown>:EINVAL
2739 * <pru_shutdown>:EADDRNOTAVAIL[TCP]
2740 * <pru_shutdown>:ENOBUFS[TCP]
2741 * <pru_shutdown>:EMSGSIZE[TCP]
2742 * <pru_shutdown>:EHOSTUNREACH[TCP]
2743 * <pru_shutdown>:ENETUNREACH[TCP]
2744 * <pru_shutdown>:ENETDOWN[TCP]
2745 * <pru_shutdown>:ENOMEM[TCP]
2746 * <pru_shutdown>:EACCES[TCP]
2747 * <pru_shutdown>:EMSGSIZE[TCP]
2748 * <pru_shutdown>:ENOBUFS[TCP]
2749 * <pru_shutdown>:???[TCP] [ignorable: mostly IPSEC/firewall/DLIL]
2750 * <pru_shutdown>:??? [other protocol families]
2751 */
2752 int
2753 soshutdown(struct socket *so, int how)
2754 {
2755 int error;
2756
2757 switch (how) {
2758 case SHUT_RD:
2759 case SHUT_WR:
2760 case SHUT_RDWR:
2761 socket_lock(so, 1);
2762 if ((so->so_state &
2763 (SS_ISCONNECTED|SS_ISCONNECTING|SS_ISDISCONNECTING)) == 0) {
2764 error = ENOTCONN;
2765 } else {
2766 error = soshutdownlock(so, how);
2767 }
2768 socket_unlock(so, 1);
2769 break;
2770 default:
2771 error = EINVAL;
2772 break;
2773 }
2774
2775 return (error);
2776 }
2777
2778 int
2779 soshutdownlock(struct socket *so, int how)
2780 {
2781 struct protosw *pr = so->so_proto;
2782 int error = 0;
2783
2784 sflt_notify(so, sock_evt_shutdown, &how);
2785
2786 if (how != SHUT_WR) {
2787 if ((so->so_state & SS_CANTRCVMORE) != 0) {
2788 /* read already shut down */
2789 error = ENOTCONN;
2790 goto done;
2791 }
2792 sorflush(so);
2793 postevent(so, 0, EV_RCLOSED);
2794 }
2795 if (how != SHUT_RD) {
2796 if ((so->so_state & SS_CANTSENDMORE) != 0) {
2797 /* write already shut down */
2798 error = ENOTCONN;
2799 goto done;
2800 }
2801 error = (*pr->pr_usrreqs->pru_shutdown)(so);
2802 postevent(so, 0, EV_WCLOSED);
2803 }
2804 done:
2805 KERNEL_DEBUG(DBG_FNC_SOSHUTDOWN | DBG_FUNC_END, 0, 0, 0, 0, 0);
2806 return (error);
2807 }
2808
2809 void
2810 sorflush(struct socket *so)
2811 {
2812 register struct sockbuf *sb = &so->so_rcv;
2813 register struct protosw *pr = so->so_proto;
2814 struct sockbuf asb;
2815
2816 #ifdef MORE_LOCKING_DEBUG
2817 lck_mtx_t *mutex_held;
2818
2819 if (so->so_proto->pr_getlock != NULL)
2820 mutex_held = (*so->so_proto->pr_getlock)(so, 0);
2821 else
2822 mutex_held = so->so_proto->pr_domain->dom_mtx;
2823 lck_mtx_assert(mutex_held, LCK_MTX_ASSERT_OWNED);
2824 #endif
2825
2826 sflt_notify(so, sock_evt_flush_read, NULL);
2827
2828 sb->sb_flags |= SB_NOINTR;
2829 (void) sblock(sb, M_WAIT);
2830 socantrcvmore(so);
2831 sbunlock(sb, 1);
2832 #ifdef __APPLE__
2833 selthreadclear(&sb->sb_sel);
2834 #endif
2835 asb = *sb;
2836 bzero((caddr_t)sb, sizeof (*sb));
2837 sb->sb_so = so; /* reestablish link to socket */
2838 if (asb.sb_flags & SB_KNOTE) {
2839 sb->sb_sel.si_note = asb.sb_sel.si_note;
2840 sb->sb_flags = SB_KNOTE;
2841 }
2842 if (asb.sb_flags & SB_DROP)
2843 sb->sb_flags |= SB_DROP;
2844 if (asb.sb_flags & SB_UNIX)
2845 sb->sb_flags |= SB_UNIX;
2846 if ((pr->pr_flags & PR_RIGHTS) && pr->pr_domain->dom_dispose) {
2847 (*pr->pr_domain->dom_dispose)(asb.sb_mb);
2848 }
2849 sbrelease(&asb);
2850 }
2851
2852 /*
2853 * Perhaps this routine, and sooptcopyout(), below, ought to come in
2854 * an additional variant to handle the case where the option value needs
2855 * to be some kind of integer, but not a specific size.
2856 * In addition to their use here, these functions are also called by the
2857 * protocol-level pr_ctloutput() routines.
2858 *
2859 * Returns: 0 Success
2860 * EINVAL
2861 * copyin:EFAULT
2862 */
2863 int
2864 sooptcopyin(struct sockopt *sopt, void *buf, size_t len, size_t minlen)
2865 {
2866 size_t valsize;
2867
2868 /*
2869 * If the user gives us more than we wanted, we ignore it,
2870 * but if we don't get the minimum length the caller
2871 * wants, we return EINVAL. On success, sopt->sopt_valsize
2872 * is set to however much we actually retrieved.
2873 */
2874 if ((valsize = sopt->sopt_valsize) < minlen)
2875 return (EINVAL);
2876 if (valsize > len)
2877 sopt->sopt_valsize = valsize = len;
2878
2879 if (sopt->sopt_p != kernproc)
2880 return (copyin(sopt->sopt_val, buf, valsize));
2881
2882 bcopy(CAST_DOWN(caddr_t, sopt->sopt_val), buf, valsize);
2883 return (0);
2884 }
2885
2886 /*
2887 * sooptcopyin_timeval
2888 * Copy in a timeval value into tv_p, and take into account whether the
2889 * the calling process is 64-bit or 32-bit. Moved the sanity checking
2890 * code here so that we can verify the 64-bit tv_sec value before we lose
2891 * the top 32-bits assigning tv64.tv_sec to tv_p->tv_sec.
2892 */
2893 static int
2894 sooptcopyin_timeval(struct sockopt *sopt, struct timeval * tv_p)
2895 {
2896 int error;
2897
2898 if (proc_is64bit(sopt->sopt_p)) {
2899 struct user64_timeval tv64;
2900
2901 if (sopt->sopt_valsize < sizeof(tv64)) {
2902 return (EINVAL);
2903 }
2904 sopt->sopt_valsize = sizeof(tv64);
2905 if (sopt->sopt_p != kernproc) {
2906 error = copyin(sopt->sopt_val, &tv64, sizeof(tv64));
2907 if (error != 0)
2908 return (error);
2909 } else {
2910 bcopy(CAST_DOWN(caddr_t, sopt->sopt_val), &tv64,
2911 sizeof(tv64));
2912 }
2913 if (tv64.tv_sec < 0 || tv64.tv_sec > LONG_MAX
2914 || tv64.tv_usec < 0 || tv64.tv_usec >= 1000000) {
2915 return (EDOM);
2916 }
2917 tv_p->tv_sec = tv64.tv_sec;
2918 tv_p->tv_usec = tv64.tv_usec;
2919 } else {
2920 struct user32_timeval tv32;
2921
2922 if (sopt->sopt_valsize < sizeof(tv32)) {
2923 return (EINVAL);
2924 }
2925 sopt->sopt_valsize = sizeof(tv32);
2926 if (sopt->sopt_p != kernproc) {
2927 error = copyin(sopt->sopt_val, &tv32, sizeof(tv32));
2928 if (error != 0) {
2929 return (error);
2930 }
2931 } else {
2932 bcopy(CAST_DOWN(caddr_t, sopt->sopt_val), &tv32,
2933 sizeof(tv32));
2934 }
2935 #ifndef __LP64__ // K64todo "comparison is always false due to limited range of data type"
2936 if (tv32.tv_sec < 0 || tv32.tv_sec > LONG_MAX
2937 || tv32.tv_usec < 0 || tv32.tv_usec >= 1000000) {
2938 return (EDOM);
2939 }
2940 #endif
2941 tv_p->tv_sec = tv32.tv_sec;
2942 tv_p->tv_usec = tv32.tv_usec;
2943 }
2944 return (0);
2945 }
2946
2947 /*
2948 * Returns: 0 Success
2949 * EINVAL
2950 * ENOPROTOOPT
2951 * ENOBUFS
2952 * EDOM
2953 * sooptcopyin:EINVAL
2954 * sooptcopyin:EFAULT
2955 * sooptcopyin_timeval:EINVAL
2956 * sooptcopyin_timeval:EFAULT
2957 * sooptcopyin_timeval:EDOM
2958 * <pr_ctloutput>:EOPNOTSUPP[AF_UNIX]
2959 * <pr_ctloutput>:???w
2960 * sflt_attach_private:??? [whatever a filter author chooses]
2961 * <sf_setoption>:??? [whatever a filter author chooses]
2962 *
2963 * Notes: Other <pru_listen> returns depend on the protocol family; all
2964 * <sf_listen> returns depend on what the filter author causes
2965 * their filter to return.
2966 */
2967 int
2968 sosetopt(struct socket *so, struct sockopt *sopt)
2969 {
2970 int error, optval;
2971 struct linger l;
2972 struct timeval tv;
2973 #if CONFIG_MACF_SOCKET
2974 struct mac extmac;
2975 #endif /* MAC_SOCKET */
2976
2977 socket_lock(so, 1);
2978 so_update_last_owner_locked(so, NULL);
2979
2980 if ((so->so_state & (SS_CANTRCVMORE | SS_CANTSENDMORE))
2981 == (SS_CANTRCVMORE | SS_CANTSENDMORE) &&
2982 (so->so_flags & SOF_NPX_SETOPTSHUT) == 0) {
2983 /* the socket has been shutdown, no more sockopt's */
2984 error = EINVAL;
2985 goto bad;
2986 }
2987
2988 if (sopt->sopt_dir != SOPT_SET) {
2989 sopt->sopt_dir = SOPT_SET;
2990 }
2991
2992 error = sflt_setsockopt(so, sopt);
2993 if (error) {
2994 if (error == EJUSTRETURN)
2995 error = 0;
2996 goto bad;
2997 }
2998
2999 error = 0;
3000 if (sopt->sopt_level != SOL_SOCKET) {
3001 if (so->so_proto && so->so_proto->pr_ctloutput) {
3002 error = (*so->so_proto->pr_ctloutput)(so, sopt);
3003 socket_unlock(so, 1);
3004 return (error);
3005 }
3006 error = ENOPROTOOPT;
3007 } else {
3008 switch (sopt->sopt_name) {
3009 case SO_LINGER:
3010 case SO_LINGER_SEC:
3011 error = sooptcopyin(sopt, &l, sizeof (l), sizeof (l));
3012 if (error)
3013 goto bad;
3014
3015 so->so_linger = (sopt->sopt_name == SO_LINGER) ?
3016 l.l_linger : l.l_linger * hz;
3017 if (l.l_onoff)
3018 so->so_options |= SO_LINGER;
3019 else
3020 so->so_options &= ~SO_LINGER;
3021 break;
3022
3023 case SO_DEBUG:
3024 case SO_KEEPALIVE:
3025 case SO_DONTROUTE:
3026 case SO_USELOOPBACK:
3027 case SO_BROADCAST:
3028 case SO_REUSEADDR:
3029 case SO_REUSEPORT:
3030 case SO_OOBINLINE:
3031 case SO_TIMESTAMP:
3032 case SO_TIMESTAMP_MONOTONIC:
3033 #ifdef __APPLE__
3034 case SO_DONTTRUNC:
3035 case SO_WANTMORE:
3036 case SO_WANTOOBFLAG:
3037 #endif
3038 error = sooptcopyin(sopt, &optval, sizeof (optval),
3039 sizeof (optval));
3040 if (error)
3041 goto bad;
3042 if (optval)
3043 so->so_options |= sopt->sopt_name;
3044 else
3045 so->so_options &= ~sopt->sopt_name;
3046 break;
3047
3048 case SO_SNDBUF:
3049 case SO_RCVBUF:
3050 case SO_SNDLOWAT:
3051 case SO_RCVLOWAT:
3052 error = sooptcopyin(sopt, &optval, sizeof (optval),
3053 sizeof (optval));
3054 if (error)
3055 goto bad;
3056
3057 /*
3058 * Values < 1 make no sense for any of these
3059 * options, so disallow them.
3060 */
3061 if (optval < 1) {
3062 error = EINVAL;
3063 goto bad;
3064 }
3065
3066 switch (sopt->sopt_name) {
3067 case SO_SNDBUF:
3068 case SO_RCVBUF:
3069 if (sbreserve(sopt->sopt_name == SO_SNDBUF ?
3070 &so->so_snd : &so->so_rcv,
3071 (u_int32_t) optval) == 0) {
3072 error = ENOBUFS;
3073 goto bad;
3074 }
3075 if (sopt->sopt_name == SO_SNDBUF)
3076 so->so_snd.sb_flags |= SB_USRSIZE;
3077 else
3078 so->so_rcv.sb_flags |= SB_USRSIZE;
3079 break;
3080
3081 /*
3082 * Make sure the low-water is never greater than
3083 * the high-water.
3084 */
3085 case SO_SNDLOWAT:
3086 so->so_snd.sb_lowat =
3087 (optval > so->so_snd.sb_hiwat) ?
3088 so->so_snd.sb_hiwat : optval;
3089 break;
3090 case SO_RCVLOWAT:
3091 so->so_rcv.sb_lowat =
3092 (optval > so->so_rcv.sb_hiwat) ?
3093 so->so_rcv.sb_hiwat : optval;
3094 break;
3095 }
3096 break;
3097
3098 case SO_SNDTIMEO:
3099 case SO_RCVTIMEO:
3100 error = sooptcopyin_timeval(sopt, &tv);
3101 if (error)
3102 goto bad;
3103
3104 switch (sopt->sopt_name) {
3105 case SO_SNDTIMEO:
3106 so->so_snd.sb_timeo = tv;
3107 break;
3108 case SO_RCVTIMEO:
3109 so->so_rcv.sb_timeo = tv;
3110 break;
3111 }
3112 break;
3113
3114 case SO_NKE:
3115 {
3116 struct so_nke nke;
3117
3118 error = sooptcopyin(sopt, &nke, sizeof (nke),
3119 sizeof (nke));
3120 if (error)
3121 goto bad;
3122
3123 error = sflt_attach_internal(so, nke.nke_handle);
3124 break;
3125 }
3126
3127 case SO_NOSIGPIPE:
3128 error = sooptcopyin(sopt, &optval, sizeof (optval),
3129 sizeof (optval));
3130 if (error)
3131 goto bad;
3132 if (optval)
3133 so->so_flags |= SOF_NOSIGPIPE;
3134 else
3135 so->so_flags &= ~SOF_NOSIGPIPE;
3136
3137 break;
3138
3139 case SO_NOADDRERR:
3140 error = sooptcopyin(sopt, &optval, sizeof (optval),
3141 sizeof (optval));
3142 if (error)
3143 goto bad;
3144 if (optval)
3145 so->so_flags |= SOF_NOADDRAVAIL;
3146 else
3147 so->so_flags &= ~SOF_NOADDRAVAIL;
3148
3149 break;
3150
3151 case SO_REUSESHAREUID:
3152 error = sooptcopyin(sopt, &optval, sizeof (optval),
3153 sizeof (optval));
3154 if (error)
3155 goto bad;
3156 if (optval)
3157 so->so_flags |= SOF_REUSESHAREUID;
3158 else
3159 so->so_flags &= ~SOF_REUSESHAREUID;
3160 break;
3161 #ifdef __APPLE_API_PRIVATE
3162 case SO_NOTIFYCONFLICT:
3163 if (kauth_cred_issuser(kauth_cred_get()) == 0) {
3164 error = EPERM;
3165 goto bad;
3166 }
3167 error = sooptcopyin(sopt, &optval, sizeof (optval),
3168 sizeof (optval));
3169 if (error)
3170 goto bad;
3171 if (optval)
3172 so->so_flags |= SOF_NOTIFYCONFLICT;
3173 else
3174 so->so_flags &= ~SOF_NOTIFYCONFLICT;
3175 break;
3176 #endif
3177 case SO_RESTRICTIONS:
3178 if (kauth_cred_issuser(kauth_cred_get()) == 0) {
3179 error = EPERM;
3180 goto bad;
3181 }
3182 error = sooptcopyin(sopt, &optval, sizeof (optval),
3183 sizeof (optval));
3184 if (error)
3185 goto bad;
3186 so->so_restrictions = (optval & (SO_RESTRICT_DENYIN |
3187 SO_RESTRICT_DENYOUT | SO_RESTRICT_DENYSET));
3188 break;
3189
3190 case SO_LABEL:
3191 #if CONFIG_MACF_SOCKET
3192 if ((error = sooptcopyin(sopt, &extmac, sizeof (extmac),
3193 sizeof (extmac))) != 0)
3194 goto bad;
3195
3196 error = mac_setsockopt_label(proc_ucred(sopt->sopt_p),
3197 so, &extmac);
3198 #else
3199 error = EOPNOTSUPP;
3200 #endif /* MAC_SOCKET */
3201 break;
3202
3203 #ifdef __APPLE_API_PRIVATE
3204 case SO_UPCALLCLOSEWAIT:
3205 error = sooptcopyin(sopt, &optval, sizeof (optval),
3206 sizeof (optval));
3207 if (error)
3208 goto bad;
3209 if (optval)
3210 so->so_flags |= SOF_UPCALLCLOSEWAIT;
3211 else
3212 so->so_flags &= ~SOF_UPCALLCLOSEWAIT;
3213 break;
3214 #endif
3215
3216 case SO_RANDOMPORT:
3217 error = sooptcopyin(sopt, &optval, sizeof (optval),
3218 sizeof (optval));
3219 if (error)
3220 goto bad;
3221 if (optval)
3222 so->so_flags |= SOF_BINDRANDOMPORT;
3223 else
3224 so->so_flags &= ~SOF_BINDRANDOMPORT;
3225 break;
3226
3227 case SO_NP_EXTENSIONS: {
3228 struct so_np_extensions sonpx;
3229
3230 error = sooptcopyin(sopt, &sonpx, sizeof(sonpx), sizeof(sonpx));
3231 if (error)
3232 goto bad;
3233 if (sonpx.npx_mask & ~SONPX_MASK_VALID) {
3234 error = EINVAL;
3235 goto bad;
3236 }
3237 /*
3238 * Only one bit defined for now
3239 */
3240 if ((sonpx.npx_mask & SONPX_SETOPTSHUT)) {
3241 if ((sonpx.npx_flags & SONPX_SETOPTSHUT))
3242 so->so_flags |= SOF_NPX_SETOPTSHUT;
3243 else
3244 so->so_flags &= ~SOF_NPX_SETOPTSHUT;
3245 }
3246 break;
3247 }
3248
3249 case SO_TRAFFIC_CLASS: {
3250 error = sooptcopyin(sopt, &optval, sizeof (optval),
3251 sizeof (optval));
3252 if (error)
3253 goto bad;
3254 error = so_set_traffic_class(so, optval);
3255 if (error)
3256 goto bad;
3257 break;
3258 }
3259
3260 case SO_RECV_TRAFFIC_CLASS: {
3261 error = sooptcopyin(sopt, &optval, sizeof (optval),
3262 sizeof (optval));
3263 if (error)
3264 goto bad;
3265 if (optval == 0)
3266 so->so_flags &= ~SOF_RECV_TRAFFIC_CLASS;
3267 else
3268 so->so_flags |= SOF_RECV_TRAFFIC_CLASS;
3269 break;
3270 }
3271
3272 case SO_TRAFFIC_CLASS_DBG: {
3273 struct so_tcdbg so_tcdbg;
3274
3275 error = sooptcopyin(sopt, &so_tcdbg, sizeof (struct so_tcdbg),
3276 sizeof (struct so_tcdbg));
3277 if (error)
3278 goto bad;
3279 error = so_set_tcdbg(so, &so_tcdbg);
3280 if (error)
3281 goto bad;
3282 break;
3283 }
3284
3285 case SO_DEFUNCTOK:
3286 error = sooptcopyin(sopt, &optval, sizeof (optval),
3287 sizeof (optval));
3288 if (error != 0 || (so->so_flags & SOF_DEFUNCT)) {
3289 if (error == 0)
3290 error = EBADF;
3291 goto bad;
3292 }
3293 /*
3294 * Any process can set SO_DEFUNCTOK (clear
3295 * SOF_NODEFUNCT), but only root can clear
3296 * SO_DEFUNCTOK (set SOF_NODEFUNCT).
3297 */
3298 if (optval == 0 &&
3299 kauth_cred_issuser(kauth_cred_get()) == 0) {
3300 error = EPERM;
3301 goto bad;
3302 }
3303 if (optval)
3304 so->so_flags &= ~SOF_NODEFUNCT;
3305 else
3306 so->so_flags |= SOF_NODEFUNCT;
3307
3308 SODEFUNCTLOG(("%s[%d]: so %p [%d,%d] is now marked as "
3309 "%seligible for defunct\n", __func__,
3310 proc_selfpid(), so, INP_SOCKAF(so),
3311 INP_SOCKTYPE(so),
3312 (so->so_flags & SOF_NODEFUNCT) ? "not " : ""));
3313 break;
3314
3315 case SO_ISDEFUNCT:
3316 /* This option is not settable */
3317 error = EINVAL;
3318 break;
3319
3320 default:
3321 error = ENOPROTOOPT;
3322 break;
3323 }
3324 if (error == 0 && so->so_proto && so->so_proto->pr_ctloutput) {
3325 (void) ((*so->so_proto->pr_ctloutput)(so, sopt));
3326 }
3327 }
3328 bad:
3329 socket_unlock(so, 1);
3330 return (error);
3331 }
3332
3333 /* Helper routines for getsockopt */
3334 int
3335 sooptcopyout(struct sockopt *sopt, void *buf, size_t len)
3336 {
3337 int error;
3338 size_t valsize;
3339
3340 error = 0;
3341
3342 /*
3343 * Documented get behavior is that we always return a value,
3344 * possibly truncated to fit in the user's buffer.
3345 * Traditional behavior is that we always tell the user
3346 * precisely how much we copied, rather than something useful
3347 * like the total amount we had available for her.
3348 * Note that this interface is not idempotent; the entire answer must
3349 * generated ahead of time.
3350 */
3351 valsize = min(len, sopt->sopt_valsize);
3352 sopt->sopt_valsize = valsize;
3353 if (sopt->sopt_val != USER_ADDR_NULL) {
3354 if (sopt->sopt_p != kernproc)
3355 error = copyout(buf, sopt->sopt_val, valsize);
3356 else
3357 bcopy(buf, CAST_DOWN(caddr_t, sopt->sopt_val), valsize);
3358 }
3359 return (error);
3360 }
3361
3362 static int
3363 sooptcopyout_timeval(struct sockopt *sopt, const struct timeval * tv_p)
3364 {
3365 int error;
3366 size_t len;
3367 struct user64_timeval tv64;
3368 struct user32_timeval tv32;
3369 const void * val;
3370 size_t valsize;
3371
3372 error = 0;
3373 if (proc_is64bit(sopt->sopt_p)) {
3374 len = sizeof(tv64);
3375 tv64.tv_sec = tv_p->tv_sec;
3376 tv64.tv_usec = tv_p->tv_usec;
3377 val = &tv64;
3378 } else {
3379 len = sizeof(tv32);
3380 tv32.tv_sec = tv_p->tv_sec;
3381 tv32.tv_usec = tv_p->tv_usec;
3382 val = &tv32;
3383 }
3384 valsize = min(len, sopt->sopt_valsize);
3385 sopt->sopt_valsize = valsize;
3386 if (sopt->sopt_val != USER_ADDR_NULL) {
3387 if (sopt->sopt_p != kernproc)
3388 error = copyout(val, sopt->sopt_val, valsize);
3389 else
3390 bcopy(val, CAST_DOWN(caddr_t, sopt->sopt_val), valsize);
3391 }
3392 return (error);
3393 }
3394
3395 /*
3396 * Return: 0 Success
3397 * ENOPROTOOPT
3398 * <pr_ctloutput>:EOPNOTSUPP[AF_UNIX]
3399 * <pr_ctloutput>:???
3400 * <sf_getoption>:???
3401 */
3402 int
3403 sogetopt(struct socket *so, struct sockopt *sopt)
3404 {
3405 int error, optval;
3406 struct linger l;
3407 struct timeval tv;
3408 #if CONFIG_MACF_SOCKET
3409 struct mac extmac;
3410 #endif /* MAC_SOCKET */
3411
3412 if (sopt->sopt_dir != SOPT_GET) {
3413 sopt->sopt_dir = SOPT_GET;
3414 }
3415
3416 socket_lock(so, 1);
3417 so_update_last_owner_locked(so, NULL);
3418
3419 error = sflt_getsockopt(so, sopt);
3420 if (error) {
3421 if (error == EJUSTRETURN)
3422 error = 0;
3423 socket_unlock(so, 1);
3424 return (error);
3425 }
3426
3427 error = 0;
3428 if (sopt->sopt_level != SOL_SOCKET) {
3429 if (so->so_proto && so->so_proto->pr_ctloutput) {
3430 error = (*so->so_proto->pr_ctloutput)(so, sopt);
3431 socket_unlock(so, 1);
3432 return (error);
3433 } else {
3434 socket_unlock(so, 1);
3435 return (ENOPROTOOPT);
3436 }
3437 } else {
3438 switch (sopt->sopt_name) {
3439 case SO_LINGER:
3440 case SO_LINGER_SEC:
3441 l.l_onoff = so->so_options & SO_LINGER;
3442 l.l_linger = (sopt->sopt_name == SO_LINGER) ?
3443 so->so_linger : so->so_linger / hz;
3444 error = sooptcopyout(sopt, &l, sizeof (l));
3445 break;
3446
3447 case SO_USELOOPBACK:
3448 case SO_DONTROUTE:
3449 case SO_DEBUG:
3450 case SO_KEEPALIVE:
3451 case SO_REUSEADDR:
3452 case SO_REUSEPORT:
3453 case SO_BROADCAST:
3454 case SO_OOBINLINE:
3455 case SO_TIMESTAMP:
3456 case SO_TIMESTAMP_MONOTONIC:
3457 #ifdef __APPLE__
3458 case SO_DONTTRUNC:
3459 case SO_WANTMORE:
3460 case SO_WANTOOBFLAG:
3461 #endif
3462 optval = so->so_options & sopt->sopt_name;
3463 integer:
3464 error = sooptcopyout(sopt, &optval, sizeof (optval));
3465 break;
3466
3467 case SO_TYPE:
3468 optval = so->so_type;
3469 goto integer;
3470
3471 #ifdef __APPLE__
3472 case SO_NREAD:
3473 if (so->so_proto->pr_flags & PR_ATOMIC) {
3474 int pkt_total;
3475 struct mbuf *m1;
3476
3477 pkt_total = 0;
3478 m1 = so->so_rcv.sb_mb;
3479 while (m1) {
3480 if (m1->m_type == MT_DATA || m1->m_type == MT_HEADER ||
3481 m1->m_type == MT_OOBDATA)
3482 pkt_total += m1->m_len;
3483 m1 = m1->m_next;
3484 }
3485 optval = pkt_total;
3486 } else {
3487 optval = so->so_rcv.sb_cc - so->so_rcv.sb_ctl;
3488 }
3489 goto integer;
3490
3491 case SO_NWRITE:
3492 optval = so->so_snd.sb_cc;
3493 goto integer;
3494 #endif
3495 case SO_ERROR:
3496 optval = so->so_error;
3497 so->so_error = 0;
3498 goto integer;
3499
3500 case SO_SNDBUF:
3501 optval = so->so_snd.sb_hiwat;
3502 goto integer;
3503
3504 case SO_RCVBUF:
3505 optval = so->so_rcv.sb_hiwat;
3506 goto integer;
3507
3508 case SO_SNDLOWAT:
3509 optval = so->so_snd.sb_lowat;
3510 goto integer;
3511
3512 case SO_RCVLOWAT:
3513 optval = so->so_rcv.sb_lowat;
3514 goto integer;
3515
3516 case SO_SNDTIMEO:
3517 case SO_RCVTIMEO:
3518 tv = (sopt->sopt_name == SO_SNDTIMEO ?
3519 so->so_snd.sb_timeo : so->so_rcv.sb_timeo);
3520
3521 error = sooptcopyout_timeval(sopt, &tv);
3522 break;
3523
3524 case SO_NOSIGPIPE:
3525 optval = (so->so_flags & SOF_NOSIGPIPE);
3526 goto integer;
3527
3528 case SO_NOADDRERR:
3529 optval = (so->so_flags & SOF_NOADDRAVAIL);
3530 goto integer;
3531
3532 case SO_REUSESHAREUID:
3533 optval = (so->so_flags & SOF_REUSESHAREUID);
3534 goto integer;
3535
3536 #ifdef __APPLE_API_PRIVATE
3537 case SO_NOTIFYCONFLICT:
3538 optval = (so->so_flags & SOF_NOTIFYCONFLICT);
3539 goto integer;
3540 #endif
3541 case SO_RESTRICTIONS:
3542 optval = so->so_restrictions & (SO_RESTRICT_DENYIN |
3543 SO_RESTRICT_DENYOUT | SO_RESTRICT_DENYSET);
3544 goto integer;
3545
3546 case SO_LABEL:
3547 #if CONFIG_MACF_SOCKET
3548 if ((error = sooptcopyin(sopt, &extmac, sizeof (extmac),
3549 sizeof (extmac))) != 0 ||
3550 (error = mac_socket_label_get(proc_ucred(
3551 sopt->sopt_p), so, &extmac)) != 0)
3552 break;
3553
3554 error = sooptcopyout(sopt, &extmac, sizeof (extmac));
3555 #else
3556 error = EOPNOTSUPP;
3557 #endif /* MAC_SOCKET */
3558 break;
3559
3560 case SO_PEERLABEL:
3561 #if CONFIG_MACF_SOCKET
3562 if ((error = sooptcopyin(sopt, &extmac, sizeof (extmac),
3563 sizeof (extmac))) != 0 ||
3564 (error = mac_socketpeer_label_get(proc_ucred(
3565 sopt->sopt_p), so, &extmac)) != 0)
3566 break;
3567
3568 error = sooptcopyout(sopt, &extmac, sizeof (extmac));
3569 #else
3570 error = EOPNOTSUPP;
3571 #endif /* MAC_SOCKET */
3572 break;
3573
3574 #ifdef __APPLE_API_PRIVATE
3575 case SO_UPCALLCLOSEWAIT:
3576 optval = (so->so_flags & SOF_UPCALLCLOSEWAIT);
3577 goto integer;
3578 #endif
3579 case SO_RANDOMPORT:
3580 optval = (so->so_flags & SOF_BINDRANDOMPORT);
3581 goto integer;
3582
3583 case SO_NP_EXTENSIONS: {
3584 struct so_np_extensions sonpx;
3585
3586 sonpx.npx_flags = (so->so_flags & SOF_NPX_SETOPTSHUT) ? SONPX_SETOPTSHUT : 0;
3587 sonpx.npx_mask = SONPX_MASK_VALID;
3588
3589 error = sooptcopyout(sopt, &sonpx, sizeof(struct so_np_extensions));
3590 break;
3591 }
3592
3593 case SO_TRAFFIC_CLASS:
3594 optval = so->so_traffic_class;
3595 goto integer;
3596
3597 case SO_RECV_TRAFFIC_CLASS:
3598 optval = (so->so_flags & SOF_RECV_TRAFFIC_CLASS);
3599 goto integer;
3600
3601 case SO_TRAFFIC_CLASS_STATS:
3602 error = sooptcopyout(sopt, &so->so_tc_stats, sizeof(so->so_tc_stats));
3603
3604 case SO_TRAFFIC_CLASS_DBG:
3605 error = sogetopt_tcdbg(so, sopt);
3606 break;
3607
3608 case SO_DEFUNCTOK:
3609 optval = !(so->so_flags & SOF_NODEFUNCT);
3610 goto integer;
3611
3612 case SO_ISDEFUNCT:
3613 optval = (so->so_flags & SOF_DEFUNCT);
3614 goto integer;
3615
3616 default:
3617 error = ENOPROTOOPT;
3618 break;
3619 }
3620 socket_unlock(so, 1);
3621 return (error);
3622 }
3623 }
3624 /* The size limits on our soopt_getm is different from that on FreeBSD.
3625 * We limit the size of options to MCLBYTES. This will have to change
3626 * if we need to define options that need more space than MCLBYTES.
3627 */
3628 int
3629 soopt_getm(struct sockopt *sopt, struct mbuf **mp)
3630 {
3631 struct mbuf *m, *m_prev;
3632 int sopt_size = sopt->sopt_valsize;
3633 int how;
3634
3635 if (sopt_size <= 0 || sopt_size > MCLBYTES)
3636 return (EMSGSIZE);
3637
3638 how = sopt->sopt_p != kernproc ? M_WAIT : M_DONTWAIT;
3639 MGET(m, how, MT_DATA);
3640 if (m == 0)
3641 return (ENOBUFS);
3642 if (sopt_size > MLEN) {
3643 MCLGET(m, how);
3644 if ((m->m_flags & M_EXT) == 0) {
3645 m_free(m);
3646 return (ENOBUFS);
3647 }
3648 m->m_len = min(MCLBYTES, sopt_size);
3649 } else {
3650 m->m_len = min(MLEN, sopt_size);
3651 }
3652 sopt_size -= m->m_len;
3653 *mp = m;
3654 m_prev = m;
3655
3656 while (sopt_size > 0) {
3657 MGET(m, how, MT_DATA);
3658 if (m == 0) {
3659 m_freem(*mp);
3660 return (ENOBUFS);
3661 }
3662 if (sopt_size > MLEN) {
3663 MCLGET(m, how);
3664 if ((m->m_flags & M_EXT) == 0) {
3665 m_freem(*mp);
3666 m_freem(m);
3667 return (ENOBUFS);
3668 }
3669 m->m_len = min(MCLBYTES, sopt_size);
3670 } else {
3671 m->m_len = min(MLEN, sopt_size);
3672 }
3673 sopt_size -= m->m_len;
3674 m_prev->m_next = m;
3675 m_prev = m;
3676 }
3677 return (0);
3678 }
3679
3680 /* copyin sopt data into mbuf chain */
3681 int
3682 soopt_mcopyin(struct sockopt *sopt, struct mbuf *m)
3683 {
3684 struct mbuf *m0 = m;
3685
3686 if (sopt->sopt_val == USER_ADDR_NULL)
3687 return (0);
3688 while (m != NULL && sopt->sopt_valsize >= m->m_len) {
3689 if (sopt->sopt_p != kernproc) {
3690 int error;
3691
3692 error = copyin(sopt->sopt_val, mtod(m, char *),
3693 m->m_len);
3694 if (error != 0) {
3695 m_freem(m0);
3696 return (error);
3697 }
3698 } else {
3699 bcopy(CAST_DOWN(caddr_t, sopt->sopt_val),
3700 mtod(m, char *), m->m_len);
3701 }
3702 sopt->sopt_valsize -= m->m_len;
3703 sopt->sopt_val += m->m_len;
3704 m = m->m_next;
3705 }
3706 if (m != NULL) /* should be allocated enoughly at ip6_sooptmcopyin() */
3707 panic("soopt_mcopyin");
3708 return (0);
3709 }
3710
3711 /* copyout mbuf chain data into soopt */
3712 int
3713 soopt_mcopyout(struct sockopt *sopt, struct mbuf *m)
3714 {
3715 struct mbuf *m0 = m;
3716 size_t valsize = 0;
3717
3718 if (sopt->sopt_val == USER_ADDR_NULL)
3719 return (0);
3720 while (m != NULL && sopt->sopt_valsize >= m->m_len) {
3721 if (sopt->sopt_p != kernproc) {
3722 int error;
3723
3724 error = copyout(mtod(m, char *), sopt->sopt_val,
3725 m->m_len);
3726 if (error != 0) {
3727 m_freem(m0);
3728 return (error);
3729 }
3730 } else {
3731 bcopy(mtod(m, char *),
3732 CAST_DOWN(caddr_t, sopt->sopt_val), m->m_len);
3733 }
3734 sopt->sopt_valsize -= m->m_len;
3735 sopt->sopt_val += m->m_len;
3736 valsize += m->m_len;
3737 m = m->m_next;
3738 }
3739 if (m != NULL) {
3740 /* enough soopt buffer should be given from user-land */
3741 m_freem(m0);
3742 return (EINVAL);
3743 }
3744 sopt->sopt_valsize = valsize;
3745 return (0);
3746 }
3747
3748 void
3749 sohasoutofband(struct socket *so)
3750 {
3751
3752 if (so->so_pgid < 0)
3753 gsignal(-so->so_pgid, SIGURG);
3754 else if (so->so_pgid > 0)
3755 proc_signal(so->so_pgid, SIGURG);
3756 selwakeup(&so->so_rcv.sb_sel);
3757 }
3758
3759 int
3760 sopoll(struct socket *so, int events, __unused kauth_cred_t cred, void * wql)
3761 {
3762 struct proc *p = current_proc();
3763 int revents = 0;
3764
3765 socket_lock(so, 1);
3766 so_update_last_owner_locked(so, p);
3767
3768 if (events & (POLLIN | POLLRDNORM))
3769 if (soreadable(so))
3770 revents |= events & (POLLIN | POLLRDNORM);
3771
3772 if (events & (POLLOUT | POLLWRNORM))
3773 if (sowriteable(so))
3774 revents |= events & (POLLOUT | POLLWRNORM);
3775
3776 if (events & (POLLPRI | POLLRDBAND))
3777 if (so->so_oobmark || (so->so_state & SS_RCVATMARK))
3778 revents |= events & (POLLPRI | POLLRDBAND);
3779
3780 if (revents == 0) {
3781 if (events & (POLLIN | POLLPRI | POLLRDNORM | POLLRDBAND)) {
3782 /*
3783 * Darwin sets the flag first,
3784 * BSD calls selrecord first
3785 */
3786 so->so_rcv.sb_flags |= SB_SEL;
3787 selrecord(p, &so->so_rcv.sb_sel, wql);
3788 }
3789
3790 if (events & (POLLOUT | POLLWRNORM)) {
3791 /*
3792 * Darwin sets the flag first,
3793 * BSD calls selrecord first
3794 */
3795 so->so_snd.sb_flags |= SB_SEL;
3796 selrecord(p, &so->so_snd.sb_sel, wql);
3797 }
3798 }
3799
3800 socket_unlock(so, 1);
3801 return (revents);
3802 }
3803
3804 int
3805 soo_kqfilter(__unused struct fileproc *fp, struct knote *kn,
3806 __unused struct proc *p)
3807 {
3808 struct socket *so = (struct socket *)kn->kn_fp->f_fglob->fg_data;
3809 struct sockbuf *sb;
3810
3811 socket_lock(so, 1);
3812
3813 #if CONFIG_MACF_SOCKET
3814 if (mac_socket_check_kqfilter(proc_ucred(p), kn, so) != 0) {
3815 socket_unlock(so, 1);
3816 return (1);
3817 }
3818 #endif /* MAC_SOCKET */
3819
3820 switch (kn->kn_filter) {
3821 case EVFILT_READ:
3822 kn->kn_fop = &soread_filtops;
3823 sb = &so->so_rcv;
3824 break;
3825 case EVFILT_WRITE:
3826 kn->kn_fop = &sowrite_filtops;
3827 sb = &so->so_snd;
3828 break;
3829 default:
3830 socket_unlock(so, 1);
3831 return (1);
3832 }
3833
3834 if (KNOTE_ATTACH(&sb->sb_sel.si_note, kn))
3835 sb->sb_flags |= SB_KNOTE;
3836 socket_unlock(so, 1);
3837 return (0);
3838 }
3839
3840 static void
3841 filt_sordetach(struct knote *kn)
3842 {
3843 struct socket *so = (struct socket *)kn->kn_fp->f_fglob->fg_data;
3844
3845 socket_lock(so, 1);
3846 if (so->so_rcv.sb_flags & SB_KNOTE)
3847 if (KNOTE_DETACH(&so->so_rcv.sb_sel.si_note, kn))
3848 so->so_rcv.sb_flags &= ~SB_KNOTE;
3849 socket_unlock(so, 1);
3850 }
3851
3852 /*ARGSUSED*/
3853 static int
3854 filt_soread(struct knote *kn, long hint)
3855 {
3856 struct socket *so = (struct socket *)kn->kn_fp->f_fglob->fg_data;
3857
3858 if ((hint & SO_FILT_HINT_LOCKED) == 0)
3859 socket_lock(so, 1);
3860
3861 if (so->so_options & SO_ACCEPTCONN) {
3862 int isempty;
3863
3864 /* Radar 6615193 handle the listen case dynamically
3865 * for kqueue read filter. This allows to call listen() after registering
3866 * the kqueue EVFILT_READ.
3867 */
3868
3869 kn->kn_data = so->so_qlen;
3870 isempty = ! TAILQ_EMPTY(&so->so_comp);
3871
3872 if ((hint & SO_FILT_HINT_LOCKED) == 0)
3873 socket_unlock(so, 1);
3874
3875 return (isempty);
3876 }
3877
3878 /* socket isn't a listener */
3879
3880 kn->kn_data = so->so_rcv.sb_cc - so->so_rcv.sb_ctl;
3881
3882 if (so->so_oobmark) {
3883 if (kn->kn_flags & EV_OOBAND) {
3884 kn->kn_data -= so->so_oobmark;
3885 if ((hint & SO_FILT_HINT_LOCKED) == 0)
3886 socket_unlock(so, 1);
3887 return (1);
3888 }
3889 kn->kn_data = so->so_oobmark;
3890 kn->kn_flags |= EV_OOBAND;
3891 } else {
3892 if (so->so_state & SS_CANTRCVMORE) {
3893 kn->kn_flags |= EV_EOF;
3894 kn->kn_fflags = so->so_error;
3895 if ((hint & SO_FILT_HINT_LOCKED) == 0)
3896 socket_unlock(so, 1);
3897 return (1);
3898 }
3899 }
3900
3901 if (so->so_state & SS_RCVATMARK) {
3902 if (kn->kn_flags & EV_OOBAND) {
3903 if ((hint & SO_FILT_HINT_LOCKED) == 0)
3904 socket_unlock(so, 1);
3905 return (1);
3906 }
3907 kn->kn_flags |= EV_OOBAND;
3908 } else if (kn->kn_flags & EV_OOBAND) {
3909 kn->kn_data = 0;
3910 if ((hint & SO_FILT_HINT_LOCKED) == 0)
3911 socket_unlock(so, 1);
3912 return (0);
3913 }
3914
3915 if (so->so_error) { /* temporary udp error */
3916 if ((hint & SO_FILT_HINT_LOCKED) == 0)
3917 socket_unlock(so, 1);
3918 return (1);
3919 }
3920
3921 int64_t lowwat = so->so_rcv.sb_lowat;
3922 if (kn->kn_sfflags & NOTE_LOWAT)
3923 {
3924 if (kn->kn_sdata > so->so_rcv.sb_hiwat)
3925 lowwat = so->so_rcv.sb_hiwat;
3926 else if (kn->kn_sdata > lowwat)
3927 lowwat = kn->kn_sdata;
3928 }
3929
3930 if ((hint & SO_FILT_HINT_LOCKED) == 0)
3931 socket_unlock(so, 1);
3932
3933 return ((kn->kn_flags & EV_OOBAND) || kn->kn_data >= lowwat);
3934 }
3935
3936 static void
3937 filt_sowdetach(struct knote *kn)
3938 {
3939 struct socket *so = (struct socket *)kn->kn_fp->f_fglob->fg_data;
3940 socket_lock(so, 1);
3941
3942 if (so->so_snd.sb_flags & SB_KNOTE)
3943 if (KNOTE_DETACH(&so->so_snd.sb_sel.si_note, kn))
3944 so->so_snd.sb_flags &= ~SB_KNOTE;
3945 socket_unlock(so, 1);
3946 }
3947
3948 /*ARGSUSED*/
3949 static int
3950 filt_sowrite(struct knote *kn, long hint)
3951 {
3952 struct socket *so = (struct socket *)kn->kn_fp->f_fglob->fg_data;
3953
3954 if ((hint & SO_FILT_HINT_LOCKED) == 0)
3955 socket_lock(so, 1);
3956
3957 kn->kn_data = sbspace(&so->so_snd);
3958 if (so->so_state & SS_CANTSENDMORE) {
3959 kn->kn_flags |= EV_EOF;
3960 kn->kn_fflags = so->so_error;
3961 if ((hint & SO_FILT_HINT_LOCKED) == 0)
3962 socket_unlock(so, 1);
3963 return (1);
3964 }
3965 if (so->so_error) { /* temporary udp error */
3966 if ((hint & SO_FILT_HINT_LOCKED) == 0)
3967 socket_unlock(so, 1);
3968 return (1);
3969 }
3970 if (((so->so_state & SS_ISCONNECTED) == 0) &&
3971 (so->so_proto->pr_flags & PR_CONNREQUIRED)) {
3972 if ((hint & SO_FILT_HINT_LOCKED) == 0)
3973 socket_unlock(so, 1);
3974 return (0);
3975 }
3976 int64_t lowwat = so->so_snd.sb_lowat;
3977 if (kn->kn_sfflags & NOTE_LOWAT)
3978 {
3979 if (kn->kn_sdata > so->so_snd.sb_hiwat)
3980 lowwat = so->so_snd.sb_hiwat;
3981 else if (kn->kn_sdata > lowwat)
3982 lowwat = kn->kn_sdata;
3983 }
3984 if ((hint & SO_FILT_HINT_LOCKED) == 0)
3985 socket_unlock(so, 1);
3986 return (kn->kn_data >= lowwat);
3987 }
3988
3989 #define SO_LOCK_HISTORY_STR_LEN (2 * SO_LCKDBG_MAX * (2 + (2 * sizeof(void *)) + 1) + 1)
3990
3991 __private_extern__ const char * solockhistory_nr(struct socket *so)
3992 {
3993 size_t n = 0;
3994 int i;
3995 static char lock_history_str[SO_LOCK_HISTORY_STR_LEN];
3996
3997 bzero(lock_history_str, sizeof(lock_history_str));
3998 for (i = SO_LCKDBG_MAX - 1; i >= 0; i--) {
3999 n += snprintf(lock_history_str + n, SO_LOCK_HISTORY_STR_LEN - n, "%lx:%lx ",
4000 (uintptr_t) so->lock_lr[(so->next_lock_lr + i) % SO_LCKDBG_MAX],
4001 (uintptr_t) so->unlock_lr[(so->next_unlock_lr + i) % SO_LCKDBG_MAX]);
4002 }
4003 return lock_history_str;
4004 }
4005
4006 int
4007 socket_lock(struct socket *so, int refcount)
4008 {
4009 int error = 0;
4010 void *lr_saved;
4011
4012 lr_saved = __builtin_return_address(0);
4013
4014 if (so->so_proto->pr_lock) {
4015 error = (*so->so_proto->pr_lock)(so, refcount, lr_saved);
4016 } else {
4017 #ifdef MORE_LOCKING_DEBUG
4018 lck_mtx_assert(so->so_proto->pr_domain->dom_mtx,
4019 LCK_MTX_ASSERT_NOTOWNED);
4020 #endif
4021 lck_mtx_lock(so->so_proto->pr_domain->dom_mtx);
4022 if (refcount)
4023 so->so_usecount++;
4024 so->lock_lr[so->next_lock_lr] = lr_saved;
4025 so->next_lock_lr = (so->next_lock_lr+1) % SO_LCKDBG_MAX;
4026 }
4027
4028 return (error);
4029 }
4030
4031 int
4032 socket_unlock(struct socket *so, int refcount)
4033 {
4034 int error = 0;
4035 void *lr_saved;
4036 lck_mtx_t *mutex_held;
4037
4038 lr_saved = __builtin_return_address(0);
4039
4040 if (so->so_proto == NULL)
4041 panic("socket_unlock null so_proto so=%p\n", so);
4042
4043 if (so && so->so_proto->pr_unlock) {
4044 error = (*so->so_proto->pr_unlock)(so, refcount, lr_saved);
4045 } else {
4046 mutex_held = so->so_proto->pr_domain->dom_mtx;
4047 #ifdef MORE_LOCKING_DEBUG
4048 lck_mtx_assert(mutex_held, LCK_MTX_ASSERT_OWNED);
4049 #endif
4050 so->unlock_lr[so->next_unlock_lr] = lr_saved;
4051 so->next_unlock_lr = (so->next_unlock_lr+1) % SO_LCKDBG_MAX;
4052
4053 if (refcount) {
4054 if (so->so_usecount <= 0)
4055 panic("socket_unlock: bad refcount=%d so=%p (%d, %d, %d) lrh=%s",
4056 so->so_usecount, so, so->so_proto->pr_domain->dom_family,
4057 so->so_type, so->so_proto->pr_protocol,
4058 solockhistory_nr(so));
4059
4060 so->so_usecount--;
4061 if (so->so_usecount == 0) {
4062 sofreelastref(so, 1);
4063 }
4064 }
4065 lck_mtx_unlock(mutex_held);
4066 }
4067
4068 return (error);
4069 }
4070
4071 /* Called with socket locked, will unlock socket */
4072 void
4073 sofree(struct socket *so)
4074 {
4075
4076 lck_mtx_t *mutex_held;
4077 if (so->so_proto->pr_getlock != NULL)
4078 mutex_held = (*so->so_proto->pr_getlock)(so, 0);
4079 else
4080 mutex_held = so->so_proto->pr_domain->dom_mtx;
4081 lck_mtx_assert(mutex_held, LCK_MTX_ASSERT_OWNED);
4082
4083 sofreelastref(so, 0);
4084 }
4085
4086 void
4087 soreference(struct socket *so)
4088 {
4089 socket_lock(so, 1); /* locks & take one reference on socket */
4090 socket_unlock(so, 0); /* unlock only */
4091 }
4092
4093 void
4094 sodereference(struct socket *so)
4095 {
4096 socket_lock(so, 0);
4097 socket_unlock(so, 1);
4098 }
4099
4100 /*
4101 * Set or clear SOF_MULTIPAGES on the socket to enable or disable the
4102 * possibility of using jumbo clusters. Caller must ensure to hold
4103 * the socket lock.
4104 */
4105 void
4106 somultipages(struct socket *so, boolean_t set)
4107 {
4108 if (set)
4109 so->so_flags |= SOF_MULTIPAGES;
4110 else
4111 so->so_flags &= ~SOF_MULTIPAGES;
4112 }
4113
4114 int
4115 so_isdstlocal(struct socket *so) {
4116
4117 struct inpcb *inp = (struct inpcb *)so->so_pcb;
4118
4119 if (so->so_proto->pr_domain->dom_family == AF_INET) {
4120 return inaddr_local(inp->inp_faddr);
4121 } else if (so->so_proto->pr_domain->dom_family == AF_INET6) {
4122 return in6addr_local(&inp->in6p_faddr);
4123 }
4124 return 0;
4125 }
4126
4127 int
4128 sosetdefunct(struct proc *p, struct socket *so, int level, boolean_t noforce)
4129 {
4130 int err = 0, defunct;
4131
4132 defunct = (so->so_flags & SOF_DEFUNCT);
4133 if (defunct) {
4134 if (!(so->so_snd.sb_flags & so->so_rcv.sb_flags & SB_DROP))
4135 panic("%s: SB_DROP not set", __func__);
4136 goto done;
4137 }
4138
4139 if (so->so_flags & SOF_NODEFUNCT) {
4140 if (noforce) {
4141 err = EOPNOTSUPP;
4142 SODEFUNCTLOG(("%s[%d]: (target pid %d level %d) so %p "
4143 "[%d,%d] is not eligible for defunct (%d)\n",
4144 __func__, proc_selfpid(), proc_pid(p), level, so,
4145 INP_SOCKAF(so), INP_SOCKTYPE(so), err));
4146 return (err);
4147 }
4148 so->so_flags &= ~SOF_NODEFUNCT;
4149 SODEFUNCTLOG(("%s[%d]: (target pid %d level %d) so %p [%d,%d] "
4150 "defunct by force\n", __func__, proc_selfpid(), proc_pid(p),
4151 level, so, INP_SOCKAF(so), INP_SOCKTYPE(so)));
4152 }
4153
4154 so->so_flags |= SOF_DEFUNCT;
4155 /* Prevent further data from being appended to the socket buffers */
4156 so->so_snd.sb_flags |= SB_DROP;
4157 so->so_rcv.sb_flags |= SB_DROP;
4158
4159 done:
4160 SODEFUNCTLOG(("%s[%d]: (target pid %d level %d) so %p [%d,%d] %s "
4161 "defunct\n", __func__, proc_selfpid(), proc_pid(p), level, so,
4162 INP_SOCKAF(so), INP_SOCKTYPE(so),
4163 defunct ? "is already" : "marked as"));
4164
4165 return (err);
4166 }
4167
4168 int
4169 sodefunct(struct proc *p, struct socket *so, int level)
4170 {
4171 struct sockbuf *rcv, *snd;
4172
4173 if (!(so->so_flags & SOF_DEFUNCT))
4174 panic("%s improperly called", __func__);
4175
4176 if (so->so_state & SS_DEFUNCT)
4177 goto done;
4178
4179 rcv = &so->so_rcv;
4180 snd = &so->so_snd;
4181
4182 SODEFUNCTLOG(("%s[%d]: (target pid %d level %d) so %p [%d,%d] is now "
4183 "defunct [rcv_si 0x%x, snd_si 0x%x, rcv_fl 0x%x, snd_fl 0x%x]\n",
4184 __func__, proc_selfpid(), proc_pid(p), level, so,
4185 INP_SOCKAF(so), INP_SOCKTYPE(so),
4186 (uint32_t)rcv->sb_sel.si_flags, (uint32_t)snd->sb_sel.si_flags,
4187 (uint16_t)rcv->sb_flags, (uint16_t)snd->sb_flags));
4188
4189 /*
4190 * Unwedge threads blocked on sbwait() and sb_lock().
4191 */
4192 sbwakeup(rcv);
4193 sbwakeup(snd);
4194
4195 if (rcv->sb_flags & SB_LOCK)
4196 sbunlock(rcv, 1);
4197 if (snd->sb_flags & SB_LOCK)
4198 sbunlock(snd, 1);
4199
4200 /*
4201 * Flush the buffers and disconnect. We explicitly call shutdown
4202 * on both data directions to ensure that SS_CANT{RCV,SEND}MORE
4203 * states are set for the socket. This would also flush out data
4204 * hanging off the receive list of this socket.
4205 */
4206 (void) soshutdownlock(so, SHUT_RD);
4207 (void) soshutdownlock(so, SHUT_WR);
4208 (void) sodisconnectlocked(so);
4209
4210 /*
4211 * Explicitly handle connectionless-protocol disconnection
4212 * and release any remaining data in the socket buffers.
4213 */
4214 if (!(so->so_flags & SS_ISDISCONNECTED))
4215 (void) soisdisconnected(so);
4216
4217 if (so->so_error == 0)
4218 so->so_error = EBADF;
4219
4220 if (rcv->sb_cc != 0)
4221 sbrelease(rcv);
4222 if (snd->sb_cc != 0)
4223 sbrelease(snd);
4224
4225 so->so_state |= SS_DEFUNCT;
4226
4227 done:
4228 return (0);
4229 }