]> git.saurik.com Git - apple/xnu.git/blob - bsd/kern/uipc_socket.c
xnu-2050.24.15.tar.gz
[apple/xnu.git] / bsd / kern / uipc_socket.c
1 /*
2 * Copyright (c) 1998-2012 Apple Inc. All rights reserved.
3 *
4 * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
5 *
6 * This file contains Original Code and/or Modifications of Original Code
7 * as defined in and that are subject to the Apple Public Source License
8 * Version 2.0 (the 'License'). You may not use this file except in
9 * compliance with the License. The rights granted to you under the License
10 * may not be used to create, or enable the creation or redistribution of,
11 * unlawful or unlicensed copies of an Apple operating system, or to
12 * circumvent, violate, or enable the circumvention or violation of, any
13 * terms of an Apple operating system software license agreement.
14 *
15 * Please obtain a copy of the License at
16 * http://www.opensource.apple.com/apsl/ and read it before using this file.
17 *
18 * The Original Code and all software distributed under the License are
19 * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
20 * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
21 * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
22 * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
23 * Please see the License for the specific language governing rights and
24 * limitations under the License.
25 *
26 * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
27 */
28 /* Copyright (c) 1995 NeXT Computer, Inc. All Rights Reserved */
29 /*
30 * Copyright (c) 1982, 1986, 1988, 1990, 1993
31 * The Regents of the University of California. All rights reserved.
32 *
33 * Redistribution and use in source and binary forms, with or without
34 * modification, are permitted provided that the following conditions
35 * are met:
36 * 1. Redistributions of source code must retain the above copyright
37 * notice, this list of conditions and the following disclaimer.
38 * 2. Redistributions in binary form must reproduce the above copyright
39 * notice, this list of conditions and the following disclaimer in the
40 * documentation and/or other materials provided with the distribution.
41 * 3. All advertising materials mentioning features or use of this software
42 * must display the following acknowledgement:
43 * This product includes software developed by the University of
44 * California, Berkeley and its contributors.
45 * 4. Neither the name of the University nor the names of its contributors
46 * may be used to endorse or promote products derived from this software
47 * without specific prior written permission.
48 *
49 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
50 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
51 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
52 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
53 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
54 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
55 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
56 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
57 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
58 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
59 * SUCH DAMAGE.
60 *
61 * @(#)uipc_socket.c 8.3 (Berkeley) 4/15/94
62 * $FreeBSD: src/sys/kern/uipc_socket.c,v 1.68.2.16 2001/06/14 20:46:06 ume Exp $
63 */
64 /*
65 * NOTICE: This file was modified by SPARTA, Inc. in 2005 to introduce
66 * support for mandatory and extensible security protections. This notice
67 * is included in support of clause 2.2 (b) of the Apple Public License,
68 * Version 2.0.
69 */
70
71 #include <sys/param.h>
72 #include <sys/systm.h>
73 #include <sys/filedesc.h>
74 #include <sys/proc.h>
75 #include <sys/proc_internal.h>
76 #include <sys/kauth.h>
77 #include <sys/file_internal.h>
78 #include <sys/fcntl.h>
79 #include <sys/malloc.h>
80 #include <sys/mbuf.h>
81 #include <sys/domain.h>
82 #include <sys/kernel.h>
83 #include <sys/event.h>
84 #include <sys/poll.h>
85 #include <sys/protosw.h>
86 #include <sys/socket.h>
87 #include <sys/socketvar.h>
88 #include <sys/resourcevar.h>
89 #include <sys/signalvar.h>
90 #include <sys/sysctl.h>
91 #include <sys/uio.h>
92 #include <sys/ev.h>
93 #include <sys/kdebug.h>
94 #include <sys/un.h>
95 #include <sys/user.h>
96 #include <sys/priv.h>
97 #include <net/route.h>
98 #include <net/ntstat.h>
99 #include <netinet/in.h>
100 #include <netinet/in_pcb.h>
101 #include <netinet/ip6.h>
102 #include <netinet6/ip6_var.h>
103 #include <kern/zalloc.h>
104 #include <kern/locks.h>
105 #include <machine/limits.h>
106 #include <libkern/OSAtomic.h>
107 #include <pexpert/pexpert.h>
108 #include <kern/assert.h>
109 #include <kern/task.h>
110 #include <sys/kpi_mbuf.h>
111 #include <sys/mcache.h>
112
113 #if CONFIG_MACF
114 #include <security/mac.h>
115 #include <security/mac_framework.h>
116 #endif /* MAC */
117
118
119 int so_cache_hw = 0;
120 int so_cache_timeouts = 0;
121 int so_cache_max_freed = 0;
122 int cached_sock_count = 0;
123 __private_extern__ int max_cached_sock_count = MAX_CACHED_SOCKETS;
124 struct socket *socket_cache_head = 0;
125 struct socket *socket_cache_tail = 0;
126 u_int32_t so_cache_time = 0;
127 int so_cache_init_done = 0;
128 struct zone *so_cache_zone;
129
130 static lck_grp_t *so_cache_mtx_grp;
131 static lck_attr_t *so_cache_mtx_attr;
132 static lck_grp_attr_t *so_cache_mtx_grp_attr;
133 lck_mtx_t *so_cache_mtx;
134
135 #include <machine/limits.h>
136
137 static void filt_sordetach(struct knote *kn);
138 static int filt_soread(struct knote *kn, long hint);
139 static void filt_sowdetach(struct knote *kn);
140 static int filt_sowrite(struct knote *kn, long hint);
141 static void filt_sockdetach(struct knote *kn);
142 static int filt_sockev(struct knote *kn, long hint);
143
144 static int
145 sooptcopyin_timeval(struct sockopt *sopt, struct timeval * tv_p);
146
147 static int
148 sooptcopyout_timeval(struct sockopt *sopt, const struct timeval * tv_p);
149
150 static struct filterops soread_filtops = {
151 .f_isfd = 1,
152 .f_detach = filt_sordetach,
153 .f_event = filt_soread,
154 };
155 static struct filterops sowrite_filtops = {
156 .f_isfd = 1,
157 .f_detach = filt_sowdetach,
158 .f_event = filt_sowrite,
159 };
160 static struct filterops sock_filtops = {
161 .f_isfd = 1,
162 .f_detach = filt_sockdetach,
163 .f_event = filt_sockev,
164 };
165
166 #define EVEN_MORE_LOCKING_DEBUG 0
167 int socket_debug = 0;
168 int socket_zone = M_SOCKET;
169 so_gen_t so_gencnt; /* generation count for sockets */
170
171 MALLOC_DEFINE(M_SONAME, "soname", "socket name");
172 MALLOC_DEFINE(M_PCB, "pcb", "protocol control block");
173
174 #define DBG_LAYER_IN_BEG NETDBG_CODE(DBG_NETSOCK, 0)
175 #define DBG_LAYER_IN_END NETDBG_CODE(DBG_NETSOCK, 2)
176 #define DBG_LAYER_OUT_BEG NETDBG_CODE(DBG_NETSOCK, 1)
177 #define DBG_LAYER_OUT_END NETDBG_CODE(DBG_NETSOCK, 3)
178 #define DBG_FNC_SOSEND NETDBG_CODE(DBG_NETSOCK, (4 << 8) | 1)
179 #define DBG_FNC_SORECEIVE NETDBG_CODE(DBG_NETSOCK, (8 << 8))
180 #define DBG_FNC_SOSHUTDOWN NETDBG_CODE(DBG_NETSOCK, (9 << 8))
181
182 #define MAX_SOOPTGETM_SIZE (128 * MCLBYTES)
183
184
185 SYSCTL_DECL(_kern_ipc);
186
187 int somaxconn = SOMAXCONN;
188 SYSCTL_INT(_kern_ipc, KIPC_SOMAXCONN, somaxconn, CTLFLAG_RW | CTLFLAG_LOCKED, &somaxconn, 0, "");
189
190 /* Should we get a maximum also ??? */
191 static int sosendmaxchain = 65536;
192 static int sosendminchain = 16384;
193 static int sorecvmincopy = 16384;
194 SYSCTL_INT(_kern_ipc, OID_AUTO, sosendminchain, CTLFLAG_RW | CTLFLAG_LOCKED, &sosendminchain,
195 0, "");
196 SYSCTL_INT(_kern_ipc, OID_AUTO, sorecvmincopy, CTLFLAG_RW | CTLFLAG_LOCKED, &sorecvmincopy,
197 0, "");
198
199 /*
200 * Set to enable jumbo clusters (if available) for large writes when
201 * the socket is marked with SOF_MULTIPAGES; see below.
202 */
203 int sosendjcl = 1;
204 SYSCTL_INT(_kern_ipc, OID_AUTO, sosendjcl, CTLFLAG_RW | CTLFLAG_LOCKED, &sosendjcl, 0, "");
205
206 /*
207 * Set this to ignore SOF_MULTIPAGES and use jumbo clusters for large
208 * writes on the socket for all protocols on any network interfaces,
209 * depending upon sosendjcl above. Be extra careful when setting this
210 * to 1, because sending down packets that cross physical pages down to
211 * broken drivers (those that falsely assume that the physical pages
212 * are contiguous) might lead to system panics or silent data corruption.
213 * When set to 0, the system will respect SOF_MULTIPAGES, which is set
214 * only for TCP sockets whose outgoing interface is IFNET_MULTIPAGES
215 * capable. Set this to 1 only for testing/debugging purposes.
216 */
217 int sosendjcl_ignore_capab = 0;
218 SYSCTL_INT(_kern_ipc, OID_AUTO, sosendjcl_ignore_capab, CTLFLAG_RW | CTLFLAG_LOCKED,
219 &sosendjcl_ignore_capab, 0, "");
220
221 int sodefunctlog = 0;
222 SYSCTL_INT(_kern_ipc, OID_AUTO, sodefunctlog, CTLFLAG_RW | CTLFLAG_LOCKED,
223 &sodefunctlog, 0, "");
224
225 int sothrottlelog = 0;
226 SYSCTL_INT(_kern_ipc, OID_AUTO, sothrottlelog, CTLFLAG_RW | CTLFLAG_LOCKED,
227 &sothrottlelog, 0, "");
228
229 /*
230 * Socket operation routines.
231 * These routines are called by the routines in
232 * sys_socket.c or from a system process, and
233 * implement the semantics of socket operations by
234 * switching out to the protocol specific routines.
235 */
236
237 /* sys_generic.c */
238 extern void postevent(struct socket *, struct sockbuf *, int);
239 extern void evsofree(struct socket *);
240 extern int tcp_notsent_lowat_check(struct socket *so);
241
242 /* TODO: these should be in header file */
243 extern int get_inpcb_str_size(void);
244 extern int get_tcp_str_size(void);
245 extern struct domain *pffinddomain(int);
246 extern struct protosw *pffindprotonotype(int, int);
247 extern int soclose_locked(struct socket *);
248 extern int soo_kqfilter(struct fileproc *, struct knote *, struct proc *);
249
250 #ifdef __APPLE__
251
252 vm_size_t so_cache_zone_element_size;
253
254 static int sodelayed_copy(struct socket *, struct uio *, struct mbuf **, int *);
255 static void cached_sock_alloc(struct socket **, int);
256 static void cached_sock_free(struct socket *);
257 static void so_cache_timer(void *);
258
259 void soclose_wait_locked(struct socket *so);
260 int so_isdstlocal(struct socket *so);
261
262 /*
263 * SOTCDB_NO_DSCP is set by default, to prevent the networking stack from
264 * setting the DSCP code on the packet based on the service class; see
265 * <rdar://problem/11277343> for details.
266 */
267 __private_extern__ u_int32_t sotcdb = SOTCDB_NO_DSCP;
268 SYSCTL_INT(_kern_ipc, OID_AUTO, sotcdb, CTLFLAG_RW | CTLFLAG_LOCKED,
269 &sotcdb, 0, "");
270
271 void
272 socketinit(void)
273 {
274 vm_size_t str_size;
275
276 if (so_cache_init_done) {
277 printf("socketinit: already called...\n");
278 return;
279 }
280
281 PE_parse_boot_argn("socket_debug", &socket_debug, sizeof (socket_debug));
282
283 /*
284 * allocate lock group attribute and group for socket cache mutex
285 */
286 so_cache_mtx_grp_attr = lck_grp_attr_alloc_init();
287
288 so_cache_mtx_grp = lck_grp_alloc_init("so_cache",
289 so_cache_mtx_grp_attr);
290
291 /*
292 * allocate the lock attribute for socket cache mutex
293 */
294 so_cache_mtx_attr = lck_attr_alloc_init();
295
296 so_cache_init_done = 1;
297
298 /* cached sockets mutex */
299 so_cache_mtx = lck_mtx_alloc_init(so_cache_mtx_grp, so_cache_mtx_attr);
300
301 if (so_cache_mtx == NULL)
302 return; /* we're hosed... */
303
304 str_size = (vm_size_t)(sizeof (struct socket) + 4 +
305 get_inpcb_str_size() + 4 + get_tcp_str_size());
306
307 so_cache_zone = zinit(str_size, 120000*str_size, 8192, "socache zone");
308 zone_change(so_cache_zone, Z_CALLERACCT, FALSE);
309 zone_change(so_cache_zone, Z_NOENCRYPT, TRUE);
310 #if TEMPDEBUG
311 printf("cached_sock_alloc -- so_cache_zone size is %x\n", str_size);
312 #endif
313 timeout(so_cache_timer, NULL, (SO_CACHE_FLUSH_INTERVAL * hz));
314
315 so_cache_zone_element_size = str_size;
316
317 sflt_init();
318
319 _CASSERT(_SO_TC_MAX == SO_TC_STATS_MAX);
320
321 socket_tclass_init();
322
323 socket_flowadv_init();
324 }
325
326 static void
327 cached_sock_alloc(struct socket **so, int waitok)
328 {
329 caddr_t temp;
330 register uintptr_t offset;
331
332 lck_mtx_lock(so_cache_mtx);
333
334 if (cached_sock_count) {
335 cached_sock_count--;
336 *so = socket_cache_head;
337 if (*so == 0)
338 panic("cached_sock_alloc: cached sock is null");
339
340 socket_cache_head = socket_cache_head->cache_next;
341 if (socket_cache_head)
342 socket_cache_head->cache_prev = 0;
343 else
344 socket_cache_tail = 0;
345
346 lck_mtx_unlock(so_cache_mtx);
347
348 temp = (*so)->so_saved_pcb;
349 bzero((caddr_t)*so, sizeof (struct socket));
350 #if TEMPDEBUG
351 kprintf("cached_sock_alloc - retreiving cached sock %p - "
352 "count == %d\n", *so, cached_sock_count);
353 #endif
354 (*so)->so_saved_pcb = temp;
355 (*so)->cached_in_sock_layer = 1;
356 } else {
357 #if TEMPDEBUG
358 kprintf("Allocating cached sock %p from memory\n", *so);
359 #endif
360
361 lck_mtx_unlock(so_cache_mtx);
362
363 if (waitok)
364 *so = (struct socket *)zalloc(so_cache_zone);
365 else
366 *so = (struct socket *)zalloc_noblock(so_cache_zone);
367
368 if (*so == 0)
369 return;
370
371 bzero((caddr_t)*so, sizeof (struct socket));
372
373 /*
374 * Define offsets for extra structures into our single block of
375 * memory. Align extra structures on longword boundaries.
376 */
377
378 offset = (uintptr_t) *so;
379 offset += sizeof (struct socket);
380
381 offset = ALIGN(offset);
382
383 (*so)->so_saved_pcb = (caddr_t)offset;
384 offset += get_inpcb_str_size();
385
386 offset = ALIGN(offset);
387
388 ((struct inpcb *)(void *)(*so)->so_saved_pcb)->inp_saved_ppcb =
389 (caddr_t)offset;
390 #if TEMPDEBUG
391 kprintf("Allocating cached socket - %p, pcb=%p tcpcb=%p\n",
392 *so, (*so)->so_saved_pcb,
393 ((struct inpcb *)(*so)->so_saved_pcb)->inp_saved_ppcb);
394 #endif
395 }
396
397 (*so)->cached_in_sock_layer = 1;
398 }
399
400 static void
401 cached_sock_free(struct socket *so)
402 {
403
404 lck_mtx_lock(so_cache_mtx);
405
406 if (++cached_sock_count > max_cached_sock_count) {
407 --cached_sock_count;
408 lck_mtx_unlock(so_cache_mtx);
409 #if TEMPDEBUG
410 kprintf("Freeing overflowed cached socket %p\n", so);
411 #endif
412 zfree(so_cache_zone, so);
413 } else {
414 #if TEMPDEBUG
415 kprintf("Freeing socket %p into cache\n", so);
416 #endif
417 if (so_cache_hw < cached_sock_count)
418 so_cache_hw = cached_sock_count;
419
420 so->cache_next = socket_cache_head;
421 so->cache_prev = 0;
422 if (socket_cache_head)
423 socket_cache_head->cache_prev = so;
424 else
425 socket_cache_tail = so;
426
427 so->cache_timestamp = so_cache_time;
428 socket_cache_head = so;
429 lck_mtx_unlock(so_cache_mtx);
430 }
431
432 #if TEMPDEBUG
433 kprintf("Freed cached sock %p into cache - count is %d\n",
434 so, cached_sock_count);
435 #endif
436 }
437
438 static void
439 so_update_last_owner_locked(
440 struct socket *so,
441 proc_t self)
442 {
443 if (so->last_pid != 0)
444 {
445 if (self == NULL)
446 self = current_proc();
447
448 if (self)
449 {
450 so->last_upid = proc_uniqueid(self);
451 so->last_pid = proc_pid(self);
452 }
453 }
454 }
455
456 static void
457 so_cache_timer(__unused void *dummy)
458 {
459 register struct socket *p;
460 register int n_freed = 0;
461
462 lck_mtx_lock(so_cache_mtx);
463
464 ++so_cache_time;
465
466 while ((p = socket_cache_tail)) {
467 if ((so_cache_time - p->cache_timestamp) < SO_CACHE_TIME_LIMIT)
468 break;
469
470 so_cache_timeouts++;
471
472 if ((socket_cache_tail = p->cache_prev))
473 p->cache_prev->cache_next = 0;
474 if (--cached_sock_count == 0)
475 socket_cache_head = 0;
476
477 zfree(so_cache_zone, p);
478
479 if (++n_freed >= SO_CACHE_MAX_FREE_BATCH) {
480 so_cache_max_freed++;
481 break;
482 }
483 }
484 lck_mtx_unlock(so_cache_mtx);
485
486 timeout(so_cache_timer, NULL, (SO_CACHE_FLUSH_INTERVAL * hz));
487 }
488 #endif /* __APPLE__ */
489
490 /*
491 * Get a socket structure from our zone, and initialize it.
492 * We don't implement `waitok' yet (see comments in uipc_domain.c).
493 * Note that it would probably be better to allocate socket
494 * and PCB at the same time, but I'm not convinced that all
495 * the protocols can be easily modified to do this.
496 */
497 struct socket *
498 soalloc(int waitok, int dom, int type)
499 {
500 struct socket *so;
501
502 if ((dom == PF_INET) && (type == SOCK_STREAM)) {
503 cached_sock_alloc(&so, waitok);
504 } else {
505 MALLOC_ZONE(so, struct socket *, sizeof (*so), socket_zone,
506 M_WAITOK);
507 if (so != NULL)
508 bzero(so, sizeof (*so));
509 }
510 /* XXX race condition for reentrant kernel */
511 //###LD Atomic add for so_gencnt
512 if (so != NULL) {
513 so->so_gencnt = ++so_gencnt;
514 so->so_zone = socket_zone;
515 #if CONFIG_MACF_SOCKET
516 /* Convert waitok to M_WAITOK/M_NOWAIT for MAC Framework. */
517 if (mac_socket_label_init(so, !waitok) != 0) {
518 sodealloc(so);
519 return (NULL);
520 }
521 #endif /* MAC_SOCKET */
522 }
523
524 return (so);
525 }
526
527 /*
528 * Returns: 0 Success
529 * EAFNOSUPPORT
530 * EPROTOTYPE
531 * EPROTONOSUPPORT
532 * ENOBUFS
533 * <pru_attach>:ENOBUFS[AF_UNIX]
534 * <pru_attach>:ENOBUFS[TCP]
535 * <pru_attach>:ENOMEM[TCP]
536 * <pru_attach>:EISCONN[TCP]
537 * <pru_attach>:??? [other protocol families, IPSEC]
538 */
539 int
540 socreate(int dom, struct socket **aso, int type, int proto)
541 {
542 struct proc *p = current_proc();
543 register struct protosw *prp;
544 register struct socket *so;
545 register int error = 0;
546
547 #if TCPDEBUG
548 extern int tcpconsdebug;
549 #endif
550 if (proto)
551 prp = pffindproto(dom, proto, type);
552 else
553 prp = pffindtype(dom, type);
554
555 if (prp == 0 || prp->pr_usrreqs->pru_attach == 0) {
556 if (pffinddomain(dom) == NULL) {
557 return (EAFNOSUPPORT);
558 }
559 if (proto != 0) {
560 if (pffindprotonotype(dom, proto) != NULL) {
561 return (EPROTOTYPE);
562 }
563 }
564 return (EPROTONOSUPPORT);
565 }
566 if (prp->pr_type != type)
567 return (EPROTOTYPE);
568 so = soalloc(1, dom, type);
569 if (so == 0)
570 return (ENOBUFS);
571
572 TAILQ_INIT(&so->so_incomp);
573 TAILQ_INIT(&so->so_comp);
574 so->so_type = type;
575 so->last_upid = proc_uniqueid(p);
576 so->last_pid = proc_pid(p);
577
578 so->so_cred = kauth_cred_proc_ref(p);
579 if (!suser(kauth_cred_get(), NULL))
580 so->so_state = SS_PRIV;
581
582 so->so_proto = prp;
583 #ifdef __APPLE__
584 so->so_rcv.sb_flags |= SB_RECV; /* XXX */
585 so->so_rcv.sb_so = so->so_snd.sb_so = so;
586 #endif
587 so->next_lock_lr = 0;
588 so->next_unlock_lr = 0;
589
590 #if CONFIG_MACF_SOCKET
591 mac_socket_label_associate(kauth_cred_get(), so);
592 #endif /* MAC_SOCKET */
593
594 //### Attachement will create the per pcb lock if necessary and increase refcount
595 /*
596 * for creation, make sure it's done before
597 * socket is inserted in lists
598 */
599 so->so_usecount++;
600
601 error = (*prp->pr_usrreqs->pru_attach)(so, proto, p);
602 if (error) {
603 /*
604 * Warning:
605 * If so_pcb is not zero, the socket will be leaked,
606 * so protocol attachment handler must be coded carefuly
607 */
608 so->so_state |= SS_NOFDREF;
609 so->so_usecount--;
610 sofreelastref(so, 1); /* will deallocate the socket */
611 return (error);
612 }
613 #ifdef __APPLE__
614 prp->pr_domain->dom_refs++;
615 TAILQ_INIT(&so->so_evlist);
616
617 /* Attach socket filters for this protocol */
618 sflt_initsock(so);
619 #if TCPDEBUG
620 if (tcpconsdebug == 2)
621 so->so_options |= SO_DEBUG;
622 #endif
623 #endif
624 so_set_default_traffic_class(so);
625 /*
626 * If this is a background thread/task, mark the socket as such.
627 */
628 if (proc_get_self_isbackground() != 0) {
629 socket_set_traffic_mgt_flags(so, TRAFFIC_MGT_SO_BACKGROUND);
630 so->so_background_thread = current_thread();
631 }
632
633 switch (dom) {
634 /*
635 * Don't mark Unix domain or system sockets as eligible for defunct by default.
636 */
637 case PF_LOCAL:
638 case PF_SYSTEM:
639 so->so_flags |= SOF_NODEFUNCT;
640 break;
641 default:
642 break;
643 }
644
645 *aso = so;
646 return (0);
647 }
648
649 /*
650 * Returns: 0 Success
651 * <pru_bind>:EINVAL Invalid argument [COMMON_START]
652 * <pru_bind>:EAFNOSUPPORT Address family not supported
653 * <pru_bind>:EADDRNOTAVAIL Address not available.
654 * <pru_bind>:EINVAL Invalid argument
655 * <pru_bind>:EAFNOSUPPORT Address family not supported [notdef]
656 * <pru_bind>:EACCES Permission denied
657 * <pru_bind>:EADDRINUSE Address in use
658 * <pru_bind>:EAGAIN Resource unavailable, try again
659 * <pru_bind>:EPERM Operation not permitted
660 * <pru_bind>:???
661 * <sf_bind>:???
662 *
663 * Notes: It's not possible to fully enumerate the return codes above,
664 * since socket filter authors and protocol family authors may
665 * not choose to limit their error returns to those listed, even
666 * though this may result in some software operating incorrectly.
667 *
668 * The error codes which are enumerated above are those known to
669 * be returned by the tcp_usr_bind function supplied.
670 */
671 int
672 sobind(struct socket *so, struct sockaddr *nam)
673 {
674 struct proc *p = current_proc();
675 int error = 0;
676
677 socket_lock(so, 1);
678 VERIFY(so->so_usecount > 1);
679 so_update_last_owner_locked(so, p);
680
681 /*
682 * If this is a bind request on a socket that has been marked
683 * as inactive, reject it now before we go any further.
684 */
685 if (so->so_flags & SOF_DEFUNCT) {
686 error = EINVAL;
687 SODEFUNCTLOG(("%s[%d]: defunct so %p [%d,%d] (%d)\n",
688 __func__, proc_pid(p), so, INP_SOCKAF(so), INP_SOCKTYPE(so),
689 error));
690 goto out;
691 }
692
693 /* Socket filter */
694 error = sflt_bind(so, nam);
695
696 if (error == 0)
697 error = (*so->so_proto->pr_usrreqs->pru_bind)(so, nam, p);
698 out:
699 socket_unlock(so, 1);
700
701 if (error == EJUSTRETURN)
702 error = 0;
703
704 return (error);
705 }
706
707 void
708 sodealloc(struct socket *so)
709 {
710 kauth_cred_unref(&so->so_cred);
711
712 /* Remove any filters */
713 sflt_termsock(so);
714
715 so->so_gencnt = ++so_gencnt;
716
717 #if CONFIG_MACF_SOCKET
718 mac_socket_label_destroy(so);
719 #endif /* MAC_SOCKET */
720 if (so->cached_in_sock_layer == 1) {
721 cached_sock_free(so);
722 } else {
723 if (so->cached_in_sock_layer == -1)
724 panic("sodealloc: double dealloc: so=%p\n", so);
725 so->cached_in_sock_layer = -1;
726 FREE_ZONE(so, sizeof (*so), so->so_zone);
727 }
728 }
729
730 /*
731 * Returns: 0 Success
732 * EINVAL
733 * EOPNOTSUPP
734 * <pru_listen>:EINVAL[AF_UNIX]
735 * <pru_listen>:EINVAL[TCP]
736 * <pru_listen>:EADDRNOTAVAIL[TCP] Address not available.
737 * <pru_listen>:EINVAL[TCP] Invalid argument
738 * <pru_listen>:EAFNOSUPPORT[TCP] Address family not supported [notdef]
739 * <pru_listen>:EACCES[TCP] Permission denied
740 * <pru_listen>:EADDRINUSE[TCP] Address in use
741 * <pru_listen>:EAGAIN[TCP] Resource unavailable, try again
742 * <pru_listen>:EPERM[TCP] Operation not permitted
743 * <sf_listen>:???
744 *
745 * Notes: Other <pru_listen> returns depend on the protocol family; all
746 * <sf_listen> returns depend on what the filter author causes
747 * their filter to return.
748 */
749 int
750 solisten(struct socket *so, int backlog)
751 {
752 struct proc *p = current_proc();
753 int error = 0;
754
755 socket_lock(so, 1);
756
757 if (so->so_proto == NULL) {
758 error = EINVAL;
759 goto out;
760 }
761 if ((so->so_proto->pr_flags & PR_CONNREQUIRED) == 0) {
762 error = EOPNOTSUPP;
763 goto out;
764 }
765
766 /*
767 * If the listen request is made on a socket that is not fully
768 * disconnected, or on a socket that has been marked as inactive,
769 * reject the request now.
770 */
771 if ((so->so_state &
772 (SS_ISCONNECTED|SS_ISCONNECTING|SS_ISDISCONNECTING)) ||
773 (so->so_flags & SOF_DEFUNCT)) {
774 error = EINVAL;
775 if (so->so_flags & SOF_DEFUNCT) {
776 SODEFUNCTLOG(("%s[%d]: defunct so %p [%d,%d] (%d)\n",
777 __func__, proc_pid(p), so, INP_SOCKAF(so),
778 INP_SOCKTYPE(so), error));
779 }
780 goto out;
781 }
782
783 if ((so->so_restrictions & SO_RESTRICT_DENYIN) != 0) {
784 error = EPERM;
785 goto out;
786 }
787
788 error = sflt_listen(so);
789
790 if (error == 0) {
791 error = (*so->so_proto->pr_usrreqs->pru_listen)(so, p);
792 }
793
794 if (error) {
795 if (error == EJUSTRETURN)
796 error = 0;
797 goto out;
798 }
799
800 if (TAILQ_EMPTY(&so->so_comp))
801 so->so_options |= SO_ACCEPTCONN;
802 /*
803 * POSIX: The implementation may have an upper limit on the length of
804 * the listen queue-either global or per accepting socket. If backlog
805 * exceeds this limit, the length of the listen queue is set to the
806 * limit.
807 *
808 * If listen() is called with a backlog argument value that is less
809 * than 0, the function behaves as if it had been called with a backlog
810 * argument value of 0.
811 *
812 * A backlog argument of 0 may allow the socket to accept connections,
813 * in which case the length of the listen queue may be set to an
814 * implementation-defined minimum value.
815 */
816 if (backlog <= 0 || backlog > somaxconn)
817 backlog = somaxconn;
818
819 so->so_qlimit = backlog;
820 out:
821 socket_unlock(so, 1);
822 return (error);
823 }
824
825 void
826 sofreelastref(struct socket *so, int dealloc)
827 {
828 struct socket *head = so->so_head;
829
830 /* Assume socket is locked */
831
832 if ((!(so->so_flags & SOF_PCBCLEARING)) ||
833 ((so->so_state & SS_NOFDREF) == 0)) {
834 #ifdef __APPLE__
835 selthreadclear(&so->so_snd.sb_sel);
836 selthreadclear(&so->so_rcv.sb_sel);
837 so->so_rcv.sb_flags &= ~SB_UPCALL;
838 so->so_snd.sb_flags &= ~SB_UPCALL;
839 #endif
840 return;
841 }
842 if (head != NULL) {
843 socket_lock(head, 1);
844 if (so->so_state & SS_INCOMP) {
845 TAILQ_REMOVE(&head->so_incomp, so, so_list);
846 head->so_incqlen--;
847 } else if (so->so_state & SS_COMP) {
848 /*
849 * We must not decommission a socket that's
850 * on the accept(2) queue. If we do, then
851 * accept(2) may hang after select(2) indicated
852 * that the listening socket was ready.
853 */
854 #ifdef __APPLE__
855 selthreadclear(&so->so_snd.sb_sel);
856 selthreadclear(&so->so_rcv.sb_sel);
857 so->so_rcv.sb_flags &= ~SB_UPCALL;
858 so->so_snd.sb_flags &= ~SB_UPCALL;
859 #endif
860 socket_unlock(head, 1);
861 return;
862 } else {
863 panic("sofree: not queued");
864 }
865 head->so_qlen--;
866 so->so_state &= ~SS_INCOMP;
867 so->so_head = NULL;
868 socket_unlock(head, 1);
869 }
870 #ifdef __APPLE__
871 selthreadclear(&so->so_snd.sb_sel);
872 sbrelease(&so->so_snd);
873 #endif
874 sorflush(so);
875
876 /* 3932268: disable upcall */
877 so->so_rcv.sb_flags &= ~SB_UPCALL;
878 so->so_snd.sb_flags &= ~SB_UPCALL;
879
880 if (dealloc)
881 sodealloc(so);
882 }
883
884 void
885 soclose_wait_locked(struct socket *so)
886 {
887 lck_mtx_t *mutex_held;
888
889 if (so->so_proto->pr_getlock != NULL)
890 mutex_held = (*so->so_proto->pr_getlock)(so, 0);
891 else
892 mutex_held = so->so_proto->pr_domain->dom_mtx;
893 lck_mtx_assert(mutex_held, LCK_MTX_ASSERT_OWNED);
894
895 /*
896 * Double check here and return if there's no outstanding upcall;
897 * otherwise proceed further only if SOF_UPCALLCLOSEWAIT is set.
898 */
899 if (!so->so_upcallusecount || !(so->so_flags & SOF_UPCALLCLOSEWAIT))
900 return;
901 so->so_rcv.sb_flags &= ~SB_UPCALL;
902 so->so_snd.sb_flags &= ~SB_UPCALL;
903 so->so_flags |= SOF_CLOSEWAIT;
904 (void) msleep((caddr_t)&so->so_upcall, mutex_held, (PZERO - 1),
905 "soclose_wait_locked", NULL);
906 lck_mtx_assert(mutex_held, LCK_MTX_ASSERT_OWNED);
907 so->so_flags &= ~SOF_CLOSEWAIT;
908 }
909
910 /*
911 * Close a socket on last file table reference removal.
912 * Initiate disconnect if connected.
913 * Free socket when disconnect complete.
914 */
915 int
916 soclose_locked(struct socket *so)
917 {
918 int error = 0;
919 lck_mtx_t *mutex_held;
920 struct timespec ts;
921
922 if (so->so_usecount == 0) {
923 panic("soclose: so=%p refcount=0\n", so);
924 }
925
926 sflt_notify(so, sock_evt_closing, NULL);
927
928 if ((so->so_options & SO_ACCEPTCONN)) {
929 struct socket *sp, *sonext;
930 int socklock = 0;
931
932 /*
933 * We do not want new connection to be added
934 * to the connection queues
935 */
936 so->so_options &= ~SO_ACCEPTCONN;
937
938 for (sp = TAILQ_FIRST(&so->so_incomp); sp != NULL; sp = sonext) {
939 sonext = TAILQ_NEXT(sp, so_list);
940
941 /* Radar 5350314
942 * skip sockets thrown away by tcpdropdropblreq
943 * they will get cleanup by the garbage collection.
944 * otherwise, remove the incomp socket from the queue
945 * and let soabort trigger the appropriate cleanup.
946 */
947 if (sp->so_flags & SOF_OVERFLOW)
948 continue;
949
950 if (so->so_proto->pr_getlock != NULL) {
951 /* lock ordering for consistency with the rest of the stack,
952 * we lock the socket first and then grabb the head.
953 */
954 socket_unlock(so, 0);
955 socket_lock(sp, 1);
956 socket_lock(so, 0);
957 socklock = 1;
958 }
959
960 TAILQ_REMOVE(&so->so_incomp, sp, so_list);
961 so->so_incqlen--;
962
963 if (sp->so_state & SS_INCOMP) {
964 sp->so_state &= ~SS_INCOMP;
965 sp->so_head = NULL;
966
967 (void) soabort(sp);
968 }
969
970 if (socklock)
971 socket_unlock(sp, 1);
972 }
973
974 while ((sp = TAILQ_FIRST(&so->so_comp)) != NULL) {
975 /* Dequeue from so_comp since sofree() won't do it */
976 TAILQ_REMOVE(&so->so_comp, sp, so_list);
977 so->so_qlen--;
978
979 if (so->so_proto->pr_getlock != NULL) {
980 socket_unlock(so, 0);
981 socket_lock(sp, 1);
982 }
983
984 if (sp->so_state & SS_COMP) {
985 sp->so_state &= ~SS_COMP;
986 sp->so_head = NULL;
987
988 (void) soabort(sp);
989 }
990
991 if (so->so_proto->pr_getlock != NULL) {
992 socket_unlock(sp, 1);
993 socket_lock(so, 0);
994 }
995 }
996 }
997 if (so->so_pcb == 0) {
998 /* 3915887: mark the socket as ready for dealloc */
999 so->so_flags |= SOF_PCBCLEARING;
1000 goto discard;
1001 }
1002 if (so->so_state & SS_ISCONNECTED) {
1003 if ((so->so_state & SS_ISDISCONNECTING) == 0) {
1004 error = sodisconnectlocked(so);
1005 if (error)
1006 goto drop;
1007 }
1008 if (so->so_options & SO_LINGER) {
1009 if ((so->so_state & SS_ISDISCONNECTING) &&
1010 (so->so_state & SS_NBIO))
1011 goto drop;
1012 if (so->so_proto->pr_getlock != NULL)
1013 mutex_held = (*so->so_proto->pr_getlock)(so, 0);
1014 else
1015 mutex_held = so->so_proto->pr_domain->dom_mtx;
1016 while (so->so_state & SS_ISCONNECTED) {
1017 ts.tv_sec = (so->so_linger/100);
1018 ts.tv_nsec = (so->so_linger % 100) *
1019 NSEC_PER_USEC * 1000 * 10;
1020 error = msleep((caddr_t)&so->so_timeo,
1021 mutex_held, PSOCK | PCATCH, "soclose", &ts);
1022 if (error) {
1023 /*
1024 * It's OK when the time fires,
1025 * don't report an error
1026 */
1027 if (error == EWOULDBLOCK)
1028 error = 0;
1029 break;
1030 }
1031 }
1032 }
1033 }
1034 drop:
1035 if (so->so_usecount == 0)
1036 panic("soclose: usecount is zero so=%p\n", so);
1037 if (so->so_pcb && !(so->so_flags & SOF_PCBCLEARING)) {
1038 /*
1039 * Let NetworkStatistics know this PCB is going away
1040 * before we detach it.
1041 */
1042 if (nstat_collect &&
1043 (so->so_proto->pr_domain->dom_family == AF_INET ||
1044 so->so_proto->pr_domain->dom_family == AF_INET6))
1045 nstat_pcb_detach(so->so_pcb);
1046
1047 int error2 = (*so->so_proto->pr_usrreqs->pru_detach)(so);
1048 if (error == 0)
1049 error = error2;
1050 }
1051 if (so->so_usecount <= 0)
1052 panic("soclose: usecount is zero so=%p\n", so);
1053 discard:
1054 if (so->so_pcb && so->so_state & SS_NOFDREF)
1055 panic("soclose: NOFDREF");
1056 so->so_state |= SS_NOFDREF;
1057
1058 if ((so->so_flags & SOF_KNOTE) != 0)
1059 KNOTE(&so->so_klist, SO_FILT_HINT_LOCKED);
1060 #ifdef __APPLE__
1061 so->so_proto->pr_domain->dom_refs--;
1062 evsofree(so);
1063 #endif
1064 so->so_usecount--;
1065 sofree(so);
1066 return (error);
1067 }
1068
1069 int
1070 soclose(struct socket *so)
1071 {
1072 int error = 0;
1073 socket_lock(so, 1);
1074
1075 if (so->so_upcallusecount)
1076 soclose_wait_locked(so);
1077
1078 if (so->so_retaincnt == 0) {
1079 error = soclose_locked(so);
1080 } else {
1081 /*
1082 * if the FD is going away, but socket is
1083 * retained in kernel remove its reference
1084 */
1085 so->so_usecount--;
1086 if (so->so_usecount < 2)
1087 panic("soclose: retaincnt non null and so=%p "
1088 "usecount=%d\n", so, so->so_usecount);
1089 }
1090 socket_unlock(so, 1);
1091 return (error);
1092 }
1093
1094 /*
1095 * Must be called at splnet...
1096 */
1097 /* Should already be locked */
1098 int
1099 soabort(struct socket *so)
1100 {
1101 int error;
1102
1103 #ifdef MORE_LOCKING_DEBUG
1104 lck_mtx_t *mutex_held;
1105
1106 if (so->so_proto->pr_getlock != NULL)
1107 mutex_held = (*so->so_proto->pr_getlock)(so, 0);
1108 else
1109 mutex_held = so->so_proto->pr_domain->dom_mtx;
1110 lck_mtx_assert(mutex_held, LCK_MTX_ASSERT_OWNED);
1111 #endif
1112
1113 if ((so->so_flags & SOF_ABORTED) == 0) {
1114 so->so_flags |= SOF_ABORTED;
1115 error = (*so->so_proto->pr_usrreqs->pru_abort)(so);
1116 if (error) {
1117 sofree(so);
1118 return (error);
1119 }
1120 }
1121 return (0);
1122 }
1123
1124 int
1125 soacceptlock(struct socket *so, struct sockaddr **nam, int dolock)
1126 {
1127 int error;
1128
1129 if (dolock)
1130 socket_lock(so, 1);
1131
1132 if ((so->so_state & SS_NOFDREF) == 0)
1133 panic("soaccept: !NOFDREF");
1134 so->so_state &= ~SS_NOFDREF;
1135 error = (*so->so_proto->pr_usrreqs->pru_accept)(so, nam);
1136
1137 if (dolock)
1138 socket_unlock(so, 1);
1139 return (error);
1140 }
1141
1142 int
1143 soaccept(struct socket *so, struct sockaddr **nam)
1144 {
1145 return (soacceptlock(so, nam, 1));
1146 }
1147
1148 int
1149 soacceptfilter(struct socket *so)
1150 {
1151 struct sockaddr *local = NULL, *remote = NULL;
1152 int error = 0;
1153 struct socket *head = so->so_head;
1154
1155 /*
1156 * Hold the lock even if this socket
1157 * has not been made visible to the filter(s).
1158 * For sockets with global locks, this protect against the
1159 * head or peer going away
1160 */
1161 socket_lock(so, 1);
1162 if (sogetaddr_locked(so, &remote, 1) != 0 ||
1163 sogetaddr_locked(so, &local, 0) != 0) {
1164 so->so_state &= ~(SS_NOFDREF | SS_COMP);
1165 so->so_head = NULL;
1166 socket_unlock(so, 1);
1167 soclose(so);
1168 /* Out of resources; try it again next time */
1169 error = ECONNABORTED;
1170 goto done;
1171 }
1172
1173 error = sflt_accept(head, so, local, remote);
1174
1175 /*
1176 * If we get EJUSTRETURN from one of the filters, mark this socket
1177 * as inactive and return it anyway. This newly accepted socket
1178 * will be disconnected later before we hand it off to the caller.
1179 */
1180 if (error == EJUSTRETURN) {
1181 error = 0;
1182 (void) sosetdefunct(current_proc(), so,
1183 SHUTDOWN_SOCKET_LEVEL_DISCONNECT_INTERNAL, FALSE);
1184 }
1185
1186 if (error != 0) {
1187 /*
1188 * This may seem like a duplication to the above error
1189 * handling part when we return ECONNABORTED, except
1190 * the following is done while holding the lock since
1191 * the socket has been exposed to the filter(s) earlier.
1192 */
1193 so->so_state &= ~(SS_NOFDREF | SS_COMP);
1194 so->so_head = NULL;
1195 socket_unlock(so, 1);
1196 soclose(so);
1197 /* Propagate socket filter's error code to the caller */
1198 } else {
1199 socket_unlock(so, 1);
1200 }
1201 done:
1202 /* Callee checks for NULL pointer */
1203 sock_freeaddr(remote);
1204 sock_freeaddr(local);
1205 return (error);
1206 }
1207
1208 /*
1209 * Returns: 0 Success
1210 * EOPNOTSUPP Operation not supported on socket
1211 * EISCONN Socket is connected
1212 * <pru_connect>:EADDRNOTAVAIL Address not available.
1213 * <pru_connect>:EINVAL Invalid argument
1214 * <pru_connect>:EAFNOSUPPORT Address family not supported [notdef]
1215 * <pru_connect>:EACCES Permission denied
1216 * <pru_connect>:EADDRINUSE Address in use
1217 * <pru_connect>:EAGAIN Resource unavailable, try again
1218 * <pru_connect>:EPERM Operation not permitted
1219 * <sf_connect_out>:??? [anything a filter writer might set]
1220 */
1221 int
1222 soconnectlock(struct socket *so, struct sockaddr *nam, int dolock)
1223 {
1224 int error;
1225 struct proc *p = current_proc();
1226
1227 if (dolock)
1228 socket_lock(so, 1);
1229
1230 /*
1231 * If this is a listening socket or if this is a previously-accepted
1232 * socket that has been marked as inactive, reject the connect request.
1233 */
1234 if ((so->so_options & SO_ACCEPTCONN) || (so->so_flags & SOF_DEFUNCT)) {
1235 error = EOPNOTSUPP;
1236 if (so->so_flags & SOF_DEFUNCT) {
1237 SODEFUNCTLOG(("%s[%d]: defunct so %p [%d,%d] (%d)\n",
1238 __func__, proc_pid(p), so, INP_SOCKAF(so),
1239 INP_SOCKTYPE(so), error));
1240 }
1241 if (dolock)
1242 socket_unlock(so, 1);
1243 return (error);
1244 }
1245
1246 if ((so->so_restrictions & SO_RESTRICT_DENYOUT) != 0) {
1247 if (dolock)
1248 socket_unlock(so, 1);
1249 return (EPERM);
1250 }
1251
1252 /*
1253 * If protocol is connection-based, can only connect once.
1254 * Otherwise, if connected, try to disconnect first.
1255 * This allows user to disconnect by connecting to, e.g.,
1256 * a null address.
1257 */
1258 if (so->so_state & (SS_ISCONNECTED|SS_ISCONNECTING) &&
1259 ((so->so_proto->pr_flags & PR_CONNREQUIRED) ||
1260 (error = sodisconnectlocked(so)))) {
1261 error = EISCONN;
1262 } else {
1263 /*
1264 * Run connect filter before calling protocol:
1265 * - non-blocking connect returns before completion;
1266 */
1267 error = sflt_connectout(so, nam);
1268
1269 if (error) {
1270 if (error == EJUSTRETURN)
1271 error = 0;
1272 } else {
1273 error = (*so->so_proto->pr_usrreqs->pru_connect)(so, nam, p);
1274 }
1275 }
1276 if (dolock)
1277 socket_unlock(so, 1);
1278 return (error);
1279 }
1280
1281 int
1282 soconnect(struct socket *so, struct sockaddr *nam)
1283 {
1284 return (soconnectlock(so, nam, 1));
1285 }
1286
1287 /*
1288 * Returns: 0 Success
1289 * <pru_connect2>:EINVAL[AF_UNIX]
1290 * <pru_connect2>:EPROTOTYPE[AF_UNIX]
1291 * <pru_connect2>:??? [other protocol families]
1292 *
1293 * Notes: <pru_connect2> is not supported by [TCP].
1294 */
1295 int
1296 soconnect2(struct socket *so1, struct socket *so2)
1297 {
1298 int error;
1299
1300 socket_lock(so1, 1);
1301 if (so2->so_proto->pr_lock)
1302 socket_lock(so2, 1);
1303
1304 error = (*so1->so_proto->pr_usrreqs->pru_connect2)(so1, so2);
1305
1306 socket_unlock(so1, 1);
1307 if (so2->so_proto->pr_lock)
1308 socket_unlock(so2, 1);
1309 return (error);
1310 }
1311
1312 int
1313 sodisconnectlocked(struct socket *so)
1314 {
1315 int error;
1316
1317 if ((so->so_state & SS_ISCONNECTED) == 0) {
1318 error = ENOTCONN;
1319 goto bad;
1320 }
1321 if (so->so_state & SS_ISDISCONNECTING) {
1322 error = EALREADY;
1323 goto bad;
1324 }
1325
1326 error = (*so->so_proto->pr_usrreqs->pru_disconnect)(so);
1327
1328 if (error == 0) {
1329 sflt_notify(so, sock_evt_disconnected, NULL);
1330 }
1331 bad:
1332 return (error);
1333 }
1334
1335 /* Locking version */
1336 int
1337 sodisconnect(struct socket *so)
1338 {
1339 int error;
1340
1341 socket_lock(so, 1);
1342 error = sodisconnectlocked(so);
1343 socket_unlock(so, 1);
1344 return (error);
1345 }
1346
1347 #define SBLOCKWAIT(f) (((f) & MSG_DONTWAIT) ? M_DONTWAIT : M_WAIT)
1348
1349 /*
1350 * sosendcheck will lock the socket buffer if it isn't locked and
1351 * verify that there is space for the data being inserted.
1352 *
1353 * Returns: 0 Success
1354 * EPIPE
1355 * sblock:EWOULDBLOCK
1356 * sblock:EINTR
1357 * sbwait:EBADF
1358 * sbwait:EINTR
1359 * [so_error]:???
1360 */
1361 static int
1362 sosendcheck(struct socket *so, struct sockaddr *addr, int32_t resid, int32_t clen,
1363 int32_t atomic, int flags, int *sblocked)
1364 {
1365 int error = 0;
1366 int32_t space;
1367 int assumelock = 0;
1368
1369 restart:
1370 if (*sblocked == 0) {
1371 if ((so->so_snd.sb_flags & SB_LOCK) != 0 &&
1372 so->so_send_filt_thread != 0 &&
1373 so->so_send_filt_thread == current_thread()) {
1374 /*
1375 * We're being called recursively from a filter,
1376 * allow this to continue. Radar 4150520.
1377 * Don't set sblocked because we don't want
1378 * to perform an unlock later.
1379 */
1380 assumelock = 1;
1381 } else {
1382 error = sblock(&so->so_snd, SBLOCKWAIT(flags));
1383 if (error) {
1384 if (so->so_flags & SOF_DEFUNCT)
1385 goto defunct;
1386 return (error);
1387 }
1388 *sblocked = 1;
1389 }
1390 }
1391
1392 /*
1393 * If a send attempt is made on a socket that has been marked
1394 * as inactive (disconnected), reject the request.
1395 */
1396 if (so->so_flags & SOF_DEFUNCT) {
1397 defunct:
1398 error = EPIPE;
1399 SODEFUNCTLOG(("%s[%d]: defunct so %p [%d,%d] (%d)\n", __func__,
1400 proc_selfpid(), so, INP_SOCKAF(so), INP_SOCKTYPE(so),
1401 error));
1402 return (error);
1403 }
1404
1405 if (so->so_state & SS_CANTSENDMORE)
1406 return (EPIPE);
1407
1408 if (so->so_error) {
1409 error = so->so_error;
1410 so->so_error = 0;
1411 return (error);
1412 }
1413
1414 if ((so->so_state & SS_ISCONNECTED) == 0) {
1415 if ((so->so_proto->pr_flags & PR_CONNREQUIRED) != 0) {
1416 if ((so->so_state & SS_ISCONFIRMING) == 0 &&
1417 !(resid == 0 && clen != 0))
1418 return (ENOTCONN);
1419 } else if (addr == 0 && !(flags&MSG_HOLD)) {
1420 return ((so->so_proto->pr_flags & PR_CONNREQUIRED) ?
1421 ENOTCONN : EDESTADDRREQ);
1422 }
1423 }
1424 space = sbspace(&so->so_snd);
1425 if (flags & MSG_OOB)
1426 space += 1024;
1427 if ((atomic && resid > so->so_snd.sb_hiwat) ||
1428 clen > so->so_snd.sb_hiwat)
1429 return (EMSGSIZE);
1430 if ((space < resid + clen &&
1431 (atomic || space < (int32_t)so->so_snd.sb_lowat || space < clen)) ||
1432 (so->so_type == SOCK_STREAM && so_wait_for_if_feedback(so))) {
1433 if ((so->so_state & SS_NBIO) || (flags & MSG_NBIO) ||
1434 assumelock) {
1435 return (EWOULDBLOCK);
1436 }
1437 sbunlock(&so->so_snd, 1);
1438 *sblocked = 0;
1439 error = sbwait(&so->so_snd);
1440 if (error) {
1441 if (so->so_flags & SOF_DEFUNCT)
1442 goto defunct;
1443 return (error);
1444 }
1445 goto restart;
1446 }
1447
1448 return (0);
1449 }
1450
1451 /*
1452 * Send on a socket.
1453 * If send must go all at once and message is larger than
1454 * send buffering, then hard error.
1455 * Lock against other senders.
1456 * If must go all at once and not enough room now, then
1457 * inform user that this would block and do nothing.
1458 * Otherwise, if nonblocking, send as much as possible.
1459 * The data to be sent is described by "uio" if nonzero,
1460 * otherwise by the mbuf chain "top" (which must be null
1461 * if uio is not). Data provided in mbuf chain must be small
1462 * enough to send all at once.
1463 *
1464 * Returns nonzero on error, timeout or signal; callers
1465 * must check for short counts if EINTR/ERESTART are returned.
1466 * Data and control buffers are freed on return.
1467 * Experiment:
1468 * MSG_HOLD: go thru most of sosend(), but just enqueue the mbuf
1469 * MSG_SEND: go thru as for MSG_HOLD on current fragment, then
1470 * point at the mbuf chain being constructed and go from there.
1471 *
1472 * Returns: 0 Success
1473 * EOPNOTSUPP
1474 * EINVAL
1475 * ENOBUFS
1476 * uiomove:EFAULT
1477 * sosendcheck:EPIPE
1478 * sosendcheck:EWOULDBLOCK
1479 * sosendcheck:EINTR
1480 * sosendcheck:EBADF
1481 * sosendcheck:EINTR
1482 * sosendcheck:??? [value from so_error]
1483 * <pru_send>:ECONNRESET[TCP]
1484 * <pru_send>:EINVAL[TCP]
1485 * <pru_send>:ENOBUFS[TCP]
1486 * <pru_send>:EADDRINUSE[TCP]
1487 * <pru_send>:EADDRNOTAVAIL[TCP]
1488 * <pru_send>:EAFNOSUPPORT[TCP]
1489 * <pru_send>:EACCES[TCP]
1490 * <pru_send>:EAGAIN[TCP]
1491 * <pru_send>:EPERM[TCP]
1492 * <pru_send>:EMSGSIZE[TCP]
1493 * <pru_send>:EHOSTUNREACH[TCP]
1494 * <pru_send>:ENETUNREACH[TCP]
1495 * <pru_send>:ENETDOWN[TCP]
1496 * <pru_send>:ENOMEM[TCP]
1497 * <pru_send>:ENOBUFS[TCP]
1498 * <pru_send>:???[TCP] [ignorable: mostly IPSEC/firewall/DLIL]
1499 * <pru_send>:EINVAL[AF_UNIX]
1500 * <pru_send>:EOPNOTSUPP[AF_UNIX]
1501 * <pru_send>:EPIPE[AF_UNIX]
1502 * <pru_send>:ENOTCONN[AF_UNIX]
1503 * <pru_send>:EISCONN[AF_UNIX]
1504 * <pru_send>:???[AF_UNIX] [whatever a filter author chooses]
1505 * <sf_data_out>:??? [whatever a filter author chooses]
1506 *
1507 * Notes: Other <pru_send> returns depend on the protocol family; all
1508 * <sf_data_out> returns depend on what the filter author causes
1509 * their filter to return.
1510 */
1511 int
1512 sosend(struct socket *so, struct sockaddr *addr, struct uio *uio,
1513 struct mbuf *top, struct mbuf *control, int flags)
1514 {
1515 struct mbuf **mp;
1516 register struct mbuf *m, *freelist = NULL;
1517 register int32_t space, len, resid;
1518 int clen = 0, error, dontroute, mlen, sendflags;
1519 int atomic = sosendallatonce(so) || top;
1520 int sblocked = 0;
1521 struct proc *p = current_proc();
1522
1523 if (uio) {
1524 // LP64todo - fix this!
1525 resid = uio_resid(uio);
1526 } else {
1527 resid = top->m_pkthdr.len;
1528 }
1529 KERNEL_DEBUG((DBG_FNC_SOSEND | DBG_FUNC_START), so, resid,
1530 so->so_snd.sb_cc, so->so_snd.sb_lowat, so->so_snd.sb_hiwat);
1531
1532 socket_lock(so, 1);
1533 so_update_last_owner_locked(so, p);
1534
1535 if (so->so_type != SOCK_STREAM && (flags & MSG_OOB) != 0) {
1536 error = EOPNOTSUPP;
1537 socket_unlock(so, 1);
1538 goto out;
1539 }
1540
1541 /*
1542 * In theory resid should be unsigned.
1543 * However, space must be signed, as it might be less than 0
1544 * if we over-committed, and we must use a signed comparison
1545 * of space and resid. On the other hand, a negative resid
1546 * causes us to loop sending 0-length segments to the protocol.
1547 *
1548 * Also check to make sure that MSG_EOR isn't used on SOCK_STREAM
1549 * type sockets since that's an error.
1550 */
1551 if (resid < 0 || (so->so_type == SOCK_STREAM && (flags & MSG_EOR))) {
1552 error = EINVAL;
1553 socket_unlock(so, 1);
1554 goto out;
1555 }
1556
1557 dontroute =
1558 (flags & MSG_DONTROUTE) && (so->so_options & SO_DONTROUTE) == 0 &&
1559 (so->so_proto->pr_flags & PR_ATOMIC);
1560 OSIncrementAtomicLong(&p->p_stats->p_ru.ru_msgsnd);
1561 if (control)
1562 clen = control->m_len;
1563
1564 do {
1565 error = sosendcheck(so, addr, resid, clen, atomic, flags,
1566 &sblocked);
1567 if (error) {
1568 goto release;
1569 }
1570 mp = &top;
1571 space = sbspace(&so->so_snd) - clen + ((flags & MSG_OOB) ?
1572 1024 : 0);
1573
1574 do {
1575 if (uio == NULL) {
1576 /*
1577 * Data is prepackaged in "top".
1578 */
1579 resid = 0;
1580 if (flags & MSG_EOR)
1581 top->m_flags |= M_EOR;
1582 } else {
1583 int chainlength;
1584 int bytes_to_copy;
1585 boolean_t jumbocl;
1586
1587 bytes_to_copy = imin(resid, space);
1588
1589 if (sosendminchain > 0) {
1590 chainlength = 0;
1591 } else {
1592 chainlength = sosendmaxchain;
1593 }
1594
1595 /*
1596 * Attempt to use larger than system page-size
1597 * clusters for large writes only if there is
1598 * a jumbo cluster pool and if the socket is
1599 * marked accordingly.
1600 */
1601 jumbocl = sosendjcl && njcl > 0 &&
1602 ((so->so_flags & SOF_MULTIPAGES) ||
1603 sosendjcl_ignore_capab);
1604
1605 socket_unlock(so, 0);
1606
1607 do {
1608 int num_needed;
1609 int hdrs_needed = (top == 0) ? 1 : 0;
1610
1611 /*
1612 * try to maintain a local cache of mbuf
1613 * clusters needed to complete this
1614 * write the list is further limited to
1615 * the number that are currently needed
1616 * to fill the socket this mechanism
1617 * allows a large number of mbufs/
1618 * clusters to be grabbed under a single
1619 * mbuf lock... if we can't get any
1620 * clusters, than fall back to trying
1621 * for mbufs if we fail early (or
1622 * miscalcluate the number needed) make
1623 * sure to release any clusters we
1624 * haven't yet consumed.
1625 */
1626 if (freelist == NULL &&
1627 bytes_to_copy > MBIGCLBYTES &&
1628 jumbocl) {
1629 num_needed =
1630 bytes_to_copy / M16KCLBYTES;
1631
1632 if ((bytes_to_copy -
1633 (num_needed * M16KCLBYTES))
1634 >= MINCLSIZE)
1635 num_needed++;
1636
1637 freelist =
1638 m_getpackets_internal(
1639 (unsigned int *)&num_needed,
1640 hdrs_needed, M_WAIT, 0,
1641 M16KCLBYTES);
1642 /*
1643 * Fall back to 4K cluster size
1644 * if allocation failed
1645 */
1646 }
1647
1648 if (freelist == NULL &&
1649 bytes_to_copy > MCLBYTES) {
1650 num_needed =
1651 bytes_to_copy / MBIGCLBYTES;
1652
1653 if ((bytes_to_copy -
1654 (num_needed * MBIGCLBYTES)) >=
1655 MINCLSIZE)
1656 num_needed++;
1657
1658 freelist =
1659 m_getpackets_internal(
1660 (unsigned int *)&num_needed,
1661 hdrs_needed, M_WAIT, 0,
1662 MBIGCLBYTES);
1663 /*
1664 * Fall back to cluster size
1665 * if allocation failed
1666 */
1667 }
1668
1669 if (freelist == NULL &&
1670 bytes_to_copy > MINCLSIZE) {
1671 num_needed =
1672 bytes_to_copy / MCLBYTES;
1673
1674 if ((bytes_to_copy -
1675 (num_needed * MCLBYTES)) >=
1676 MINCLSIZE)
1677 num_needed++;
1678
1679 freelist =
1680 m_getpackets_internal(
1681 (unsigned int *)&num_needed,
1682 hdrs_needed, M_WAIT, 0,
1683 MCLBYTES);
1684 /*
1685 * Fall back to a single mbuf
1686 * if allocation failed
1687 */
1688 }
1689
1690 if (freelist == NULL) {
1691 if (top == 0)
1692 MGETHDR(freelist,
1693 M_WAIT, MT_DATA);
1694 else
1695 MGET(freelist,
1696 M_WAIT, MT_DATA);
1697
1698 if (freelist == NULL) {
1699 error = ENOBUFS;
1700 socket_lock(so, 0);
1701 goto release;
1702 }
1703 /*
1704 * For datagram protocols,
1705 * leave room for protocol
1706 * headers in first mbuf.
1707 */
1708 if (atomic && top == 0 &&
1709 bytes_to_copy < MHLEN) {
1710 MH_ALIGN(freelist,
1711 bytes_to_copy);
1712 }
1713 }
1714 m = freelist;
1715 freelist = m->m_next;
1716 m->m_next = NULL;
1717
1718 if ((m->m_flags & M_EXT))
1719 mlen = m->m_ext.ext_size;
1720 else if ((m->m_flags & M_PKTHDR))
1721 mlen =
1722 MHLEN - m_leadingspace(m);
1723 else
1724 mlen = MLEN;
1725 len = imin(mlen, bytes_to_copy);
1726
1727 chainlength += len;
1728
1729 space -= len;
1730
1731 error = uiomove(mtod(m, caddr_t),
1732 len, uio);
1733
1734 resid = uio_resid(uio);
1735
1736 m->m_len = len;
1737 *mp = m;
1738 top->m_pkthdr.len += len;
1739 if (error)
1740 break;
1741 mp = &m->m_next;
1742 if (resid <= 0) {
1743 if (flags & MSG_EOR)
1744 top->m_flags |= M_EOR;
1745 break;
1746 }
1747 bytes_to_copy = min(resid, space);
1748
1749 } while (space > 0 &&
1750 (chainlength < sosendmaxchain || atomic ||
1751 resid < MINCLSIZE));
1752
1753 socket_lock(so, 0);
1754
1755 if (error)
1756 goto release;
1757 }
1758
1759 if (flags & (MSG_HOLD|MSG_SEND)) {
1760 /* Enqueue for later, go away if HOLD */
1761 register struct mbuf *mb1;
1762 if (so->so_temp && (flags & MSG_FLUSH)) {
1763 m_freem(so->so_temp);
1764 so->so_temp = NULL;
1765 }
1766 if (so->so_temp)
1767 so->so_tail->m_next = top;
1768 else
1769 so->so_temp = top;
1770 mb1 = top;
1771 while (mb1->m_next)
1772 mb1 = mb1->m_next;
1773 so->so_tail = mb1;
1774 if (flags & MSG_HOLD) {
1775 top = NULL;
1776 goto release;
1777 }
1778 top = so->so_temp;
1779 }
1780 if (dontroute)
1781 so->so_options |= SO_DONTROUTE;
1782
1783 /* Compute flags here, for pru_send and NKEs */
1784 sendflags = (flags & MSG_OOB) ? PRUS_OOB :
1785 /*
1786 * If the user set MSG_EOF, the protocol
1787 * understands this flag and nothing left to
1788 * send then use PRU_SEND_EOF instead of PRU_SEND.
1789 */
1790 ((flags & MSG_EOF) &&
1791 (so->so_proto->pr_flags & PR_IMPLOPCL) &&
1792 (resid <= 0)) ?
1793 PRUS_EOF :
1794 /* If there is more to send set PRUS_MORETOCOME */
1795 (resid > 0 && space > 0) ? PRUS_MORETOCOME : 0;
1796
1797 /*
1798 * Socket filter processing
1799 */
1800 error = sflt_data_out(so, addr, &top, &control,
1801 (sendflags & MSG_OOB) ? sock_data_filt_flag_oob : 0);
1802 if (error) {
1803 if (error == EJUSTRETURN) {
1804 error = 0;
1805 clen = 0;
1806 control = 0;
1807 top = 0;
1808 }
1809
1810 goto release;
1811 }
1812 /*
1813 * End Socket filter processing
1814 */
1815
1816 error = (*so->so_proto->pr_usrreqs->pru_send)
1817 (so, sendflags, top, addr, control, p);
1818 #ifdef __APPLE__
1819 if (flags & MSG_SEND)
1820 so->so_temp = NULL;
1821 #endif
1822 if (dontroute)
1823 so->so_options &= ~SO_DONTROUTE;
1824
1825 clen = 0;
1826 control = 0;
1827 top = 0;
1828 mp = &top;
1829 if (error)
1830 goto release;
1831 } while (resid && space > 0);
1832 } while (resid);
1833
1834 release:
1835 if (sblocked)
1836 sbunlock(&so->so_snd, 0); /* will unlock socket */
1837 else
1838 socket_unlock(so, 1);
1839 out:
1840 if (top)
1841 m_freem(top);
1842 if (control)
1843 m_freem(control);
1844 if (freelist)
1845 m_freem_list(freelist);
1846
1847 KERNEL_DEBUG(DBG_FNC_SOSEND | DBG_FUNC_END, so, resid, so->so_snd.sb_cc,
1848 space, error);
1849
1850 return (error);
1851 }
1852
1853 /*
1854 * Implement receive operations on a socket.
1855 * We depend on the way that records are added to the sockbuf
1856 * by sbappend*. In particular, each record (mbufs linked through m_next)
1857 * must begin with an address if the protocol so specifies,
1858 * followed by an optional mbuf or mbufs containing ancillary data,
1859 * and then zero or more mbufs of data.
1860 * In order to avoid blocking network interrupts for the entire time here,
1861 * we splx() while doing the actual copy to user space.
1862 * Although the sockbuf is locked, new data may still be appended,
1863 * and thus we must maintain consistency of the sockbuf during that time.
1864 *
1865 * The caller may receive the data as a single mbuf chain by supplying
1866 * an mbuf **mp0 for use in returning the chain. The uio is then used
1867 * only for the count in uio_resid.
1868 *
1869 * Returns: 0 Success
1870 * ENOBUFS
1871 * ENOTCONN
1872 * EWOULDBLOCK
1873 * uiomove:EFAULT
1874 * sblock:EWOULDBLOCK
1875 * sblock:EINTR
1876 * sbwait:EBADF
1877 * sbwait:EINTR
1878 * sodelayed_copy:EFAULT
1879 * <pru_rcvoob>:EINVAL[TCP]
1880 * <pru_rcvoob>:EWOULDBLOCK[TCP]
1881 * <pru_rcvoob>:???
1882 * <pr_domain->dom_externalize>:EMSGSIZE[AF_UNIX]
1883 * <pr_domain->dom_externalize>:ENOBUFS[AF_UNIX]
1884 * <pr_domain->dom_externalize>:???
1885 *
1886 * Notes: Additional return values from calls through <pru_rcvoob> and
1887 * <pr_domain->dom_externalize> depend on protocols other than
1888 * TCP or AF_UNIX, which are documented above.
1889 */
1890 int
1891 soreceive(struct socket *so, struct sockaddr **psa, struct uio *uio,
1892 struct mbuf **mp0, struct mbuf **controlp, int *flagsp)
1893 {
1894 register struct mbuf *m, **mp, *ml = NULL;
1895 register int flags, len, error, offset;
1896 struct protosw *pr = so->so_proto;
1897 struct mbuf *nextrecord;
1898 int moff, type = 0;
1899 int orig_resid = uio_resid(uio);
1900 struct mbuf *free_list;
1901 int delayed_copy_len;
1902 int can_delay;
1903 int need_event;
1904 struct proc *p = current_proc();
1905
1906 // LP64todo - fix this!
1907 KERNEL_DEBUG(DBG_FNC_SORECEIVE | DBG_FUNC_START, so, uio_resid(uio),
1908 so->so_rcv.sb_cc, so->so_rcv.sb_lowat, so->so_rcv.sb_hiwat);
1909
1910 socket_lock(so, 1);
1911 so_update_last_owner_locked(so, p);
1912
1913 #ifdef MORE_LOCKING_DEBUG
1914 if (so->so_usecount == 1)
1915 panic("soreceive: so=%x no other reference on socket\n", so);
1916 #endif
1917 mp = mp0;
1918 if (psa)
1919 *psa = 0;
1920 if (controlp)
1921 *controlp = 0;
1922 if (flagsp)
1923 flags = *flagsp &~ MSG_EOR;
1924 else
1925 flags = 0;
1926
1927 /*
1928 * If a recv attempt is made on a previously-accepted socket
1929 * that has been marked as inactive (disconnected), reject
1930 * the request.
1931 */
1932 if (so->so_flags & SOF_DEFUNCT) {
1933 struct sockbuf *sb = &so->so_rcv;
1934
1935 error = ENOTCONN;
1936 SODEFUNCTLOG(("%s[%d]: defunct so %p [%d,%d] (%d)\n", __func__,
1937 proc_pid(p), so, INP_SOCKAF(so), INP_SOCKTYPE(so), error));
1938 /*
1939 * This socket should have been disconnected and flushed
1940 * prior to being returned from sodefunct(); there should
1941 * be no data on its receive list, so panic otherwise.
1942 */
1943 if (so->so_state & SS_DEFUNCT)
1944 sb_empty_assert(sb, __func__);
1945 socket_unlock(so, 1);
1946 return (error);
1947 }
1948
1949 /*
1950 * When SO_WANTOOBFLAG is set we try to get out-of-band data
1951 * regardless of the flags argument. Here is the case were
1952 * out-of-band data is not inline.
1953 */
1954 if ((flags & MSG_OOB) ||
1955 ((so->so_options & SO_WANTOOBFLAG) != 0 &&
1956 (so->so_options & SO_OOBINLINE) == 0 &&
1957 (so->so_oobmark || (so->so_state & SS_RCVATMARK)))) {
1958 m = m_get(M_WAIT, MT_DATA);
1959 if (m == NULL) {
1960 socket_unlock(so, 1);
1961 KERNEL_DEBUG(DBG_FNC_SORECEIVE | DBG_FUNC_END,
1962 ENOBUFS, 0, 0, 0, 0);
1963 return (ENOBUFS);
1964 }
1965 error = (*pr->pr_usrreqs->pru_rcvoob)(so, m, flags & MSG_PEEK);
1966 if (error)
1967 goto bad;
1968 socket_unlock(so, 0);
1969 do {
1970 error = uiomove(mtod(m, caddr_t),
1971 imin(uio_resid(uio), m->m_len), uio);
1972 m = m_free(m);
1973 } while (uio_resid(uio) && error == 0 && m);
1974 socket_lock(so, 0);
1975 bad:
1976 if (m)
1977 m_freem(m);
1978 #ifdef __APPLE__
1979 if ((so->so_options & SO_WANTOOBFLAG) != 0) {
1980 if (error == EWOULDBLOCK || error == EINVAL) {
1981 /*
1982 * Let's try to get normal data:
1983 * EWOULDBLOCK: out-of-band data not
1984 * receive yet. EINVAL: out-of-band data
1985 * already read.
1986 */
1987 error = 0;
1988 goto nooob;
1989 } else if (error == 0 && flagsp) {
1990 *flagsp |= MSG_OOB;
1991 }
1992 }
1993 socket_unlock(so, 1);
1994 KERNEL_DEBUG(DBG_FNC_SORECEIVE | DBG_FUNC_END, error,
1995 0, 0, 0, 0);
1996 #endif
1997 return (error);
1998 }
1999 nooob:
2000 if (mp)
2001 *mp = (struct mbuf *)0;
2002 if (so->so_state & SS_ISCONFIRMING && uio_resid(uio))
2003 (*pr->pr_usrreqs->pru_rcvd)(so, 0);
2004
2005
2006 free_list = (struct mbuf *)0;
2007 delayed_copy_len = 0;
2008 restart:
2009 #ifdef MORE_LOCKING_DEBUG
2010 if (so->so_usecount <= 1)
2011 printf("soreceive: sblock so=%p ref=%d on socket\n",
2012 so, so->so_usecount);
2013 #endif
2014 /*
2015 * See if the socket has been closed (SS_NOFDREF|SS_CANTRCVMORE)
2016 * and if so just return to the caller. This could happen when
2017 * soreceive() is called by a socket upcall function during the
2018 * time the socket is freed. The socket buffer would have been
2019 * locked across the upcall, therefore we cannot put this thread
2020 * to sleep (else we will deadlock) or return EWOULDBLOCK (else
2021 * we may livelock), because the lock on the socket buffer will
2022 * only be released when the upcall routine returns to its caller.
2023 * Because the socket has been officially closed, there can be
2024 * no further read on it.
2025 */
2026 if ((so->so_state & (SS_NOFDREF | SS_CANTRCVMORE)) ==
2027 (SS_NOFDREF | SS_CANTRCVMORE)) {
2028 socket_unlock(so, 1);
2029 return (0);
2030 }
2031
2032 error = sblock(&so->so_rcv, SBLOCKWAIT(flags));
2033 if (error) {
2034 socket_unlock(so, 1);
2035 KERNEL_DEBUG(DBG_FNC_SORECEIVE | DBG_FUNC_END, error,
2036 0, 0, 0, 0);
2037 return (error);
2038 }
2039
2040 m = so->so_rcv.sb_mb;
2041 /*
2042 * If we have less data than requested, block awaiting more
2043 * (subject to any timeout) if:
2044 * 1. the current count is less than the low water mark, or
2045 * 2. MSG_WAITALL is set, and it is possible to do the entire
2046 * receive operation at once if we block (resid <= hiwat).
2047 * 3. MSG_DONTWAIT is not set
2048 * If MSG_WAITALL is set but resid is larger than the receive buffer,
2049 * we have to do the receive in sections, and thus risk returning
2050 * a short count if a timeout or signal occurs after we start.
2051 */
2052 if (m == 0 || (((flags & MSG_DONTWAIT) == 0 &&
2053 so->so_rcv.sb_cc < uio_resid(uio)) &&
2054 (so->so_rcv.sb_cc < so->so_rcv.sb_lowat ||
2055 ((flags & MSG_WAITALL) && uio_resid(uio) <= so->so_rcv.sb_hiwat)) &&
2056 m->m_nextpkt == 0 && (pr->pr_flags & PR_ATOMIC) == 0)) {
2057 /*
2058 * Panic if we notice inconsistencies in the socket's
2059 * receive list; both sb_mb and sb_cc should correctly
2060 * reflect the contents of the list, otherwise we may
2061 * end up with false positives during select() or poll()
2062 * which could put the application in a bad state.
2063 */
2064 SB_MB_CHECK(&so->so_rcv);
2065
2066 if (so->so_error) {
2067 if (m)
2068 goto dontblock;
2069 error = so->so_error;
2070 if ((flags & MSG_PEEK) == 0)
2071 so->so_error = 0;
2072 goto release;
2073 }
2074 if (so->so_state & SS_CANTRCVMORE) {
2075 if (m)
2076 goto dontblock;
2077 else
2078 goto release;
2079 }
2080 for (; m; m = m->m_next)
2081 if (m->m_type == MT_OOBDATA || (m->m_flags & M_EOR)) {
2082 m = so->so_rcv.sb_mb;
2083 goto dontblock;
2084 }
2085 if ((so->so_state & (SS_ISCONNECTED|SS_ISCONNECTING)) == 0 &&
2086 (so->so_proto->pr_flags & PR_CONNREQUIRED)) {
2087 error = ENOTCONN;
2088 goto release;
2089 }
2090 if (uio_resid(uio) == 0)
2091 goto release;
2092 if ((so->so_state & SS_NBIO) ||
2093 (flags & (MSG_DONTWAIT|MSG_NBIO))) {
2094 error = EWOULDBLOCK;
2095 goto release;
2096 }
2097 SBLASTRECORDCHK(&so->so_rcv, "soreceive sbwait 1");
2098 SBLASTMBUFCHK(&so->so_rcv, "soreceive sbwait 1");
2099 sbunlock(&so->so_rcv, 1);
2100 #if EVEN_MORE_LOCKING_DEBUG
2101 if (socket_debug)
2102 printf("Waiting for socket data\n");
2103 #endif
2104
2105 error = sbwait(&so->so_rcv);
2106 #if EVEN_MORE_LOCKING_DEBUG
2107 if (socket_debug)
2108 printf("SORECEIVE - sbwait returned %d\n", error);
2109 #endif
2110 if (so->so_usecount < 1)
2111 panic("soreceive: after 2nd sblock so=%p ref=%d on "
2112 "socket\n", so, so->so_usecount);
2113 if (error) {
2114 socket_unlock(so, 1);
2115 KERNEL_DEBUG(DBG_FNC_SORECEIVE | DBG_FUNC_END, error,
2116 0, 0, 0, 0);
2117 return (error);
2118 }
2119 goto restart;
2120 }
2121 dontblock:
2122 OSIncrementAtomicLong(&p->p_stats->p_ru.ru_msgrcv);
2123 SBLASTRECORDCHK(&so->so_rcv, "soreceive 1");
2124 SBLASTMBUFCHK(&so->so_rcv, "soreceive 1");
2125 nextrecord = m->m_nextpkt;
2126 if ((pr->pr_flags & PR_ADDR) && m->m_type == MT_SONAME) {
2127 KASSERT(m->m_type == MT_SONAME, ("receive 1a"));
2128 #if CONFIG_MACF_SOCKET_SUBSET
2129 /*
2130 * Call the MAC framework for policy checking if we're in
2131 * the user process context and the socket isn't connected.
2132 */
2133 if (p != kernproc && !(so->so_state & SS_ISCONNECTED)) {
2134 struct mbuf *m0 = m;
2135 /*
2136 * Dequeue this record (temporarily) from the receive
2137 * list since we're about to drop the socket's lock
2138 * where a new record may arrive and be appended to
2139 * the list. Upon MAC policy failure, the record
2140 * will be freed. Otherwise, we'll add it back to
2141 * the head of the list. We cannot rely on SB_LOCK
2142 * because append operation uses the socket's lock.
2143 */
2144 do {
2145 m->m_nextpkt = NULL;
2146 sbfree(&so->so_rcv, m);
2147 m = m->m_next;
2148 } while (m != NULL);
2149 m = m0;
2150 so->so_rcv.sb_mb = nextrecord;
2151 SB_EMPTY_FIXUP(&so->so_rcv);
2152 SBLASTRECORDCHK(&so->so_rcv, "soreceive 1a");
2153 SBLASTMBUFCHK(&so->so_rcv, "soreceive 1a");
2154 socket_unlock(so, 0);
2155 if (mac_socket_check_received(proc_ucred(p), so,
2156 mtod(m, struct sockaddr *)) != 0) {
2157 /*
2158 * MAC policy failure; free this record and
2159 * process the next record (or block until
2160 * one is available). We have adjusted sb_cc
2161 * and sb_mbcnt above so there is no need to
2162 * call sbfree() again.
2163 */
2164 do {
2165 m = m_free(m);
2166 } while (m != NULL);
2167 /*
2168 * Clear SB_LOCK but don't unlock the socket.
2169 * Process the next record or wait for one.
2170 */
2171 socket_lock(so, 0);
2172 sbunlock(&so->so_rcv, 1);
2173 goto restart;
2174 }
2175 socket_lock(so, 0);
2176 /*
2177 * If the socket has been defunct'd, drop it.
2178 */
2179 if (so->so_flags & SOF_DEFUNCT) {
2180 m_freem(m);
2181 error = ENOTCONN;
2182 goto release;
2183 }
2184 /*
2185 * Re-adjust the socket receive list and re-enqueue
2186 * the record in front of any packets which may have
2187 * been appended while we dropped the lock.
2188 */
2189 for (m = m0; m->m_next != NULL; m = m->m_next)
2190 sballoc(&so->so_rcv, m);
2191 sballoc(&so->so_rcv, m);
2192 if (so->so_rcv.sb_mb == NULL) {
2193 so->so_rcv.sb_lastrecord = m0;
2194 so->so_rcv.sb_mbtail = m;
2195 }
2196 m = m0;
2197 nextrecord = m->m_nextpkt = so->so_rcv.sb_mb;
2198 so->so_rcv.sb_mb = m;
2199 SBLASTRECORDCHK(&so->so_rcv, "soreceive 1b");
2200 SBLASTMBUFCHK(&so->so_rcv, "soreceive 1b");
2201 }
2202 #endif /* CONFIG_MACF_SOCKET_SUBSET */
2203 orig_resid = 0;
2204 if (psa) {
2205 *psa = dup_sockaddr(mtod(m, struct sockaddr *),
2206 mp0 == 0);
2207 if ((*psa == 0) && (flags & MSG_NEEDSA)) {
2208 error = EWOULDBLOCK;
2209 goto release;
2210 }
2211 }
2212 if (flags & MSG_PEEK) {
2213 m = m->m_next;
2214 } else {
2215 sbfree(&so->so_rcv, m);
2216 if (m->m_next == 0 && so->so_rcv.sb_cc != 0)
2217 panic("soreceive: about to create invalid "
2218 "socketbuf");
2219 MFREE(m, so->so_rcv.sb_mb);
2220 m = so->so_rcv.sb_mb;
2221 if (m != NULL) {
2222 m->m_nextpkt = nextrecord;
2223 } else {
2224 so->so_rcv.sb_mb = nextrecord;
2225 SB_EMPTY_FIXUP(&so->so_rcv);
2226 }
2227 }
2228 }
2229
2230 /*
2231 * Process one or more MT_CONTROL mbufs present before any data mbufs
2232 * in the first mbuf chain on the socket buffer. If MSG_PEEK, we
2233 * just copy the data; if !MSG_PEEK, we call into the protocol to
2234 * perform externalization.
2235 */
2236 if (m != NULL && m->m_type == MT_CONTROL) {
2237 struct mbuf *cm = NULL, *cmn;
2238 struct mbuf **cme = &cm;
2239 struct sockbuf *sb_rcv = &so->so_rcv;
2240 struct mbuf **msgpcm = NULL;
2241
2242 /*
2243 * Externalizing the control messages would require us to
2244 * drop the socket's lock below. Once we re-acquire the
2245 * lock, the mbuf chain might change. In order to preserve
2246 * consistency, we unlink all control messages from the
2247 * first mbuf chain in one shot and link them separately
2248 * onto a different chain.
2249 */
2250 do {
2251 if (flags & MSG_PEEK) {
2252 if (controlp != NULL) {
2253 if (*controlp == NULL) {
2254 msgpcm = controlp;
2255 }
2256 *controlp = m_copy(m, 0, m->m_len);
2257
2258 /* If we failed to allocate an mbuf,
2259 * release any previously allocated
2260 * mbufs for control data. Return
2261 * an error. Keep the mbufs in the
2262 * socket as this is using
2263 * MSG_PEEK flag.
2264 */
2265 if (*controlp == NULL) {
2266 m_freem(*msgpcm);
2267 error = ENOBUFS;
2268 goto release;
2269 }
2270 controlp = &(*controlp)->m_next;
2271 }
2272 m = m->m_next;
2273 } else {
2274 m->m_nextpkt = NULL;
2275 sbfree(sb_rcv, m);
2276 sb_rcv->sb_mb = m->m_next;
2277 m->m_next = NULL;
2278 *cme = m;
2279 cme = &(*cme)->m_next;
2280 m = sb_rcv->sb_mb;
2281 }
2282 } while (m != NULL && m->m_type == MT_CONTROL);
2283
2284 if (!(flags & MSG_PEEK)) {
2285 if (sb_rcv->sb_mb != NULL) {
2286 sb_rcv->sb_mb->m_nextpkt = nextrecord;
2287 } else {
2288 sb_rcv->sb_mb = nextrecord;
2289 SB_EMPTY_FIXUP(sb_rcv);
2290 }
2291 if (nextrecord == NULL)
2292 sb_rcv->sb_lastrecord = m;
2293 }
2294
2295 SBLASTRECORDCHK(&so->so_rcv, "soreceive ctl");
2296 SBLASTMBUFCHK(&so->so_rcv, "soreceive ctl");
2297
2298 while (cm != NULL) {
2299 int cmsg_type;
2300
2301 cmn = cm->m_next;
2302 cm->m_next = NULL;
2303 cmsg_type = mtod(cm, struct cmsghdr *)->cmsg_type;
2304
2305 /*
2306 * Call the protocol to externalize SCM_RIGHTS message
2307 * and return the modified message to the caller upon
2308 * success. Otherwise, all other control messages are
2309 * returned unmodified to the caller. Note that we
2310 * only get into this loop if MSG_PEEK is not set.
2311 */
2312 if (pr->pr_domain->dom_externalize != NULL &&
2313 cmsg_type == SCM_RIGHTS) {
2314 /*
2315 * Release socket lock: see 3903171. This
2316 * would also allow more records to be appended
2317 * to the socket buffer. We still have SB_LOCK
2318 * set on it, so we can be sure that the head
2319 * of the mbuf chain won't change.
2320 */
2321 socket_unlock(so, 0);
2322 error = (*pr->pr_domain->dom_externalize)(cm);
2323 socket_lock(so, 0);
2324 } else {
2325 error = 0;
2326 }
2327
2328 if (controlp != NULL && error == 0) {
2329 *controlp = cm;
2330 controlp = &(*controlp)->m_next;
2331 orig_resid = 0;
2332 } else {
2333 (void) m_free(cm);
2334 }
2335 cm = cmn;
2336 }
2337 /*
2338 * Update the value of nextrecord in case we received new
2339 * records when the socket was unlocked above for
2340 * externalizing SCM_RIGHTS.
2341 */
2342 if (m != NULL)
2343 nextrecord = sb_rcv->sb_mb->m_nextpkt;
2344 else
2345 nextrecord = sb_rcv->sb_mb;
2346 orig_resid = 0;
2347 }
2348
2349 if (m != NULL) {
2350 if (!(flags & MSG_PEEK)) {
2351 /*
2352 * We get here because m points to an mbuf following
2353 * any MT_SONAME or MT_CONTROL mbufs which have been
2354 * processed above. In any case, m should be pointing
2355 * to the head of the mbuf chain, and the nextrecord
2356 * should be either NULL or equal to m->m_nextpkt.
2357 * See comments above about SB_LOCK.
2358 */
2359 if (m != so->so_rcv.sb_mb || m->m_nextpkt != nextrecord)
2360 panic("soreceive: post-control !sync so=%p "
2361 "m=%p nextrecord=%p\n", so, m, nextrecord);
2362
2363 if (nextrecord == NULL)
2364 so->so_rcv.sb_lastrecord = m;
2365 }
2366 type = m->m_type;
2367 if (type == MT_OOBDATA)
2368 flags |= MSG_OOB;
2369 } else {
2370 if (!(flags & MSG_PEEK)) {
2371 SB_EMPTY_FIXUP(&so->so_rcv);
2372 }
2373 }
2374 SBLASTRECORDCHK(&so->so_rcv, "soreceive 2");
2375 SBLASTMBUFCHK(&so->so_rcv, "soreceive 2");
2376
2377 moff = 0;
2378 offset = 0;
2379
2380 if (!(flags & MSG_PEEK) && uio_resid(uio) > sorecvmincopy)
2381 can_delay = 1;
2382 else
2383 can_delay = 0;
2384
2385 need_event = 0;
2386
2387 while (m && (uio_resid(uio) - delayed_copy_len) > 0 && error == 0) {
2388 if (m->m_type == MT_OOBDATA) {
2389 if (type != MT_OOBDATA)
2390 break;
2391 } else if (type == MT_OOBDATA) {
2392 break;
2393 }
2394 /*
2395 * Make sure to allways set MSG_OOB event when getting
2396 * out of band data inline.
2397 */
2398 if ((so->so_options & SO_WANTOOBFLAG) != 0 &&
2399 (so->so_options & SO_OOBINLINE) != 0 &&
2400 (so->so_state & SS_RCVATMARK) != 0) {
2401 flags |= MSG_OOB;
2402 }
2403 so->so_state &= ~SS_RCVATMARK;
2404 len = uio_resid(uio) - delayed_copy_len;
2405 if (so->so_oobmark && len > so->so_oobmark - offset)
2406 len = so->so_oobmark - offset;
2407 if (len > m->m_len - moff)
2408 len = m->m_len - moff;
2409 /*
2410 * If mp is set, just pass back the mbufs.
2411 * Otherwise copy them out via the uio, then free.
2412 * Sockbuf must be consistent here (points to current mbuf,
2413 * it points to next record) when we drop priority;
2414 * we must note any additions to the sockbuf when we
2415 * block interrupts again.
2416 */
2417 if (mp == 0) {
2418 SBLASTRECORDCHK(&so->so_rcv, "soreceive uiomove");
2419 SBLASTMBUFCHK(&so->so_rcv, "soreceive uiomove");
2420 if (can_delay && len == m->m_len) {
2421 /*
2422 * only delay the copy if we're consuming the
2423 * mbuf and we're NOT in MSG_PEEK mode
2424 * and we have enough data to make it worthwile
2425 * to drop and retake the lock... can_delay
2426 * reflects the state of the 2 latter
2427 * constraints moff should always be zero
2428 * in these cases
2429 */
2430 delayed_copy_len += len;
2431 } else {
2432 if (delayed_copy_len) {
2433 error = sodelayed_copy(so, uio,
2434 &free_list, &delayed_copy_len);
2435
2436 if (error) {
2437 goto release;
2438 }
2439 /*
2440 * can only get here if MSG_PEEK is not
2441 * set therefore, m should point at the
2442 * head of the rcv queue; if it doesn't,
2443 * it means something drastically
2444 * changed while we were out from behind
2445 * the lock in sodelayed_copy. perhaps
2446 * a RST on the stream. in any event,
2447 * the stream has been interrupted. it's
2448 * probably best just to return whatever
2449 * data we've moved and let the caller
2450 * sort it out...
2451 */
2452 if (m != so->so_rcv.sb_mb) {
2453 break;
2454 }
2455 }
2456 socket_unlock(so, 0);
2457 error = uiomove(mtod(m, caddr_t) + moff,
2458 (int)len, uio);
2459 socket_lock(so, 0);
2460
2461 if (error)
2462 goto release;
2463 }
2464 } else {
2465 uio_setresid(uio, (uio_resid(uio) - len));
2466 }
2467 if (len == m->m_len - moff) {
2468 if (m->m_flags & M_EOR)
2469 flags |= MSG_EOR;
2470 if (flags & MSG_PEEK) {
2471 m = m->m_next;
2472 moff = 0;
2473 } else {
2474 nextrecord = m->m_nextpkt;
2475 sbfree(&so->so_rcv, m);
2476 m->m_nextpkt = NULL;
2477
2478 if (mp) {
2479 *mp = m;
2480 mp = &m->m_next;
2481 so->so_rcv.sb_mb = m = m->m_next;
2482 *mp = (struct mbuf *)0;
2483 } else {
2484 if (free_list == NULL)
2485 free_list = m;
2486 else
2487 ml->m_next = m;
2488 ml = m;
2489 so->so_rcv.sb_mb = m = m->m_next;
2490 ml->m_next = 0;
2491 }
2492 if (m != NULL) {
2493 m->m_nextpkt = nextrecord;
2494 if (nextrecord == NULL)
2495 so->so_rcv.sb_lastrecord = m;
2496 } else {
2497 so->so_rcv.sb_mb = nextrecord;
2498 SB_EMPTY_FIXUP(&so->so_rcv);
2499 }
2500 SBLASTRECORDCHK(&so->so_rcv, "soreceive 3");
2501 SBLASTMBUFCHK(&so->so_rcv, "soreceive 3");
2502 }
2503 } else {
2504 if (flags & MSG_PEEK) {
2505 moff += len;
2506 } else {
2507 if (mp != NULL) {
2508 int copy_flag;
2509
2510 if (flags & MSG_DONTWAIT)
2511 copy_flag = M_DONTWAIT;
2512 else
2513 copy_flag = M_WAIT;
2514 *mp = m_copym(m, 0, len, copy_flag);
2515 if (*mp == NULL) {
2516 /*
2517 * Failed to allocate an mbuf.
2518 * Adjust uio_resid back, it was
2519 * adjusted down by len bytes which
2520 * we didn't copy over
2521 */
2522 uio_setresid(uio, (uio_resid(uio) + len));
2523 break;
2524 }
2525 }
2526 m->m_data += len;
2527 m->m_len -= len;
2528 so->so_rcv.sb_cc -= len;
2529 }
2530 }
2531 if (so->so_oobmark) {
2532 if ((flags & MSG_PEEK) == 0) {
2533 so->so_oobmark -= len;
2534 if (so->so_oobmark == 0) {
2535 so->so_state |= SS_RCVATMARK;
2536 /*
2537 * delay posting the actual event until
2538 * after any delayed copy processing
2539 * has finished
2540 */
2541 need_event = 1;
2542 break;
2543 }
2544 } else {
2545 offset += len;
2546 if (offset == so->so_oobmark)
2547 break;
2548 }
2549 }
2550 if (flags & MSG_EOR)
2551 break;
2552 /*
2553 * If the MSG_WAITALL or MSG_WAITSTREAM flag is set
2554 * (for non-atomic socket), we must not quit until
2555 * "uio->uio_resid == 0" or an error termination.
2556 * If a signal/timeout occurs, return with a short
2557 * count but without error. Keep sockbuf locked
2558 * against other readers.
2559 */
2560 while (flags & (MSG_WAITALL|MSG_WAITSTREAM) && m == 0 &&
2561 (uio_resid(uio) - delayed_copy_len) > 0 &&
2562 !sosendallatonce(so) && !nextrecord) {
2563 if (so->so_error || so->so_state & SS_CANTRCVMORE)
2564 goto release;
2565
2566 /*
2567 * Depending on the protocol (e.g. TCP), the following
2568 * might cause the socket lock to be dropped and later
2569 * be reacquired, and more data could have arrived and
2570 * have been appended to the receive socket buffer by
2571 * the time it returns. Therefore, we only sleep in
2572 * sbwait() below if and only if the socket buffer is
2573 * empty, in order to avoid a false sleep.
2574 */
2575 if (pr->pr_flags & PR_WANTRCVD && so->so_pcb &&
2576 (((struct inpcb *)so->so_pcb)->inp_state !=
2577 INPCB_STATE_DEAD))
2578 (*pr->pr_usrreqs->pru_rcvd)(so, flags);
2579
2580 SBLASTRECORDCHK(&so->so_rcv, "soreceive sbwait 2");
2581 SBLASTMBUFCHK(&so->so_rcv, "soreceive sbwait 2");
2582
2583 if (so->so_rcv.sb_mb == NULL && sbwait(&so->so_rcv)) {
2584 error = 0;
2585 goto release;
2586 }
2587 /*
2588 * have to wait until after we get back from the sbwait
2589 * to do the copy because we will drop the lock if we
2590 * have enough data that has been delayed... by dropping
2591 * the lock we open up a window allowing the netisr
2592 * thread to process the incoming packets and to change
2593 * the state of this socket... we're issuing the sbwait
2594 * because the socket is empty and we're expecting the
2595 * netisr thread to wake us up when more packets arrive;
2596 * if we allow that processing to happen and then sbwait
2597 * we could stall forever with packets sitting in the
2598 * socket if no further packets arrive from the remote
2599 * side.
2600 *
2601 * we want to copy before we've collected all the data
2602 * to satisfy this request to allow the copy to overlap
2603 * the incoming packet processing on an MP system
2604 */
2605 if (delayed_copy_len > sorecvmincopy &&
2606 (delayed_copy_len > (so->so_rcv.sb_hiwat / 2))) {
2607 error = sodelayed_copy(so, uio,
2608 &free_list, &delayed_copy_len);
2609
2610 if (error)
2611 goto release;
2612 }
2613 m = so->so_rcv.sb_mb;
2614 if (m) {
2615 nextrecord = m->m_nextpkt;
2616 }
2617 SB_MB_CHECK(&so->so_rcv);
2618 }
2619 }
2620 #ifdef MORE_LOCKING_DEBUG
2621 if (so->so_usecount <= 1)
2622 panic("soreceive: after big while so=%p ref=%d on socket\n",
2623 so, so->so_usecount);
2624 #endif
2625
2626 if (m && pr->pr_flags & PR_ATOMIC) {
2627 #ifdef __APPLE__
2628 if (so->so_options & SO_DONTTRUNC) {
2629 flags |= MSG_RCVMORE;
2630 } else {
2631 #endif
2632 flags |= MSG_TRUNC;
2633 if ((flags & MSG_PEEK) == 0)
2634 (void) sbdroprecord(&so->so_rcv);
2635 #ifdef __APPLE__
2636 }
2637 #endif
2638 }
2639
2640 /*
2641 * pru_rcvd below (for TCP) may cause more data to be received
2642 * if the socket lock is dropped prior to sending the ACK; some
2643 * legacy OpenTransport applications don't handle this well
2644 * (if it receives less data than requested while MSG_HAVEMORE
2645 * is set), and so we set the flag now based on what we know
2646 * prior to calling pru_rcvd.
2647 */
2648 if ((so->so_options & SO_WANTMORE) && so->so_rcv.sb_cc > 0)
2649 flags |= MSG_HAVEMORE;
2650
2651 if ((flags & MSG_PEEK) == 0) {
2652 if (m == 0) {
2653 so->so_rcv.sb_mb = nextrecord;
2654 /*
2655 * First part is an inline SB_EMPTY_FIXUP(). Second
2656 * part makes sure sb_lastrecord is up-to-date if
2657 * there is still data in the socket buffer.
2658 */
2659 if (so->so_rcv.sb_mb == NULL) {
2660 so->so_rcv.sb_mbtail = NULL;
2661 so->so_rcv.sb_lastrecord = NULL;
2662 } else if (nextrecord->m_nextpkt == NULL) {
2663 so->so_rcv.sb_lastrecord = nextrecord;
2664 }
2665 SB_MB_CHECK(&so->so_rcv);
2666 }
2667 SBLASTRECORDCHK(&so->so_rcv, "soreceive 4");
2668 SBLASTMBUFCHK(&so->so_rcv, "soreceive 4");
2669 if (pr->pr_flags & PR_WANTRCVD && so->so_pcb)
2670 (*pr->pr_usrreqs->pru_rcvd)(so, flags);
2671 }
2672 #ifdef __APPLE__
2673 if (delayed_copy_len) {
2674 error = sodelayed_copy(so, uio, &free_list, &delayed_copy_len);
2675
2676 if (error)
2677 goto release;
2678 }
2679 if (free_list) {
2680 m_freem_list((struct mbuf *)free_list);
2681 free_list = (struct mbuf *)0;
2682 }
2683 if (need_event)
2684 postevent(so, 0, EV_OOB);
2685 #endif
2686 if (orig_resid == uio_resid(uio) && orig_resid &&
2687 (flags & MSG_EOR) == 0 && (so->so_state & SS_CANTRCVMORE) == 0) {
2688 sbunlock(&so->so_rcv, 1);
2689 goto restart;
2690 }
2691
2692 if (flagsp)
2693 *flagsp |= flags;
2694 release:
2695 #ifdef MORE_LOCKING_DEBUG
2696 if (so->so_usecount <= 1)
2697 panic("soreceive: release so=%p ref=%d on socket\n",
2698 so, so->so_usecount);
2699 #endif
2700 if (delayed_copy_len) {
2701 error = sodelayed_copy(so, uio, &free_list, &delayed_copy_len);
2702 }
2703 if (free_list) {
2704 m_freem_list((struct mbuf *)free_list);
2705 }
2706 sbunlock(&so->so_rcv, 0); /* will unlock socket */
2707
2708 // LP64todo - fix this!
2709 KERNEL_DEBUG(DBG_FNC_SORECEIVE | DBG_FUNC_END, so, uio_resid(uio),
2710 so->so_rcv.sb_cc, 0, error);
2711
2712 return (error);
2713 }
2714
2715 /*
2716 * Returns: 0 Success
2717 * uiomove:EFAULT
2718 */
2719 static int
2720 sodelayed_copy(struct socket *so, struct uio *uio, struct mbuf **free_list,
2721 int *resid)
2722 {
2723 int error = 0;
2724 struct mbuf *m;
2725
2726 m = *free_list;
2727
2728 socket_unlock(so, 0);
2729
2730 while (m && error == 0) {
2731
2732 error = uiomove(mtod(m, caddr_t), (int)m->m_len, uio);
2733
2734 m = m->m_next;
2735 }
2736 m_freem_list(*free_list);
2737
2738 *free_list = (struct mbuf *)NULL;
2739 *resid = 0;
2740
2741 socket_lock(so, 0);
2742
2743 return (error);
2744 }
2745
2746
2747 /*
2748 * Returns: 0 Success
2749 * EINVAL
2750 * ENOTCONN
2751 * <pru_shutdown>:EINVAL
2752 * <pru_shutdown>:EADDRNOTAVAIL[TCP]
2753 * <pru_shutdown>:ENOBUFS[TCP]
2754 * <pru_shutdown>:EMSGSIZE[TCP]
2755 * <pru_shutdown>:EHOSTUNREACH[TCP]
2756 * <pru_shutdown>:ENETUNREACH[TCP]
2757 * <pru_shutdown>:ENETDOWN[TCP]
2758 * <pru_shutdown>:ENOMEM[TCP]
2759 * <pru_shutdown>:EACCES[TCP]
2760 * <pru_shutdown>:EMSGSIZE[TCP]
2761 * <pru_shutdown>:ENOBUFS[TCP]
2762 * <pru_shutdown>:???[TCP] [ignorable: mostly IPSEC/firewall/DLIL]
2763 * <pru_shutdown>:??? [other protocol families]
2764 */
2765 int
2766 soshutdown(struct socket *so, int how)
2767 {
2768 int error;
2769
2770 switch (how) {
2771 case SHUT_RD:
2772 case SHUT_WR:
2773 case SHUT_RDWR:
2774 socket_lock(so, 1);
2775 if ((so->so_state &
2776 (SS_ISCONNECTED|SS_ISCONNECTING|SS_ISDISCONNECTING)) == 0) {
2777 error = ENOTCONN;
2778 } else {
2779 error = soshutdownlock(so, how);
2780 }
2781 socket_unlock(so, 1);
2782 break;
2783 default:
2784 error = EINVAL;
2785 break;
2786 }
2787
2788 return (error);
2789 }
2790
2791 int
2792 soshutdownlock(struct socket *so, int how)
2793 {
2794 struct protosw *pr = so->so_proto;
2795 int error = 0;
2796
2797 sflt_notify(so, sock_evt_shutdown, &how);
2798
2799 if (how != SHUT_WR) {
2800 if ((so->so_state & SS_CANTRCVMORE) != 0) {
2801 /* read already shut down */
2802 error = ENOTCONN;
2803 goto done;
2804 }
2805 sorflush(so);
2806 postevent(so, 0, EV_RCLOSED);
2807 }
2808 if (how != SHUT_RD) {
2809 if ((so->so_state & SS_CANTSENDMORE) != 0) {
2810 /* write already shut down */
2811 error = ENOTCONN;
2812 goto done;
2813 }
2814 error = (*pr->pr_usrreqs->pru_shutdown)(so);
2815 postevent(so, 0, EV_WCLOSED);
2816 }
2817 done:
2818 KERNEL_DEBUG(DBG_FNC_SOSHUTDOWN | DBG_FUNC_END, 0, 0, 0, 0, 0);
2819 return (error);
2820 }
2821
2822 void
2823 sorflush(struct socket *so)
2824 {
2825 register struct sockbuf *sb = &so->so_rcv;
2826 register struct protosw *pr = so->so_proto;
2827 struct sockbuf asb;
2828
2829 #ifdef MORE_LOCKING_DEBUG
2830 lck_mtx_t *mutex_held;
2831
2832 if (so->so_proto->pr_getlock != NULL)
2833 mutex_held = (*so->so_proto->pr_getlock)(so, 0);
2834 else
2835 mutex_held = so->so_proto->pr_domain->dom_mtx;
2836 lck_mtx_assert(mutex_held, LCK_MTX_ASSERT_OWNED);
2837 #endif
2838
2839 sflt_notify(so, sock_evt_flush_read, NULL);
2840
2841 sb->sb_flags |= SB_NOINTR;
2842 (void) sblock(sb, M_WAIT);
2843 socantrcvmore(so);
2844 sbunlock(sb, 1);
2845 #ifdef __APPLE__
2846 selthreadclear(&sb->sb_sel);
2847 #endif
2848 asb = *sb;
2849 bzero((caddr_t)sb, sizeof (*sb));
2850 sb->sb_so = so; /* reestablish link to socket */
2851 if (asb.sb_flags & SB_KNOTE) {
2852 sb->sb_sel.si_note = asb.sb_sel.si_note;
2853 sb->sb_flags = SB_KNOTE;
2854 }
2855 if (asb.sb_flags & SB_DROP)
2856 sb->sb_flags |= SB_DROP;
2857 if (asb.sb_flags & SB_UNIX)
2858 sb->sb_flags |= SB_UNIX;
2859 if ((pr->pr_flags & PR_RIGHTS) && pr->pr_domain->dom_dispose) {
2860 (*pr->pr_domain->dom_dispose)(asb.sb_mb);
2861 }
2862 sbrelease(&asb);
2863 }
2864
2865 /*
2866 * Perhaps this routine, and sooptcopyout(), below, ought to come in
2867 * an additional variant to handle the case where the option value needs
2868 * to be some kind of integer, but not a specific size.
2869 * In addition to their use here, these functions are also called by the
2870 * protocol-level pr_ctloutput() routines.
2871 *
2872 * Returns: 0 Success
2873 * EINVAL
2874 * copyin:EFAULT
2875 */
2876 int
2877 sooptcopyin(struct sockopt *sopt, void *buf, size_t len, size_t minlen)
2878 {
2879 size_t valsize;
2880
2881 /*
2882 * If the user gives us more than we wanted, we ignore it,
2883 * but if we don't get the minimum length the caller
2884 * wants, we return EINVAL. On success, sopt->sopt_valsize
2885 * is set to however much we actually retrieved.
2886 */
2887 if ((valsize = sopt->sopt_valsize) < minlen)
2888 return (EINVAL);
2889 if (valsize > len)
2890 sopt->sopt_valsize = valsize = len;
2891
2892 if (sopt->sopt_p != kernproc)
2893 return (copyin(sopt->sopt_val, buf, valsize));
2894
2895 bcopy(CAST_DOWN(caddr_t, sopt->sopt_val), buf, valsize);
2896 return (0);
2897 }
2898
2899 /*
2900 * sooptcopyin_timeval
2901 * Copy in a timeval value into tv_p, and take into account whether the
2902 * the calling process is 64-bit or 32-bit. Moved the sanity checking
2903 * code here so that we can verify the 64-bit tv_sec value before we lose
2904 * the top 32-bits assigning tv64.tv_sec to tv_p->tv_sec.
2905 */
2906 static int
2907 sooptcopyin_timeval(struct sockopt *sopt, struct timeval * tv_p)
2908 {
2909 int error;
2910
2911 if (proc_is64bit(sopt->sopt_p)) {
2912 struct user64_timeval tv64;
2913
2914 if (sopt->sopt_valsize < sizeof(tv64)) {
2915 return (EINVAL);
2916 }
2917 sopt->sopt_valsize = sizeof(tv64);
2918 if (sopt->sopt_p != kernproc) {
2919 error = copyin(sopt->sopt_val, &tv64, sizeof(tv64));
2920 if (error != 0)
2921 return (error);
2922 } else {
2923 bcopy(CAST_DOWN(caddr_t, sopt->sopt_val), &tv64,
2924 sizeof(tv64));
2925 }
2926 if (tv64.tv_sec < 0 || tv64.tv_sec > LONG_MAX
2927 || tv64.tv_usec < 0 || tv64.tv_usec >= 1000000) {
2928 return (EDOM);
2929 }
2930 tv_p->tv_sec = tv64.tv_sec;
2931 tv_p->tv_usec = tv64.tv_usec;
2932 } else {
2933 struct user32_timeval tv32;
2934
2935 if (sopt->sopt_valsize < sizeof(tv32)) {
2936 return (EINVAL);
2937 }
2938 sopt->sopt_valsize = sizeof(tv32);
2939 if (sopt->sopt_p != kernproc) {
2940 error = copyin(sopt->sopt_val, &tv32, sizeof(tv32));
2941 if (error != 0) {
2942 return (error);
2943 }
2944 } else {
2945 bcopy(CAST_DOWN(caddr_t, sopt->sopt_val), &tv32,
2946 sizeof(tv32));
2947 }
2948 #ifndef __LP64__ // K64todo "comparison is always false due to limited range of data type"
2949 if (tv32.tv_sec < 0 || tv32.tv_sec > LONG_MAX
2950 || tv32.tv_usec < 0 || tv32.tv_usec >= 1000000) {
2951 return (EDOM);
2952 }
2953 #endif
2954 tv_p->tv_sec = tv32.tv_sec;
2955 tv_p->tv_usec = tv32.tv_usec;
2956 }
2957 return (0);
2958 }
2959
2960 /*
2961 * Returns: 0 Success
2962 * EINVAL
2963 * ENOPROTOOPT
2964 * ENOBUFS
2965 * EDOM
2966 * sooptcopyin:EINVAL
2967 * sooptcopyin:EFAULT
2968 * sooptcopyin_timeval:EINVAL
2969 * sooptcopyin_timeval:EFAULT
2970 * sooptcopyin_timeval:EDOM
2971 * <pr_ctloutput>:EOPNOTSUPP[AF_UNIX]
2972 * <pr_ctloutput>:???w
2973 * sflt_attach_private:??? [whatever a filter author chooses]
2974 * <sf_setoption>:??? [whatever a filter author chooses]
2975 *
2976 * Notes: Other <pru_listen> returns depend on the protocol family; all
2977 * <sf_listen> returns depend on what the filter author causes
2978 * their filter to return.
2979 */
2980 int
2981 sosetopt(struct socket *so, struct sockopt *sopt)
2982 {
2983 int error, optval;
2984 struct linger l;
2985 struct timeval tv;
2986 #if CONFIG_MACF_SOCKET
2987 struct mac extmac;
2988 #endif /* MAC_SOCKET */
2989
2990 socket_lock(so, 1);
2991
2992 if ((so->so_state & (SS_CANTRCVMORE | SS_CANTSENDMORE))
2993 == (SS_CANTRCVMORE | SS_CANTSENDMORE) &&
2994 (so->so_flags & SOF_NPX_SETOPTSHUT) == 0) {
2995 /* the socket has been shutdown, no more sockopt's */
2996 error = EINVAL;
2997 goto bad;
2998 }
2999
3000 if (sopt->sopt_dir != SOPT_SET) {
3001 sopt->sopt_dir = SOPT_SET;
3002 }
3003
3004 error = sflt_setsockopt(so, sopt);
3005 if (error) {
3006 if (error == EJUSTRETURN)
3007 error = 0;
3008 goto bad;
3009 }
3010
3011 error = 0;
3012 if (sopt->sopt_level != SOL_SOCKET) {
3013 if (so->so_proto && so->so_proto->pr_ctloutput) {
3014 error = (*so->so_proto->pr_ctloutput)(so, sopt);
3015 socket_unlock(so, 1);
3016 return (error);
3017 }
3018 error = ENOPROTOOPT;
3019 } else {
3020 switch (sopt->sopt_name) {
3021 case SO_LINGER:
3022 case SO_LINGER_SEC:
3023 error = sooptcopyin(sopt, &l, sizeof (l), sizeof (l));
3024 if (error)
3025 goto bad;
3026
3027 so->so_linger = (sopt->sopt_name == SO_LINGER) ?
3028 l.l_linger : l.l_linger * hz;
3029 if (l.l_onoff)
3030 so->so_options |= SO_LINGER;
3031 else
3032 so->so_options &= ~SO_LINGER;
3033 break;
3034
3035 case SO_DEBUG:
3036 case SO_KEEPALIVE:
3037 case SO_DONTROUTE:
3038 case SO_USELOOPBACK:
3039 case SO_BROADCAST:
3040 case SO_REUSEADDR:
3041 case SO_REUSEPORT:
3042 case SO_OOBINLINE:
3043 case SO_TIMESTAMP:
3044 case SO_TIMESTAMP_MONOTONIC:
3045 #ifdef __APPLE__
3046 case SO_DONTTRUNC:
3047 case SO_WANTMORE:
3048 case SO_WANTOOBFLAG:
3049 #endif
3050 error = sooptcopyin(sopt, &optval, sizeof (optval),
3051 sizeof (optval));
3052 if (error)
3053 goto bad;
3054 if (optval)
3055 so->so_options |= sopt->sopt_name;
3056 else
3057 so->so_options &= ~sopt->sopt_name;
3058 break;
3059
3060 case SO_SNDBUF:
3061 case SO_RCVBUF:
3062 case SO_SNDLOWAT:
3063 case SO_RCVLOWAT:
3064 error = sooptcopyin(sopt, &optval, sizeof (optval),
3065 sizeof (optval));
3066 if (error)
3067 goto bad;
3068
3069 /*
3070 * Values < 1 make no sense for any of these
3071 * options, so disallow them.
3072 */
3073 if (optval < 1) {
3074 error = EINVAL;
3075 goto bad;
3076 }
3077
3078 switch (sopt->sopt_name) {
3079 case SO_SNDBUF:
3080 case SO_RCVBUF:
3081 {
3082 struct sockbuf *sb = (sopt->sopt_name == SO_SNDBUF) ?
3083 &so->so_snd : &so->so_rcv;
3084 if (sbreserve(sb, (u_int32_t) optval) == 0) {
3085 error = ENOBUFS;
3086 goto bad;
3087 }
3088 sb->sb_flags |= SB_USRSIZE;
3089 sb->sb_flags &= ~SB_AUTOSIZE;
3090 sb->sb_idealsize = (u_int32_t)optval;
3091 break;
3092 }
3093
3094 /*
3095 * Make sure the low-water is never greater than
3096 * the high-water.
3097 */
3098 case SO_SNDLOWAT:
3099 so->so_snd.sb_lowat =
3100 (optval > so->so_snd.sb_hiwat) ?
3101 so->so_snd.sb_hiwat : optval;
3102 break;
3103 case SO_RCVLOWAT:
3104 so->so_rcv.sb_lowat =
3105 (optval > so->so_rcv.sb_hiwat) ?
3106 so->so_rcv.sb_hiwat : optval;
3107 break;
3108 }
3109 break;
3110
3111 case SO_SNDTIMEO:
3112 case SO_RCVTIMEO:
3113 error = sooptcopyin_timeval(sopt, &tv);
3114 if (error)
3115 goto bad;
3116
3117 switch (sopt->sopt_name) {
3118 case SO_SNDTIMEO:
3119 so->so_snd.sb_timeo = tv;
3120 break;
3121 case SO_RCVTIMEO:
3122 so->so_rcv.sb_timeo = tv;
3123 break;
3124 }
3125 break;
3126
3127 case SO_NKE:
3128 {
3129 struct so_nke nke;
3130
3131 error = sooptcopyin(sopt, &nke, sizeof (nke),
3132 sizeof (nke));
3133 if (error)
3134 goto bad;
3135
3136 error = sflt_attach_internal(so, nke.nke_handle);
3137 break;
3138 }
3139
3140 case SO_NOSIGPIPE:
3141 error = sooptcopyin(sopt, &optval, sizeof (optval),
3142 sizeof (optval));
3143 if (error)
3144 goto bad;
3145 if (optval)
3146 so->so_flags |= SOF_NOSIGPIPE;
3147 else
3148 so->so_flags &= ~SOF_NOSIGPIPE;
3149
3150 break;
3151
3152 case SO_NOADDRERR:
3153 error = sooptcopyin(sopt, &optval, sizeof (optval),
3154 sizeof (optval));
3155 if (error)
3156 goto bad;
3157 if (optval)
3158 so->so_flags |= SOF_NOADDRAVAIL;
3159 else
3160 so->so_flags &= ~SOF_NOADDRAVAIL;
3161
3162 break;
3163
3164 case SO_REUSESHAREUID:
3165 error = sooptcopyin(sopt, &optval, sizeof (optval),
3166 sizeof (optval));
3167 if (error)
3168 goto bad;
3169 if (optval)
3170 so->so_flags |= SOF_REUSESHAREUID;
3171 else
3172 so->so_flags &= ~SOF_REUSESHAREUID;
3173 break;
3174 #ifdef __APPLE_API_PRIVATE
3175 case SO_NOTIFYCONFLICT:
3176 if (kauth_cred_issuser(kauth_cred_get()) == 0) {
3177 error = EPERM;
3178 goto bad;
3179 }
3180 error = sooptcopyin(sopt, &optval, sizeof (optval),
3181 sizeof (optval));
3182 if (error)
3183 goto bad;
3184 if (optval)
3185 so->so_flags |= SOF_NOTIFYCONFLICT;
3186 else
3187 so->so_flags &= ~SOF_NOTIFYCONFLICT;
3188 break;
3189 #endif
3190 case SO_RESTRICTIONS:
3191 if (kauth_cred_issuser(kauth_cred_get()) == 0) {
3192 error = EPERM;
3193 goto bad;
3194 }
3195 error = sooptcopyin(sopt, &optval, sizeof (optval),
3196 sizeof (optval));
3197 if (error)
3198 goto bad;
3199 so->so_restrictions = (optval & (SO_RESTRICT_DENYIN |
3200 SO_RESTRICT_DENYOUT | SO_RESTRICT_DENYSET));
3201 break;
3202
3203 case SO_LABEL:
3204 #if CONFIG_MACF_SOCKET
3205 if ((error = sooptcopyin(sopt, &extmac, sizeof (extmac),
3206 sizeof (extmac))) != 0)
3207 goto bad;
3208
3209 error = mac_setsockopt_label(proc_ucred(sopt->sopt_p),
3210 so, &extmac);
3211 #else
3212 error = EOPNOTSUPP;
3213 #endif /* MAC_SOCKET */
3214 break;
3215
3216 #ifdef __APPLE_API_PRIVATE
3217 case SO_UPCALLCLOSEWAIT:
3218 error = sooptcopyin(sopt, &optval, sizeof (optval),
3219 sizeof (optval));
3220 if (error)
3221 goto bad;
3222 if (optval)
3223 so->so_flags |= SOF_UPCALLCLOSEWAIT;
3224 else
3225 so->so_flags &= ~SOF_UPCALLCLOSEWAIT;
3226 break;
3227 #endif
3228
3229 case SO_RANDOMPORT:
3230 error = sooptcopyin(sopt, &optval, sizeof (optval),
3231 sizeof (optval));
3232 if (error)
3233 goto bad;
3234 if (optval)
3235 so->so_flags |= SOF_BINDRANDOMPORT;
3236 else
3237 so->so_flags &= ~SOF_BINDRANDOMPORT;
3238 break;
3239
3240 case SO_NP_EXTENSIONS: {
3241 struct so_np_extensions sonpx;
3242
3243 error = sooptcopyin(sopt, &sonpx, sizeof(sonpx), sizeof(sonpx));
3244 if (error)
3245 goto bad;
3246 if (sonpx.npx_mask & ~SONPX_MASK_VALID) {
3247 error = EINVAL;
3248 goto bad;
3249 }
3250 /*
3251 * Only one bit defined for now
3252 */
3253 if ((sonpx.npx_mask & SONPX_SETOPTSHUT)) {
3254 if ((sonpx.npx_flags & SONPX_SETOPTSHUT))
3255 so->so_flags |= SOF_NPX_SETOPTSHUT;
3256 else
3257 so->so_flags &= ~SOF_NPX_SETOPTSHUT;
3258 }
3259 break;
3260 }
3261
3262 case SO_TRAFFIC_CLASS: {
3263 error = sooptcopyin(sopt, &optval, sizeof (optval),
3264 sizeof (optval));
3265 if (error)
3266 goto bad;
3267 error = so_set_traffic_class(so, optval);
3268 if (error)
3269 goto bad;
3270 break;
3271 }
3272
3273 case SO_RECV_TRAFFIC_CLASS: {
3274 error = sooptcopyin(sopt, &optval, sizeof (optval),
3275 sizeof (optval));
3276 if (error)
3277 goto bad;
3278 if (optval == 0)
3279 so->so_flags &= ~SOF_RECV_TRAFFIC_CLASS;
3280 else
3281 so->so_flags |= SOF_RECV_TRAFFIC_CLASS;
3282 break;
3283 }
3284
3285 case SO_TRAFFIC_CLASS_DBG: {
3286 struct so_tcdbg so_tcdbg;
3287
3288 error = sooptcopyin(sopt, &so_tcdbg,
3289 sizeof (struct so_tcdbg), sizeof (struct so_tcdbg));
3290 if (error)
3291 goto bad;
3292 error = so_set_tcdbg(so, &so_tcdbg);
3293 if (error)
3294 goto bad;
3295 break;
3296 }
3297
3298 case SO_PRIVILEGED_TRAFFIC_CLASS:
3299 error = priv_check_cred(kauth_cred_get(),
3300 PRIV_NET_PRIVILEGED_TRAFFIC_CLASS, 0);
3301 if (error)
3302 goto bad;
3303 error = sooptcopyin(sopt, &optval, sizeof (optval),
3304 sizeof (optval));
3305 if (error)
3306 goto bad;
3307 if (optval == 0)
3308 so->so_flags &= ~SOF_PRIVILEGED_TRAFFIC_CLASS;
3309 else
3310 so->so_flags |= SOF_PRIVILEGED_TRAFFIC_CLASS;
3311 break;
3312
3313 case SO_DEFUNCTOK:
3314 error = sooptcopyin(sopt, &optval, sizeof (optval),
3315 sizeof (optval));
3316 if (error != 0 || (so->so_flags & SOF_DEFUNCT)) {
3317 if (error == 0)
3318 error = EBADF;
3319 goto bad;
3320 }
3321 /*
3322 * Any process can set SO_DEFUNCTOK (clear
3323 * SOF_NODEFUNCT), but only root can clear
3324 * SO_DEFUNCTOK (set SOF_NODEFUNCT).
3325 */
3326 if (optval == 0 &&
3327 kauth_cred_issuser(kauth_cred_get()) == 0) {
3328 error = EPERM;
3329 goto bad;
3330 }
3331 if (optval)
3332 so->so_flags &= ~SOF_NODEFUNCT;
3333 else
3334 so->so_flags |= SOF_NODEFUNCT;
3335
3336 SODEFUNCTLOG(("%s[%d]: so %p [%d,%d] is now marked as "
3337 "%seligible for defunct\n", __func__,
3338 proc_selfpid(), so, INP_SOCKAF(so),
3339 INP_SOCKTYPE(so),
3340 (so->so_flags & SOF_NODEFUNCT) ? "not " : ""));
3341 break;
3342
3343 case SO_ISDEFUNCT:
3344 /* This option is not settable */
3345 error = EINVAL;
3346 break;
3347
3348 case SO_OPPORTUNISTIC:
3349 error = sooptcopyin(sopt, &optval, sizeof (optval),
3350 sizeof (optval));
3351 if (error == 0)
3352 error = so_set_opportunistic(so, optval);
3353 break;
3354
3355 case SO_FLUSH:
3356 /* This option is handled by lower layer(s) */
3357 error = 0;
3358 break;
3359
3360 case SO_RECV_ANYIF:
3361 error = sooptcopyin(sopt, &optval, sizeof (optval),
3362 sizeof (optval));
3363 if (error == 0)
3364 error = so_set_recv_anyif(so, optval);
3365 break;
3366
3367 default:
3368 error = ENOPROTOOPT;
3369 break;
3370 }
3371 if (error == 0 && so->so_proto && so->so_proto->pr_ctloutput) {
3372 (void) ((*so->so_proto->pr_ctloutput)(so, sopt));
3373 }
3374 }
3375 bad:
3376 socket_unlock(so, 1);
3377 return (error);
3378 }
3379
3380 /* Helper routines for getsockopt */
3381 int
3382 sooptcopyout(struct sockopt *sopt, void *buf, size_t len)
3383 {
3384 int error;
3385 size_t valsize;
3386
3387 error = 0;
3388
3389 /*
3390 * Documented get behavior is that we always return a value,
3391 * possibly truncated to fit in the user's buffer.
3392 * Traditional behavior is that we always tell the user
3393 * precisely how much we copied, rather than something useful
3394 * like the total amount we had available for her.
3395 * Note that this interface is not idempotent; the entire answer must
3396 * generated ahead of time.
3397 */
3398 valsize = min(len, sopt->sopt_valsize);
3399 sopt->sopt_valsize = valsize;
3400 if (sopt->sopt_val != USER_ADDR_NULL) {
3401 if (sopt->sopt_p != kernproc)
3402 error = copyout(buf, sopt->sopt_val, valsize);
3403 else
3404 bcopy(buf, CAST_DOWN(caddr_t, sopt->sopt_val), valsize);
3405 }
3406 return (error);
3407 }
3408
3409 static int
3410 sooptcopyout_timeval(struct sockopt *sopt, const struct timeval * tv_p)
3411 {
3412 int error;
3413 size_t len;
3414 struct user64_timeval tv64;
3415 struct user32_timeval tv32;
3416 const void * val;
3417 size_t valsize;
3418
3419 error = 0;
3420 if (proc_is64bit(sopt->sopt_p)) {
3421 len = sizeof(tv64);
3422 tv64.tv_sec = tv_p->tv_sec;
3423 tv64.tv_usec = tv_p->tv_usec;
3424 val = &tv64;
3425 } else {
3426 len = sizeof(tv32);
3427 tv32.tv_sec = tv_p->tv_sec;
3428 tv32.tv_usec = tv_p->tv_usec;
3429 val = &tv32;
3430 }
3431 valsize = min(len, sopt->sopt_valsize);
3432 sopt->sopt_valsize = valsize;
3433 if (sopt->sopt_val != USER_ADDR_NULL) {
3434 if (sopt->sopt_p != kernproc)
3435 error = copyout(val, sopt->sopt_val, valsize);
3436 else
3437 bcopy(val, CAST_DOWN(caddr_t, sopt->sopt_val), valsize);
3438 }
3439 return (error);
3440 }
3441
3442 /*
3443 * Return: 0 Success
3444 * ENOPROTOOPT
3445 * <pr_ctloutput>:EOPNOTSUPP[AF_UNIX]
3446 * <pr_ctloutput>:???
3447 * <sf_getoption>:???
3448 */
3449 int
3450 sogetopt(struct socket *so, struct sockopt *sopt)
3451 {
3452 int error, optval;
3453 struct linger l;
3454 struct timeval tv;
3455 #if CONFIG_MACF_SOCKET
3456 struct mac extmac;
3457 #endif /* MAC_SOCKET */
3458
3459 if (sopt->sopt_dir != SOPT_GET) {
3460 sopt->sopt_dir = SOPT_GET;
3461 }
3462
3463 socket_lock(so, 1);
3464
3465 error = sflt_getsockopt(so, sopt);
3466 if (error) {
3467 if (error == EJUSTRETURN)
3468 error = 0;
3469 socket_unlock(so, 1);
3470 return (error);
3471 }
3472
3473 error = 0;
3474 if (sopt->sopt_level != SOL_SOCKET) {
3475 if (so->so_proto && so->so_proto->pr_ctloutput) {
3476 error = (*so->so_proto->pr_ctloutput)(so, sopt);
3477 socket_unlock(so, 1);
3478 return (error);
3479 } else {
3480 socket_unlock(so, 1);
3481 return (ENOPROTOOPT);
3482 }
3483 } else {
3484 switch (sopt->sopt_name) {
3485 case SO_LINGER:
3486 case SO_LINGER_SEC:
3487 l.l_onoff = so->so_options & SO_LINGER;
3488 l.l_linger = (sopt->sopt_name == SO_LINGER) ?
3489 so->so_linger : so->so_linger / hz;
3490 error = sooptcopyout(sopt, &l, sizeof (l));
3491 break;
3492
3493 case SO_USELOOPBACK:
3494 case SO_DONTROUTE:
3495 case SO_DEBUG:
3496 case SO_KEEPALIVE:
3497 case SO_REUSEADDR:
3498 case SO_REUSEPORT:
3499 case SO_BROADCAST:
3500 case SO_OOBINLINE:
3501 case SO_TIMESTAMP:
3502 case SO_TIMESTAMP_MONOTONIC:
3503 #ifdef __APPLE__
3504 case SO_DONTTRUNC:
3505 case SO_WANTMORE:
3506 case SO_WANTOOBFLAG:
3507 #endif
3508 optval = so->so_options & sopt->sopt_name;
3509 integer:
3510 error = sooptcopyout(sopt, &optval, sizeof (optval));
3511 break;
3512
3513 case SO_TYPE:
3514 optval = so->so_type;
3515 goto integer;
3516
3517 #ifdef __APPLE__
3518 case SO_NREAD:
3519 if (so->so_proto->pr_flags & PR_ATOMIC) {
3520 int pkt_total;
3521 struct mbuf *m1;
3522
3523 pkt_total = 0;
3524 m1 = so->so_rcv.sb_mb;
3525 while (m1) {
3526 if (m1->m_type == MT_DATA || m1->m_type == MT_HEADER ||
3527 m1->m_type == MT_OOBDATA)
3528 pkt_total += m1->m_len;
3529 m1 = m1->m_next;
3530 }
3531 optval = pkt_total;
3532 } else {
3533 optval = so->so_rcv.sb_cc - so->so_rcv.sb_ctl;
3534 }
3535 goto integer;
3536
3537 case SO_NWRITE:
3538 optval = so->so_snd.sb_cc;
3539 goto integer;
3540 #endif
3541 case SO_ERROR:
3542 optval = so->so_error;
3543 so->so_error = 0;
3544 goto integer;
3545
3546 case SO_SNDBUF:
3547 optval = so->so_snd.sb_hiwat;
3548 goto integer;
3549
3550 case SO_RCVBUF:
3551 optval = so->so_rcv.sb_hiwat;
3552 goto integer;
3553
3554 case SO_SNDLOWAT:
3555 optval = so->so_snd.sb_lowat;
3556 goto integer;
3557
3558 case SO_RCVLOWAT:
3559 optval = so->so_rcv.sb_lowat;
3560 goto integer;
3561
3562 case SO_SNDTIMEO:
3563 case SO_RCVTIMEO:
3564 tv = (sopt->sopt_name == SO_SNDTIMEO ?
3565 so->so_snd.sb_timeo : so->so_rcv.sb_timeo);
3566
3567 error = sooptcopyout_timeval(sopt, &tv);
3568 break;
3569
3570 case SO_NOSIGPIPE:
3571 optval = (so->so_flags & SOF_NOSIGPIPE);
3572 goto integer;
3573
3574 case SO_NOADDRERR:
3575 optval = (so->so_flags & SOF_NOADDRAVAIL);
3576 goto integer;
3577
3578 case SO_REUSESHAREUID:
3579 optval = (so->so_flags & SOF_REUSESHAREUID);
3580 goto integer;
3581
3582 #ifdef __APPLE_API_PRIVATE
3583 case SO_NOTIFYCONFLICT:
3584 optval = (so->so_flags & SOF_NOTIFYCONFLICT);
3585 goto integer;
3586 #endif
3587 case SO_RESTRICTIONS:
3588 optval = so->so_restrictions & (SO_RESTRICT_DENYIN |
3589 SO_RESTRICT_DENYOUT | SO_RESTRICT_DENYSET);
3590 goto integer;
3591
3592 case SO_LABEL:
3593 #if CONFIG_MACF_SOCKET
3594 if ((error = sooptcopyin(sopt, &extmac, sizeof (extmac),
3595 sizeof (extmac))) != 0 ||
3596 (error = mac_socket_label_get(proc_ucred(
3597 sopt->sopt_p), so, &extmac)) != 0)
3598 break;
3599
3600 error = sooptcopyout(sopt, &extmac, sizeof (extmac));
3601 #else
3602 error = EOPNOTSUPP;
3603 #endif /* MAC_SOCKET */
3604 break;
3605
3606 case SO_PEERLABEL:
3607 #if CONFIG_MACF_SOCKET
3608 if ((error = sooptcopyin(sopt, &extmac, sizeof (extmac),
3609 sizeof (extmac))) != 0 ||
3610 (error = mac_socketpeer_label_get(proc_ucred(
3611 sopt->sopt_p), so, &extmac)) != 0)
3612 break;
3613
3614 error = sooptcopyout(sopt, &extmac, sizeof (extmac));
3615 #else
3616 error = EOPNOTSUPP;
3617 #endif /* MAC_SOCKET */
3618 break;
3619
3620 #ifdef __APPLE_API_PRIVATE
3621 case SO_UPCALLCLOSEWAIT:
3622 optval = (so->so_flags & SOF_UPCALLCLOSEWAIT);
3623 goto integer;
3624 #endif
3625 case SO_RANDOMPORT:
3626 optval = (so->so_flags & SOF_BINDRANDOMPORT);
3627 goto integer;
3628
3629 case SO_NP_EXTENSIONS: {
3630 struct so_np_extensions sonpx;
3631
3632 sonpx.npx_flags = (so->so_flags & SOF_NPX_SETOPTSHUT) ? SONPX_SETOPTSHUT : 0;
3633 sonpx.npx_mask = SONPX_MASK_VALID;
3634
3635 error = sooptcopyout(sopt, &sonpx, sizeof(struct so_np_extensions));
3636 break;
3637 }
3638
3639 case SO_TRAFFIC_CLASS:
3640 optval = so->so_traffic_class;
3641 goto integer;
3642
3643 case SO_RECV_TRAFFIC_CLASS:
3644 optval = (so->so_flags & SOF_RECV_TRAFFIC_CLASS);
3645 goto integer;
3646
3647 case SO_TRAFFIC_CLASS_STATS:
3648 error = sooptcopyout(sopt, &so->so_tc_stats, sizeof(so->so_tc_stats));
3649 break;
3650
3651 case SO_TRAFFIC_CLASS_DBG:
3652 error = sogetopt_tcdbg(so, sopt);
3653 break;
3654
3655 case SO_PRIVILEGED_TRAFFIC_CLASS:
3656 optval = (so->so_flags & SOF_PRIVILEGED_TRAFFIC_CLASS);
3657 goto integer;
3658
3659 case SO_DEFUNCTOK:
3660 optval = !(so->so_flags & SOF_NODEFUNCT);
3661 goto integer;
3662
3663 case SO_ISDEFUNCT:
3664 optval = (so->so_flags & SOF_DEFUNCT);
3665 goto integer;
3666
3667 case SO_OPPORTUNISTIC:
3668 optval = so_get_opportunistic(so);
3669 goto integer;
3670
3671 case SO_FLUSH:
3672 /* This option is not gettable */
3673 error = EINVAL;
3674 break;
3675
3676 case SO_RECV_ANYIF:
3677 optval = so_get_recv_anyif(so);
3678 goto integer;
3679
3680 default:
3681 error = ENOPROTOOPT;
3682 break;
3683 }
3684 socket_unlock(so, 1);
3685 return (error);
3686 }
3687 }
3688 /* The size limits on our soopt_getm is different from that on FreeBSD.
3689 * We limit the size of options to MCLBYTES. This will have to change
3690 * if we need to define options that need more space than MCLBYTES.
3691 */
3692 int
3693 soopt_getm(struct sockopt *sopt, struct mbuf **mp)
3694 {
3695 struct mbuf *m, *m_prev;
3696 int sopt_size = sopt->sopt_valsize;
3697 int how;
3698
3699 if (sopt_size <= 0 || sopt_size > MCLBYTES)
3700 return (EMSGSIZE);
3701
3702 how = sopt->sopt_p != kernproc ? M_WAIT : M_DONTWAIT;
3703 MGET(m, how, MT_DATA);
3704 if (m == 0)
3705 return (ENOBUFS);
3706 if (sopt_size > MLEN) {
3707 MCLGET(m, how);
3708 if ((m->m_flags & M_EXT) == 0) {
3709 m_free(m);
3710 return (ENOBUFS);
3711 }
3712 m->m_len = min(MCLBYTES, sopt_size);
3713 } else {
3714 m->m_len = min(MLEN, sopt_size);
3715 }
3716 sopt_size -= m->m_len;
3717 *mp = m;
3718 m_prev = m;
3719
3720 while (sopt_size > 0) {
3721 MGET(m, how, MT_DATA);
3722 if (m == 0) {
3723 m_freem(*mp);
3724 return (ENOBUFS);
3725 }
3726 if (sopt_size > MLEN) {
3727 MCLGET(m, how);
3728 if ((m->m_flags & M_EXT) == 0) {
3729 m_freem(*mp);
3730 m_freem(m);
3731 return (ENOBUFS);
3732 }
3733 m->m_len = min(MCLBYTES, sopt_size);
3734 } else {
3735 m->m_len = min(MLEN, sopt_size);
3736 }
3737 sopt_size -= m->m_len;
3738 m_prev->m_next = m;
3739 m_prev = m;
3740 }
3741 return (0);
3742 }
3743
3744 /* copyin sopt data into mbuf chain */
3745 int
3746 soopt_mcopyin(struct sockopt *sopt, struct mbuf *m)
3747 {
3748 struct mbuf *m0 = m;
3749
3750 if (sopt->sopt_val == USER_ADDR_NULL)
3751 return (0);
3752 while (m != NULL && sopt->sopt_valsize >= m->m_len) {
3753 if (sopt->sopt_p != kernproc) {
3754 int error;
3755
3756 error = copyin(sopt->sopt_val, mtod(m, char *),
3757 m->m_len);
3758 if (error != 0) {
3759 m_freem(m0);
3760 return (error);
3761 }
3762 } else {
3763 bcopy(CAST_DOWN(caddr_t, sopt->sopt_val),
3764 mtod(m, char *), m->m_len);
3765 }
3766 sopt->sopt_valsize -= m->m_len;
3767 sopt->sopt_val += m->m_len;
3768 m = m->m_next;
3769 }
3770 if (m != NULL) /* should be allocated enoughly at ip6_sooptmcopyin() */
3771 panic("soopt_mcopyin");
3772 return (0);
3773 }
3774
3775 /* copyout mbuf chain data into soopt */
3776 int
3777 soopt_mcopyout(struct sockopt *sopt, struct mbuf *m)
3778 {
3779 struct mbuf *m0 = m;
3780 size_t valsize = 0;
3781
3782 if (sopt->sopt_val == USER_ADDR_NULL)
3783 return (0);
3784 while (m != NULL && sopt->sopt_valsize >= m->m_len) {
3785 if (sopt->sopt_p != kernproc) {
3786 int error;
3787
3788 error = copyout(mtod(m, char *), sopt->sopt_val,
3789 m->m_len);
3790 if (error != 0) {
3791 m_freem(m0);
3792 return (error);
3793 }
3794 } else {
3795 bcopy(mtod(m, char *),
3796 CAST_DOWN(caddr_t, sopt->sopt_val), m->m_len);
3797 }
3798 sopt->sopt_valsize -= m->m_len;
3799 sopt->sopt_val += m->m_len;
3800 valsize += m->m_len;
3801 m = m->m_next;
3802 }
3803 if (m != NULL) {
3804 /* enough soopt buffer should be given from user-land */
3805 m_freem(m0);
3806 return (EINVAL);
3807 }
3808 sopt->sopt_valsize = valsize;
3809 return (0);
3810 }
3811
3812 void
3813 sohasoutofband(struct socket *so)
3814 {
3815
3816 if (so->so_pgid < 0)
3817 gsignal(-so->so_pgid, SIGURG);
3818 else if (so->so_pgid > 0)
3819 proc_signal(so->so_pgid, SIGURG);
3820 selwakeup(&so->so_rcv.sb_sel);
3821 }
3822
3823 int
3824 sopoll(struct socket *so, int events, __unused kauth_cred_t cred, void * wql)
3825 {
3826 struct proc *p = current_proc();
3827 int revents = 0;
3828
3829 socket_lock(so, 1);
3830
3831 if (events & (POLLIN | POLLRDNORM))
3832 if (soreadable(so))
3833 revents |= events & (POLLIN | POLLRDNORM);
3834
3835 if (events & (POLLOUT | POLLWRNORM))
3836 if (sowriteable(so))
3837 revents |= events & (POLLOUT | POLLWRNORM);
3838
3839 if (events & (POLLPRI | POLLRDBAND))
3840 if (so->so_oobmark || (so->so_state & SS_RCVATMARK))
3841 revents |= events & (POLLPRI | POLLRDBAND);
3842
3843 if (revents == 0) {
3844 if (events & (POLLIN | POLLPRI | POLLRDNORM | POLLRDBAND)) {
3845 /*
3846 * Darwin sets the flag first,
3847 * BSD calls selrecord first
3848 */
3849 so->so_rcv.sb_flags |= SB_SEL;
3850 selrecord(p, &so->so_rcv.sb_sel, wql);
3851 }
3852
3853 if (events & (POLLOUT | POLLWRNORM)) {
3854 /*
3855 * Darwin sets the flag first,
3856 * BSD calls selrecord first
3857 */
3858 so->so_snd.sb_flags |= SB_SEL;
3859 selrecord(p, &so->so_snd.sb_sel, wql);
3860 }
3861 }
3862
3863 socket_unlock(so, 1);
3864 return (revents);
3865 }
3866
3867 int
3868 soo_kqfilter(__unused struct fileproc *fp, struct knote *kn,
3869 __unused struct proc *p)
3870 {
3871 struct socket *so = (struct socket *)kn->kn_fp->f_fglob->fg_data;
3872 struct klist *skl;
3873
3874 socket_lock(so, 1);
3875
3876 #if CONFIG_MACF_SOCKET
3877 if (mac_socket_check_kqfilter(proc_ucred(p), kn, so) != 0) {
3878 socket_unlock(so, 1);
3879 return (1);
3880 }
3881 #endif /* MAC_SOCKET */
3882
3883 switch (kn->kn_filter) {
3884 case EVFILT_READ:
3885 kn->kn_fop = &soread_filtops;
3886 skl = &so->so_rcv.sb_sel.si_note;
3887 break;
3888 case EVFILT_WRITE:
3889 kn->kn_fop = &sowrite_filtops;
3890 skl = &so->so_snd.sb_sel.si_note;
3891 break;
3892 case EVFILT_SOCK:
3893 kn->kn_fop = &sock_filtops;
3894 skl = &so->so_klist;
3895 break;
3896 default:
3897 socket_unlock(so, 1);
3898 return (1);
3899 }
3900
3901 if (KNOTE_ATTACH(skl, kn)) {
3902 switch(kn->kn_filter) {
3903 case EVFILT_READ:
3904 so->so_rcv.sb_flags |= SB_KNOTE;
3905 break;
3906 case EVFILT_WRITE:
3907 so->so_snd.sb_flags |= SB_KNOTE;
3908 break;
3909 case EVFILT_SOCK:
3910 so->so_flags |= SOF_KNOTE;
3911 break;
3912 default:
3913 socket_unlock(so, 1);
3914 return (1);
3915 }
3916 }
3917 socket_unlock(so, 1);
3918 return (0);
3919 }
3920
3921 static void
3922 filt_sordetach(struct knote *kn)
3923 {
3924 struct socket *so = (struct socket *)kn->kn_fp->f_fglob->fg_data;
3925
3926 socket_lock(so, 1);
3927 if (so->so_rcv.sb_flags & SB_KNOTE)
3928 if (KNOTE_DETACH(&so->so_rcv.sb_sel.si_note, kn))
3929 so->so_rcv.sb_flags &= ~SB_KNOTE;
3930 socket_unlock(so, 1);
3931 }
3932
3933 /*ARGSUSED*/
3934 static int
3935 filt_soread(struct knote *kn, long hint)
3936 {
3937 struct socket *so = (struct socket *)kn->kn_fp->f_fglob->fg_data;
3938
3939 if ((hint & SO_FILT_HINT_LOCKED) == 0)
3940 socket_lock(so, 1);
3941
3942 if (so->so_options & SO_ACCEPTCONN) {
3943 int isempty;
3944
3945 /* Radar 6615193 handle the listen case dynamically
3946 * for kqueue read filter. This allows to call listen() after registering
3947 * the kqueue EVFILT_READ.
3948 */
3949
3950 kn->kn_data = so->so_qlen;
3951 isempty = ! TAILQ_EMPTY(&so->so_comp);
3952
3953 if ((hint & SO_FILT_HINT_LOCKED) == 0)
3954 socket_unlock(so, 1);
3955
3956 return (isempty);
3957 }
3958
3959 /* socket isn't a listener */
3960
3961 kn->kn_data = so->so_rcv.sb_cc - so->so_rcv.sb_ctl;
3962
3963 if (so->so_oobmark) {
3964 if (kn->kn_flags & EV_OOBAND) {
3965 kn->kn_data -= so->so_oobmark;
3966 if ((hint & SO_FILT_HINT_LOCKED) == 0)
3967 socket_unlock(so, 1);
3968 return (1);
3969 }
3970 kn->kn_data = so->so_oobmark;
3971 kn->kn_flags |= EV_OOBAND;
3972 } else {
3973 if (so->so_state & SS_CANTRCVMORE) {
3974 kn->kn_flags |= EV_EOF;
3975 kn->kn_fflags = so->so_error;
3976 if ((hint & SO_FILT_HINT_LOCKED) == 0)
3977 socket_unlock(so, 1);
3978 return (1);
3979 }
3980 }
3981
3982 if (so->so_state & SS_RCVATMARK) {
3983 if (kn->kn_flags & EV_OOBAND) {
3984 if ((hint & SO_FILT_HINT_LOCKED) == 0)
3985 socket_unlock(so, 1);
3986 return (1);
3987 }
3988 kn->kn_flags |= EV_OOBAND;
3989 } else if (kn->kn_flags & EV_OOBAND) {
3990 kn->kn_data = 0;
3991 if ((hint & SO_FILT_HINT_LOCKED) == 0)
3992 socket_unlock(so, 1);
3993 return (0);
3994 }
3995
3996 if (so->so_error) { /* temporary udp error */
3997 if ((hint & SO_FILT_HINT_LOCKED) == 0)
3998 socket_unlock(so, 1);
3999 return (1);
4000 }
4001
4002 int64_t lowwat = so->so_rcv.sb_lowat;
4003 if (kn->kn_sfflags & NOTE_LOWAT)
4004 {
4005 if (kn->kn_sdata > so->so_rcv.sb_hiwat)
4006 lowwat = so->so_rcv.sb_hiwat;
4007 else if (kn->kn_sdata > lowwat)
4008 lowwat = kn->kn_sdata;
4009 }
4010
4011 if ((hint & SO_FILT_HINT_LOCKED) == 0)
4012 socket_unlock(so, 1);
4013
4014 return ((kn->kn_flags & EV_OOBAND) || kn->kn_data >= lowwat);
4015 }
4016
4017 static void
4018 filt_sowdetach(struct knote *kn)
4019 {
4020 struct socket *so = (struct socket *)kn->kn_fp->f_fglob->fg_data;
4021 socket_lock(so, 1);
4022
4023 if (so->so_snd.sb_flags & SB_KNOTE)
4024 if (KNOTE_DETACH(&so->so_snd.sb_sel.si_note, kn))
4025 so->so_snd.sb_flags &= ~SB_KNOTE;
4026 socket_unlock(so, 1);
4027 }
4028
4029 int
4030 so_wait_for_if_feedback(struct socket *so)
4031 {
4032 if ((so->so_proto->pr_domain->dom_family == AF_INET ||
4033 so->so_proto->pr_domain->dom_family == AF_INET6) &&
4034 (so->so_state & SS_ISCONNECTED)) {
4035 struct inpcb *inp = sotoinpcb(so);
4036 if (INP_WAIT_FOR_IF_FEEDBACK(inp))
4037 return (1);
4038 }
4039 return (0);
4040 }
4041
4042 /*ARGSUSED*/
4043 static int
4044 filt_sowrite(struct knote *kn, long hint)
4045 {
4046 struct socket *so = (struct socket *)kn->kn_fp->f_fglob->fg_data;
4047 int ret = 0;
4048
4049 if ((hint & SO_FILT_HINT_LOCKED) == 0)
4050 socket_lock(so, 1);
4051
4052 kn->kn_data = sbspace(&so->so_snd);
4053 if (so->so_state & SS_CANTSENDMORE) {
4054 kn->kn_flags |= EV_EOF;
4055 kn->kn_fflags = so->so_error;
4056 ret = 1;
4057 goto out;
4058 }
4059 if (so->so_error) { /* temporary udp error */
4060 ret = 1;
4061 goto out;
4062 }
4063 if (((so->so_state & SS_ISCONNECTED) == 0) &&
4064 (so->so_proto->pr_flags & PR_CONNREQUIRED)) {
4065 ret = 0;
4066 goto out;
4067 }
4068 int64_t lowwat = so->so_snd.sb_lowat;
4069 if (kn->kn_sfflags & NOTE_LOWAT)
4070 {
4071 if (kn->kn_sdata > so->so_snd.sb_hiwat)
4072 lowwat = so->so_snd.sb_hiwat;
4073 else if (kn->kn_sdata > lowwat)
4074 lowwat = kn->kn_sdata;
4075 }
4076 if (kn->kn_data >= lowwat) {
4077 if ((so->so_flags & SOF_NOTSENT_LOWAT) != 0) {
4078 ret = tcp_notsent_lowat_check(so);
4079 } else {
4080 ret = 1;
4081 }
4082 }
4083 if (so_wait_for_if_feedback(so))
4084 ret = 0;
4085 out:
4086 if ((hint & SO_FILT_HINT_LOCKED) == 0)
4087 socket_unlock(so, 1);
4088 return(ret);
4089 }
4090
4091 static void
4092 filt_sockdetach(struct knote *kn)
4093 {
4094 struct socket *so = (struct socket *)kn->kn_fp->f_fglob->fg_data;
4095 socket_lock(so, 1);
4096
4097 if ((so->so_flags & SOF_KNOTE) != 0)
4098 if (KNOTE_DETACH(&so->so_klist, kn))
4099 so->so_flags &= ~SOF_KNOTE;
4100 socket_unlock(so, 1);
4101 }
4102
4103 static int
4104 filt_sockev(struct knote *kn, long hint)
4105 {
4106 int ret = 0, locked = 0;
4107 struct socket *so = (struct socket *)kn->kn_fp->f_fglob->fg_data;
4108
4109 if ((hint & SO_FILT_HINT_LOCKED) == 0) {
4110 socket_lock(so, 1);
4111 locked = 1;
4112 }
4113
4114 switch (hint & SO_FILT_HINT_EV) {
4115 case SO_FILT_HINT_CONNRESET:
4116 if (kn->kn_sfflags & NOTE_CONNRESET)
4117 kn->kn_fflags |= NOTE_CONNRESET;
4118 break;
4119 case SO_FILT_HINT_TIMEOUT:
4120 if (kn->kn_sfflags & NOTE_TIMEOUT)
4121 kn->kn_fflags |= NOTE_TIMEOUT;
4122 break;
4123 case SO_FILT_HINT_NOSRCADDR:
4124 if (kn->kn_sfflags & NOTE_NOSRCADDR)
4125 kn->kn_fflags |= NOTE_NOSRCADDR;
4126 break;
4127 case SO_FILT_HINT_IFDENIED:
4128 if ((kn->kn_sfflags & NOTE_IFDENIED))
4129 kn->kn_fflags |= NOTE_IFDENIED;
4130 break;
4131 case SO_FILT_HINT_KEEPALIVE:
4132 if (kn->kn_sfflags & NOTE_KEEPALIVE)
4133 kn->kn_fflags |= NOTE_KEEPALIVE;
4134 }
4135
4136 if ((kn->kn_sfflags & NOTE_READCLOSED) &&
4137 (so->so_state & SS_CANTRCVMORE))
4138 kn->kn_fflags |= NOTE_READCLOSED;
4139
4140 if ((kn->kn_sfflags & NOTE_WRITECLOSED) &&
4141 (so->so_state & SS_CANTSENDMORE))
4142 kn->kn_fflags |= NOTE_WRITECLOSED;
4143
4144 if ((kn->kn_sfflags & NOTE_SUSPEND) &&
4145 ((hint & SO_FILT_HINT_SUSPEND) ||
4146 (so->so_flags & SOF_SUSPENDED))) {
4147 kn->kn_fflags &=
4148 ~(NOTE_SUSPEND | NOTE_RESUME);
4149 kn->kn_fflags |= NOTE_SUSPEND;
4150 }
4151
4152 if ((kn->kn_sfflags & NOTE_RESUME) &&
4153 ((hint & SO_FILT_HINT_RESUME) ||
4154 (so->so_flags & SOF_SUSPENDED) == 0)) {
4155 kn->kn_fflags &=
4156 ~(NOTE_SUSPEND | NOTE_RESUME);
4157 kn->kn_fflags |= NOTE_RESUME;
4158 }
4159
4160 if (so->so_error != 0) {
4161 ret = 1;
4162 kn->kn_data = so->so_error;
4163 kn->kn_flags |= EV_EOF;
4164 } else {
4165 get_sockev_state(so, (u_int32_t *)&(kn->kn_data));
4166 }
4167
4168 if (kn->kn_fflags != 0)
4169 ret = 1;
4170
4171 if (locked)
4172 socket_unlock(so, 1);
4173
4174 return(ret);
4175 }
4176
4177 void
4178 get_sockev_state(struct socket *so, u_int32_t *statep) {
4179 u_int32_t state = *(statep);
4180
4181 if (so->so_state & SS_ISCONNECTED)
4182 state |= SOCKEV_CONNECTED;
4183 else
4184 state &= ~(SOCKEV_CONNECTED);
4185 state |= ((so->so_state & SS_ISDISCONNECTED) ?
4186 SOCKEV_DISCONNECTED : 0);
4187 *(statep) = state;
4188 return;
4189 }
4190
4191 #define SO_LOCK_HISTORY_STR_LEN (2 * SO_LCKDBG_MAX * (2 + (2 * sizeof(void *)) + 1) + 1)
4192
4193 __private_extern__ const char * solockhistory_nr(struct socket *so)
4194 {
4195 size_t n = 0;
4196 int i;
4197 static char lock_history_str[SO_LOCK_HISTORY_STR_LEN];
4198
4199 bzero(lock_history_str, sizeof(lock_history_str));
4200 for (i = SO_LCKDBG_MAX - 1; i >= 0; i--) {
4201 n += snprintf(lock_history_str + n, SO_LOCK_HISTORY_STR_LEN - n, "%lx:%lx ",
4202 (uintptr_t) so->lock_lr[(so->next_lock_lr + i) % SO_LCKDBG_MAX],
4203 (uintptr_t) so->unlock_lr[(so->next_unlock_lr + i) % SO_LCKDBG_MAX]);
4204 }
4205 return lock_history_str;
4206 }
4207
4208 int
4209 socket_lock(struct socket *so, int refcount)
4210 {
4211 int error = 0;
4212 void *lr_saved;
4213
4214 lr_saved = __builtin_return_address(0);
4215
4216 if (so->so_proto->pr_lock) {
4217 error = (*so->so_proto->pr_lock)(so, refcount, lr_saved);
4218 } else {
4219 #ifdef MORE_LOCKING_DEBUG
4220 lck_mtx_assert(so->so_proto->pr_domain->dom_mtx,
4221 LCK_MTX_ASSERT_NOTOWNED);
4222 #endif
4223 lck_mtx_lock(so->so_proto->pr_domain->dom_mtx);
4224 if (refcount)
4225 so->so_usecount++;
4226 so->lock_lr[so->next_lock_lr] = lr_saved;
4227 so->next_lock_lr = (so->next_lock_lr+1) % SO_LCKDBG_MAX;
4228 }
4229
4230 return (error);
4231 }
4232
4233 int
4234 socket_unlock(struct socket *so, int refcount)
4235 {
4236 int error = 0;
4237 void *lr_saved;
4238 lck_mtx_t *mutex_held;
4239
4240 lr_saved = __builtin_return_address(0);
4241
4242 if (so->so_proto == NULL)
4243 panic("socket_unlock null so_proto so=%p\n", so);
4244
4245 if (so && so->so_proto->pr_unlock) {
4246 error = (*so->so_proto->pr_unlock)(so, refcount, lr_saved);
4247 } else {
4248 mutex_held = so->so_proto->pr_domain->dom_mtx;
4249 #ifdef MORE_LOCKING_DEBUG
4250 lck_mtx_assert(mutex_held, LCK_MTX_ASSERT_OWNED);
4251 #endif
4252 so->unlock_lr[so->next_unlock_lr] = lr_saved;
4253 so->next_unlock_lr = (so->next_unlock_lr+1) % SO_LCKDBG_MAX;
4254
4255 if (refcount) {
4256 if (so->so_usecount <= 0)
4257 panic("socket_unlock: bad refcount=%d so=%p (%d, %d, %d) lrh=%s",
4258 so->so_usecount, so, so->so_proto->pr_domain->dom_family,
4259 so->so_type, so->so_proto->pr_protocol,
4260 solockhistory_nr(so));
4261
4262 so->so_usecount--;
4263 if (so->so_usecount == 0) {
4264 sofreelastref(so, 1);
4265 }
4266 }
4267 lck_mtx_unlock(mutex_held);
4268 }
4269
4270 return (error);
4271 }
4272
4273 /* Called with socket locked, will unlock socket */
4274 void
4275 sofree(struct socket *so)
4276 {
4277
4278 lck_mtx_t *mutex_held;
4279 if (so->so_proto->pr_getlock != NULL)
4280 mutex_held = (*so->so_proto->pr_getlock)(so, 0);
4281 else
4282 mutex_held = so->so_proto->pr_domain->dom_mtx;
4283 lck_mtx_assert(mutex_held, LCK_MTX_ASSERT_OWNED);
4284
4285 sofreelastref(so, 0);
4286 }
4287
4288 void
4289 soreference(struct socket *so)
4290 {
4291 socket_lock(so, 1); /* locks & take one reference on socket */
4292 socket_unlock(so, 0); /* unlock only */
4293 }
4294
4295 void
4296 sodereference(struct socket *so)
4297 {
4298 socket_lock(so, 0);
4299 socket_unlock(so, 1);
4300 }
4301
4302 /*
4303 * Set or clear SOF_MULTIPAGES on the socket to enable or disable the
4304 * possibility of using jumbo clusters. Caller must ensure to hold
4305 * the socket lock.
4306 */
4307 void
4308 somultipages(struct socket *so, boolean_t set)
4309 {
4310 if (set)
4311 so->so_flags |= SOF_MULTIPAGES;
4312 else
4313 so->so_flags &= ~SOF_MULTIPAGES;
4314 }
4315
4316 int
4317 so_isdstlocal(struct socket *so) {
4318
4319 struct inpcb *inp = (struct inpcb *)so->so_pcb;
4320
4321 if (so->so_proto->pr_domain->dom_family == AF_INET) {
4322 return inaddr_local(inp->inp_faddr);
4323 } else if (so->so_proto->pr_domain->dom_family == AF_INET6) {
4324 return in6addr_local(&inp->in6p_faddr);
4325 }
4326 return 0;
4327 }
4328
4329 int
4330 sosetdefunct(struct proc *p, struct socket *so, int level, boolean_t noforce)
4331 {
4332 int err = 0, defunct;
4333
4334 defunct = (so->so_flags & SOF_DEFUNCT);
4335 if (defunct) {
4336 if (!(so->so_snd.sb_flags & so->so_rcv.sb_flags & SB_DROP))
4337 panic("%s: SB_DROP not set", __func__);
4338 goto done;
4339 }
4340
4341 if (so->so_flags & SOF_NODEFUNCT) {
4342 if (noforce) {
4343 err = EOPNOTSUPP;
4344 SODEFUNCTLOG(("%s[%d]: (target pid %d level %d) so %p "
4345 "[%d,%d] is not eligible for defunct (%d)\n",
4346 __func__, proc_selfpid(), proc_pid(p), level, so,
4347 INP_SOCKAF(so), INP_SOCKTYPE(so), err));
4348 return (err);
4349 }
4350 so->so_flags &= ~SOF_NODEFUNCT;
4351 SODEFUNCTLOG(("%s[%d]: (target pid %d level %d) so %p [%d,%d] "
4352 "defunct by force\n", __func__, proc_selfpid(), proc_pid(p),
4353 level, so, INP_SOCKAF(so), INP_SOCKTYPE(so)));
4354 }
4355
4356 so->so_flags |= SOF_DEFUNCT;
4357 /* Prevent further data from being appended to the socket buffers */
4358 so->so_snd.sb_flags |= SB_DROP;
4359 so->so_rcv.sb_flags |= SB_DROP;
4360
4361 done:
4362 SODEFUNCTLOG(("%s[%d]: (target pid %d level %d) so %p [%d,%d] %s "
4363 "defunct\n", __func__, proc_selfpid(), proc_pid(p), level, so,
4364 INP_SOCKAF(so), INP_SOCKTYPE(so),
4365 defunct ? "is already" : "marked as"));
4366
4367 return (err);
4368 }
4369
4370 int
4371 sodefunct(struct proc *p, struct socket *so, int level)
4372 {
4373 struct sockbuf *rcv, *snd;
4374
4375 if (!(so->so_flags & SOF_DEFUNCT))
4376 panic("%s improperly called", __func__);
4377
4378 if (so->so_state & SS_DEFUNCT)
4379 goto done;
4380
4381 rcv = &so->so_rcv;
4382 snd = &so->so_snd;
4383
4384 SODEFUNCTLOG(("%s[%d]: (target pid %d level %d) so %p [%d,%d] is now "
4385 "defunct [rcv_si 0x%x, snd_si 0x%x, rcv_fl 0x%x, snd_fl 0x%x]\n",
4386 __func__, proc_selfpid(), proc_pid(p), level, so,
4387 INP_SOCKAF(so), INP_SOCKTYPE(so),
4388 (uint32_t)rcv->sb_sel.si_flags, (uint32_t)snd->sb_sel.si_flags,
4389 (uint16_t)rcv->sb_flags, (uint16_t)snd->sb_flags));
4390
4391 /*
4392 * Unwedge threads blocked on sbwait() and sb_lock().
4393 */
4394 sbwakeup(rcv);
4395 sbwakeup(snd);
4396
4397 if (rcv->sb_flags & SB_LOCK)
4398 sbunlock(rcv, 1);
4399 if (snd->sb_flags & SB_LOCK)
4400 sbunlock(snd, 1);
4401
4402 /*
4403 * Flush the buffers and disconnect. We explicitly call shutdown
4404 * on both data directions to ensure that SS_CANT{RCV,SEND}MORE
4405 * states are set for the socket. This would also flush out data
4406 * hanging off the receive list of this socket.
4407 */
4408 (void) soshutdownlock(so, SHUT_RD);
4409 (void) soshutdownlock(so, SHUT_WR);
4410 (void) sodisconnectlocked(so);
4411
4412 /*
4413 * Explicitly handle connectionless-protocol disconnection
4414 * and release any remaining data in the socket buffers.
4415 */
4416 if (!(so->so_flags & SS_ISDISCONNECTED))
4417 (void) soisdisconnected(so);
4418
4419 if (so->so_error == 0)
4420 so->so_error = EBADF;
4421
4422 if (rcv->sb_cc != 0)
4423 sbrelease(rcv);
4424 if (snd->sb_cc != 0)
4425 sbrelease(snd);
4426
4427 so->so_state |= SS_DEFUNCT;
4428
4429 done:
4430 return (0);
4431 }
4432
4433 __private_extern__ int
4434 so_set_recv_anyif(struct socket *so, int optval)
4435 {
4436 int ret = 0;
4437
4438 #if INET6
4439 if (INP_SOCKAF(so) == AF_INET || INP_SOCKAF(so) == AF_INET6) {
4440 #else
4441 if (INP_SOCKAF(so) == AF_INET) {
4442 #endif /* !INET6 */
4443 if (optval)
4444 sotoinpcb(so)->inp_flags |= INP_RECV_ANYIF;
4445 else
4446 sotoinpcb(so)->inp_flags &= ~INP_RECV_ANYIF;
4447 } else {
4448 ret = EPROTONOSUPPORT;
4449 }
4450
4451 return (ret);
4452 }
4453
4454 __private_extern__ int
4455 so_get_recv_anyif(struct socket *so)
4456 {
4457 int ret = 0;
4458
4459 #if INET6
4460 if (INP_SOCKAF(so) == AF_INET || INP_SOCKAF(so) == AF_INET6) {
4461 #else
4462 if (INP_SOCKAF(so) == AF_INET) {
4463 #endif /* !INET6 */
4464 ret = (sotoinpcb(so)->inp_flags & INP_RECV_ANYIF) ? 1 : 0;
4465 }
4466
4467 return (ret);
4468 }