bsd/kern/uipc_socket.c

   1 /*
   2  * Copyright (c) 1998-2012 Apple Inc. All rights reserved.
   3  *
   4  * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
   5  *
   6  * This file contains Original Code and/or Modifications of Original Code
   7  * as defined in and that are subject to the Apple Public Source License
   8  * Version 2.0 (the 'License'). You may not use this file except in
   9  * compliance with the License. The rights granted to you under the License
  10  * may not be used to create, or enable the creation or redistribution of,
  11  * unlawful or unlicensed copies of an Apple operating system, or to
  12  * circumvent, violate, or enable the circumvention or violation of, any
  13  * terms of an Apple operating system software license agreement.
  14  *
  15  * Please obtain a copy of the License at
  16  * http://www.opensource.apple.com/apsl/ and read it before using this file.
  17  *
  18  * The Original Code and all software distributed under the License are
  19  * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
  20  * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
  21  * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
  22  * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
  23  * Please see the License for the specific language governing rights and
  24  * limitations under the License.
  25  *
  26  * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
  27  */
  28 /* Copyright (c) 1995 NeXT Computer, Inc. All Rights Reserved */
  29 /*
  30  * Copyright (c) 1982, 1986, 1988, 1990, 1993
  31  *      The Regents of the University of California.  All rights reserved.
  32  *
  33  * Redistribution and use in source and binary forms, with or without
  34  * modification, are permitted provided that the following conditions
  35  * are met:
  36  * 1. Redistributions of source code must retain the above copyright
  37  *    notice, this list of conditions and the following disclaimer.
  38  * 2. Redistributions in binary form must reproduce the above copyright
  39  *    notice, this list of conditions and the following disclaimer in the
  40  *    documentation and/or other materials provided with the distribution.
  41  * 3. All advertising materials mentioning features or use of this software
  42  *    must display the following acknowledgement:
  43  *      This product includes software developed by the University of
  44  *      California, Berkeley and its contributors.
  45  * 4. Neither the name of the University nor the names of its contributors
  46  *    may be used to endorse or promote products derived from this software
  47  *    without specific prior written permission.
  48  *
  49  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
  50  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  51  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  52  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  53  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  54  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  55  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  56  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  57  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  58  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  59  * SUCH DAMAGE.
  60  *
  61  *      @(#)uipc_socket.c       8.3 (Berkeley) 4/15/94
  62  * $FreeBSD: src/sys/kern/uipc_socket.c,v 1.68.2.16 2001/06/14 20:46:06 ume Exp $
  63  */
  64 /*
  65  * NOTICE: This file was modified by SPARTA, Inc. in 2005 to introduce
  66  * support for mandatory and extensible security protections.  This notice
  67  * is included in support of clause 2.2 (b) of the Apple Public License,
  68  * Version 2.0.
  69  */
  70
  71 #include <sys/param.h>
  72 #include <sys/systm.h>
  73 #include <sys/filedesc.h>
  74 #include <sys/proc.h>
  75 #include <sys/proc_internal.h>
  76 #include <sys/kauth.h>
  77 #include <sys/file_internal.h>
  78 #include <sys/fcntl.h>
  79 #include <sys/malloc.h>
  80 #include <sys/mbuf.h>
  81 #include <sys/domain.h>
  82 #include <sys/kernel.h>
  83 #include <sys/event.h>
  84 #include <sys/poll.h>
  85 #include <sys/protosw.h>
  86 #include <sys/socket.h>
  87 #include <sys/socketvar.h>
  88 #include <sys/resourcevar.h>
  89 #include <sys/signalvar.h>
  90 #include <sys/sysctl.h>
  91 #include <sys/uio.h>
  92 #include <sys/ev.h>
  93 #include <sys/kdebug.h>
  94 #include <sys/un.h>
  95 #include <sys/user.h>
  96 #include <sys/priv.h>
  97 #include <net/route.h>
  98 #include <net/ntstat.h>
  99 #include <netinet/in.h>
 100 #include <netinet/in_pcb.h>
 101 #include <netinet/ip6.h>
 102 #include <netinet6/ip6_var.h>
 103 #include <kern/zalloc.h>
 104 #include <kern/locks.h>
 105 #include <machine/limits.h>
 106 #include <libkern/OSAtomic.h>
 107 #include <pexpert/pexpert.h>
 108 #include <kern/assert.h>
 109 #include <kern/task.h>
 110 #include <sys/kpi_mbuf.h>
 111 #include <sys/mcache.h>
 112
 113 #if CONFIG_MACF
 114 #include <security/mac.h>
 115 #include <security/mac_framework.h>
 116 #endif /* MAC */
 117
 118
 119 int                     so_cache_hw = 0;
 120 int                     so_cache_timeouts = 0;
 121 int                     so_cache_max_freed = 0;
 122 int                     cached_sock_count = 0;
 123 __private_extern__ int  max_cached_sock_count = MAX_CACHED_SOCKETS;
 124 struct socket           *socket_cache_head = 0;
 125 struct socket           *socket_cache_tail = 0;
 126 u_int32_t                       so_cache_time = 0;
 127 int                     so_cache_init_done = 0;
 128 struct zone             *so_cache_zone;
 129
 130 static lck_grp_t                *so_cache_mtx_grp;
 131 static lck_attr_t               *so_cache_mtx_attr;
 132 static lck_grp_attr_t   *so_cache_mtx_grp_attr;
 133 lck_mtx_t                               *so_cache_mtx;
 134
 135 #include <machine/limits.h>
 136
 137 static void     filt_sordetach(struct knote *kn);
 138 static int      filt_soread(struct knote *kn, long hint);
 139 static void     filt_sowdetach(struct knote *kn);
 140 static int      filt_sowrite(struct knote *kn, long hint);
 141 static void     filt_sockdetach(struct knote *kn);
 142 static int      filt_sockev(struct knote *kn, long hint);
 143
 144 static int
 145 sooptcopyin_timeval(struct sockopt *sopt, struct timeval * tv_p);
 146
 147 static int
 148 sooptcopyout_timeval(struct sockopt *sopt, const struct timeval * tv_p);
 149
 150 static struct filterops soread_filtops = {
 151         .f_isfd = 1,
 152         .f_detach = filt_sordetach,
 153         .f_event = filt_soread,
 154 };
 155 static struct filterops sowrite_filtops = {
 156         .f_isfd = 1,
 157         .f_detach = filt_sowdetach,
 158         .f_event = filt_sowrite,
 159 };
 160 static struct filterops sock_filtops = {
 161         .f_isfd = 1,
 162         .f_detach = filt_sockdetach,
 163         .f_event = filt_sockev,
 164 };
 165
 166 #define EVEN_MORE_LOCKING_DEBUG 0
 167 int socket_debug = 0;
 168 int socket_zone = M_SOCKET;
 169 so_gen_t        so_gencnt;      /* generation count for sockets */
 170
 171 MALLOC_DEFINE(M_SONAME, "soname", "socket name");
 172 MALLOC_DEFINE(M_PCB, "pcb", "protocol control block");
 173
 174 #define DBG_LAYER_IN_BEG        NETDBG_CODE(DBG_NETSOCK, 0)
 175 #define DBG_LAYER_IN_END        NETDBG_CODE(DBG_NETSOCK, 2)
 176 #define DBG_LAYER_OUT_BEG       NETDBG_CODE(DBG_NETSOCK, 1)
 177 #define DBG_LAYER_OUT_END       NETDBG_CODE(DBG_NETSOCK, 3)
 178 #define DBG_FNC_SOSEND          NETDBG_CODE(DBG_NETSOCK, (4 << 8) | 1)
 179 #define DBG_FNC_SORECEIVE       NETDBG_CODE(DBG_NETSOCK, (8 << 8))
 180 #define DBG_FNC_SOSHUTDOWN      NETDBG_CODE(DBG_NETSOCK, (9 << 8))
 181
 182 #define MAX_SOOPTGETM_SIZE      (128 * MCLBYTES)
 183
 184
 185 SYSCTL_DECL(_kern_ipc);
 186
 187 int somaxconn = SOMAXCONN;
 188 SYSCTL_INT(_kern_ipc, KIPC_SOMAXCONN, somaxconn, CTLFLAG_RW | CTLFLAG_LOCKED, &somaxconn, 0, "");
 189
 190 /* Should we get a maximum also ??? */
 191 static int sosendmaxchain = 65536;
 192 static int sosendminchain = 16384;
 193 static int sorecvmincopy  = 16384;
 194 SYSCTL_INT(_kern_ipc, OID_AUTO, sosendminchain, CTLFLAG_RW | CTLFLAG_LOCKED, &sosendminchain,
 195     0, "");
 196 SYSCTL_INT(_kern_ipc, OID_AUTO, sorecvmincopy, CTLFLAG_RW | CTLFLAG_LOCKED, &sorecvmincopy,
 197     0, "");
 198
 199 /*
 200  * Set to enable jumbo clusters (if available) for large writes when
 201  * the socket is marked with SOF_MULTIPAGES; see below.
 202  */
 203 int sosendjcl = 1;
 204 SYSCTL_INT(_kern_ipc, OID_AUTO, sosendjcl, CTLFLAG_RW | CTLFLAG_LOCKED, &sosendjcl, 0, "");
 205
 206 /*
 207  * Set this to ignore SOF_MULTIPAGES and use jumbo clusters for large
 208  * writes on the socket for all protocols on any network interfaces,
 209  * depending upon sosendjcl above.  Be extra careful when setting this
 210  * to 1, because sending down packets that cross physical pages down to
 211  * broken drivers (those that falsely assume that the physical pages
 212  * are contiguous) might lead to system panics or silent data corruption.
 213  * When set to 0, the system will respect SOF_MULTIPAGES, which is set
 214  * only for TCP sockets whose outgoing interface is IFNET_MULTIPAGES
 215  * capable.  Set this to 1 only for testing/debugging purposes.
 216  */
 217 int sosendjcl_ignore_capab = 0;
 218 SYSCTL_INT(_kern_ipc, OID_AUTO, sosendjcl_ignore_capab, CTLFLAG_RW | CTLFLAG_LOCKED,
 219     &sosendjcl_ignore_capab, 0, "");
 220
 221 int sodefunctlog = 0;
 222 SYSCTL_INT(_kern_ipc, OID_AUTO, sodefunctlog, CTLFLAG_RW | CTLFLAG_LOCKED,
 223     &sodefunctlog, 0, "");
 224
 225 int sothrottlelog = 0;
 226 SYSCTL_INT(_kern_ipc, OID_AUTO, sothrottlelog, CTLFLAG_RW | CTLFLAG_LOCKED,
 227     &sothrottlelog, 0, "");
 228
 229 /*
 230  * Socket operation routines.
 231  * These routines are called by the routines in
 232  * sys_socket.c or from a system process, and
 233  * implement the semantics of socket operations by
 234  * switching out to the protocol specific routines.
 235  */
 236
 237 /* sys_generic.c */
 238 extern void postevent(struct socket *, struct sockbuf *, int);
 239 extern void evsofree(struct socket *);
 240 extern int tcp_notsent_lowat_check(struct socket *so);
 241
 242 /* TODO: these should be in header file */
 243 extern int get_inpcb_str_size(void);
 244 extern int get_tcp_str_size(void);
 245 extern struct domain *pffinddomain(int);
 246 extern struct protosw *pffindprotonotype(int, int);
 247 extern int soclose_locked(struct socket *);
 248 extern int soo_kqfilter(struct fileproc *, struct knote *, struct proc *);
 249
 250 #ifdef __APPLE__
 251
 252 vm_size_t       so_cache_zone_element_size;
 253
 254 static int sodelayed_copy(struct socket *, struct uio *, struct mbuf **, int *);
 255 static void cached_sock_alloc(struct socket **, int);
 256 static void cached_sock_free(struct socket *);
 257 static void so_cache_timer(void *);
 258
 259 void soclose_wait_locked(struct socket *so);
 260 int so_isdstlocal(struct socket *so);
 261
 262 /*
 263  * SOTCDB_NO_DSCP is set by default, to prevent the networking stack from
 264  * setting the DSCP code on the packet based on the service class; see
 265  * <rdar://problem/11277343> for details.
 266  */
 267 __private_extern__ u_int32_t sotcdb = SOTCDB_NO_DSCP;
 268 SYSCTL_INT(_kern_ipc, OID_AUTO, sotcdb, CTLFLAG_RW | CTLFLAG_LOCKED,
 269     &sotcdb, 0, "");
 270
 271 void
 272 socketinit(void)
 273 {
 274         vm_size_t str_size;
 275
 276         if (so_cache_init_done) {
 277                 printf("socketinit: already called...\n");
 278                 return;
 279         }
 280
 281         PE_parse_boot_argn("socket_debug", &socket_debug, sizeof (socket_debug));
 282
 283         /*
 284          * allocate lock group attribute and group for socket cache mutex
 285          */
 286         so_cache_mtx_grp_attr = lck_grp_attr_alloc_init();
 287
 288         so_cache_mtx_grp = lck_grp_alloc_init("so_cache",
 289             so_cache_mtx_grp_attr);
 290
 291         /*
 292          * allocate the lock attribute for socket cache mutex
 293          */
 294         so_cache_mtx_attr = lck_attr_alloc_init();
 295
 296         so_cache_init_done = 1;
 297
 298         /* cached sockets mutex */
 299         so_cache_mtx = lck_mtx_alloc_init(so_cache_mtx_grp, so_cache_mtx_attr);
 300
 301         if (so_cache_mtx == NULL)
 302                 return; /* we're hosed... */
 303
 304         str_size = (vm_size_t)(sizeof (struct socket) + 4 +
 305             get_inpcb_str_size() + 4 + get_tcp_str_size());
 306
 307         so_cache_zone = zinit(str_size, 120000*str_size, 8192, "socache zone");
 308         zone_change(so_cache_zone, Z_CALLERACCT, FALSE);
 309         zone_change(so_cache_zone, Z_NOENCRYPT, TRUE);
 310 #if TEMPDEBUG
 311         printf("cached_sock_alloc -- so_cache_zone size is %x\n", str_size);
 312 #endif
 313         timeout(so_cache_timer, NULL, (SO_CACHE_FLUSH_INTERVAL * hz));
 314
 315         so_cache_zone_element_size = str_size;
 316
 317         sflt_init();
 318
 319         _CASSERT(_SO_TC_MAX == SO_TC_STATS_MAX);
 320
 321         socket_tclass_init();
 322
 323         socket_flowadv_init();
 324 }
 325
 326 static void
 327 cached_sock_alloc(struct socket **so, int waitok)
 328 {
 329         caddr_t temp;
 330         register uintptr_t offset;
 331
 332         lck_mtx_lock(so_cache_mtx);
 333
 334         if (cached_sock_count) {
 335                 cached_sock_count--;
 336                 *so = socket_cache_head;
 337                 if (*so == 0)
 338                         panic("cached_sock_alloc: cached sock is null");
 339
 340                 socket_cache_head = socket_cache_head->cache_next;
 341                 if (socket_cache_head)
 342                         socket_cache_head->cache_prev = 0;
 343                 else
 344                         socket_cache_tail = 0;
 345
 346                 lck_mtx_unlock(so_cache_mtx);
 347
 348                 temp = (*so)->so_saved_pcb;
 349                 bzero((caddr_t)*so, sizeof (struct socket));
 350 #if TEMPDEBUG
 351                 kprintf("cached_sock_alloc - retreiving cached sock %p - "
 352                     "count == %d\n", *so, cached_sock_count);
 353 #endif
 354                 (*so)->so_saved_pcb = temp;
 355                 (*so)->cached_in_sock_layer = 1;
 356         } else {
 357 #if TEMPDEBUG
 358                 kprintf("Allocating cached sock %p from memory\n", *so);
 359 #endif
 360
 361                 lck_mtx_unlock(so_cache_mtx);
 362
 363                 if (waitok)
 364                         *so = (struct socket *)zalloc(so_cache_zone);
 365                 else
 366                         *so = (struct socket *)zalloc_noblock(so_cache_zone);
 367
 368                 if (*so == 0)
 369                         return;
 370
 371                 bzero((caddr_t)*so, sizeof (struct socket));
 372
 373                 /*
 374                  * Define offsets for extra structures into our single block of
 375                  * memory. Align extra structures on longword boundaries.
 376                  */
 377
 378                 offset = (uintptr_t) *so;
 379                 offset += sizeof (struct socket);
 380
 381                 offset = ALIGN(offset);
 382
 383                 (*so)->so_saved_pcb = (caddr_t)offset;
 384                 offset += get_inpcb_str_size();
 385
 386                 offset = ALIGN(offset);
 387
 388                 ((struct inpcb *)(void *)(*so)->so_saved_pcb)->inp_saved_ppcb =
 389                     (caddr_t)offset;
 390 #if TEMPDEBUG
 391                 kprintf("Allocating cached socket - %p, pcb=%p tcpcb=%p\n",
 392                     *so, (*so)->so_saved_pcb,
 393                     ((struct inpcb *)(*so)->so_saved_pcb)->inp_saved_ppcb);
 394 #endif
 395         }
 396
 397         (*so)->cached_in_sock_layer = 1;
 398 }
 399
 400 static void
 401 cached_sock_free(struct socket *so)
 402 {
 403
 404         lck_mtx_lock(so_cache_mtx);
 405
 406         if (++cached_sock_count > max_cached_sock_count) {
 407                 --cached_sock_count;
 408                 lck_mtx_unlock(so_cache_mtx);
 409 #if TEMPDEBUG
 410                 kprintf("Freeing overflowed cached socket %p\n", so);
 411 #endif
 412                 zfree(so_cache_zone, so);
 413         } else {
 414 #if TEMPDEBUG
 415                 kprintf("Freeing socket %p into cache\n", so);
 416 #endif
 417                 if (so_cache_hw < cached_sock_count)
 418                         so_cache_hw = cached_sock_count;
 419
 420                 so->cache_next = socket_cache_head;
 421                 so->cache_prev = 0;
 422                 if (socket_cache_head)
 423                         socket_cache_head->cache_prev = so;
 424                 else
 425                         socket_cache_tail = so;
 426
 427                 so->cache_timestamp = so_cache_time;
 428                 socket_cache_head = so;
 429                 lck_mtx_unlock(so_cache_mtx);
 430         }
 431
 432 #if TEMPDEBUG
 433         kprintf("Freed cached sock %p into cache - count is %d\n",
 434             so, cached_sock_count);
 435 #endif
 436 }
 437
 438 static void
 439 so_update_last_owner_locked(
 440         struct socket   *so,
 441         proc_t                  self)
 442 {
 443         if (so->last_pid != 0)
 444         {
 445                 if (self == NULL)
 446                         self = current_proc();
 447
 448                 if (self)
 449                 {
 450                         so->last_upid = proc_uniqueid(self);
 451                         so->last_pid = proc_pid(self);
 452                 }
 453         }
 454 }
 455
 456 static void
 457 so_cache_timer(__unused void *dummy)
 458 {
 459         register struct socket  *p;
 460         register int            n_freed = 0;
 461
 462         lck_mtx_lock(so_cache_mtx);
 463
 464         ++so_cache_time;
 465
 466         while ((p = socket_cache_tail)) {
 467                 if ((so_cache_time - p->cache_timestamp) < SO_CACHE_TIME_LIMIT)
 468                         break;
 469
 470                 so_cache_timeouts++;
 471
 472                 if ((socket_cache_tail = p->cache_prev))
 473                         p->cache_prev->cache_next = 0;
 474                 if (--cached_sock_count == 0)
 475                         socket_cache_head = 0;
 476
 477                 zfree(so_cache_zone, p);
 478
 479                 if (++n_freed >= SO_CACHE_MAX_FREE_BATCH) {
 480                         so_cache_max_freed++;
 481                         break;
 482                 }
 483         }
 484         lck_mtx_unlock(so_cache_mtx);
 485
 486         timeout(so_cache_timer, NULL, (SO_CACHE_FLUSH_INTERVAL * hz));
 487 }
 488 #endif /* __APPLE__ */
 489
 490 /*
 491  * Get a socket structure from our zone, and initialize it.
 492  * We don't implement `waitok' yet (see comments in uipc_domain.c).
 493  * Note that it would probably be better to allocate socket
 494  * and PCB at the same time, but I'm not convinced that all
 495  * the protocols can be easily modified to do this.
 496  */
 497 struct socket *
 498 soalloc(int waitok, int dom, int type)
 499 {
 500         struct socket *so;
 501
 502         if ((dom == PF_INET) && (type == SOCK_STREAM)) {
 503                 cached_sock_alloc(&so, waitok);
 504         } else {
 505                 MALLOC_ZONE(so, struct socket *, sizeof (*so), socket_zone,
 506                     M_WAITOK);
 507                 if (so != NULL)
 508                         bzero(so, sizeof (*so));
 509         }
 510         /* XXX race condition for reentrant kernel */
 511 //###LD Atomic add for so_gencnt
 512         if (so != NULL) {
 513                 so->so_gencnt = ++so_gencnt;
 514                 so->so_zone = socket_zone;
 515 #if CONFIG_MACF_SOCKET
 516              /* Convert waitok to  M_WAITOK/M_NOWAIT for MAC Framework. */
 517              if (mac_socket_label_init(so, !waitok) != 0) {
 518                         sodealloc(so);
 519                         return (NULL);
 520                 }
 521 #endif /* MAC_SOCKET */
 522         }
 523
 524         return (so);
 525 }
 526
 527 /*
 528  * Returns:     0                       Success
 529  *              EAFNOSUPPORT
 530  *              EPROTOTYPE
 531  *              EPROTONOSUPPORT
 532  *              ENOBUFS
 533  *      <pru_attach>:ENOBUFS[AF_UNIX]
 534  *      <pru_attach>:ENOBUFS[TCP]
 535  *      <pru_attach>:ENOMEM[TCP]
 536  *      <pru_attach>:EISCONN[TCP]
 537  *      <pru_attach>:???                [other protocol families, IPSEC]
 538  */
 539 int
 540 socreate(int dom, struct socket **aso, int type, int proto)
 541 {
 542         struct proc *p = current_proc();
 543         register struct protosw *prp;
 544         register struct socket *so;
 545         register int error = 0;
 546
 547 #if TCPDEBUG
 548         extern int tcpconsdebug;
 549 #endif
 550         if (proto)
 551                 prp = pffindproto(dom, proto, type);
 552         else
 553                 prp = pffindtype(dom, type);
 554
 555         if (prp == 0 || prp->pr_usrreqs->pru_attach == 0) {
 556                 if (pffinddomain(dom) == NULL) {
 557                         return (EAFNOSUPPORT);
 558                 }
 559                 if (proto != 0) {
 560                         if (pffindprotonotype(dom, proto) != NULL) {
 561                                 return (EPROTOTYPE);
 562                         }
 563                 }
 564                 return (EPROTONOSUPPORT);
 565         }
 566         if (prp->pr_type != type)
 567                 return (EPROTOTYPE);
 568         so = soalloc(1, dom, type);
 569         if (so == 0)
 570                 return (ENOBUFS);
 571
 572         TAILQ_INIT(&so->so_incomp);
 573         TAILQ_INIT(&so->so_comp);
 574         so->so_type = type;
 575         so->last_upid = proc_uniqueid(p);
 576         so->last_pid = proc_pid(p);
 577
 578         so->so_cred = kauth_cred_proc_ref(p);
 579         if (!suser(kauth_cred_get(), NULL))
 580                 so->so_state = SS_PRIV;
 581
 582         so->so_proto = prp;
 583 #ifdef __APPLE__
 584         so->so_rcv.sb_flags |= SB_RECV; /* XXX */
 585         so->so_rcv.sb_so = so->so_snd.sb_so = so;
 586 #endif
 587         so->next_lock_lr = 0;
 588         so->next_unlock_lr = 0;
 589
 590 #if CONFIG_MACF_SOCKET
 591         mac_socket_label_associate(kauth_cred_get(), so);
 592 #endif /* MAC_SOCKET */
 593
 594 //### Attachement will create the per pcb lock if necessary and increase refcount
 595         /*
 596          * for creation, make sure it's done before
 597          * socket is inserted in lists
 598          */
 599         so->so_usecount++;
 600
 601         error = (*prp->pr_usrreqs->pru_attach)(so, proto, p);
 602         if (error) {
 603                 /*
 604                  * Warning:
 605                  * If so_pcb is not zero, the socket will be leaked,
 606                  * so protocol attachment handler must be coded carefuly
 607                  */
 608                 so->so_state |= SS_NOFDREF;
 609                 so->so_usecount--;
 610                 sofreelastref(so, 1);   /* will deallocate the socket */
 611                 return (error);
 612         }
 613 #ifdef __APPLE__
 614         prp->pr_domain->dom_refs++;
 615         TAILQ_INIT(&so->so_evlist);
 616
 617         /* Attach socket filters for this protocol */
 618         sflt_initsock(so);
 619 #if TCPDEBUG
 620         if (tcpconsdebug == 2)
 621                 so->so_options |= SO_DEBUG;
 622 #endif
 623 #endif
 624         so_set_default_traffic_class(so);
 625         /*
 626          * If this is a background thread/task, mark the socket as such.
 627          */
 628         if (proc_get_self_isbackground() != 0) {
 629                 socket_set_traffic_mgt_flags(so, TRAFFIC_MGT_SO_BACKGROUND);
 630                 so->so_background_thread = current_thread();
 631         }
 632
 633         switch (dom) {
 634         /*
 635          * Don't mark Unix domain or system sockets as eligible for defunct by default.
 636         */
 637         case PF_LOCAL:
 638         case PF_SYSTEM:
 639                 so->so_flags |= SOF_NODEFUNCT;
 640                 break;
 641         default:
 642                 break;
 643         }
 644
 645         *aso = so;
 646         return (0);
 647 }
 648
 649 /*
 650  * Returns:     0                       Success
 651  *      <pru_bind>:EINVAL               Invalid argument [COMMON_START]
 652  *      <pru_bind>:EAFNOSUPPORT         Address family not supported
 653  *      <pru_bind>:EADDRNOTAVAIL        Address not available.
 654  *      <pru_bind>:EINVAL               Invalid argument
 655  *      <pru_bind>:EAFNOSUPPORT         Address family not supported [notdef]
 656  *      <pru_bind>:EACCES               Permission denied
 657  *      <pru_bind>:EADDRINUSE           Address in use
 658  *      <pru_bind>:EAGAIN               Resource unavailable, try again
 659  *      <pru_bind>:EPERM                Operation not permitted
 660  *      <pru_bind>:???
 661  *      <sf_bind>:???
 662  *
 663  * Notes:       It's not possible to fully enumerate the return codes above,
 664  *              since socket filter authors and protocol family authors may
 665  *              not choose to limit their error returns to those listed, even
 666  *              though this may result in some software operating incorrectly.
 667  *
 668  *              The error codes which are enumerated above are those known to
 669  *              be returned by the tcp_usr_bind function supplied.
 670  */
 671 int
 672 sobind(struct socket *so, struct sockaddr *nam)
 673 {
 674         struct proc *p = current_proc();
 675         int error = 0;
 676
 677         socket_lock(so, 1);
 678         VERIFY(so->so_usecount > 1);
 679         so_update_last_owner_locked(so, p);
 680
 681         /*
 682          * If this is a bind request on a socket that has been marked
 683          * as inactive, reject it now before we go any further.
 684          */
 685         if (so->so_flags & SOF_DEFUNCT) {
 686                 error = EINVAL;
 687                 SODEFUNCTLOG(("%s[%d]: defunct so %p [%d,%d] (%d)\n",
 688                     __func__, proc_pid(p), so, INP_SOCKAF(so), INP_SOCKTYPE(so),
 689                     error));
 690                 goto out;
 691         }
 692
 693         /* Socket filter */
 694         error = sflt_bind(so, nam);
 695
 696         if (error == 0)
 697                 error = (*so->so_proto->pr_usrreqs->pru_bind)(so, nam, p);
 698 out:
 699         socket_unlock(so, 1);
 700
 701         if (error == EJUSTRETURN)
 702                 error = 0;
 703
 704         return (error);
 705 }
 706
 707 void
 708 sodealloc(struct socket *so)
 709 {
 710         kauth_cred_unref(&so->so_cred);
 711
 712         /* Remove any filters */
 713         sflt_termsock(so);
 714
 715         so->so_gencnt = ++so_gencnt;
 716
 717 #if CONFIG_MACF_SOCKET
 718         mac_socket_label_destroy(so);
 719 #endif /* MAC_SOCKET */
 720         if (so->cached_in_sock_layer == 1) {
 721                 cached_sock_free(so);
 722         } else {
 723                 if (so->cached_in_sock_layer == -1)
 724                         panic("sodealloc: double dealloc: so=%p\n", so);
 725                 so->cached_in_sock_layer = -1;
 726                 FREE_ZONE(so, sizeof (*so), so->so_zone);
 727         }
 728 }
 729
 730 /*
 731  * Returns:     0                       Success
 732  *              EINVAL
 733  *              EOPNOTSUPP
 734  *      <pru_listen>:EINVAL[AF_UNIX]
 735  *      <pru_listen>:EINVAL[TCP]
 736  *      <pru_listen>:EADDRNOTAVAIL[TCP] Address not available.
 737  *      <pru_listen>:EINVAL[TCP]        Invalid argument
 738  *      <pru_listen>:EAFNOSUPPORT[TCP]  Address family not supported [notdef]
 739  *      <pru_listen>:EACCES[TCP]        Permission denied
 740  *      <pru_listen>:EADDRINUSE[TCP]    Address in use
 741  *      <pru_listen>:EAGAIN[TCP]        Resource unavailable, try again
 742  *      <pru_listen>:EPERM[TCP]         Operation not permitted
 743  *      <sf_listen>:???
 744  *
 745  * Notes:       Other <pru_listen> returns depend on the protocol family; all
 746  *              <sf_listen> returns depend on what the filter author causes
 747  *              their filter to return.
 748  */
 749 int
 750 solisten(struct socket *so, int backlog)
 751 {
 752         struct proc *p = current_proc();
 753         int error = 0;
 754
 755         socket_lock(so, 1);
 756
 757         if (so->so_proto == NULL) {
 758                 error = EINVAL;
 759                 goto out;
 760         }
 761         if ((so->so_proto->pr_flags & PR_CONNREQUIRED) == 0) {
 762                 error = EOPNOTSUPP;
 763                 goto out;
 764         }
 765
 766         /*
 767          * If the listen request is made on a socket that is not fully
 768          * disconnected, or on a socket that has been marked as inactive,
 769          * reject the request now.
 770          */
 771         if ((so->so_state &
 772             (SS_ISCONNECTED|SS_ISCONNECTING|SS_ISDISCONNECTING)) ||
 773             (so->so_flags & SOF_DEFUNCT)) {
 774                 error = EINVAL;
 775                 if (so->so_flags & SOF_DEFUNCT) {
 776                         SODEFUNCTLOG(("%s[%d]: defunct so %p [%d,%d] (%d)\n",
 777                             __func__, proc_pid(p), so, INP_SOCKAF(so),
 778                             INP_SOCKTYPE(so), error));
 779                 }
 780                 goto out;
 781         }
 782
 783         if ((so->so_restrictions & SO_RESTRICT_DENYIN) != 0) {
 784                 error = EPERM;
 785                 goto out;
 786         }
 787
 788         error = sflt_listen(so);
 789
 790         if (error == 0) {
 791                 error = (*so->so_proto->pr_usrreqs->pru_listen)(so, p);
 792         }
 793
 794         if (error) {
 795                 if (error == EJUSTRETURN)
 796                         error = 0;
 797                 goto out;
 798         }
 799
 800         if (TAILQ_EMPTY(&so->so_comp))
 801                 so->so_options |= SO_ACCEPTCONN;
 802         /*
 803          * POSIX: The implementation may have an upper limit on the length of
 804          * the listen queue-either global or per accepting socket. If backlog
 805          * exceeds this limit, the length of the listen queue is set to the
 806          * limit.
 807          *
 808          * If listen() is called with a backlog argument value that is less
 809          * than 0, the function behaves as if it had been called with a backlog
 810          * argument value of 0.
 811          *
 812          * A backlog argument of 0 may allow the socket to accept connections,
 813          * in which case the length of the listen queue may be set to an
 814          * implementation-defined minimum value.
 815          */
 816         if (backlog <= 0 || backlog > somaxconn)
 817                 backlog = somaxconn;
 818
 819         so->so_qlimit = backlog;
 820 out:
 821         socket_unlock(so, 1);
 822         return (error);
 823 }
 824
 825 void
 826 sofreelastref(struct socket *so, int dealloc)
 827 {
 828         struct socket *head = so->so_head;
 829
 830         /* Assume socket is locked */
 831
 832         if ((!(so->so_flags & SOF_PCBCLEARING)) ||
 833             ((so->so_state & SS_NOFDREF) == 0)) {
 834 #ifdef __APPLE__
 835                 selthreadclear(&so->so_snd.sb_sel);
 836                 selthreadclear(&so->so_rcv.sb_sel);
 837                 so->so_rcv.sb_flags &= ~SB_UPCALL;
 838                 so->so_snd.sb_flags &= ~SB_UPCALL;
 839 #endif
 840                 return;
 841         }
 842         if (head != NULL) {
 843                 socket_lock(head, 1);
 844                 if (so->so_state & SS_INCOMP) {
 845                         TAILQ_REMOVE(&head->so_incomp, so, so_list);
 846                         head->so_incqlen--;
 847                 } else if (so->so_state & SS_COMP) {
 848                         /*
 849                          * We must not decommission a socket that's
 850                          * on the accept(2) queue.  If we do, then
 851                          * accept(2) may hang after select(2) indicated
 852                          * that the listening socket was ready.
 853                          */
 854 #ifdef __APPLE__
 855                         selthreadclear(&so->so_snd.sb_sel);
 856                         selthreadclear(&so->so_rcv.sb_sel);
 857                         so->so_rcv.sb_flags &= ~SB_UPCALL;
 858                         so->so_snd.sb_flags &= ~SB_UPCALL;
 859 #endif
 860                         socket_unlock(head, 1);
 861                         return;
 862                 } else {
 863                         panic("sofree: not queued");
 864                 }
 865                 head->so_qlen--;
 866                 so->so_state &= ~SS_INCOMP;
 867                 so->so_head = NULL;
 868                 socket_unlock(head, 1);
 869         }
 870 #ifdef __APPLE__
 871         selthreadclear(&so->so_snd.sb_sel);
 872         sbrelease(&so->so_snd);
 873 #endif
 874         sorflush(so);
 875
 876         /* 3932268: disable upcall */
 877         so->so_rcv.sb_flags &= ~SB_UPCALL;
 878         so->so_snd.sb_flags &= ~SB_UPCALL;
 879
 880         if (dealloc)
 881                 sodealloc(so);
 882 }
 883
 884 void
 885 soclose_wait_locked(struct socket *so)
 886 {
 887         lck_mtx_t *mutex_held;
 888
 889         if (so->so_proto->pr_getlock != NULL)
 890                 mutex_held = (*so->so_proto->pr_getlock)(so, 0);
 891         else
 892                 mutex_held = so->so_proto->pr_domain->dom_mtx;
 893         lck_mtx_assert(mutex_held, LCK_MTX_ASSERT_OWNED);
 894
 895         /*
 896          * Double check here and return if there's no outstanding upcall;
 897          * otherwise proceed further only if SOF_UPCALLCLOSEWAIT is set.
 898          */
 899         if (!so->so_upcallusecount || !(so->so_flags & SOF_UPCALLCLOSEWAIT))
 900                 return;
 901         so->so_rcv.sb_flags &= ~SB_UPCALL;
 902         so->so_snd.sb_flags &= ~SB_UPCALL;
 903         so->so_flags |= SOF_CLOSEWAIT;
 904         (void) msleep((caddr_t)&so->so_upcall, mutex_held, (PZERO - 1),
 905             "soclose_wait_locked", NULL);
 906         lck_mtx_assert(mutex_held, LCK_MTX_ASSERT_OWNED);
 907         so->so_flags &= ~SOF_CLOSEWAIT;
 908 }
 909
 910 /*
 911  * Close a socket on last file table reference removal.
 912  * Initiate disconnect if connected.
 913  * Free socket when disconnect complete.
 914  */
 915 int
 916 soclose_locked(struct socket *so)
 917 {
 918         int error = 0;
 919         lck_mtx_t *mutex_held;
 920         struct timespec ts;
 921
 922         if (so->so_usecount == 0) {
 923                 panic("soclose: so=%p refcount=0\n", so);
 924         }
 925
 926         sflt_notify(so, sock_evt_closing, NULL);
 927
 928         if ((so->so_options & SO_ACCEPTCONN)) {
 929                 struct socket *sp, *sonext;
 930                 int socklock = 0;
 931
 932                 /*
 933                  * We do not want new connection to be added
 934                  * to the connection queues
 935                  */
 936                 so->so_options &= ~SO_ACCEPTCONN;
 937
 938                 for (sp = TAILQ_FIRST(&so->so_incomp); sp != NULL; sp = sonext) {
 939                         sonext = TAILQ_NEXT(sp, so_list);
 940
 941                         /* Radar 5350314
 942                          * skip sockets thrown away by tcpdropdropblreq
 943                          * they will get cleanup by the garbage collection.
 944                          * otherwise, remove the incomp socket from the queue
 945                          * and let soabort trigger the appropriate cleanup.
 946                          */
 947                         if (sp->so_flags & SOF_OVERFLOW)
 948                                 continue;
 949
 950                         if (so->so_proto->pr_getlock != NULL) {
 951                                 /* lock ordering for consistency with the rest of the stack,
 952                                  * we lock the socket first and then grabb the head.
 953                                  */
 954                                 socket_unlock(so, 0);
 955                                 socket_lock(sp, 1);
 956                                 socket_lock(so, 0);
 957                                 socklock = 1;
 958                         }
 959
 960                         TAILQ_REMOVE(&so->so_incomp, sp, so_list);
 961                         so->so_incqlen--;
 962
 963                         if (sp->so_state & SS_INCOMP) {
 964                                 sp->so_state &= ~SS_INCOMP;
 965                                 sp->so_head = NULL;
 966
 967                                 (void) soabort(sp);
 968                         }
 969
 970                         if (socklock)
 971                                 socket_unlock(sp, 1);
 972                 }
 973
 974                 while ((sp = TAILQ_FIRST(&so->so_comp)) != NULL) {
 975                         /* Dequeue from so_comp since sofree() won't do it */
 976                         TAILQ_REMOVE(&so->so_comp, sp, so_list);
 977                         so->so_qlen--;
 978
 979                         if (so->so_proto->pr_getlock != NULL) {
 980                                 socket_unlock(so, 0);
 981                                 socket_lock(sp, 1);
 982                         }
 983
 984                         if (sp->so_state & SS_COMP) {
 985                                 sp->so_state &= ~SS_COMP;
 986                                 sp->so_head = NULL;
 987
 988                                 (void) soabort(sp);
 989                         }
 990
 991                         if (so->so_proto->pr_getlock != NULL) {
 992                                 socket_unlock(sp, 1);
 993                                 socket_lock(so, 0);
 994                         }
 995                 }
 996         }
 997         if (so->so_pcb == 0) {
 998                 /* 3915887: mark the socket as ready for dealloc */
 999                 so->so_flags |= SOF_PCBCLEARING;
1000                 goto discard;
1001         }
1002         if (so->so_state & SS_ISCONNECTED) {
1003                 if ((so->so_state & SS_ISDISCONNECTING) == 0) {
1004                         error = sodisconnectlocked(so);
1005                         if (error)
1006                                 goto drop;
1007                 }
1008                 if (so->so_options & SO_LINGER) {
1009                         if ((so->so_state & SS_ISDISCONNECTING) &&
1010                             (so->so_state & SS_NBIO))
1011                                 goto drop;
1012                         if (so->so_proto->pr_getlock != NULL)
1013                                 mutex_held = (*so->so_proto->pr_getlock)(so, 0);
1014                         else
1015                                 mutex_held = so->so_proto->pr_domain->dom_mtx;
1016                         while (so->so_state & SS_ISCONNECTED) {
1017                                 ts.tv_sec = (so->so_linger/100);
1018                                 ts.tv_nsec = (so->so_linger % 100) *
1019                                     NSEC_PER_USEC * 1000 * 10;
1020                                 error = msleep((caddr_t)&so->so_timeo,
1021                                     mutex_held, PSOCK | PCATCH, "soclose", &ts);
1022                                 if (error) {
1023                                         /*
1024                                          * It's OK when the time fires,
1025                                          * don't report an error
1026                                          */
1027                                         if (error == EWOULDBLOCK)
1028                                                 error = 0;
1029                                         break;
1030                                 }
1031                         }
1032                 }
1033         }
1034 drop:
1035         if (so->so_usecount == 0)
1036                 panic("soclose: usecount is zero so=%p\n", so);
1037         if (so->so_pcb && !(so->so_flags & SOF_PCBCLEARING)) {
1038                 /*
1039                  * Let NetworkStatistics know this PCB is going away
1040                  * before we detach it.
1041                  */
1042                 if (nstat_collect &&
1043                     (so->so_proto->pr_domain->dom_family == AF_INET ||
1044                     so->so_proto->pr_domain->dom_family == AF_INET6))
1045                         nstat_pcb_detach(so->so_pcb);
1046
1047                 int error2 = (*so->so_proto->pr_usrreqs->pru_detach)(so);
1048                 if (error == 0)
1049                         error = error2;
1050         }
1051         if (so->so_usecount <= 0)
1052                 panic("soclose: usecount is zero so=%p\n", so);
1053 discard:
1054         if (so->so_pcb && so->so_state & SS_NOFDREF)
1055                 panic("soclose: NOFDREF");
1056         so->so_state |= SS_NOFDREF;
1057
1058         if ((so->so_flags & SOF_KNOTE) != 0)
1059                 KNOTE(&so->so_klist, SO_FILT_HINT_LOCKED);
1060 #ifdef __APPLE__
1061         so->so_proto->pr_domain->dom_refs--;
1062         evsofree(so);
1063 #endif
1064         so->so_usecount--;
1065         sofree(so);
1066         return (error);
1067 }
1068
1069 int
1070 soclose(struct socket *so)
1071 {
1072         int error = 0;
1073         socket_lock(so, 1);
1074
1075         if (so->so_upcallusecount)
1076                 soclose_wait_locked(so);
1077
1078         if (so->so_retaincnt == 0) {
1079                 error = soclose_locked(so);
1080         } else {
1081                 /*
1082                  * if the FD is going away, but socket is
1083                  * retained in kernel remove its reference
1084                  */
1085                 so->so_usecount--;
1086                 if (so->so_usecount < 2)
1087                         panic("soclose: retaincnt non null and so=%p "
1088                             "usecount=%d\n", so, so->so_usecount);
1089         }
1090         socket_unlock(so, 1);
1091         return (error);
1092 }
1093
1094 /*
1095  * Must be called at splnet...
1096  */
1097 /* Should already be locked */
1098 int
1099 soabort(struct socket *so)
1100 {
1101         int error;
1102
1103 #ifdef MORE_LOCKING_DEBUG
1104         lck_mtx_t *mutex_held;
1105
1106         if (so->so_proto->pr_getlock != NULL)
1107                 mutex_held = (*so->so_proto->pr_getlock)(so, 0);
1108         else
1109                 mutex_held = so->so_proto->pr_domain->dom_mtx;
1110         lck_mtx_assert(mutex_held, LCK_MTX_ASSERT_OWNED);
1111 #endif
1112
1113         if ((so->so_flags & SOF_ABORTED) == 0) {
1114                 so->so_flags |= SOF_ABORTED;
1115                 error = (*so->so_proto->pr_usrreqs->pru_abort)(so);
1116                 if (error) {
1117                         sofree(so);
1118                         return (error);
1119                 }
1120         }
1121         return (0);
1122 }
1123
1124 int
1125 soacceptlock(struct socket *so, struct sockaddr **nam, int dolock)
1126 {
1127         int error;
1128
1129         if (dolock)
1130                 socket_lock(so, 1);
1131
1132         if ((so->so_state & SS_NOFDREF) == 0)
1133                 panic("soaccept: !NOFDREF");
1134         so->so_state &= ~SS_NOFDREF;
1135         error = (*so->so_proto->pr_usrreqs->pru_accept)(so, nam);
1136
1137         if (dolock)
1138                 socket_unlock(so, 1);
1139         return (error);
1140 }
1141
1142 int
1143 soaccept(struct socket *so, struct sockaddr **nam)
1144 {
1145         return (soacceptlock(so, nam, 1));
1146 }
1147
1148 int
1149 soacceptfilter(struct socket *so)
1150 {
1151         struct sockaddr *local = NULL, *remote = NULL;
1152         int error = 0;
1153         struct socket *head = so->so_head;
1154
1155         /*
1156          * Hold the lock even if this socket
1157          * has not been made visible to the filter(s).
1158          * For sockets with global locks, this protect against the
1159          * head or peer going away
1160          */
1161         socket_lock(so, 1);
1162         if (sogetaddr_locked(so, &remote, 1) != 0 ||
1163             sogetaddr_locked(so, &local, 0) != 0) {
1164                 so->so_state &= ~(SS_NOFDREF | SS_COMP);
1165                 so->so_head = NULL;
1166                 socket_unlock(so, 1);
1167                 soclose(so);
1168                 /* Out of resources; try it again next time */
1169                 error = ECONNABORTED;
1170                 goto done;
1171         }
1172
1173         error = sflt_accept(head, so, local, remote);
1174
1175         /*
1176          * If we get EJUSTRETURN from one of the filters, mark this socket
1177          * as inactive and return it anyway.  This newly accepted socket
1178          * will be disconnected later before we hand it off to the caller.
1179          */
1180         if (error == EJUSTRETURN) {
1181                 error = 0;
1182                 (void) sosetdefunct(current_proc(), so,
1183                     SHUTDOWN_SOCKET_LEVEL_DISCONNECT_INTERNAL, FALSE);
1184         }
1185
1186         if (error != 0) {
1187                 /*
1188                  * This may seem like a duplication to the above error
1189                  * handling part when we return ECONNABORTED, except
1190                  * the following is done while holding the lock since
1191                  * the socket has been exposed to the filter(s) earlier.
1192                  */
1193                 so->so_state &= ~(SS_NOFDREF | SS_COMP);
1194                 so->so_head = NULL;
1195                 socket_unlock(so, 1);
1196                 soclose(so);
1197                 /* Propagate socket filter's error code to the caller */
1198         } else {
1199                 socket_unlock(so, 1);
1200         }
1201 done:
1202         /* Callee checks for NULL pointer */
1203         sock_freeaddr(remote);
1204         sock_freeaddr(local);
1205         return (error);
1206 }
1207
1208 /*
1209  * Returns:     0                       Success
1210  *              EOPNOTSUPP              Operation not supported on socket
1211  *              EISCONN                 Socket is connected
1212  *      <pru_connect>:EADDRNOTAVAIL     Address not available.
1213  *      <pru_connect>:EINVAL            Invalid argument
1214  *      <pru_connect>:EAFNOSUPPORT      Address family not supported [notdef]
1215  *      <pru_connect>:EACCES            Permission denied
1216  *      <pru_connect>:EADDRINUSE        Address in use
1217  *      <pru_connect>:EAGAIN            Resource unavailable, try again
1218  *      <pru_connect>:EPERM             Operation not permitted
1219  *      <sf_connect_out>:???            [anything a filter writer might set]
1220  */
1221 int
1222 soconnectlock(struct socket *so, struct sockaddr *nam, int dolock)
1223 {
1224         int error;
1225         struct proc *p = current_proc();
1226
1227         if (dolock)
1228                 socket_lock(so, 1);
1229
1230         /*
1231          * If this is a listening socket or if this is a previously-accepted
1232          * socket that has been marked as inactive, reject the connect request.
1233          */
1234         if ((so->so_options & SO_ACCEPTCONN) || (so->so_flags & SOF_DEFUNCT)) {
1235                 error = EOPNOTSUPP;
1236                 if (so->so_flags & SOF_DEFUNCT) {
1237                         SODEFUNCTLOG(("%s[%d]: defunct so %p [%d,%d] (%d)\n",
1238                             __func__, proc_pid(p), so, INP_SOCKAF(so),
1239                             INP_SOCKTYPE(so), error));
1240                 }
1241                 if (dolock)
1242                         socket_unlock(so, 1);
1243                 return (error);
1244         }
1245
1246         if ((so->so_restrictions & SO_RESTRICT_DENYOUT) != 0) {
1247                 if (dolock)
1248                         socket_unlock(so, 1);
1249                 return (EPERM);
1250         }
1251
1252         /*
1253          * If protocol is connection-based, can only connect once.
1254          * Otherwise, if connected, try to disconnect first.
1255          * This allows user to disconnect by connecting to, e.g.,
1256          * a null address.
1257          */
1258         if (so->so_state & (SS_ISCONNECTED|SS_ISCONNECTING) &&
1259             ((so->so_proto->pr_flags & PR_CONNREQUIRED) ||
1260             (error = sodisconnectlocked(so)))) {
1261                 error = EISCONN;
1262         } else {
1263                 /*
1264                  * Run connect filter before calling protocol:
1265                  *  - non-blocking connect returns before completion;
1266                  */
1267                 error = sflt_connectout(so, nam);
1268
1269                 if (error) {
1270                         if (error == EJUSTRETURN)
1271                                 error = 0;
1272                 } else {
1273                         error = (*so->so_proto->pr_usrreqs->pru_connect)(so, nam, p);
1274                 }
1275         }
1276         if (dolock)
1277                 socket_unlock(so, 1);
1278         return (error);
1279 }
1280
1281 int
1282 soconnect(struct socket *so, struct sockaddr *nam)
1283 {
1284         return (soconnectlock(so, nam, 1));
1285 }
1286
1287 /*
1288  * Returns:     0                       Success
1289  *      <pru_connect2>:EINVAL[AF_UNIX]
1290  *      <pru_connect2>:EPROTOTYPE[AF_UNIX]
1291  *      <pru_connect2>:???              [other protocol families]
1292  *
1293  * Notes:       <pru_connect2> is not supported by [TCP].
1294  */
1295 int
1296 soconnect2(struct socket *so1, struct socket *so2)
1297 {
1298         int error;
1299
1300         socket_lock(so1, 1);
1301         if (so2->so_proto->pr_lock)
1302                 socket_lock(so2, 1);
1303
1304         error = (*so1->so_proto->pr_usrreqs->pru_connect2)(so1, so2);
1305
1306         socket_unlock(so1, 1);
1307         if (so2->so_proto->pr_lock)
1308                 socket_unlock(so2, 1);
1309         return (error);
1310 }
1311
1312 int
1313 sodisconnectlocked(struct socket *so)
1314 {
1315         int error;
1316
1317         if ((so->so_state & SS_ISCONNECTED) == 0) {
1318                 error = ENOTCONN;
1319                 goto bad;
1320         }
1321         if (so->so_state & SS_ISDISCONNECTING) {
1322                 error = EALREADY;
1323                 goto bad;
1324         }
1325
1326         error = (*so->so_proto->pr_usrreqs->pru_disconnect)(so);
1327
1328         if (error == 0) {
1329                 sflt_notify(so, sock_evt_disconnected, NULL);
1330         }
1331 bad:
1332         return (error);
1333 }
1334
1335 /* Locking version */
1336 int
1337 sodisconnect(struct socket *so)
1338 {
1339         int error;
1340
1341         socket_lock(so, 1);
1342         error = sodisconnectlocked(so);
1343         socket_unlock(so, 1);
1344         return (error);
1345 }
1346
1347 #define SBLOCKWAIT(f)   (((f) & MSG_DONTWAIT) ? M_DONTWAIT : M_WAIT)
1348
1349 /*
1350  * sosendcheck will lock the socket buffer if it isn't locked and
1351  * verify that there is space for the data being inserted.
1352  *
1353  * Returns:     0                       Success
1354  *              EPIPE
1355  *      sblock:EWOULDBLOCK
1356  *      sblock:EINTR
1357  *      sbwait:EBADF
1358  *      sbwait:EINTR
1359  *      [so_error]:???
1360  */
1361 static int
1362 sosendcheck(struct socket *so, struct sockaddr *addr, int32_t resid, int32_t clen,
1363     int32_t atomic, int flags, int *sblocked)
1364 {
1365         int     error = 0;
1366         int32_t space;
1367         int     assumelock = 0;
1368
1369 restart:
1370         if (*sblocked == 0) {
1371                 if ((so->so_snd.sb_flags & SB_LOCK) != 0 &&
1372                     so->so_send_filt_thread != 0 &&
1373                     so->so_send_filt_thread == current_thread()) {
1374                         /*
1375                          * We're being called recursively from a filter,
1376                          * allow this to continue. Radar 4150520.
1377                          * Don't set sblocked because we don't want
1378                          * to perform an unlock later.
1379                          */
1380                         assumelock = 1;
1381                 } else {
1382                         error = sblock(&so->so_snd, SBLOCKWAIT(flags));
1383                         if (error) {
1384                                 if (so->so_flags & SOF_DEFUNCT)
1385                                         goto defunct;
1386                                 return (error);
1387                         }
1388                         *sblocked = 1;
1389                 }
1390         }
1391
1392         /*
1393          * If a send attempt is made on a socket that has been marked
1394          * as inactive (disconnected), reject the request.
1395          */
1396         if (so->so_flags & SOF_DEFUNCT) {
1397 defunct:
1398                 error = EPIPE;
1399                 SODEFUNCTLOG(("%s[%d]: defunct so %p [%d,%d] (%d)\n", __func__,
1400                     proc_selfpid(), so, INP_SOCKAF(so), INP_SOCKTYPE(so),
1401                     error));
1402                 return (error);
1403         }
1404
1405         if (so->so_state & SS_CANTSENDMORE)
1406                 return (EPIPE);
1407
1408         if (so->so_error) {
1409                 error = so->so_error;
1410                 so->so_error = 0;
1411                 return (error);
1412         }
1413
1414         if ((so->so_state & SS_ISCONNECTED) == 0) {
1415                 if ((so->so_proto->pr_flags & PR_CONNREQUIRED) != 0) {
1416                         if ((so->so_state & SS_ISCONFIRMING) == 0 &&
1417                             !(resid == 0 && clen != 0))
1418                                 return (ENOTCONN);
1419                 } else if (addr == 0 && !(flags&MSG_HOLD)) {
1420                         return ((so->so_proto->pr_flags & PR_CONNREQUIRED) ?
1421                             ENOTCONN : EDESTADDRREQ);
1422                 }
1423         }
1424         space = sbspace(&so->so_snd);
1425         if (flags & MSG_OOB)
1426                 space += 1024;
1427         if ((atomic && resid > so->so_snd.sb_hiwat) ||
1428             clen > so->so_snd.sb_hiwat)
1429                 return (EMSGSIZE);
1430         if ((space < resid + clen &&
1431             (atomic || space < (int32_t)so->so_snd.sb_lowat || space < clen)) ||
1432             (so->so_type == SOCK_STREAM && so_wait_for_if_feedback(so))) {
1433                 if ((so->so_state & SS_NBIO) || (flags & MSG_NBIO) ||
1434                     assumelock) {
1435                         return (EWOULDBLOCK);
1436                 }
1437                 sbunlock(&so->so_snd, 1);
1438                 *sblocked = 0;
1439                 error = sbwait(&so->so_snd);
1440                 if (error) {
1441                         if (so->so_flags & SOF_DEFUNCT)
1442                                 goto defunct;
1443                         return (error);
1444                 }
1445                 goto restart;
1446         }
1447
1448         return (0);
1449 }
1450
1451 /*
1452  * Send on a socket.
1453  * If send must go all at once and message is larger than
1454  * send buffering, then hard error.
1455  * Lock against other senders.
1456  * If must go all at once and not enough room now, then
1457  * inform user that this would block and do nothing.
1458  * Otherwise, if nonblocking, send as much as possible.
1459  * The data to be sent is described by "uio" if nonzero,
1460  * otherwise by the mbuf chain "top" (which must be null
1461  * if uio is not).  Data provided in mbuf chain must be small
1462  * enough to send all at once.
1463  *
1464  * Returns nonzero on error, timeout or signal; callers
1465  * must check for short counts if EINTR/ERESTART are returned.
1466  * Data and control buffers are freed on return.
1467  * Experiment:
1468  * MSG_HOLD: go thru most of sosend(), but just enqueue the mbuf
1469  * MSG_SEND: go thru as for MSG_HOLD on current fragment, then
1470  *  point at the mbuf chain being constructed and go from there.
1471  *
1472  * Returns:     0                       Success
1473  *              EOPNOTSUPP
1474  *              EINVAL
1475  *              ENOBUFS
1476  *      uiomove:EFAULT
1477  *      sosendcheck:EPIPE
1478  *      sosendcheck:EWOULDBLOCK
1479  *      sosendcheck:EINTR
1480  *      sosendcheck:EBADF
1481  *      sosendcheck:EINTR
1482  *      sosendcheck:???                 [value from so_error]
1483  *      <pru_send>:ECONNRESET[TCP]
1484  *      <pru_send>:EINVAL[TCP]
1485  *      <pru_send>:ENOBUFS[TCP]
1486  *      <pru_send>:EADDRINUSE[TCP]
1487  *      <pru_send>:EADDRNOTAVAIL[TCP]
1488  *      <pru_send>:EAFNOSUPPORT[TCP]
1489  *      <pru_send>:EACCES[TCP]
1490  *      <pru_send>:EAGAIN[TCP]
1491  *      <pru_send>:EPERM[TCP]
1492  *      <pru_send>:EMSGSIZE[TCP]
1493  *      <pru_send>:EHOSTUNREACH[TCP]
1494  *      <pru_send>:ENETUNREACH[TCP]
1495  *      <pru_send>:ENETDOWN[TCP]
1496  *      <pru_send>:ENOMEM[TCP]
1497  *      <pru_send>:ENOBUFS[TCP]
1498  *      <pru_send>:???[TCP]             [ignorable: mostly IPSEC/firewall/DLIL]
1499  *      <pru_send>:EINVAL[AF_UNIX]
1500  *      <pru_send>:EOPNOTSUPP[AF_UNIX]
1501  *      <pru_send>:EPIPE[AF_UNIX]
1502  *      <pru_send>:ENOTCONN[AF_UNIX]
1503  *      <pru_send>:EISCONN[AF_UNIX]
1504  *      <pru_send>:???[AF_UNIX]         [whatever a filter author chooses]
1505  *      <sf_data_out>:???               [whatever a filter author chooses]
1506  *
1507  * Notes:       Other <pru_send> returns depend on the protocol family; all
1508  *              <sf_data_out> returns depend on what the filter author causes
1509  *              their filter to return.
1510  */
1511 int
1512 sosend(struct socket *so, struct sockaddr *addr, struct uio *uio,
1513     struct mbuf *top, struct mbuf *control, int flags)
1514 {
1515         struct mbuf **mp;
1516         register struct mbuf *m, *freelist = NULL;
1517         register int32_t space, len, resid;
1518         int clen = 0, error, dontroute, mlen, sendflags;
1519         int atomic = sosendallatonce(so) || top;
1520         int sblocked = 0;
1521         struct proc *p = current_proc();
1522
1523         if (uio) {
1524                 // LP64todo - fix this!
1525                 resid = uio_resid(uio);
1526         } else {
1527                 resid = top->m_pkthdr.len;
1528         }
1529         KERNEL_DEBUG((DBG_FNC_SOSEND | DBG_FUNC_START), so, resid,
1530             so->so_snd.sb_cc, so->so_snd.sb_lowat, so->so_snd.sb_hiwat);
1531
1532         socket_lock(so, 1);
1533         so_update_last_owner_locked(so, p);
1534
1535         if (so->so_type != SOCK_STREAM && (flags & MSG_OOB) != 0) {
1536                 error = EOPNOTSUPP;
1537                 socket_unlock(so, 1);
1538                 goto out;
1539         }
1540
1541         /*
1542          * In theory resid should be unsigned.
1543          * However, space must be signed, as it might be less than 0
1544          * if we over-committed, and we must use a signed comparison
1545          * of space and resid.  On the other hand, a negative resid
1546          * causes us to loop sending 0-length segments to the protocol.
1547          *
1548          * Also check to make sure that MSG_EOR isn't used on SOCK_STREAM
1549          * type sockets since that's an error.
1550          */
1551         if (resid < 0 || (so->so_type == SOCK_STREAM && (flags & MSG_EOR))) {
1552                 error = EINVAL;
1553                 socket_unlock(so, 1);
1554                 goto out;
1555         }
1556
1557         dontroute =
1558             (flags & MSG_DONTROUTE) && (so->so_options & SO_DONTROUTE) == 0 &&
1559             (so->so_proto->pr_flags & PR_ATOMIC);
1560         OSIncrementAtomicLong(&p->p_stats->p_ru.ru_msgsnd);
1561         if (control)
1562                 clen = control->m_len;
1563
1564         do {
1565                 error = sosendcheck(so, addr, resid, clen, atomic, flags,
1566                     &sblocked);
1567                 if (error) {
1568                         goto release;
1569                 }
1570                 mp = &top;
1571                 space = sbspace(&so->so_snd) - clen + ((flags & MSG_OOB) ?
1572                     1024 : 0);
1573
1574                 do {
1575                         if (uio == NULL) {
1576                                 /*
1577                                  * Data is prepackaged in "top".
1578                                  */
1579                                 resid = 0;
1580                                 if (flags & MSG_EOR)
1581                                         top->m_flags |= M_EOR;
1582                         } else {
1583                                 int chainlength;
1584                                 int bytes_to_copy;
1585                                 boolean_t jumbocl;
1586
1587                                 bytes_to_copy = imin(resid, space);
1588
1589                                 if (sosendminchain > 0) {
1590                                         chainlength = 0;
1591                                 } else {
1592                                         chainlength = sosendmaxchain;
1593                                 }
1594
1595                                 /*
1596                                  * Attempt to use larger than system page-size
1597                                  * clusters for large writes only if there is
1598                                  * a jumbo cluster pool and if the socket is
1599                                  * marked accordingly.
1600                                  */
1601                                 jumbocl = sosendjcl && njcl > 0 &&
1602                                     ((so->so_flags & SOF_MULTIPAGES) ||
1603                                     sosendjcl_ignore_capab);
1604
1605                                 socket_unlock(so, 0);
1606
1607                                 do {
1608                                         int num_needed;
1609                                         int hdrs_needed = (top == 0) ? 1 : 0;
1610
1611                                         /*
1612                                          * try to maintain a local cache of mbuf
1613                                          * clusters needed to complete this
1614                                          * write the list is further limited to
1615                                          * the number that are currently needed
1616                                          * to fill the socket this mechanism
1617                                          * allows a large number of mbufs/
1618                                          * clusters to be grabbed under a single
1619                                          * mbuf lock... if we can't get any
1620                                          * clusters, than fall back to trying
1621                                          * for mbufs if we fail early (or
1622                                          * miscalcluate the number needed) make
1623                                          * sure to release any clusters we
1624                                          * haven't yet consumed.
1625                                          */
1626                                         if (freelist == NULL &&
1627                                             bytes_to_copy > MBIGCLBYTES &&
1628                                             jumbocl) {
1629                                                 num_needed =
1630                                                     bytes_to_copy / M16KCLBYTES;
1631
1632                                                 if ((bytes_to_copy -
1633                                                     (num_needed * M16KCLBYTES))
1634                                                     >= MINCLSIZE)
1635                                                         num_needed++;
1636
1637                                                 freelist =
1638                                                     m_getpackets_internal(
1639                                                     (unsigned int *)&num_needed,
1640                                                     hdrs_needed, M_WAIT, 0,
1641                                                     M16KCLBYTES);
1642                                                 /*
1643                                                  * Fall back to 4K cluster size
1644                                                  * if allocation failed
1645                                                  */
1646                                         }
1647
1648                                         if (freelist == NULL &&
1649                                             bytes_to_copy > MCLBYTES) {
1650                                                 num_needed =
1651                                                     bytes_to_copy / MBIGCLBYTES;
1652
1653                                                 if ((bytes_to_copy -
1654                                                     (num_needed * MBIGCLBYTES)) >=
1655                                                     MINCLSIZE)
1656                                                         num_needed++;
1657
1658                                                 freelist =
1659                                                     m_getpackets_internal(
1660                                                     (unsigned int *)&num_needed,
1661                                                     hdrs_needed, M_WAIT, 0,
1662                                                     MBIGCLBYTES);
1663                                                 /*
1664                                                  * Fall back to cluster size
1665                                                  * if allocation failed
1666                                                  */
1667                                         }
1668
1669                                         if (freelist == NULL &&
1670                                             bytes_to_copy > MINCLSIZE) {
1671                                                 num_needed =
1672                                                     bytes_to_copy / MCLBYTES;
1673
1674                                                 if ((bytes_to_copy -
1675                                                     (num_needed * MCLBYTES)) >=
1676                                                     MINCLSIZE)
1677                                                         num_needed++;
1678
1679                                                 freelist =
1680                                                     m_getpackets_internal(
1681                                                     (unsigned int *)&num_needed,
1682                                                     hdrs_needed, M_WAIT, 0,
1683                                                     MCLBYTES);
1684                                                 /*
1685                                                  * Fall back to a single mbuf
1686                                                  * if allocation failed
1687                                                  */
1688                                         }
1689
1690                                         if (freelist == NULL) {
1691                                                 if (top == 0)
1692                                                         MGETHDR(freelist,
1693                                                             M_WAIT, MT_DATA);
1694                                                 else
1695                                                         MGET(freelist,
1696                                                             M_WAIT, MT_DATA);
1697
1698                                                 if (freelist == NULL) {
1699                                                         error = ENOBUFS;
1700                                                         socket_lock(so, 0);
1701                                                         goto release;
1702                                                 }
1703                                                 /*
1704                                                  * For datagram protocols,
1705                                                  * leave room for protocol
1706                                                  * headers in first mbuf.
1707                                                  */
1708                                                 if (atomic && top == 0 &&
1709                                                     bytes_to_copy < MHLEN) {
1710                                                         MH_ALIGN(freelist,
1711                                                             bytes_to_copy);
1712                                                 }
1713                                         }
1714                                         m = freelist;
1715                                         freelist = m->m_next;
1716                                         m->m_next = NULL;
1717
1718                                         if ((m->m_flags & M_EXT))
1719                                                 mlen = m->m_ext.ext_size;
1720                                         else if ((m->m_flags & M_PKTHDR))
1721                                                 mlen =
1722                                                     MHLEN - m_leadingspace(m);
1723                                         else
1724                                                 mlen = MLEN;
1725                                         len = imin(mlen, bytes_to_copy);
1726
1727                                         chainlength += len;
1728
1729                                         space -= len;
1730
1731                                         error = uiomove(mtod(m, caddr_t),
1732                                             len, uio);
1733
1734                                         resid = uio_resid(uio);
1735
1736                                         m->m_len = len;
1737                                         *mp = m;
1738                                         top->m_pkthdr.len += len;
1739                                         if (error)
1740                                                 break;
1741                                         mp = &m->m_next;
1742                                         if (resid <= 0) {
1743                                                 if (flags & MSG_EOR)
1744                                                         top->m_flags |= M_EOR;
1745                                                 break;
1746                                         }
1747                                         bytes_to_copy = min(resid, space);
1748
1749                                 } while (space > 0 &&
1750                                     (chainlength < sosendmaxchain || atomic ||
1751                                     resid < MINCLSIZE));
1752
1753                                 socket_lock(so, 0);
1754
1755                                 if (error)
1756                                         goto release;
1757                         }
1758
1759                         if (flags & (MSG_HOLD|MSG_SEND)) {
1760                                 /* Enqueue for later, go away if HOLD */
1761                                 register struct mbuf *mb1;
1762                                 if (so->so_temp && (flags & MSG_FLUSH)) {
1763                                         m_freem(so->so_temp);
1764                                         so->so_temp = NULL;
1765                                 }
1766                                 if (so->so_temp)
1767                                         so->so_tail->m_next = top;
1768                                 else
1769                                         so->so_temp = top;
1770                                 mb1 = top;
1771                                 while (mb1->m_next)
1772                                         mb1 = mb1->m_next;
1773                                 so->so_tail = mb1;
1774                                 if (flags & MSG_HOLD) {
1775                                         top = NULL;
1776                                         goto release;
1777                                 }
1778                                 top = so->so_temp;
1779                         }
1780                         if (dontroute)
1781                                 so->so_options |= SO_DONTROUTE;
1782
1783                         /* Compute flags here, for pru_send and NKEs */
1784                         sendflags = (flags & MSG_OOB) ? PRUS_OOB :
1785                             /*
1786                              * If the user set MSG_EOF, the protocol
1787                              * understands this flag and nothing left to
1788                              * send then use PRU_SEND_EOF instead of PRU_SEND.
1789                              */
1790                             ((flags & MSG_EOF) &&
1791                              (so->so_proto->pr_flags & PR_IMPLOPCL) &&
1792                              (resid <= 0)) ?
1793                                 PRUS_EOF :
1794                             /* If there is more to send set PRUS_MORETOCOME */
1795                             (resid > 0 && space > 0) ? PRUS_MORETOCOME : 0;
1796
1797                         /*
1798                          * Socket filter processing
1799                          */
1800                         error = sflt_data_out(so, addr, &top, &control,
1801                                                 (sendflags & MSG_OOB) ? sock_data_filt_flag_oob : 0);
1802                         if (error) {
1803                                 if (error == EJUSTRETURN) {
1804                                         error = 0;
1805                                         clen = 0;
1806                                         control = 0;
1807                                         top = 0;
1808                                 }
1809
1810                                 goto release;
1811                         }
1812                         /*
1813                          * End Socket filter processing
1814                          */
1815
1816                         error = (*so->so_proto->pr_usrreqs->pru_send)
1817                                 (so, sendflags, top, addr, control, p);
1818 #ifdef __APPLE__
1819                         if (flags & MSG_SEND)
1820                                 so->so_temp = NULL;
1821 #endif
1822                         if (dontroute)
1823                                 so->so_options &= ~SO_DONTROUTE;
1824
1825                         clen = 0;
1826                         control = 0;
1827                         top = 0;
1828                         mp = &top;
1829                         if (error)
1830                                 goto release;
1831                 } while (resid && space > 0);
1832         } while (resid);
1833
1834 release:
1835         if (sblocked)
1836                 sbunlock(&so->so_snd, 0);       /* will unlock socket */
1837         else
1838                 socket_unlock(so, 1);
1839 out:
1840         if (top)
1841                 m_freem(top);
1842         if (control)
1843                 m_freem(control);
1844         if (freelist)
1845                 m_freem_list(freelist);
1846
1847         KERNEL_DEBUG(DBG_FNC_SOSEND | DBG_FUNC_END, so, resid, so->so_snd.sb_cc,
1848             space, error);
1849
1850         return (error);
1851 }
1852
1853 /*
1854  * Implement receive operations on a socket.
1855  * We depend on the way that records are added to the sockbuf
1856  * by sbappend*.  In particular, each record (mbufs linked through m_next)
1857  * must begin with an address if the protocol so specifies,
1858  * followed by an optional mbuf or mbufs containing ancillary data,
1859  * and then zero or more mbufs of data.
1860  * In order to avoid blocking network interrupts for the entire time here,
1861  * we splx() while doing the actual copy to user space.
1862  * Although the sockbuf is locked, new data may still be appended,
1863  * and thus we must maintain consistency of the sockbuf during that time.
1864  *
1865  * The caller may receive the data as a single mbuf chain by supplying
1866  * an mbuf **mp0 for use in returning the chain.  The uio is then used
1867  * only for the count in uio_resid.
1868  *
1869  * Returns:     0                       Success
1870  *              ENOBUFS
1871  *              ENOTCONN
1872  *              EWOULDBLOCK
1873  *      uiomove:EFAULT
1874  *      sblock:EWOULDBLOCK
1875  *      sblock:EINTR
1876  *      sbwait:EBADF
1877  *      sbwait:EINTR
1878  *      sodelayed_copy:EFAULT
1879  *      <pru_rcvoob>:EINVAL[TCP]
1880  *      <pru_rcvoob>:EWOULDBLOCK[TCP]
1881  *      <pru_rcvoob>:???
1882  *      <pr_domain->dom_externalize>:EMSGSIZE[AF_UNIX]
1883  *      <pr_domain->dom_externalize>:ENOBUFS[AF_UNIX]
1884  *      <pr_domain->dom_externalize>:???
1885  *
1886  * Notes:       Additional return values from calls through <pru_rcvoob> and
1887  *              <pr_domain->dom_externalize> depend on protocols other than
1888  *              TCP or AF_UNIX, which are documented above.
1889  */
1890 int
1891 soreceive(struct socket *so, struct sockaddr **psa, struct uio *uio,
1892     struct mbuf **mp0, struct mbuf **controlp, int *flagsp)
1893 {
1894         register struct mbuf *m, **mp, *ml = NULL;
1895         register int flags, len, error, offset;
1896         struct protosw *pr = so->so_proto;
1897         struct mbuf *nextrecord;
1898         int moff, type = 0;
1899         int orig_resid = uio_resid(uio);
1900         struct mbuf *free_list;
1901         int delayed_copy_len;
1902         int can_delay;
1903         int need_event;
1904         struct proc *p = current_proc();
1905
1906         // LP64todo - fix this!
1907         KERNEL_DEBUG(DBG_FNC_SORECEIVE | DBG_FUNC_START, so, uio_resid(uio),
1908             so->so_rcv.sb_cc, so->so_rcv.sb_lowat, so->so_rcv.sb_hiwat);
1909
1910         socket_lock(so, 1);
1911         so_update_last_owner_locked(so, p);
1912
1913 #ifdef MORE_LOCKING_DEBUG
1914         if (so->so_usecount == 1)
1915                 panic("soreceive: so=%x no other reference on socket\n", so);
1916 #endif
1917         mp = mp0;
1918         if (psa)
1919                 *psa = 0;
1920         if (controlp)
1921                 *controlp = 0;
1922         if (flagsp)
1923                 flags = *flagsp &~ MSG_EOR;
1924         else
1925                 flags = 0;
1926
1927         /*
1928          * If a recv attempt is made on a previously-accepted socket
1929          * that has been marked as inactive (disconnected), reject
1930          * the request.
1931          */
1932         if (so->so_flags & SOF_DEFUNCT) {
1933                 struct sockbuf *sb = &so->so_rcv;
1934
1935                 error = ENOTCONN;
1936                 SODEFUNCTLOG(("%s[%d]: defunct so %p [%d,%d] (%d)\n", __func__,
1937                     proc_pid(p), so, INP_SOCKAF(so), INP_SOCKTYPE(so), error));
1938                 /*
1939                  * This socket should have been disconnected and flushed
1940                  * prior to being returned from sodefunct(); there should
1941                  * be no data on its receive list, so panic otherwise.
1942                  */
1943                 if (so->so_state & SS_DEFUNCT)
1944                         sb_empty_assert(sb, __func__);
1945                 socket_unlock(so, 1);
1946                 return (error);
1947         }
1948
1949         /*
1950          * When SO_WANTOOBFLAG is set we try to get out-of-band data
1951          * regardless of the flags argument. Here is the case were
1952          * out-of-band data is not inline.
1953          */
1954         if ((flags & MSG_OOB) ||
1955             ((so->so_options & SO_WANTOOBFLAG) != 0 &&
1956             (so->so_options & SO_OOBINLINE) == 0 &&
1957             (so->so_oobmark || (so->so_state & SS_RCVATMARK)))) {
1958                 m = m_get(M_WAIT, MT_DATA);
1959                 if (m == NULL) {
1960                         socket_unlock(so, 1);
1961                         KERNEL_DEBUG(DBG_FNC_SORECEIVE | DBG_FUNC_END,
1962                             ENOBUFS, 0, 0, 0, 0);
1963                         return (ENOBUFS);
1964                 }
1965                 error = (*pr->pr_usrreqs->pru_rcvoob)(so, m, flags & MSG_PEEK);
1966                 if (error)
1967                         goto bad;
1968                 socket_unlock(so, 0);
1969                 do {
1970                         error = uiomove(mtod(m, caddr_t),
1971                             imin(uio_resid(uio), m->m_len), uio);
1972                         m = m_free(m);
1973                 } while (uio_resid(uio) && error == 0 && m);
1974                 socket_lock(so, 0);
1975 bad:
1976                 if (m)
1977                         m_freem(m);
1978 #ifdef __APPLE__
1979                 if ((so->so_options & SO_WANTOOBFLAG) != 0) {
1980                         if (error == EWOULDBLOCK || error == EINVAL) {
1981                                 /*
1982                                  * Let's try to get normal data:
1983                                  * EWOULDBLOCK: out-of-band data not
1984                                  * receive yet. EINVAL: out-of-band data
1985                                  * already read.
1986                                  */
1987                                 error = 0;
1988                                 goto nooob;
1989                         } else if (error == 0 && flagsp) {
1990                                 *flagsp |= MSG_OOB;
1991                         }
1992                 }
1993                 socket_unlock(so, 1);
1994                 KERNEL_DEBUG(DBG_FNC_SORECEIVE | DBG_FUNC_END, error,
1995                     0, 0, 0, 0);
1996 #endif
1997                 return (error);
1998         }
1999 nooob:
2000         if (mp)
2001                 *mp = (struct mbuf *)0;
2002         if (so->so_state & SS_ISCONFIRMING && uio_resid(uio))
2003                 (*pr->pr_usrreqs->pru_rcvd)(so, 0);
2004
2005
2006         free_list = (struct mbuf *)0;
2007         delayed_copy_len = 0;
2008 restart:
2009 #ifdef MORE_LOCKING_DEBUG
2010         if (so->so_usecount <= 1)
2011                 printf("soreceive: sblock so=%p ref=%d on socket\n",
2012                     so, so->so_usecount);
2013 #endif
2014         /*
2015          * See if the socket has been closed (SS_NOFDREF|SS_CANTRCVMORE)
2016          * and if so just return to the caller.  This could happen when
2017          * soreceive() is called by a socket upcall function during the
2018          * time the socket is freed.  The socket buffer would have been
2019          * locked across the upcall, therefore we cannot put this thread
2020          * to sleep (else we will deadlock) or return EWOULDBLOCK (else
2021          * we may livelock), because the lock on the socket buffer will
2022          * only be released when the upcall routine returns to its caller.
2023          * Because the socket has been officially closed, there can be
2024          * no further read on it.
2025          */
2026         if ((so->so_state & (SS_NOFDREF | SS_CANTRCVMORE)) ==
2027             (SS_NOFDREF | SS_CANTRCVMORE)) {
2028                 socket_unlock(so, 1);
2029                 return (0);
2030         }
2031
2032         error = sblock(&so->so_rcv, SBLOCKWAIT(flags));
2033         if (error) {
2034                 socket_unlock(so, 1);
2035                 KERNEL_DEBUG(DBG_FNC_SORECEIVE | DBG_FUNC_END, error,
2036                     0, 0, 0, 0);
2037                 return (error);
2038         }
2039
2040         m = so->so_rcv.sb_mb;
2041         /*
2042          * If we have less data than requested, block awaiting more
2043          * (subject to any timeout) if:
2044          *   1. the current count is less than the low water mark, or
2045          *   2. MSG_WAITALL is set, and it is possible to do the entire
2046          *      receive operation at once if we block (resid <= hiwat).
2047          *   3. MSG_DONTWAIT is not set
2048          * If MSG_WAITALL is set but resid is larger than the receive buffer,
2049          * we have to do the receive in sections, and thus risk returning
2050          * a short count if a timeout or signal occurs after we start.
2051          */
2052         if (m == 0 || (((flags & MSG_DONTWAIT) == 0 &&
2053             so->so_rcv.sb_cc < uio_resid(uio)) &&
2054             (so->so_rcv.sb_cc < so->so_rcv.sb_lowat ||
2055             ((flags & MSG_WAITALL) && uio_resid(uio) <= so->so_rcv.sb_hiwat)) &&
2056             m->m_nextpkt == 0 && (pr->pr_flags & PR_ATOMIC) == 0)) {
2057                 /*
2058                  * Panic if we notice inconsistencies in the socket's
2059                  * receive list; both sb_mb and sb_cc should correctly
2060                  * reflect the contents of the list, otherwise we may
2061                  * end up with false positives during select() or poll()
2062                  * which could put the application in a bad state.
2063                  */
2064                 SB_MB_CHECK(&so->so_rcv);
2065
2066                 if (so->so_error) {
2067                         if (m)
2068                                 goto dontblock;
2069                         error = so->so_error;
2070                         if ((flags & MSG_PEEK) == 0)
2071                                 so->so_error = 0;
2072                         goto release;
2073                 }
2074                 if (so->so_state & SS_CANTRCVMORE) {
2075                         if (m)
2076                                 goto dontblock;
2077                         else
2078                                 goto release;
2079                 }
2080                 for (; m; m = m->m_next)
2081                         if (m->m_type == MT_OOBDATA || (m->m_flags & M_EOR)) {
2082                                 m = so->so_rcv.sb_mb;
2083                                 goto dontblock;
2084                         }
2085                 if ((so->so_state & (SS_ISCONNECTED|SS_ISCONNECTING)) == 0 &&
2086                     (so->so_proto->pr_flags & PR_CONNREQUIRED)) {
2087                         error = ENOTCONN;
2088                         goto release;
2089                 }
2090                 if (uio_resid(uio) == 0)
2091                         goto release;
2092                 if ((so->so_state & SS_NBIO) ||
2093                     (flags & (MSG_DONTWAIT|MSG_NBIO))) {
2094                         error = EWOULDBLOCK;
2095                         goto release;
2096                 }
2097                 SBLASTRECORDCHK(&so->so_rcv, "soreceive sbwait 1");
2098                 SBLASTMBUFCHK(&so->so_rcv, "soreceive sbwait 1");
2099                 sbunlock(&so->so_rcv, 1);
2100 #if EVEN_MORE_LOCKING_DEBUG
2101                 if (socket_debug)
2102                         printf("Waiting for socket data\n");
2103 #endif
2104
2105                 error = sbwait(&so->so_rcv);
2106 #if EVEN_MORE_LOCKING_DEBUG
2107                 if (socket_debug)
2108                         printf("SORECEIVE - sbwait returned %d\n", error);
2109 #endif
2110                 if (so->so_usecount < 1)
2111                         panic("soreceive: after 2nd sblock so=%p ref=%d on "
2112                             "socket\n", so, so->so_usecount);
2113                 if (error) {
2114                         socket_unlock(so, 1);
2115                         KERNEL_DEBUG(DBG_FNC_SORECEIVE | DBG_FUNC_END, error,
2116                             0, 0, 0, 0);
2117                         return (error);
2118                 }
2119                 goto restart;
2120         }
2121 dontblock:
2122         OSIncrementAtomicLong(&p->p_stats->p_ru.ru_msgrcv);
2123         SBLASTRECORDCHK(&so->so_rcv, "soreceive 1");
2124         SBLASTMBUFCHK(&so->so_rcv, "soreceive 1");
2125         nextrecord = m->m_nextpkt;
2126         if ((pr->pr_flags & PR_ADDR) && m->m_type == MT_SONAME) {
2127                 KASSERT(m->m_type == MT_SONAME, ("receive 1a"));
2128 #if CONFIG_MACF_SOCKET_SUBSET
2129                 /*
2130                  * Call the MAC framework for policy checking if we're in
2131                  * the user process context and the socket isn't connected.
2132                  */
2133                 if (p != kernproc && !(so->so_state & SS_ISCONNECTED)) {
2134                         struct mbuf *m0 = m;
2135                         /*
2136                          * Dequeue this record (temporarily) from the receive
2137                          * list since we're about to drop the socket's lock
2138                          * where a new record may arrive and be appended to
2139                          * the list.  Upon MAC policy failure, the record
2140                          * will be freed.  Otherwise, we'll add it back to
2141                          * the head of the list.  We cannot rely on SB_LOCK
2142                          * because append operation uses the socket's lock.
2143                          */
2144                         do {
2145                                 m->m_nextpkt = NULL;
2146                                 sbfree(&so->so_rcv, m);
2147                                 m = m->m_next;
2148                         } while (m != NULL);
2149                         m = m0;
2150                         so->so_rcv.sb_mb = nextrecord;
2151                         SB_EMPTY_FIXUP(&so->so_rcv);
2152                         SBLASTRECORDCHK(&so->so_rcv, "soreceive 1a");
2153                         SBLASTMBUFCHK(&so->so_rcv, "soreceive 1a");
2154                         socket_unlock(so, 0);
2155                         if (mac_socket_check_received(proc_ucred(p), so,
2156                             mtod(m, struct sockaddr *)) != 0) {
2157                                 /*
2158                                  * MAC policy failure; free this record and
2159                                  * process the next record (or block until
2160                                  * one is available).  We have adjusted sb_cc
2161                                  * and sb_mbcnt above so there is no need to
2162                                  * call sbfree() again.
2163                                  */
2164                                 do {
2165                                         m = m_free(m);
2166                                 } while (m != NULL);
2167                                 /*
2168                                  * Clear SB_LOCK but don't unlock the socket.
2169                                  * Process the next record or wait for one.
2170                                  */
2171                                 socket_lock(so, 0);
2172                                 sbunlock(&so->so_rcv, 1);
2173                                 goto restart;
2174                         }
2175                         socket_lock(so, 0);
2176                         /*
2177                          * If the socket has been defunct'd, drop it.
2178                          */
2179                         if (so->so_flags & SOF_DEFUNCT) {
2180                                 m_freem(m);
2181                                 error = ENOTCONN;
2182                                 goto release;
2183                         }
2184                         /*
2185                          * Re-adjust the socket receive list and re-enqueue
2186                          * the record in front of any packets which may have
2187                          * been appended while we dropped the lock.
2188                          */
2189                         for (m = m0; m->m_next != NULL; m = m->m_next)
2190                                 sballoc(&so->so_rcv, m);
2191                         sballoc(&so->so_rcv, m);
2192                         if (so->so_rcv.sb_mb == NULL) {
2193                                 so->so_rcv.sb_lastrecord = m0;
2194                                 so->so_rcv.sb_mbtail = m;
2195                         }
2196                         m = m0;
2197                         nextrecord = m->m_nextpkt = so->so_rcv.sb_mb;
2198                         so->so_rcv.sb_mb = m;
2199                         SBLASTRECORDCHK(&so->so_rcv, "soreceive 1b");
2200                         SBLASTMBUFCHK(&so->so_rcv, "soreceive 1b");
2201                 }
2202 #endif /* CONFIG_MACF_SOCKET_SUBSET */
2203                 orig_resid = 0;
2204                 if (psa) {
2205                         *psa = dup_sockaddr(mtod(m, struct sockaddr *),
2206                             mp0 == 0);
2207                         if ((*psa == 0) && (flags & MSG_NEEDSA)) {
2208                                 error = EWOULDBLOCK;
2209                                 goto release;
2210                         }
2211                 }
2212                 if (flags & MSG_PEEK) {
2213                         m = m->m_next;
2214                 } else {
2215                         sbfree(&so->so_rcv, m);
2216                         if (m->m_next == 0 && so->so_rcv.sb_cc != 0)
2217                                 panic("soreceive: about to create invalid "
2218                                     "socketbuf");
2219                         MFREE(m, so->so_rcv.sb_mb);
2220                         m = so->so_rcv.sb_mb;
2221                         if (m != NULL) {
2222                                 m->m_nextpkt = nextrecord;
2223                         } else {
2224                                 so->so_rcv.sb_mb = nextrecord;
2225                                 SB_EMPTY_FIXUP(&so->so_rcv);
2226                         }
2227                 }
2228         }
2229
2230         /*
2231          * Process one or more MT_CONTROL mbufs present before any data mbufs
2232          * in the first mbuf chain on the socket buffer.  If MSG_PEEK, we
2233          * just copy the data; if !MSG_PEEK, we call into the protocol to
2234          * perform externalization.
2235          */
2236         if (m != NULL && m->m_type == MT_CONTROL) {
2237                 struct mbuf *cm = NULL, *cmn;
2238                 struct mbuf **cme = &cm;
2239                 struct sockbuf *sb_rcv = &so->so_rcv;
2240                 struct mbuf **msgpcm = NULL;
2241
2242                 /*
2243                  * Externalizing the control messages would require us to
2244                  * drop the socket's lock below.  Once we re-acquire the
2245                  * lock, the mbuf chain might change.  In order to preserve
2246                  * consistency, we unlink all control messages from the
2247                  * first mbuf chain in one shot and link them separately
2248                  * onto a different chain.
2249                  */
2250                 do {
2251                         if (flags & MSG_PEEK) {
2252                                 if (controlp != NULL) {
2253                                         if (*controlp == NULL) {
2254                                                 msgpcm = controlp;
2255                                         }
2256                                         *controlp = m_copy(m, 0, m->m_len);
2257
2258                                         /* If we failed to allocate an mbuf,
2259                                          * release any previously allocated
2260                                          * mbufs for control data. Return
2261                                          * an error. Keep the mbufs in the
2262                                          * socket as this is using
2263                                          * MSG_PEEK flag.
2264                                          */
2265                                         if (*controlp == NULL) {
2266                                                 m_freem(*msgpcm);
2267                                                 error = ENOBUFS;
2268                                                 goto release;
2269                                         }
2270                                         controlp = &(*controlp)->m_next;
2271                                 }
2272                                 m = m->m_next;
2273                         } else {
2274                                 m->m_nextpkt = NULL;
2275                                 sbfree(sb_rcv, m);
2276                                 sb_rcv->sb_mb = m->m_next;
2277                                 m->m_next = NULL;
2278                                 *cme = m;
2279                                 cme = &(*cme)->m_next;
2280                                 m = sb_rcv->sb_mb;
2281                         }
2282                 } while (m != NULL && m->m_type == MT_CONTROL);
2283
2284                 if (!(flags & MSG_PEEK)) {
2285                         if (sb_rcv->sb_mb != NULL) {
2286                                 sb_rcv->sb_mb->m_nextpkt = nextrecord;
2287                         } else {
2288                                 sb_rcv->sb_mb = nextrecord;
2289                                 SB_EMPTY_FIXUP(sb_rcv);
2290                         }
2291                         if (nextrecord == NULL)
2292                                 sb_rcv->sb_lastrecord = m;
2293                 }
2294
2295                 SBLASTRECORDCHK(&so->so_rcv, "soreceive ctl");
2296                 SBLASTMBUFCHK(&so->so_rcv, "soreceive ctl");
2297
2298                 while (cm != NULL) {
2299                         int cmsg_type;
2300
2301                         cmn = cm->m_next;
2302                         cm->m_next = NULL;
2303                         cmsg_type = mtod(cm, struct cmsghdr *)->cmsg_type;
2304
2305                         /*
2306                          * Call the protocol to externalize SCM_RIGHTS message
2307                          * and return the modified message to the caller upon
2308                          * success.  Otherwise, all other control messages are
2309                          * returned unmodified to the caller.  Note that we
2310                          * only get into this loop if MSG_PEEK is not set.
2311                          */
2312                         if (pr->pr_domain->dom_externalize != NULL &&
2313                             cmsg_type == SCM_RIGHTS) {
2314                                 /*
2315                                  * Release socket lock: see 3903171.  This
2316                                  * would also allow more records to be appended
2317                                  * to the socket buffer.  We still have SB_LOCK
2318                                  * set on it, so we can be sure that the head
2319                                  * of the mbuf chain won't change.
2320                                  */
2321                                 socket_unlock(so, 0);
2322                                 error = (*pr->pr_domain->dom_externalize)(cm);
2323                                 socket_lock(so, 0);
2324                         } else {
2325                                 error = 0;
2326                         }
2327
2328                         if (controlp != NULL && error == 0) {
2329                                 *controlp = cm;
2330                                 controlp = &(*controlp)->m_next;
2331                                 orig_resid = 0;
2332                         } else {
2333                                 (void) m_free(cm);
2334                         }
2335                         cm = cmn;
2336                 }
2337                 /*
2338                  * Update the value of nextrecord in case we received new
2339                  * records when the socket was unlocked above for
2340                  * externalizing SCM_RIGHTS.
2341                  */
2342                 if (m != NULL)
2343                         nextrecord = sb_rcv->sb_mb->m_nextpkt;
2344                 else
2345                         nextrecord = sb_rcv->sb_mb;
2346                 orig_resid = 0;
2347         }
2348
2349         if (m != NULL) {
2350                 if (!(flags & MSG_PEEK)) {
2351                         /*
2352                          * We get here because m points to an mbuf following
2353                          * any MT_SONAME or MT_CONTROL mbufs which have been
2354                          * processed above.  In any case, m should be pointing
2355                          * to the head of the mbuf chain, and the nextrecord
2356                          * should be either NULL or equal to m->m_nextpkt.
2357                          * See comments above about SB_LOCK.
2358                          */
2359                         if (m != so->so_rcv.sb_mb || m->m_nextpkt != nextrecord)
2360                                 panic("soreceive: post-control !sync so=%p "
2361                                     "m=%p nextrecord=%p\n", so, m, nextrecord);
2362
2363                         if (nextrecord == NULL)
2364                                 so->so_rcv.sb_lastrecord = m;
2365                 }
2366                 type = m->m_type;
2367                 if (type == MT_OOBDATA)
2368                         flags |= MSG_OOB;
2369         } else {
2370                 if (!(flags & MSG_PEEK)) {
2371                         SB_EMPTY_FIXUP(&so->so_rcv);
2372                 }
2373         }
2374         SBLASTRECORDCHK(&so->so_rcv, "soreceive 2");
2375         SBLASTMBUFCHK(&so->so_rcv, "soreceive 2");
2376
2377         moff = 0;
2378         offset = 0;
2379
2380         if (!(flags & MSG_PEEK) && uio_resid(uio) > sorecvmincopy)
2381                 can_delay = 1;
2382         else
2383                 can_delay = 0;
2384
2385         need_event = 0;
2386
2387         while (m && (uio_resid(uio) - delayed_copy_len) > 0 && error == 0) {
2388                 if (m->m_type == MT_OOBDATA) {
2389                         if (type != MT_OOBDATA)
2390                                 break;
2391                 } else if (type == MT_OOBDATA) {
2392                         break;
2393                 }
2394                 /*
2395                  * Make sure to allways set MSG_OOB event when getting
2396                  * out of band data inline.
2397                  */
2398                 if ((so->so_options & SO_WANTOOBFLAG) != 0 &&
2399                     (so->so_options & SO_OOBINLINE) != 0 &&
2400                     (so->so_state & SS_RCVATMARK) != 0) {
2401                         flags |= MSG_OOB;
2402                 }
2403                 so->so_state &= ~SS_RCVATMARK;
2404                 len = uio_resid(uio) - delayed_copy_len;
2405                 if (so->so_oobmark && len > so->so_oobmark - offset)
2406                         len = so->so_oobmark - offset;
2407                 if (len > m->m_len - moff)
2408                         len = m->m_len - moff;
2409                 /*
2410                  * If mp is set, just pass back the mbufs.
2411                  * Otherwise copy them out via the uio, then free.
2412                  * Sockbuf must be consistent here (points to current mbuf,
2413                  * it points to next record) when we drop priority;
2414                  * we must note any additions to the sockbuf when we
2415                  * block interrupts again.
2416                  */
2417                 if (mp == 0) {
2418                         SBLASTRECORDCHK(&so->so_rcv, "soreceive uiomove");
2419                         SBLASTMBUFCHK(&so->so_rcv, "soreceive uiomove");
2420                         if (can_delay && len == m->m_len) {
2421                                 /*
2422                                  * only delay the copy if we're consuming the
2423                                  * mbuf and we're NOT in MSG_PEEK mode
2424                                  * and we have enough data to make it worthwile
2425                                  * to drop and retake the lock... can_delay
2426                                  * reflects the state of the 2 latter
2427                                  * constraints moff should always be zero
2428                                  * in these cases
2429                                  */
2430                                 delayed_copy_len += len;
2431                         } else {
2432                                 if (delayed_copy_len) {
2433                                         error = sodelayed_copy(so, uio,
2434                                             &free_list, &delayed_copy_len);
2435
2436                                         if (error) {
2437                                                 goto release;
2438                                         }
2439                                         /*
2440                                          * can only get here if MSG_PEEK is not
2441                                          * set therefore, m should point at the
2442                                          * head of the rcv queue; if it doesn't,
2443                                          * it means something drastically
2444                                          * changed while we were out from behind
2445                                          * the lock in sodelayed_copy. perhaps
2446                                          * a RST on the stream. in any event,
2447                                          * the stream has been interrupted. it's
2448                                          * probably best just to return whatever
2449                                          * data we've moved and let the caller
2450                                          * sort it out...
2451                                          */
2452                                         if (m != so->so_rcv.sb_mb) {
2453                                                 break;
2454                                         }
2455                                 }
2456                                 socket_unlock(so, 0);
2457                                 error = uiomove(mtod(m, caddr_t) + moff,
2458                                     (int)len, uio);
2459                                 socket_lock(so, 0);
2460
2461                                 if (error)
2462                                         goto release;
2463                         }
2464                 } else {
2465                         uio_setresid(uio, (uio_resid(uio) - len));
2466                 }
2467                 if (len == m->m_len - moff) {
2468                         if (m->m_flags & M_EOR)
2469                                 flags |= MSG_EOR;
2470                         if (flags & MSG_PEEK) {
2471                                 m = m->m_next;
2472                                 moff = 0;
2473                         } else {
2474                                 nextrecord = m->m_nextpkt;
2475                                 sbfree(&so->so_rcv, m);
2476                                 m->m_nextpkt = NULL;
2477
2478                                 if (mp) {
2479                                         *mp = m;
2480                                         mp = &m->m_next;
2481                                         so->so_rcv.sb_mb = m = m->m_next;
2482                                         *mp = (struct mbuf *)0;
2483                                 } else {
2484                                         if (free_list == NULL)
2485                                                 free_list = m;
2486                                         else
2487                                                 ml->m_next = m;
2488                                         ml = m;
2489                                         so->so_rcv.sb_mb = m = m->m_next;
2490                                         ml->m_next = 0;
2491                                 }
2492                                 if (m != NULL) {
2493                                         m->m_nextpkt = nextrecord;
2494                                         if (nextrecord == NULL)
2495                                                 so->so_rcv.sb_lastrecord = m;
2496                                 } else {
2497                                         so->so_rcv.sb_mb = nextrecord;
2498                                         SB_EMPTY_FIXUP(&so->so_rcv);
2499                                 }
2500                                 SBLASTRECORDCHK(&so->so_rcv, "soreceive 3");
2501                                 SBLASTMBUFCHK(&so->so_rcv, "soreceive 3");
2502                         }
2503                 } else {
2504                         if (flags & MSG_PEEK) {
2505                                 moff += len;
2506                         } else {
2507                                 if (mp != NULL) {
2508                                         int copy_flag;
2509
2510                                         if (flags & MSG_DONTWAIT)
2511                                                 copy_flag = M_DONTWAIT;
2512                                         else
2513                                                 copy_flag = M_WAIT;
2514                                         *mp = m_copym(m, 0, len, copy_flag);
2515                                         if (*mp == NULL) {
2516                                                 /*
2517                                                  * Failed to allocate an mbuf.
2518                                                  * Adjust uio_resid back, it was
2519                                                  * adjusted down by len bytes which
2520                                                  * we didn't copy over
2521                                                  */
2522                                                 uio_setresid(uio, (uio_resid(uio) + len));
2523                                                 break;
2524                                         }
2525                                 }
2526                                 m->m_data += len;
2527                                 m->m_len -= len;
2528                                 so->so_rcv.sb_cc -= len;
2529                         }
2530                 }
2531                 if (so->so_oobmark) {
2532                         if ((flags & MSG_PEEK) == 0) {
2533                                 so->so_oobmark -= len;
2534                                 if (so->so_oobmark == 0) {
2535                                         so->so_state |= SS_RCVATMARK;
2536                                         /*
2537                                          * delay posting the actual event until
2538                                          * after any delayed copy processing
2539                                          * has finished
2540                                          */
2541                                         need_event = 1;
2542                                         break;
2543                                 }
2544                         } else {
2545                                 offset += len;
2546                                 if (offset == so->so_oobmark)
2547                                         break;
2548                         }
2549                 }
2550                 if (flags & MSG_EOR)
2551                         break;
2552                 /*
2553                  * If the MSG_WAITALL or MSG_WAITSTREAM flag is set
2554                  * (for non-atomic socket), we must not quit until
2555                  * "uio->uio_resid == 0" or an error termination.
2556                  * If a signal/timeout occurs, return with a short
2557                  * count but without error.  Keep sockbuf locked
2558                  * against other readers.
2559                  */
2560                 while (flags & (MSG_WAITALL|MSG_WAITSTREAM) && m == 0 &&
2561                     (uio_resid(uio) - delayed_copy_len) > 0 &&
2562                     !sosendallatonce(so) && !nextrecord) {
2563                         if (so->so_error || so->so_state & SS_CANTRCVMORE)
2564                                 goto release;
2565
2566                         /*
2567                          * Depending on the protocol (e.g. TCP), the following
2568                          * might cause the socket lock to be dropped and later
2569                          * be reacquired, and more data could have arrived and
2570                          * have been appended to the receive socket buffer by
2571                          * the time it returns.  Therefore, we only sleep in
2572                          * sbwait() below if and only if the socket buffer is
2573                          * empty, in order to avoid a false sleep.
2574                          */
2575                         if (pr->pr_flags & PR_WANTRCVD && so->so_pcb &&
2576                             (((struct inpcb *)so->so_pcb)->inp_state !=
2577                             INPCB_STATE_DEAD))
2578                                 (*pr->pr_usrreqs->pru_rcvd)(so, flags);
2579
2580                         SBLASTRECORDCHK(&so->so_rcv, "soreceive sbwait 2");
2581                         SBLASTMBUFCHK(&so->so_rcv, "soreceive sbwait 2");
2582
2583                         if (so->so_rcv.sb_mb == NULL && sbwait(&so->so_rcv)) {
2584                                 error = 0;
2585                                 goto release;
2586                         }
2587                         /*
2588                          * have to wait until after we get back from the sbwait
2589                          * to do the copy because we will drop the lock if we
2590                          * have enough data that has been delayed... by dropping
2591                          * the lock we open up a window allowing the netisr
2592                          * thread to process the incoming packets and to change
2593                          * the state of this socket... we're issuing the sbwait
2594                          * because the socket is empty and we're expecting the
2595                          * netisr thread to wake us up when more packets arrive;
2596                          * if we allow that processing to happen and then sbwait
2597                          * we could stall forever with packets sitting in the
2598                          * socket if no further packets arrive from the remote
2599                          * side.
2600                          *
2601                          * we want to copy before we've collected all the data
2602                          * to satisfy this request to allow the copy to overlap
2603                          * the incoming packet processing on an MP system
2604                          */
2605                         if (delayed_copy_len > sorecvmincopy &&
2606                             (delayed_copy_len > (so->so_rcv.sb_hiwat / 2))) {
2607                                 error = sodelayed_copy(so, uio,
2608                                     &free_list, &delayed_copy_len);
2609
2610                                 if (error)
2611                                         goto release;
2612                         }
2613                         m = so->so_rcv.sb_mb;
2614                         if (m) {
2615                                 nextrecord = m->m_nextpkt;
2616                         }
2617                         SB_MB_CHECK(&so->so_rcv);
2618                 }
2619         }
2620 #ifdef MORE_LOCKING_DEBUG
2621         if (so->so_usecount <= 1)
2622                 panic("soreceive: after big while so=%p ref=%d on socket\n",
2623                     so, so->so_usecount);
2624 #endif
2625
2626         if (m && pr->pr_flags & PR_ATOMIC) {
2627 #ifdef __APPLE__
2628                 if (so->so_options & SO_DONTTRUNC) {
2629                         flags |= MSG_RCVMORE;
2630                 } else {
2631 #endif
2632                         flags |= MSG_TRUNC;
2633                         if ((flags & MSG_PEEK) == 0)
2634                                 (void) sbdroprecord(&so->so_rcv);
2635 #ifdef __APPLE__
2636                 }
2637 #endif
2638         }
2639
2640         /*
2641          * pru_rcvd below (for TCP) may cause more data to be received
2642          * if the socket lock is dropped prior to sending the ACK; some
2643          * legacy OpenTransport applications don't handle this well
2644          * (if it receives less data than requested while MSG_HAVEMORE
2645          * is set), and so we set the flag now based on what we know
2646          * prior to calling pru_rcvd.
2647          */
2648         if ((so->so_options & SO_WANTMORE) && so->so_rcv.sb_cc > 0)
2649                 flags |= MSG_HAVEMORE;
2650
2651         if ((flags & MSG_PEEK) == 0) {
2652                 if (m == 0) {
2653                         so->so_rcv.sb_mb = nextrecord;
2654                         /*
2655                          * First part is an inline SB_EMPTY_FIXUP().  Second
2656                          * part makes sure sb_lastrecord is up-to-date if
2657                          * there is still data in the socket buffer.
2658                          */
2659                         if (so->so_rcv.sb_mb == NULL) {
2660                                 so->so_rcv.sb_mbtail = NULL;
2661                                 so->so_rcv.sb_lastrecord = NULL;
2662                         } else if (nextrecord->m_nextpkt == NULL) {
2663                                 so->so_rcv.sb_lastrecord = nextrecord;
2664                         }
2665                         SB_MB_CHECK(&so->so_rcv);
2666                 }
2667                 SBLASTRECORDCHK(&so->so_rcv, "soreceive 4");
2668                 SBLASTMBUFCHK(&so->so_rcv, "soreceive 4");
2669                 if (pr->pr_flags & PR_WANTRCVD && so->so_pcb)
2670                         (*pr->pr_usrreqs->pru_rcvd)(so, flags);
2671         }
2672 #ifdef __APPLE__
2673         if (delayed_copy_len) {
2674                 error = sodelayed_copy(so, uio, &free_list, &delayed_copy_len);
2675
2676                 if (error)
2677                         goto release;
2678         }
2679         if (free_list) {
2680                 m_freem_list((struct mbuf *)free_list);
2681                 free_list = (struct mbuf *)0;
2682         }
2683         if (need_event)
2684                 postevent(so, 0, EV_OOB);
2685 #endif
2686         if (orig_resid == uio_resid(uio) && orig_resid &&
2687             (flags & MSG_EOR) == 0 && (so->so_state & SS_CANTRCVMORE) == 0) {
2688                 sbunlock(&so->so_rcv, 1);
2689                 goto restart;
2690         }
2691
2692         if (flagsp)
2693                 *flagsp |= flags;
2694 release:
2695 #ifdef MORE_LOCKING_DEBUG
2696         if (so->so_usecount <= 1)
2697                 panic("soreceive: release so=%p ref=%d on socket\n",
2698                     so, so->so_usecount);
2699 #endif
2700         if (delayed_copy_len) {
2701                 error = sodelayed_copy(so, uio, &free_list, &delayed_copy_len);
2702         }
2703         if (free_list) {
2704                 m_freem_list((struct mbuf *)free_list);
2705         }
2706         sbunlock(&so->so_rcv, 0);       /* will unlock socket */
2707
2708         // LP64todo - fix this!
2709         KERNEL_DEBUG(DBG_FNC_SORECEIVE | DBG_FUNC_END, so, uio_resid(uio),
2710             so->so_rcv.sb_cc, 0, error);
2711
2712         return (error);
2713 }
2714
2715 /*
2716  * Returns:     0                       Success
2717  *      uiomove:EFAULT
2718  */
2719 static int
2720 sodelayed_copy(struct socket *so, struct uio *uio, struct mbuf **free_list,
2721     int *resid)
2722 {
2723         int error = 0;
2724         struct mbuf *m;
2725
2726         m = *free_list;
2727
2728         socket_unlock(so, 0);
2729
2730         while (m && error == 0) {
2731
2732                 error = uiomove(mtod(m, caddr_t), (int)m->m_len, uio);
2733
2734                 m = m->m_next;
2735         }
2736         m_freem_list(*free_list);
2737
2738         *free_list = (struct mbuf *)NULL;
2739         *resid = 0;
2740
2741         socket_lock(so, 0);
2742
2743         return (error);
2744 }
2745
2746
2747 /*
2748  * Returns:     0                       Success
2749  *              EINVAL
2750  *              ENOTCONN
2751  *      <pru_shutdown>:EINVAL
2752  *      <pru_shutdown>:EADDRNOTAVAIL[TCP]
2753  *      <pru_shutdown>:ENOBUFS[TCP]
2754  *      <pru_shutdown>:EMSGSIZE[TCP]
2755  *      <pru_shutdown>:EHOSTUNREACH[TCP]
2756  *      <pru_shutdown>:ENETUNREACH[TCP]
2757  *      <pru_shutdown>:ENETDOWN[TCP]
2758  *      <pru_shutdown>:ENOMEM[TCP]
2759  *      <pru_shutdown>:EACCES[TCP]
2760  *      <pru_shutdown>:EMSGSIZE[TCP]
2761  *      <pru_shutdown>:ENOBUFS[TCP]
2762  *      <pru_shutdown>:???[TCP]         [ignorable: mostly IPSEC/firewall/DLIL]
2763  *      <pru_shutdown>:???              [other protocol families]
2764  */
2765 int
2766 soshutdown(struct socket *so, int how)
2767 {
2768         int error;
2769
2770         switch (how) {
2771         case SHUT_RD:
2772         case SHUT_WR:
2773         case SHUT_RDWR:
2774                 socket_lock(so, 1);
2775                 if ((so->so_state &
2776                     (SS_ISCONNECTED|SS_ISCONNECTING|SS_ISDISCONNECTING)) == 0) {
2777                         error = ENOTCONN;
2778                 } else {
2779                         error = soshutdownlock(so, how);
2780                 }
2781                 socket_unlock(so, 1);
2782                 break;
2783         default:
2784                 error = EINVAL;
2785                 break;
2786         }
2787
2788         return (error);
2789 }
2790
2791 int
2792 soshutdownlock(struct socket *so, int how)
2793 {
2794         struct protosw *pr = so->so_proto;
2795         int error = 0;
2796
2797         sflt_notify(so, sock_evt_shutdown, &how);
2798
2799         if (how != SHUT_WR) {
2800                 if ((so->so_state & SS_CANTRCVMORE) != 0) {
2801                         /* read already shut down */
2802                         error = ENOTCONN;
2803                         goto done;
2804                 }
2805                 sorflush(so);
2806                 postevent(so, 0, EV_RCLOSED);
2807         }
2808         if (how != SHUT_RD) {
2809                 if ((so->so_state & SS_CANTSENDMORE) != 0) {
2810                         /* write already shut down */
2811                         error = ENOTCONN;
2812                         goto done;
2813                 }
2814                 error = (*pr->pr_usrreqs->pru_shutdown)(so);
2815                 postevent(so, 0, EV_WCLOSED);
2816         }
2817 done:
2818         KERNEL_DEBUG(DBG_FNC_SOSHUTDOWN | DBG_FUNC_END, 0, 0, 0, 0, 0);
2819         return (error);
2820 }
2821
2822 void
2823 sorflush(struct socket *so)
2824 {
2825         register struct sockbuf *sb = &so->so_rcv;
2826         register struct protosw *pr = so->so_proto;
2827         struct sockbuf asb;
2828
2829 #ifdef MORE_LOCKING_DEBUG
2830         lck_mtx_t *mutex_held;
2831
2832         if (so->so_proto->pr_getlock != NULL)
2833                 mutex_held = (*so->so_proto->pr_getlock)(so, 0);
2834         else
2835                 mutex_held = so->so_proto->pr_domain->dom_mtx;
2836         lck_mtx_assert(mutex_held, LCK_MTX_ASSERT_OWNED);
2837 #endif
2838
2839         sflt_notify(so, sock_evt_flush_read, NULL);
2840
2841         sb->sb_flags |= SB_NOINTR;
2842         (void) sblock(sb, M_WAIT);
2843         socantrcvmore(so);
2844         sbunlock(sb, 1);
2845 #ifdef __APPLE__
2846         selthreadclear(&sb->sb_sel);
2847 #endif
2848         asb = *sb;
2849         bzero((caddr_t)sb, sizeof (*sb));
2850         sb->sb_so = so; /* reestablish link to socket */
2851         if (asb.sb_flags & SB_KNOTE) {
2852                 sb->sb_sel.si_note = asb.sb_sel.si_note;
2853                 sb->sb_flags = SB_KNOTE;
2854         }
2855         if (asb.sb_flags & SB_DROP)
2856                 sb->sb_flags |= SB_DROP;
2857         if (asb.sb_flags & SB_UNIX)
2858                 sb->sb_flags |= SB_UNIX;
2859         if ((pr->pr_flags & PR_RIGHTS) && pr->pr_domain->dom_dispose) {
2860                 (*pr->pr_domain->dom_dispose)(asb.sb_mb);
2861         }
2862         sbrelease(&asb);
2863 }
2864
2865 /*
2866  * Perhaps this routine, and sooptcopyout(), below, ought to come in
2867  * an additional variant to handle the case where the option value needs
2868  * to be some kind of integer, but not a specific size.
2869  * In addition to their use here, these functions are also called by the
2870  * protocol-level pr_ctloutput() routines.
2871  *
2872  * Returns:     0                       Success
2873  *              EINVAL
2874  *      copyin:EFAULT
2875  */
2876 int
2877 sooptcopyin(struct sockopt *sopt, void *buf, size_t len, size_t minlen)
2878 {
2879         size_t  valsize;
2880
2881         /*
2882          * If the user gives us more than we wanted, we ignore it,
2883          * but if we don't get the minimum length the caller
2884          * wants, we return EINVAL.  On success, sopt->sopt_valsize
2885          * is set to however much we actually retrieved.
2886          */
2887         if ((valsize = sopt->sopt_valsize) < minlen)
2888                 return (EINVAL);
2889         if (valsize > len)
2890                 sopt->sopt_valsize = valsize = len;
2891
2892         if (sopt->sopt_p != kernproc)
2893                 return (copyin(sopt->sopt_val, buf, valsize));
2894
2895         bcopy(CAST_DOWN(caddr_t, sopt->sopt_val), buf, valsize);
2896         return (0);
2897 }
2898
2899 /*
2900  * sooptcopyin_timeval
2901  *   Copy in a timeval value into tv_p, and take into account whether the
2902  *   the calling process is 64-bit or 32-bit.  Moved the sanity checking
2903  *   code here so that we can verify the 64-bit tv_sec value before we lose
2904  *   the top 32-bits assigning tv64.tv_sec to tv_p->tv_sec.
2905  */
2906 static int
2907 sooptcopyin_timeval(struct sockopt *sopt, struct timeval * tv_p)
2908 {
2909         int                     error;
2910
2911         if (proc_is64bit(sopt->sopt_p)) {
2912                 struct user64_timeval   tv64;
2913
2914                 if (sopt->sopt_valsize < sizeof(tv64)) {
2915                         return (EINVAL);
2916                 }
2917                 sopt->sopt_valsize = sizeof(tv64);
2918                 if (sopt->sopt_p != kernproc) {
2919                         error = copyin(sopt->sopt_val, &tv64, sizeof(tv64));
2920                         if (error != 0)
2921                                 return (error);
2922                 } else {
2923                         bcopy(CAST_DOWN(caddr_t, sopt->sopt_val), &tv64,
2924                                 sizeof(tv64));
2925                 }
2926                 if (tv64.tv_sec < 0 || tv64.tv_sec > LONG_MAX
2927                     || tv64.tv_usec < 0 || tv64.tv_usec >= 1000000) {
2928                         return (EDOM);
2929                 }
2930                 tv_p->tv_sec = tv64.tv_sec;
2931                 tv_p->tv_usec = tv64.tv_usec;
2932         } else {
2933                 struct user32_timeval   tv32;
2934
2935                 if (sopt->sopt_valsize < sizeof(tv32)) {
2936                         return (EINVAL);
2937                 }
2938                 sopt->sopt_valsize = sizeof(tv32);
2939                 if (sopt->sopt_p != kernproc) {
2940                         error = copyin(sopt->sopt_val, &tv32, sizeof(tv32));
2941                         if (error != 0) {
2942                                 return (error);
2943                         }
2944                 } else {
2945                         bcopy(CAST_DOWN(caddr_t, sopt->sopt_val), &tv32,
2946                               sizeof(tv32));
2947                 }
2948 #ifndef __LP64__ // K64todo "comparison is always false due to limited range of data type"
2949                 if (tv32.tv_sec < 0 || tv32.tv_sec > LONG_MAX
2950                     || tv32.tv_usec < 0 || tv32.tv_usec >= 1000000) {
2951                         return (EDOM);
2952                 }
2953 #endif
2954                 tv_p->tv_sec = tv32.tv_sec;
2955                 tv_p->tv_usec = tv32.tv_usec;
2956         }
2957         return (0);
2958 }
2959
2960 /*
2961  * Returns:     0                       Success
2962  *              EINVAL
2963  *              ENOPROTOOPT
2964  *              ENOBUFS
2965  *              EDOM
2966  *      sooptcopyin:EINVAL
2967  *      sooptcopyin:EFAULT
2968  *      sooptcopyin_timeval:EINVAL
2969  *      sooptcopyin_timeval:EFAULT
2970  *      sooptcopyin_timeval:EDOM
2971  *      <pr_ctloutput>:EOPNOTSUPP[AF_UNIX]
2972  *      <pr_ctloutput>:???w
2973  *      sflt_attach_private:???         [whatever a filter author chooses]
2974  *      <sf_setoption>:???              [whatever a filter author chooses]
2975  *
2976  * Notes:       Other <pru_listen> returns depend on the protocol family; all
2977  *              <sf_listen> returns depend on what the filter author causes
2978  *              their filter to return.
2979  */
2980 int
2981 sosetopt(struct socket *so, struct sockopt *sopt)
2982 {
2983         int     error, optval;
2984         struct  linger l;
2985         struct  timeval tv;
2986 #if CONFIG_MACF_SOCKET
2987         struct mac extmac;
2988 #endif /* MAC_SOCKET */
2989
2990         socket_lock(so, 1);
2991
2992         if ((so->so_state & (SS_CANTRCVMORE | SS_CANTSENDMORE))
2993             == (SS_CANTRCVMORE | SS_CANTSENDMORE) &&
2994             (so->so_flags & SOF_NPX_SETOPTSHUT) == 0) {
2995                 /* the socket has been shutdown, no more sockopt's */
2996                 error = EINVAL;
2997                 goto bad;
2998         }
2999
3000         if (sopt->sopt_dir != SOPT_SET) {
3001                 sopt->sopt_dir = SOPT_SET;
3002         }
3003
3004         error = sflt_setsockopt(so, sopt);
3005         if (error) {
3006                 if (error == EJUSTRETURN)
3007                         error = 0;
3008                 goto bad;
3009         }
3010
3011         error = 0;
3012         if (sopt->sopt_level != SOL_SOCKET) {
3013                 if (so->so_proto && so->so_proto->pr_ctloutput) {
3014                         error = (*so->so_proto->pr_ctloutput)(so, sopt);
3015                         socket_unlock(so, 1);
3016                         return (error);
3017                 }
3018                 error = ENOPROTOOPT;
3019         } else {
3020                 switch (sopt->sopt_name) {
3021                 case SO_LINGER:
3022                 case SO_LINGER_SEC:
3023                         error = sooptcopyin(sopt, &l, sizeof (l), sizeof (l));
3024                         if (error)
3025                                 goto bad;
3026
3027                         so->so_linger = (sopt->sopt_name == SO_LINGER) ?
3028                             l.l_linger : l.l_linger * hz;
3029                         if (l.l_onoff)
3030                                 so->so_options |= SO_LINGER;
3031                         else
3032                                 so->so_options &= ~SO_LINGER;
3033                         break;
3034
3035                 case SO_DEBUG:
3036                 case SO_KEEPALIVE:
3037                 case SO_DONTROUTE:
3038                 case SO_USELOOPBACK:
3039                 case SO_BROADCAST:
3040                 case SO_REUSEADDR:
3041                 case SO_REUSEPORT:
3042                 case SO_OOBINLINE:
3043                 case SO_TIMESTAMP:
3044                 case SO_TIMESTAMP_MONOTONIC:
3045 #ifdef __APPLE__
3046                 case SO_DONTTRUNC:
3047                 case SO_WANTMORE:
3048                 case SO_WANTOOBFLAG:
3049 #endif
3050                         error = sooptcopyin(sopt, &optval, sizeof (optval),
3051                             sizeof (optval));
3052                         if (error)
3053                                 goto bad;
3054                         if (optval)
3055                                 so->so_options |= sopt->sopt_name;
3056                         else
3057                                 so->so_options &= ~sopt->sopt_name;
3058                         break;
3059
3060                 case SO_SNDBUF:
3061                 case SO_RCVBUF:
3062                 case SO_SNDLOWAT:
3063                 case SO_RCVLOWAT:
3064                         error = sooptcopyin(sopt, &optval, sizeof (optval),
3065                             sizeof (optval));
3066                         if (error)
3067                                 goto bad;
3068
3069                         /*
3070                          * Values < 1 make no sense for any of these
3071                          * options, so disallow them.
3072                          */
3073                         if (optval < 1) {
3074                                 error = EINVAL;
3075                                 goto bad;
3076                         }
3077
3078                         switch (sopt->sopt_name) {
3079                         case SO_SNDBUF:
3080                         case SO_RCVBUF:
3081                         {
3082                                 struct sockbuf *sb = (sopt->sopt_name == SO_SNDBUF) ?
3083                                         &so->so_snd : &so->so_rcv;
3084                                 if (sbreserve(sb, (u_int32_t) optval) == 0) {
3085                                         error = ENOBUFS;
3086                                         goto bad;
3087                                 }
3088                                 sb->sb_flags |= SB_USRSIZE;
3089                                 sb->sb_flags &= ~SB_AUTOSIZE;
3090                                 sb->sb_idealsize = (u_int32_t)optval;
3091                                 break;
3092                         }
3093
3094                         /*
3095                          * Make sure the low-water is never greater than
3096                          * the high-water.
3097                          */
3098                         case SO_SNDLOWAT:
3099                                 so->so_snd.sb_lowat =
3100                                     (optval > so->so_snd.sb_hiwat) ?
3101                                     so->so_snd.sb_hiwat : optval;
3102                                 break;
3103                         case SO_RCVLOWAT:
3104                                 so->so_rcv.sb_lowat =
3105                                     (optval > so->so_rcv.sb_hiwat) ?
3106                                     so->so_rcv.sb_hiwat : optval;
3107                                 break;
3108                         }
3109                         break;
3110
3111                 case SO_SNDTIMEO:
3112                 case SO_RCVTIMEO:
3113                         error = sooptcopyin_timeval(sopt, &tv);
3114                         if (error)
3115                                 goto bad;
3116
3117                         switch (sopt->sopt_name) {
3118                         case SO_SNDTIMEO:
3119                                 so->so_snd.sb_timeo = tv;
3120                                 break;
3121                         case SO_RCVTIMEO:
3122                                 so->so_rcv.sb_timeo = tv;
3123                                 break;
3124                         }
3125                         break;
3126
3127                 case SO_NKE:
3128                 {
3129                         struct so_nke nke;
3130
3131                         error = sooptcopyin(sopt, &nke, sizeof (nke),
3132                             sizeof (nke));
3133                         if (error)
3134                                 goto bad;
3135
3136                         error = sflt_attach_internal(so, nke.nke_handle);
3137                         break;
3138                 }
3139
3140                 case SO_NOSIGPIPE:
3141                         error = sooptcopyin(sopt, &optval, sizeof (optval),
3142                             sizeof (optval));
3143                         if (error)
3144                                 goto bad;
3145                         if (optval)
3146                                 so->so_flags |= SOF_NOSIGPIPE;
3147                         else
3148                                 so->so_flags &= ~SOF_NOSIGPIPE;
3149
3150                         break;
3151
3152                 case SO_NOADDRERR:
3153                         error = sooptcopyin(sopt, &optval, sizeof (optval),
3154                             sizeof (optval));
3155                         if (error)
3156                                 goto bad;
3157                         if (optval)
3158                                 so->so_flags |= SOF_NOADDRAVAIL;
3159                         else
3160                                 so->so_flags &= ~SOF_NOADDRAVAIL;
3161
3162                         break;
3163
3164                 case SO_REUSESHAREUID:
3165                         error = sooptcopyin(sopt, &optval, sizeof (optval),
3166                             sizeof (optval));
3167                         if (error)
3168                                 goto bad;
3169                         if (optval)
3170                                 so->so_flags |= SOF_REUSESHAREUID;
3171                         else
3172                                 so->so_flags &= ~SOF_REUSESHAREUID;
3173                         break;
3174 #ifdef __APPLE_API_PRIVATE
3175                 case SO_NOTIFYCONFLICT:
3176                         if (kauth_cred_issuser(kauth_cred_get()) == 0) {
3177                                 error = EPERM;
3178                                 goto bad;
3179                         }
3180                         error = sooptcopyin(sopt, &optval, sizeof (optval),
3181                             sizeof (optval));
3182                         if (error)
3183                                 goto bad;
3184                         if (optval)
3185                                 so->so_flags |= SOF_NOTIFYCONFLICT;
3186                         else
3187                                 so->so_flags &= ~SOF_NOTIFYCONFLICT;
3188                         break;
3189 #endif
3190                 case SO_RESTRICTIONS:
3191                         if (kauth_cred_issuser(kauth_cred_get()) == 0) {
3192                                 error = EPERM;
3193                                 goto bad;
3194                         }
3195                         error = sooptcopyin(sopt, &optval, sizeof (optval),
3196                             sizeof (optval));
3197                         if (error)
3198                                 goto bad;
3199                         so->so_restrictions = (optval & (SO_RESTRICT_DENYIN |
3200                             SO_RESTRICT_DENYOUT | SO_RESTRICT_DENYSET));
3201                         break;
3202
3203                 case SO_LABEL:
3204 #if CONFIG_MACF_SOCKET
3205                         if ((error = sooptcopyin(sopt, &extmac, sizeof (extmac),
3206                             sizeof (extmac))) != 0)
3207                                 goto bad;
3208
3209                         error = mac_setsockopt_label(proc_ucred(sopt->sopt_p),
3210                             so, &extmac);
3211 #else
3212                         error = EOPNOTSUPP;
3213 #endif /* MAC_SOCKET */
3214                         break;
3215
3216 #ifdef __APPLE_API_PRIVATE
3217                 case SO_UPCALLCLOSEWAIT:
3218                         error = sooptcopyin(sopt, &optval, sizeof (optval),
3219                             sizeof (optval));
3220                         if (error)
3221                                 goto bad;
3222                         if (optval)
3223                                 so->so_flags |= SOF_UPCALLCLOSEWAIT;
3224                         else
3225                                 so->so_flags &= ~SOF_UPCALLCLOSEWAIT;
3226                         break;
3227 #endif
3228
3229                 case SO_RANDOMPORT:
3230                         error = sooptcopyin(sopt, &optval, sizeof (optval),
3231                             sizeof (optval));
3232                         if (error)
3233                                 goto bad;
3234                         if (optval)
3235                                 so->so_flags |= SOF_BINDRANDOMPORT;
3236                         else
3237                                 so->so_flags &= ~SOF_BINDRANDOMPORT;
3238                         break;
3239
3240                 case SO_NP_EXTENSIONS: {
3241                         struct so_np_extensions sonpx;
3242
3243                         error = sooptcopyin(sopt, &sonpx, sizeof(sonpx), sizeof(sonpx));
3244                         if (error)
3245                                 goto bad;
3246                         if (sonpx.npx_mask & ~SONPX_MASK_VALID) {
3247                                 error = EINVAL;
3248                                 goto bad;
3249                         }
3250                         /*
3251                          * Only one bit defined for now
3252                          */
3253                         if ((sonpx.npx_mask & SONPX_SETOPTSHUT)) {
3254                                 if ((sonpx.npx_flags & SONPX_SETOPTSHUT))
3255                                         so->so_flags |= SOF_NPX_SETOPTSHUT;
3256                                 else
3257                                         so->so_flags &= ~SOF_NPX_SETOPTSHUT;
3258                         }
3259                         break;
3260                 }
3261
3262                 case SO_TRAFFIC_CLASS: {
3263                         error = sooptcopyin(sopt, &optval, sizeof (optval),
3264                                 sizeof (optval));
3265                         if (error)
3266                                 goto bad;
3267                         error = so_set_traffic_class(so, optval);
3268                         if (error)
3269                                 goto bad;
3270                         break;
3271                 }
3272
3273                 case SO_RECV_TRAFFIC_CLASS: {
3274                         error = sooptcopyin(sopt, &optval, sizeof (optval),
3275                                 sizeof (optval));
3276                         if (error)
3277                                 goto bad;
3278                         if (optval == 0)
3279                                 so->so_flags &= ~SOF_RECV_TRAFFIC_CLASS;
3280                         else
3281                                 so->so_flags |= SOF_RECV_TRAFFIC_CLASS;
3282                         break;
3283                 }
3284
3285                 case SO_TRAFFIC_CLASS_DBG: {
3286                         struct so_tcdbg so_tcdbg;
3287
3288                         error = sooptcopyin(sopt, &so_tcdbg,
3289                             sizeof (struct so_tcdbg), sizeof (struct so_tcdbg));
3290                         if (error)
3291                                 goto bad;
3292                         error = so_set_tcdbg(so, &so_tcdbg);
3293                         if (error)
3294                                 goto bad;
3295                         break;
3296                 }
3297
3298                 case SO_PRIVILEGED_TRAFFIC_CLASS:
3299                         error = priv_check_cred(kauth_cred_get(),
3300                             PRIV_NET_PRIVILEGED_TRAFFIC_CLASS, 0);
3301                         if (error)
3302                                 goto bad;
3303                         error = sooptcopyin(sopt, &optval, sizeof (optval),
3304                                 sizeof (optval));
3305                         if (error)
3306                                 goto bad;
3307                         if (optval == 0)
3308                                 so->so_flags &= ~SOF_PRIVILEGED_TRAFFIC_CLASS;
3309                         else
3310                                 so->so_flags |= SOF_PRIVILEGED_TRAFFIC_CLASS;
3311                         break;
3312
3313                 case SO_DEFUNCTOK:
3314                         error = sooptcopyin(sopt, &optval, sizeof (optval),
3315                             sizeof (optval));
3316                         if (error != 0 || (so->so_flags & SOF_DEFUNCT)) {
3317                                 if (error == 0)
3318                                         error = EBADF;
3319                                 goto bad;
3320                         }
3321                         /*
3322                          * Any process can set SO_DEFUNCTOK (clear
3323                          * SOF_NODEFUNCT), but only root can clear
3324                          * SO_DEFUNCTOK (set SOF_NODEFUNCT).
3325                          */
3326                         if (optval == 0 &&
3327                             kauth_cred_issuser(kauth_cred_get()) == 0) {
3328                                 error = EPERM;
3329                                 goto bad;
3330                         }
3331                         if (optval)
3332                                 so->so_flags &= ~SOF_NODEFUNCT;
3333                         else
3334                                 so->so_flags |= SOF_NODEFUNCT;
3335
3336                         SODEFUNCTLOG(("%s[%d]: so %p [%d,%d] is now marked as "
3337                             "%seligible for defunct\n", __func__,
3338                             proc_selfpid(), so, INP_SOCKAF(so),
3339                             INP_SOCKTYPE(so),
3340                             (so->so_flags & SOF_NODEFUNCT) ? "not " : ""));
3341                         break;
3342
3343                 case SO_ISDEFUNCT:
3344                         /* This option is not settable */
3345                         error = EINVAL;
3346                         break;
3347
3348                 case SO_OPPORTUNISTIC:
3349                         error = sooptcopyin(sopt, &optval, sizeof (optval),
3350                             sizeof (optval));
3351                         if (error == 0)
3352                                 error = so_set_opportunistic(so, optval);
3353                         break;
3354
3355                 case SO_FLUSH:
3356                         /* This option is handled by lower layer(s) */
3357                         error = 0;
3358                         break;
3359
3360                 case SO_RECV_ANYIF:
3361                         error = sooptcopyin(sopt, &optval, sizeof (optval),
3362                             sizeof (optval));
3363                         if (error == 0)
3364                                 error = so_set_recv_anyif(so, optval);
3365                         break;
3366
3367                 default:
3368                         error = ENOPROTOOPT;
3369                         break;
3370                 }
3371                 if (error == 0 && so->so_proto && so->so_proto->pr_ctloutput) {
3372                         (void) ((*so->so_proto->pr_ctloutput)(so, sopt));
3373                 }
3374         }
3375 bad:
3376         socket_unlock(so, 1);
3377         return (error);
3378 }
3379
3380 /* Helper routines for getsockopt */
3381 int
3382 sooptcopyout(struct sockopt *sopt, void *buf, size_t len)
3383 {
3384         int     error;
3385         size_t  valsize;
3386
3387         error = 0;
3388
3389         /*
3390          * Documented get behavior is that we always return a value,
3391          * possibly truncated to fit in the user's buffer.
3392          * Traditional behavior is that we always tell the user
3393          * precisely how much we copied, rather than something useful
3394          * like the total amount we had available for her.
3395          * Note that this interface is not idempotent; the entire answer must
3396          * generated ahead of time.
3397          */
3398         valsize = min(len, sopt->sopt_valsize);
3399         sopt->sopt_valsize = valsize;
3400         if (sopt->sopt_val != USER_ADDR_NULL) {
3401                 if (sopt->sopt_p != kernproc)
3402                         error = copyout(buf, sopt->sopt_val, valsize);
3403                 else
3404                         bcopy(buf, CAST_DOWN(caddr_t, sopt->sopt_val), valsize);
3405         }
3406         return (error);
3407 }
3408
3409 static int
3410 sooptcopyout_timeval(struct sockopt *sopt, const struct timeval * tv_p)
3411 {
3412         int                     error;
3413         size_t                  len;
3414         struct user64_timeval   tv64;
3415         struct user32_timeval   tv32;
3416         const void *            val;
3417         size_t                  valsize;
3418
3419         error = 0;
3420         if (proc_is64bit(sopt->sopt_p)) {
3421                 len = sizeof(tv64);
3422                 tv64.tv_sec = tv_p->tv_sec;
3423                 tv64.tv_usec = tv_p->tv_usec;
3424                 val = &tv64;
3425         } else {
3426                 len = sizeof(tv32);
3427                 tv32.tv_sec = tv_p->tv_sec;
3428                 tv32.tv_usec = tv_p->tv_usec;
3429                 val = &tv32;
3430         }
3431         valsize = min(len, sopt->sopt_valsize);
3432         sopt->sopt_valsize = valsize;
3433         if (sopt->sopt_val != USER_ADDR_NULL) {
3434                 if (sopt->sopt_p != kernproc)
3435                         error = copyout(val, sopt->sopt_val, valsize);
3436                 else
3437                         bcopy(val, CAST_DOWN(caddr_t, sopt->sopt_val), valsize);
3438         }
3439         return (error);
3440 }
3441
3442 /*
3443  * Return:      0                       Success
3444  *              ENOPROTOOPT
3445  *      <pr_ctloutput>:EOPNOTSUPP[AF_UNIX]
3446  *      <pr_ctloutput>:???
3447  *      <sf_getoption>:???
3448  */
3449 int
3450 sogetopt(struct socket *so, struct sockopt *sopt)
3451 {
3452         int     error, optval;
3453         struct  linger l;
3454         struct  timeval tv;
3455 #if CONFIG_MACF_SOCKET
3456         struct mac extmac;
3457 #endif /* MAC_SOCKET */
3458
3459         if (sopt->sopt_dir != SOPT_GET) {
3460                 sopt->sopt_dir = SOPT_GET;
3461         }
3462
3463         socket_lock(so, 1);
3464
3465         error = sflt_getsockopt(so, sopt);
3466         if (error) {
3467                 if (error == EJUSTRETURN)
3468                         error = 0;
3469                 socket_unlock(so, 1);
3470                 return (error);
3471         }
3472
3473         error = 0;
3474         if (sopt->sopt_level != SOL_SOCKET) {
3475                 if (so->so_proto && so->so_proto->pr_ctloutput) {
3476                         error = (*so->so_proto->pr_ctloutput)(so, sopt);
3477                         socket_unlock(so, 1);
3478                         return (error);
3479                 } else {
3480                         socket_unlock(so, 1);
3481                         return (ENOPROTOOPT);
3482                 }
3483         } else {
3484                 switch (sopt->sopt_name) {
3485                 case SO_LINGER:
3486                 case SO_LINGER_SEC:
3487                         l.l_onoff = so->so_options & SO_LINGER;
3488                         l.l_linger = (sopt->sopt_name == SO_LINGER) ?
3489                             so->so_linger : so->so_linger / hz;
3490                         error = sooptcopyout(sopt, &l, sizeof (l));
3491                         break;
3492
3493                 case SO_USELOOPBACK:
3494                 case SO_DONTROUTE:
3495                 case SO_DEBUG:
3496                 case SO_KEEPALIVE:
3497                 case SO_REUSEADDR:
3498                 case SO_REUSEPORT:
3499                 case SO_BROADCAST:
3500                 case SO_OOBINLINE:
3501                 case SO_TIMESTAMP:
3502                 case SO_TIMESTAMP_MONOTONIC:
3503 #ifdef __APPLE__
3504                 case SO_DONTTRUNC:
3505                 case SO_WANTMORE:
3506                 case SO_WANTOOBFLAG:
3507 #endif
3508                         optval = so->so_options & sopt->sopt_name;
3509 integer:
3510                         error = sooptcopyout(sopt, &optval, sizeof (optval));
3511                         break;
3512
3513                 case SO_TYPE:
3514                         optval = so->so_type;
3515                         goto integer;
3516
3517 #ifdef __APPLE__
3518                 case SO_NREAD:
3519                         if (so->so_proto->pr_flags & PR_ATOMIC) {
3520                                 int pkt_total;
3521                                 struct mbuf *m1;
3522
3523                                 pkt_total = 0;
3524                                 m1 = so->so_rcv.sb_mb;
3525                                 while (m1) {
3526                                         if (m1->m_type == MT_DATA || m1->m_type == MT_HEADER ||
3527                                                 m1->m_type == MT_OOBDATA)
3528                                                 pkt_total += m1->m_len;
3529                                         m1 = m1->m_next;
3530                                 }
3531                                 optval = pkt_total;
3532                         } else {
3533                                 optval = so->so_rcv.sb_cc - so->so_rcv.sb_ctl;
3534                         }
3535                         goto integer;
3536
3537                 case SO_NWRITE:
3538                         optval = so->so_snd.sb_cc;
3539                         goto integer;
3540 #endif
3541                 case SO_ERROR:
3542                         optval = so->so_error;
3543                         so->so_error = 0;
3544                         goto integer;
3545
3546                 case SO_SNDBUF:
3547                         optval = so->so_snd.sb_hiwat;
3548                         goto integer;
3549
3550                 case SO_RCVBUF:
3551                         optval = so->so_rcv.sb_hiwat;
3552                         goto integer;
3553
3554                 case SO_SNDLOWAT:
3555                         optval = so->so_snd.sb_lowat;
3556                         goto integer;
3557
3558                 case SO_RCVLOWAT:
3559                         optval = so->so_rcv.sb_lowat;
3560                         goto integer;
3561
3562                 case SO_SNDTIMEO:
3563                 case SO_RCVTIMEO:
3564                         tv = (sopt->sopt_name == SO_SNDTIMEO ?
3565                             so->so_snd.sb_timeo : so->so_rcv.sb_timeo);
3566
3567                         error = sooptcopyout_timeval(sopt, &tv);
3568                         break;
3569
3570                 case SO_NOSIGPIPE:
3571                         optval = (so->so_flags & SOF_NOSIGPIPE);
3572                         goto integer;
3573
3574                 case SO_NOADDRERR:
3575                         optval = (so->so_flags & SOF_NOADDRAVAIL);
3576                         goto integer;
3577
3578                 case SO_REUSESHAREUID:
3579                         optval = (so->so_flags & SOF_REUSESHAREUID);
3580                         goto integer;
3581
3582 #ifdef __APPLE_API_PRIVATE
3583                 case SO_NOTIFYCONFLICT:
3584                         optval = (so->so_flags & SOF_NOTIFYCONFLICT);
3585                         goto integer;
3586 #endif
3587                 case SO_RESTRICTIONS:
3588                         optval = so->so_restrictions & (SO_RESTRICT_DENYIN |
3589                             SO_RESTRICT_DENYOUT | SO_RESTRICT_DENYSET);
3590                         goto integer;
3591
3592                 case SO_LABEL:
3593 #if CONFIG_MACF_SOCKET
3594                         if ((error = sooptcopyin(sopt, &extmac, sizeof (extmac),
3595                             sizeof (extmac))) != 0 ||
3596                             (error = mac_socket_label_get(proc_ucred(
3597                             sopt->sopt_p), so, &extmac)) != 0)
3598                                 break;
3599
3600                         error = sooptcopyout(sopt, &extmac, sizeof (extmac));
3601 #else
3602                         error = EOPNOTSUPP;
3603 #endif /* MAC_SOCKET */
3604                         break;
3605
3606                 case SO_PEERLABEL:
3607 #if CONFIG_MACF_SOCKET
3608                         if ((error = sooptcopyin(sopt, &extmac, sizeof (extmac),
3609                             sizeof (extmac))) != 0 ||
3610                             (error = mac_socketpeer_label_get(proc_ucred(
3611                             sopt->sopt_p), so, &extmac)) != 0)
3612                                 break;
3613
3614                         error = sooptcopyout(sopt, &extmac, sizeof (extmac));
3615 #else
3616                         error = EOPNOTSUPP;
3617 #endif /* MAC_SOCKET */
3618                         break;
3619
3620 #ifdef __APPLE_API_PRIVATE
3621                 case SO_UPCALLCLOSEWAIT:
3622                         optval = (so->so_flags & SOF_UPCALLCLOSEWAIT);
3623                         goto integer;
3624 #endif
3625                 case SO_RANDOMPORT:
3626                         optval = (so->so_flags & SOF_BINDRANDOMPORT);
3627                         goto integer;
3628
3629                 case SO_NP_EXTENSIONS: {
3630                         struct so_np_extensions sonpx;
3631
3632                         sonpx.npx_flags = (so->so_flags & SOF_NPX_SETOPTSHUT) ? SONPX_SETOPTSHUT : 0;
3633                         sonpx.npx_mask = SONPX_MASK_VALID;
3634
3635                         error = sooptcopyout(sopt, &sonpx, sizeof(struct so_np_extensions));
3636                         break;
3637                 }
3638
3639                 case SO_TRAFFIC_CLASS:
3640                         optval = so->so_traffic_class;
3641                         goto integer;
3642
3643                 case SO_RECV_TRAFFIC_CLASS:
3644                         optval = (so->so_flags & SOF_RECV_TRAFFIC_CLASS);
3645                         goto integer;
3646
3647                 case SO_TRAFFIC_CLASS_STATS:
3648                         error = sooptcopyout(sopt, &so->so_tc_stats, sizeof(so->so_tc_stats));
3649                         break;
3650
3651                 case SO_TRAFFIC_CLASS_DBG:
3652                         error = sogetopt_tcdbg(so, sopt);
3653                         break;
3654
3655                 case SO_PRIVILEGED_TRAFFIC_CLASS:
3656                         optval = (so->so_flags & SOF_PRIVILEGED_TRAFFIC_CLASS);
3657                         goto integer;
3658
3659                 case SO_DEFUNCTOK:
3660                         optval = !(so->so_flags & SOF_NODEFUNCT);
3661                         goto integer;
3662
3663                 case SO_ISDEFUNCT:
3664                         optval = (so->so_flags & SOF_DEFUNCT);
3665                         goto integer;
3666
3667                 case SO_OPPORTUNISTIC:
3668                         optval = so_get_opportunistic(so);
3669                         goto integer;
3670
3671                 case SO_FLUSH:
3672                         /* This option is not gettable */
3673                         error = EINVAL;
3674                         break;
3675
3676                 case SO_RECV_ANYIF:
3677                         optval = so_get_recv_anyif(so);
3678                         goto integer;
3679
3680                 default:
3681                         error = ENOPROTOOPT;
3682                         break;
3683                 }
3684                 socket_unlock(so, 1);
3685                 return (error);
3686         }
3687 }
3688 /* The size limits on our soopt_getm is different from that on FreeBSD.
3689  * We limit the size of options to MCLBYTES. This will have to change
3690  * if we need to define options that need more space than MCLBYTES.
3691  */
3692 int
3693 soopt_getm(struct sockopt *sopt, struct mbuf **mp)
3694 {
3695         struct mbuf *m, *m_prev;
3696         int sopt_size = sopt->sopt_valsize;
3697         int how;
3698
3699         if (sopt_size <= 0 || sopt_size > MCLBYTES)
3700                 return (EMSGSIZE);
3701
3702         how = sopt->sopt_p != kernproc ? M_WAIT : M_DONTWAIT;
3703         MGET(m, how, MT_DATA);
3704         if (m == 0)
3705                 return (ENOBUFS);
3706         if (sopt_size > MLEN) {
3707                 MCLGET(m, how);
3708                 if ((m->m_flags & M_EXT) == 0) {
3709                         m_free(m);
3710                         return (ENOBUFS);
3711                 }
3712                 m->m_len = min(MCLBYTES, sopt_size);
3713         } else {
3714                 m->m_len = min(MLEN, sopt_size);
3715         }
3716         sopt_size -= m->m_len;
3717         *mp = m;
3718         m_prev = m;
3719
3720         while (sopt_size > 0) {
3721                 MGET(m, how, MT_DATA);
3722                 if (m == 0) {
3723                         m_freem(*mp);
3724                         return (ENOBUFS);
3725                 }
3726                 if (sopt_size > MLEN) {
3727                         MCLGET(m, how);
3728                         if ((m->m_flags & M_EXT) == 0) {
3729                                 m_freem(*mp);
3730                                 m_freem(m);
3731                                 return (ENOBUFS);
3732                         }
3733                         m->m_len = min(MCLBYTES, sopt_size);
3734                 } else {
3735                         m->m_len = min(MLEN, sopt_size);
3736                 }
3737                 sopt_size -= m->m_len;
3738                 m_prev->m_next = m;
3739                 m_prev = m;
3740         }
3741         return (0);
3742 }
3743
3744 /* copyin sopt data into mbuf chain */
3745 int
3746 soopt_mcopyin(struct sockopt *sopt, struct mbuf *m)
3747 {
3748         struct mbuf *m0 = m;
3749
3750         if (sopt->sopt_val == USER_ADDR_NULL)
3751                 return (0);
3752         while (m != NULL && sopt->sopt_valsize >= m->m_len) {
3753                 if (sopt->sopt_p != kernproc) {
3754                         int error;
3755
3756                         error = copyin(sopt->sopt_val, mtod(m, char *),
3757                             m->m_len);
3758                         if (error != 0) {
3759                                 m_freem(m0);
3760                                 return (error);
3761                         }
3762                 } else {
3763                         bcopy(CAST_DOWN(caddr_t, sopt->sopt_val),
3764                             mtod(m, char *), m->m_len);
3765                 }
3766                 sopt->sopt_valsize -= m->m_len;
3767                 sopt->sopt_val += m->m_len;
3768                 m = m->m_next;
3769         }
3770         if (m != NULL) /* should be allocated enoughly at ip6_sooptmcopyin() */
3771                 panic("soopt_mcopyin");
3772         return (0);
3773 }
3774
3775 /* copyout mbuf chain data into soopt */
3776 int
3777 soopt_mcopyout(struct sockopt *sopt, struct mbuf *m)
3778 {
3779         struct mbuf *m0 = m;
3780         size_t valsize = 0;
3781
3782         if (sopt->sopt_val == USER_ADDR_NULL)
3783                 return (0);
3784         while (m != NULL && sopt->sopt_valsize >= m->m_len) {
3785                 if (sopt->sopt_p != kernproc) {
3786                         int error;
3787
3788                         error = copyout(mtod(m, char *), sopt->sopt_val,
3789                             m->m_len);
3790                         if (error != 0) {
3791                                 m_freem(m0);
3792                                 return (error);
3793                         }
3794                 } else {
3795                         bcopy(mtod(m, char *),
3796                             CAST_DOWN(caddr_t, sopt->sopt_val), m->m_len);
3797                 }
3798                 sopt->sopt_valsize -= m->m_len;
3799                 sopt->sopt_val += m->m_len;
3800                 valsize += m->m_len;
3801                 m = m->m_next;
3802         }
3803         if (m != NULL) {
3804                 /* enough soopt buffer should be given from user-land */
3805                 m_freem(m0);
3806                 return (EINVAL);
3807         }
3808         sopt->sopt_valsize = valsize;
3809         return (0);
3810 }
3811
3812 void
3813 sohasoutofband(struct socket *so)
3814 {
3815
3816         if (so->so_pgid < 0)
3817                 gsignal(-so->so_pgid, SIGURG);
3818         else if (so->so_pgid > 0)
3819                 proc_signal(so->so_pgid, SIGURG);
3820         selwakeup(&so->so_rcv.sb_sel);
3821 }
3822
3823 int
3824 sopoll(struct socket *so, int events, __unused kauth_cred_t cred, void * wql)
3825 {
3826         struct proc *p = current_proc();
3827         int revents = 0;
3828
3829         socket_lock(so, 1);
3830
3831         if (events & (POLLIN | POLLRDNORM))
3832                 if (soreadable(so))
3833                         revents |= events & (POLLIN | POLLRDNORM);
3834
3835         if (events & (POLLOUT | POLLWRNORM))
3836                 if (sowriteable(so))
3837                         revents |= events & (POLLOUT | POLLWRNORM);
3838
3839         if (events & (POLLPRI | POLLRDBAND))
3840                 if (so->so_oobmark || (so->so_state & SS_RCVATMARK))
3841                         revents |= events & (POLLPRI | POLLRDBAND);
3842
3843         if (revents == 0) {
3844                 if (events & (POLLIN | POLLPRI | POLLRDNORM | POLLRDBAND)) {
3845                         /*
3846                          * Darwin sets the flag first,
3847                          * BSD calls selrecord first
3848                          */
3849                         so->so_rcv.sb_flags |= SB_SEL;
3850                         selrecord(p, &so->so_rcv.sb_sel, wql);
3851                 }
3852
3853                 if (events & (POLLOUT | POLLWRNORM)) {
3854                         /*
3855                          * Darwin sets the flag first,
3856                          * BSD calls selrecord first
3857                          */
3858                         so->so_snd.sb_flags |= SB_SEL;
3859                         selrecord(p, &so->so_snd.sb_sel, wql);
3860                 }
3861         }
3862
3863         socket_unlock(so, 1);
3864         return (revents);
3865 }
3866
3867 int
3868 soo_kqfilter(__unused struct fileproc *fp, struct knote *kn,
3869     __unused struct proc *p)
3870 {
3871         struct socket *so = (struct socket *)kn->kn_fp->f_fglob->fg_data;
3872         struct klist *skl;
3873
3874         socket_lock(so, 1);
3875
3876 #if CONFIG_MACF_SOCKET
3877         if (mac_socket_check_kqfilter(proc_ucred(p), kn, so) != 0) {
3878                 socket_unlock(so, 1);
3879                 return (1);
3880         }
3881 #endif /* MAC_SOCKET */
3882
3883         switch (kn->kn_filter) {
3884         case EVFILT_READ:
3885                 kn->kn_fop = &soread_filtops;
3886                 skl = &so->so_rcv.sb_sel.si_note;
3887                 break;
3888         case EVFILT_WRITE:
3889                 kn->kn_fop = &sowrite_filtops;
3890                 skl = &so->so_snd.sb_sel.si_note;
3891                 break;
3892         case EVFILT_SOCK:
3893                 kn->kn_fop = &sock_filtops;
3894                 skl = &so->so_klist;
3895                 break;
3896         default:
3897                 socket_unlock(so, 1);
3898                 return (1);
3899         }
3900
3901         if (KNOTE_ATTACH(skl, kn)) {
3902                 switch(kn->kn_filter) {
3903                 case EVFILT_READ:
3904                         so->so_rcv.sb_flags |= SB_KNOTE;
3905                         break;
3906                 case EVFILT_WRITE:
3907                         so->so_snd.sb_flags |= SB_KNOTE;
3908                         break;
3909                 case EVFILT_SOCK:
3910                         so->so_flags |= SOF_KNOTE;
3911                         break;
3912                 default:
3913                         socket_unlock(so, 1);
3914                         return (1);
3915                 }
3916         }
3917         socket_unlock(so, 1);
3918         return (0);
3919 }
3920
3921 static void
3922 filt_sordetach(struct knote *kn)
3923 {
3924         struct socket *so = (struct socket *)kn->kn_fp->f_fglob->fg_data;
3925
3926         socket_lock(so, 1);
3927         if (so->so_rcv.sb_flags & SB_KNOTE)
3928                 if (KNOTE_DETACH(&so->so_rcv.sb_sel.si_note, kn))
3929                         so->so_rcv.sb_flags &= ~SB_KNOTE;
3930         socket_unlock(so, 1);
3931 }
3932
3933 /*ARGSUSED*/
3934 static int
3935 filt_soread(struct knote *kn, long hint)
3936 {
3937         struct socket *so = (struct socket *)kn->kn_fp->f_fglob->fg_data;
3938
3939         if ((hint & SO_FILT_HINT_LOCKED) == 0)
3940                 socket_lock(so, 1);
3941
3942         if (so->so_options & SO_ACCEPTCONN) {
3943                 int isempty;
3944
3945                 /* Radar 6615193 handle the listen case dynamically
3946                  * for kqueue read filter. This allows to call listen() after registering
3947                  * the kqueue EVFILT_READ.
3948                  */
3949
3950                 kn->kn_data = so->so_qlen;
3951                 isempty = ! TAILQ_EMPTY(&so->so_comp);
3952
3953                 if ((hint & SO_FILT_HINT_LOCKED) == 0)
3954                         socket_unlock(so, 1);
3955
3956                 return (isempty);
3957         }
3958
3959         /* socket isn't a listener */
3960
3961         kn->kn_data = so->so_rcv.sb_cc - so->so_rcv.sb_ctl;
3962
3963         if (so->so_oobmark) {
3964                 if (kn->kn_flags & EV_OOBAND) {
3965                         kn->kn_data -= so->so_oobmark;
3966                         if ((hint & SO_FILT_HINT_LOCKED) == 0)
3967                                 socket_unlock(so, 1);
3968                         return (1);
3969                 }
3970                 kn->kn_data = so->so_oobmark;
3971                 kn->kn_flags |= EV_OOBAND;
3972         } else {
3973                 if (so->so_state & SS_CANTRCVMORE) {
3974                         kn->kn_flags |= EV_EOF;
3975                         kn->kn_fflags = so->so_error;
3976                         if ((hint & SO_FILT_HINT_LOCKED) == 0)
3977                                 socket_unlock(so, 1);
3978                         return (1);
3979                 }
3980         }
3981
3982         if (so->so_state & SS_RCVATMARK) {
3983                 if (kn->kn_flags & EV_OOBAND) {
3984                         if ((hint & SO_FILT_HINT_LOCKED) == 0)
3985                                 socket_unlock(so, 1);
3986                         return (1);
3987                 }
3988                 kn->kn_flags |= EV_OOBAND;
3989         } else if (kn->kn_flags & EV_OOBAND) {
3990                 kn->kn_data = 0;
3991                 if ((hint & SO_FILT_HINT_LOCKED) == 0)
3992                         socket_unlock(so, 1);
3993                 return (0);
3994         }
3995
3996         if (so->so_error) {     /* temporary udp error */
3997                 if ((hint & SO_FILT_HINT_LOCKED) == 0)
3998                         socket_unlock(so, 1);
3999                 return (1);
4000         }
4001
4002         int64_t lowwat = so->so_rcv.sb_lowat;
4003         if (kn->kn_sfflags & NOTE_LOWAT)
4004         {
4005                 if (kn->kn_sdata > so->so_rcv.sb_hiwat)
4006                         lowwat = so->so_rcv.sb_hiwat;
4007                 else if (kn->kn_sdata > lowwat)
4008                         lowwat = kn->kn_sdata;
4009         }
4010
4011         if ((hint & SO_FILT_HINT_LOCKED) == 0)
4012                 socket_unlock(so, 1);
4013
4014         return ((kn->kn_flags & EV_OOBAND) || kn->kn_data >= lowwat);
4015 }
4016
4017 static void
4018 filt_sowdetach(struct knote *kn)
4019 {
4020         struct socket *so = (struct socket *)kn->kn_fp->f_fglob->fg_data;
4021         socket_lock(so, 1);
4022
4023         if (so->so_snd.sb_flags & SB_KNOTE)
4024                 if (KNOTE_DETACH(&so->so_snd.sb_sel.si_note, kn))
4025                         so->so_snd.sb_flags &= ~SB_KNOTE;
4026         socket_unlock(so, 1);
4027 }
4028
4029 int
4030 so_wait_for_if_feedback(struct socket *so)
4031 {
4032         if ((so->so_proto->pr_domain->dom_family == AF_INET ||
4033             so->so_proto->pr_domain->dom_family == AF_INET6) &&
4034             (so->so_state & SS_ISCONNECTED)) {
4035                 struct inpcb *inp = sotoinpcb(so);
4036                 if (INP_WAIT_FOR_IF_FEEDBACK(inp))
4037                         return (1);
4038         }
4039         return (0);
4040 }
4041
4042 /*ARGSUSED*/
4043 static int
4044 filt_sowrite(struct knote *kn, long hint)
4045 {
4046         struct socket *so = (struct socket *)kn->kn_fp->f_fglob->fg_data;
4047         int ret = 0;
4048
4049         if ((hint & SO_FILT_HINT_LOCKED) == 0)
4050                 socket_lock(so, 1);
4051
4052         kn->kn_data = sbspace(&so->so_snd);
4053         if (so->so_state & SS_CANTSENDMORE) {
4054                 kn->kn_flags |= EV_EOF;
4055                 kn->kn_fflags = so->so_error;
4056                 ret = 1;
4057                 goto out;
4058         }
4059         if (so->so_error) {     /* temporary udp error */
4060                 ret = 1;
4061                 goto out;
4062         }
4063         if (((so->so_state & SS_ISCONNECTED) == 0) &&
4064             (so->so_proto->pr_flags & PR_CONNREQUIRED)) {
4065                 ret = 0;
4066                 goto out;
4067         }
4068         int64_t lowwat = so->so_snd.sb_lowat;
4069         if (kn->kn_sfflags & NOTE_LOWAT)
4070         {
4071                 if (kn->kn_sdata > so->so_snd.sb_hiwat)
4072                         lowwat = so->so_snd.sb_hiwat;
4073                 else if (kn->kn_sdata > lowwat)
4074                         lowwat = kn->kn_sdata;
4075         }
4076         if (kn->kn_data >= lowwat) {
4077                 if ((so->so_flags & SOF_NOTSENT_LOWAT) != 0) {
4078                         ret = tcp_notsent_lowat_check(so);
4079                 } else {
4080                         ret = 1;
4081                 }
4082         }
4083         if (so_wait_for_if_feedback(so))
4084                 ret = 0;
4085 out:
4086         if ((hint & SO_FILT_HINT_LOCKED) == 0)
4087                 socket_unlock(so, 1);
4088         return(ret);
4089 }
4090
4091 static void
4092 filt_sockdetach(struct knote *kn)
4093 {
4094         struct socket *so = (struct socket *)kn->kn_fp->f_fglob->fg_data;
4095         socket_lock(so, 1);
4096
4097         if ((so->so_flags & SOF_KNOTE) != 0)
4098                 if (KNOTE_DETACH(&so->so_klist, kn))
4099                         so->so_flags &= ~SOF_KNOTE;
4100         socket_unlock(so, 1);
4101 }
4102
4103 static int
4104 filt_sockev(struct knote *kn, long hint)
4105 {
4106         int ret = 0, locked = 0;
4107         struct socket *so = (struct socket *)kn->kn_fp->f_fglob->fg_data;
4108
4109         if ((hint & SO_FILT_HINT_LOCKED) == 0) {
4110                 socket_lock(so, 1);
4111                 locked = 1;
4112         }
4113
4114         switch (hint & SO_FILT_HINT_EV) {
4115         case SO_FILT_HINT_CONNRESET:
4116                 if (kn->kn_sfflags & NOTE_CONNRESET)
4117                         kn->kn_fflags |= NOTE_CONNRESET;
4118                 break;
4119         case SO_FILT_HINT_TIMEOUT:
4120                 if (kn->kn_sfflags & NOTE_TIMEOUT)
4121                         kn->kn_fflags |= NOTE_TIMEOUT;
4122                 break;
4123         case SO_FILT_HINT_NOSRCADDR:
4124                 if (kn->kn_sfflags & NOTE_NOSRCADDR)
4125                         kn->kn_fflags |= NOTE_NOSRCADDR;
4126                 break;
4127         case SO_FILT_HINT_IFDENIED:
4128                 if ((kn->kn_sfflags & NOTE_IFDENIED))
4129                         kn->kn_fflags |= NOTE_IFDENIED;
4130                 break;
4131         case SO_FILT_HINT_KEEPALIVE:
4132                 if (kn->kn_sfflags & NOTE_KEEPALIVE)
4133                         kn->kn_fflags |= NOTE_KEEPALIVE;
4134         }
4135
4136         if ((kn->kn_sfflags & NOTE_READCLOSED) &&
4137                 (so->so_state & SS_CANTRCVMORE))
4138                 kn->kn_fflags |= NOTE_READCLOSED;
4139
4140         if ((kn->kn_sfflags & NOTE_WRITECLOSED) &&
4141                 (so->so_state & SS_CANTSENDMORE))
4142                 kn->kn_fflags |= NOTE_WRITECLOSED;
4143
4144         if ((kn->kn_sfflags & NOTE_SUSPEND) &&
4145             ((hint & SO_FILT_HINT_SUSPEND) ||
4146             (so->so_flags & SOF_SUSPENDED))) {
4147                 kn->kn_fflags &=
4148                         ~(NOTE_SUSPEND | NOTE_RESUME);
4149                 kn->kn_fflags |= NOTE_SUSPEND;
4150         }
4151
4152         if ((kn->kn_sfflags & NOTE_RESUME) &&
4153             ((hint & SO_FILT_HINT_RESUME) ||
4154             (so->so_flags & SOF_SUSPENDED) == 0)) {
4155                 kn->kn_fflags &=
4156                         ~(NOTE_SUSPEND | NOTE_RESUME);
4157                 kn->kn_fflags |= NOTE_RESUME;
4158         }
4159
4160         if (so->so_error != 0) {
4161                 ret = 1;
4162                 kn->kn_data = so->so_error;
4163                 kn->kn_flags |= EV_EOF;
4164         } else {
4165                 get_sockev_state(so, (u_int32_t *)&(kn->kn_data));
4166         }
4167
4168         if (kn->kn_fflags != 0)
4169                 ret = 1;
4170
4171         if (locked)
4172                 socket_unlock(so, 1);
4173
4174         return(ret);
4175 }
4176
4177 void
4178 get_sockev_state(struct socket *so, u_int32_t *statep) {
4179         u_int32_t state = *(statep);
4180
4181         if (so->so_state & SS_ISCONNECTED)
4182                 state |= SOCKEV_CONNECTED;
4183         else
4184                 state &= ~(SOCKEV_CONNECTED);
4185         state |= ((so->so_state & SS_ISDISCONNECTED) ?
4186                 SOCKEV_DISCONNECTED : 0);
4187         *(statep) = state;
4188         return;
4189 }
4190
4191 #define SO_LOCK_HISTORY_STR_LEN (2 * SO_LCKDBG_MAX * (2 + (2 * sizeof(void *)) + 1) + 1)
4192
4193 __private_extern__ const char * solockhistory_nr(struct socket *so)
4194 {
4195         size_t n = 0;
4196         int i;
4197         static char lock_history_str[SO_LOCK_HISTORY_STR_LEN];
4198
4199         bzero(lock_history_str, sizeof(lock_history_str));
4200         for (i = SO_LCKDBG_MAX - 1; i >= 0; i--) {
4201                 n += snprintf(lock_history_str + n, SO_LOCK_HISTORY_STR_LEN - n, "%lx:%lx ",
4202                         (uintptr_t) so->lock_lr[(so->next_lock_lr + i) % SO_LCKDBG_MAX],
4203                         (uintptr_t) so->unlock_lr[(so->next_unlock_lr + i) % SO_LCKDBG_MAX]);
4204         }
4205         return lock_history_str;
4206 }
4207
4208 int
4209 socket_lock(struct socket *so, int refcount)
4210 {
4211         int error = 0;
4212         void *lr_saved;
4213
4214         lr_saved = __builtin_return_address(0);
4215
4216         if (so->so_proto->pr_lock) {
4217                 error = (*so->so_proto->pr_lock)(so, refcount, lr_saved);
4218         } else {
4219 #ifdef MORE_LOCKING_DEBUG
4220                 lck_mtx_assert(so->so_proto->pr_domain->dom_mtx,
4221                     LCK_MTX_ASSERT_NOTOWNED);
4222 #endif
4223                 lck_mtx_lock(so->so_proto->pr_domain->dom_mtx);
4224                 if (refcount)
4225                         so->so_usecount++;
4226                 so->lock_lr[so->next_lock_lr] = lr_saved;
4227                 so->next_lock_lr = (so->next_lock_lr+1) % SO_LCKDBG_MAX;
4228         }
4229
4230         return (error);
4231 }
4232
4233 int
4234 socket_unlock(struct socket *so, int refcount)
4235 {
4236         int error = 0;
4237         void *lr_saved;
4238         lck_mtx_t *mutex_held;
4239
4240         lr_saved = __builtin_return_address(0);
4241
4242         if (so->so_proto == NULL)
4243                 panic("socket_unlock null so_proto so=%p\n", so);
4244
4245         if (so && so->so_proto->pr_unlock) {
4246                 error = (*so->so_proto->pr_unlock)(so, refcount, lr_saved);
4247         } else {
4248                 mutex_held = so->so_proto->pr_domain->dom_mtx;
4249 #ifdef MORE_LOCKING_DEBUG
4250                 lck_mtx_assert(mutex_held, LCK_MTX_ASSERT_OWNED);
4251 #endif
4252                 so->unlock_lr[so->next_unlock_lr] = lr_saved;
4253                 so->next_unlock_lr = (so->next_unlock_lr+1) % SO_LCKDBG_MAX;
4254
4255                 if (refcount) {
4256                         if (so->so_usecount <= 0)
4257                                 panic("socket_unlock: bad refcount=%d so=%p (%d, %d, %d) lrh=%s",
4258                                     so->so_usecount, so, so->so_proto->pr_domain->dom_family,
4259                                     so->so_type, so->so_proto->pr_protocol,
4260                                     solockhistory_nr(so));
4261
4262                         so->so_usecount--;
4263                         if (so->so_usecount == 0) {
4264                                 sofreelastref(so, 1);
4265                         }
4266                 }
4267                 lck_mtx_unlock(mutex_held);
4268         }
4269
4270         return (error);
4271 }
4272
4273 /* Called with socket locked, will unlock socket */
4274 void
4275 sofree(struct socket *so)
4276 {
4277
4278         lck_mtx_t *mutex_held;
4279         if (so->so_proto->pr_getlock != NULL)
4280                 mutex_held = (*so->so_proto->pr_getlock)(so, 0);
4281         else
4282                 mutex_held = so->so_proto->pr_domain->dom_mtx;
4283         lck_mtx_assert(mutex_held, LCK_MTX_ASSERT_OWNED);
4284
4285         sofreelastref(so, 0);
4286 }
4287
4288 void
4289 soreference(struct socket *so)
4290 {
4291         socket_lock(so, 1);     /* locks & take one reference on socket */
4292         socket_unlock(so, 0);   /* unlock only */
4293 }
4294
4295 void
4296 sodereference(struct socket *so)
4297 {
4298         socket_lock(so, 0);
4299         socket_unlock(so, 1);
4300 }
4301
4302 /*
4303  * Set or clear SOF_MULTIPAGES on the socket to enable or disable the
4304  * possibility of using jumbo clusters.  Caller must ensure to hold
4305  * the socket lock.
4306  */
4307 void
4308 somultipages(struct socket *so, boolean_t set)
4309 {
4310         if (set)
4311                 so->so_flags |= SOF_MULTIPAGES;
4312         else
4313                 so->so_flags &= ~SOF_MULTIPAGES;
4314 }
4315
4316 int
4317 so_isdstlocal(struct socket *so) {
4318
4319         struct inpcb *inp = (struct inpcb *)so->so_pcb;
4320
4321         if (so->so_proto->pr_domain->dom_family == AF_INET) {
4322                 return inaddr_local(inp->inp_faddr);
4323         } else if (so->so_proto->pr_domain->dom_family == AF_INET6) {
4324                 return in6addr_local(&inp->in6p_faddr);
4325         }
4326         return 0;
4327 }
4328
4329 int
4330 sosetdefunct(struct proc *p, struct socket *so, int level, boolean_t noforce)
4331 {
4332         int err = 0, defunct;
4333
4334         defunct = (so->so_flags & SOF_DEFUNCT);
4335         if (defunct) {
4336                 if (!(so->so_snd.sb_flags & so->so_rcv.sb_flags & SB_DROP))
4337                         panic("%s: SB_DROP not set", __func__);
4338                 goto done;
4339         }
4340
4341         if (so->so_flags & SOF_NODEFUNCT) {
4342                 if (noforce) {
4343                         err = EOPNOTSUPP;
4344                         SODEFUNCTLOG(("%s[%d]: (target pid %d level %d) so %p "
4345                             "[%d,%d] is not eligible for defunct (%d)\n",
4346                             __func__, proc_selfpid(), proc_pid(p), level, so,
4347                             INP_SOCKAF(so), INP_SOCKTYPE(so), err));
4348                         return (err);
4349                 }
4350                 so->so_flags &= ~SOF_NODEFUNCT;
4351                 SODEFUNCTLOG(("%s[%d]: (target pid %d level %d) so %p [%d,%d] "
4352                     "defunct by force\n", __func__, proc_selfpid(), proc_pid(p),
4353                     level, so, INP_SOCKAF(so), INP_SOCKTYPE(so)));
4354         }
4355
4356         so->so_flags |= SOF_DEFUNCT;
4357         /* Prevent further data from being appended to the socket buffers */
4358         so->so_snd.sb_flags |= SB_DROP;
4359         so->so_rcv.sb_flags |= SB_DROP;
4360
4361 done:
4362         SODEFUNCTLOG(("%s[%d]: (target pid %d level %d) so %p [%d,%d] %s "
4363             "defunct\n", __func__, proc_selfpid(), proc_pid(p), level, so,
4364             INP_SOCKAF(so), INP_SOCKTYPE(so),
4365             defunct ? "is already" : "marked as"));
4366
4367         return (err);
4368 }
4369
4370 int
4371 sodefunct(struct proc *p, struct socket *so, int level)
4372 {
4373         struct sockbuf *rcv, *snd;
4374
4375         if (!(so->so_flags & SOF_DEFUNCT))
4376                 panic("%s improperly called", __func__);
4377
4378         if (so->so_state & SS_DEFUNCT)
4379                 goto done;
4380
4381         rcv = &so->so_rcv;
4382         snd = &so->so_snd;
4383
4384         SODEFUNCTLOG(("%s[%d]: (target pid %d level %d) so %p [%d,%d] is now "
4385             "defunct [rcv_si 0x%x, snd_si 0x%x, rcv_fl 0x%x, snd_fl 0x%x]\n",
4386             __func__, proc_selfpid(), proc_pid(p), level, so,
4387             INP_SOCKAF(so), INP_SOCKTYPE(so),
4388             (uint32_t)rcv->sb_sel.si_flags, (uint32_t)snd->sb_sel.si_flags,
4389             (uint16_t)rcv->sb_flags, (uint16_t)snd->sb_flags));
4390
4391         /*
4392          * Unwedge threads blocked on sbwait() and sb_lock().
4393          */
4394         sbwakeup(rcv);
4395         sbwakeup(snd);
4396
4397         if (rcv->sb_flags & SB_LOCK)
4398                 sbunlock(rcv, 1);
4399         if (snd->sb_flags & SB_LOCK)
4400                 sbunlock(snd, 1);
4401
4402         /*
4403          * Flush the buffers and disconnect.  We explicitly call shutdown
4404          * on both data directions to ensure that SS_CANT{RCV,SEND}MORE
4405          * states are set for the socket.  This would also flush out data
4406          * hanging off the receive list of this socket.
4407          */
4408         (void) soshutdownlock(so, SHUT_RD);
4409         (void) soshutdownlock(so, SHUT_WR);
4410         (void) sodisconnectlocked(so);
4411
4412         /*
4413          * Explicitly handle connectionless-protocol disconnection
4414          * and release any remaining data in the socket buffers.
4415          */
4416         if (!(so->so_flags & SS_ISDISCONNECTED))
4417                 (void) soisdisconnected(so);
4418
4419         if (so->so_error == 0)
4420                 so->so_error = EBADF;
4421
4422         if (rcv->sb_cc != 0)
4423                 sbrelease(rcv);
4424         if (snd->sb_cc != 0)
4425                 sbrelease(snd);
4426
4427         so->so_state |= SS_DEFUNCT;
4428
4429 done:
4430         return (0);
4431 }
4432
4433 __private_extern__ int
4434 so_set_recv_anyif(struct socket *so, int optval)
4435 {
4436         int ret = 0;
4437
4438 #if INET6
4439         if (INP_SOCKAF(so) == AF_INET || INP_SOCKAF(so) == AF_INET6) {
4440 #else
4441         if (INP_SOCKAF(so) == AF_INET) {
4442 #endif /* !INET6 */
4443                 if (optval)
4444                         sotoinpcb(so)->inp_flags |= INP_RECV_ANYIF;
4445                 else
4446                         sotoinpcb(so)->inp_flags &= ~INP_RECV_ANYIF;
4447         } else {
4448                 ret = EPROTONOSUPPORT;
4449         }
4450
4451         return (ret);
4452 }
4453
4454 __private_extern__ int
4455 so_get_recv_anyif(struct socket *so)
4456 {
4457         int ret = 0;
4458
4459 #if INET6
4460         if (INP_SOCKAF(so) == AF_INET || INP_SOCKAF(so) == AF_INET6) {
4461 #else
4462         if (INP_SOCKAF(so) == AF_INET) {
4463 #endif /* !INET6 */
4464                 ret = (sotoinpcb(so)->inp_flags & INP_RECV_ANYIF) ? 1 : 0;
4465         }
4466
4467         return (ret);
4468 }