bsd/kern/uipc_socket.c

   1 /*
   2  * Copyright (c) 1998-2015 Apple Inc. All rights reserved.
   3  *
   4  * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
   5  *
   6  * This file contains Original Code and/or Modifications of Original Code
   7  * as defined in and that are subject to the Apple Public Source License
   8  * Version 2.0 (the 'License'). You may not use this file except in
   9  * compliance with the License. The rights granted to you under the License
  10  * may not be used to create, or enable the creation or redistribution of,
  11  * unlawful or unlicensed copies of an Apple operating system, or to
  12  * circumvent, violate, or enable the circumvention or violation of, any
  13  * terms of an Apple operating system software license agreement.
  14  *
  15  * Please obtain a copy of the License at
  16  * http://www.opensource.apple.com/apsl/ and read it before using this file.
  17  *
  18  * The Original Code and all software distributed under the License are
  19  * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
  20  * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
  21  * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
  22  * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
  23  * Please see the License for the specific language governing rights and
  24  * limitations under the License.
  25  *
  26  * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
  27  */
  28 /* Copyright (c) 1995 NeXT Computer, Inc. All Rights Reserved */
  29 /*
  30  * Copyright (c) 1982, 1986, 1988, 1990, 1993
  31  *      The Regents of the University of California.  All rights reserved.
  32  *
  33  * Redistribution and use in source and binary forms, with or without
  34  * modification, are permitted provided that the following conditions
  35  * are met:
  36  * 1. Redistributions of source code must retain the above copyright
  37  *    notice, this list of conditions and the following disclaimer.
  38  * 2. Redistributions in binary form must reproduce the above copyright
  39  *    notice, this list of conditions and the following disclaimer in the
  40  *    documentation and/or other materials provided with the distribution.
  41  * 3. All advertising materials mentioning features or use of this software
  42  *    must display the following acknowledgement:
  43  *      This product includes software developed by the University of
  44  *      California, Berkeley and its contributors.
  45  * 4. Neither the name of the University nor the names of its contributors
  46  *    may be used to endorse or promote products derived from this software
  47  *    without specific prior written permission.
  48  *
  49  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
  50  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  51  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  52  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  53  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  54  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  55  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  56  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  57  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  58  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  59  * SUCH DAMAGE.
  60  *
  61  *      @(#)uipc_socket.c       8.3 (Berkeley) 4/15/94
  62  */
  63 /*
  64  * NOTICE: This file was modified by SPARTA, Inc. in 2005 to introduce
  65  * support for mandatory and extensible security protections.  This notice
  66  * is included in support of clause 2.2 (b) of the Apple Public License,
  67  * Version 2.0.
  68  */
  69
  70 #include <sys/param.h>
  71 #include <sys/systm.h>
  72 #include <sys/filedesc.h>
  73 #include <sys/proc.h>
  74 #include <sys/proc_internal.h>
  75 #include <sys/kauth.h>
  76 #include <sys/file_internal.h>
  77 #include <sys/fcntl.h>
  78 #include <sys/malloc.h>
  79 #include <sys/mbuf.h>
  80 #include <sys/domain.h>
  81 #include <sys/kernel.h>
  82 #include <sys/event.h>
  83 #include <sys/poll.h>
  84 #include <sys/protosw.h>
  85 #include <sys/socket.h>
  86 #include <sys/socketvar.h>
  87 #include <sys/resourcevar.h>
  88 #include <sys/signalvar.h>
  89 #include <sys/sysctl.h>
  90 #include <sys/syslog.h>
  91 #include <sys/uio.h>
  92 #include <sys/uio_internal.h>
  93 #include <sys/ev.h>
  94 #include <sys/kdebug.h>
  95 #include <sys/un.h>
  96 #include <sys/user.h>
  97 #include <sys/priv.h>
  98 #include <sys/kern_event.h>
  99 #include <net/route.h>
 100 #include <net/init.h>
 101 #include <net/ntstat.h>
 102 #include <net/content_filter.h>
 103 #include <netinet/in.h>
 104 #include <netinet/in_pcb.h>
 105 #include <netinet/ip6.h>
 106 #include <netinet6/ip6_var.h>
 107 #include <netinet/flow_divert.h>
 108 #include <kern/zalloc.h>
 109 #include <kern/locks.h>
 110 #include <machine/limits.h>
 111 #include <libkern/OSAtomic.h>
 112 #include <pexpert/pexpert.h>
 113 #include <kern/assert.h>
 114 #include <kern/task.h>
 115 #include <sys/kpi_mbuf.h>
 116 #include <sys/mcache.h>
 117 #include <sys/unpcb.h>
 118
 119 #if CONFIG_MACF
 120 #include <security/mac.h>
 121 #include <security/mac_framework.h>
 122 #endif /* MAC */
 123
 124 #if MULTIPATH
 125 #include <netinet/mp_pcb.h>
 126 #include <netinet/mptcp_var.h>
 127 #endif /* MULTIPATH */
 128
 129 #define ROUNDUP(a, b) (((a) + ((b) - 1)) & (~((b) - 1)))
 130
 131 #if DEBUG || DEVELOPMENT
 132 #define DEBUG_KERNEL_ADDRPERM(_v) (_v)
 133 #else
 134 #define DEBUG_KERNEL_ADDRPERM(_v) VM_KERNEL_ADDRPERM(_v)
 135 #endif
 136
 137 /* TODO: this should be in a header file somewhere */
 138 extern char *proc_name_address(void *p);
 139
 140 static u_int32_t        so_cache_hw;    /* High water mark for socache */
 141 static u_int32_t        so_cache_timeouts;      /* number of timeouts */
 142 static u_int32_t        so_cache_max_freed;     /* max freed per timeout */
 143 static u_int32_t        cached_sock_count = 0;
 144 STAILQ_HEAD(, socket)   so_cache_head;
 145 int     max_cached_sock_count = MAX_CACHED_SOCKETS;
 146 static u_int32_t        so_cache_time;
 147 static int              socketinit_done;
 148 static struct zone      *so_cache_zone;
 149
 150 static lck_grp_t        *so_cache_mtx_grp;
 151 static lck_attr_t       *so_cache_mtx_attr;
 152 static lck_grp_attr_t   *so_cache_mtx_grp_attr;
 153 static lck_mtx_t        *so_cache_mtx;
 154
 155 #include <machine/limits.h>
 156
 157 static void     filt_sordetach(struct knote *kn);
 158 static int      filt_soread(struct knote *kn, long hint);
 159 static void     filt_sowdetach(struct knote *kn);
 160 static int      filt_sowrite(struct knote *kn, long hint);
 161 static void     filt_sockdetach(struct knote *kn);
 162 static int      filt_sockev(struct knote *kn, long hint);
 163 static void     filt_socktouch(struct knote *kn, struct kevent_internal_s *kev,
 164     long type);
 165
 166 static int sooptcopyin_timeval(struct sockopt *, struct timeval *);
 167 static int sooptcopyout_timeval(struct sockopt *, const struct timeval *);
 168
 169 static struct filterops soread_filtops = {
 170         .f_isfd = 1,
 171         .f_detach = filt_sordetach,
 172         .f_event = filt_soread,
 173 };
 174
 175 static struct filterops sowrite_filtops = {
 176         .f_isfd = 1,
 177         .f_detach = filt_sowdetach,
 178         .f_event = filt_sowrite,
 179 };
 180
 181 static struct filterops sock_filtops = {
 182         .f_isfd = 1,
 183         .f_detach = filt_sockdetach,
 184         .f_event = filt_sockev,
 185         .f_touch = filt_socktouch,
 186 };
 187
 188 SYSCTL_DECL(_kern_ipc);
 189
 190 #define EVEN_MORE_LOCKING_DEBUG 0
 191
 192 int socket_debug = 0;
 193 SYSCTL_INT(_kern_ipc, OID_AUTO, socket_debug,
 194         CTLFLAG_RW | CTLFLAG_LOCKED, &socket_debug, 0, "");
 195
 196 static int socket_zone = M_SOCKET;
 197 so_gen_t        so_gencnt;      /* generation count for sockets */
 198
 199 MALLOC_DEFINE(M_SONAME, "soname", "socket name");
 200 MALLOC_DEFINE(M_PCB, "pcb", "protocol control block");
 201
 202 #define DBG_LAYER_IN_BEG        NETDBG_CODE(DBG_NETSOCK, 0)
 203 #define DBG_LAYER_IN_END        NETDBG_CODE(DBG_NETSOCK, 2)
 204 #define DBG_LAYER_OUT_BEG       NETDBG_CODE(DBG_NETSOCK, 1)
 205 #define DBG_LAYER_OUT_END       NETDBG_CODE(DBG_NETSOCK, 3)
 206 #define DBG_FNC_SOSEND          NETDBG_CODE(DBG_NETSOCK, (4 << 8) | 1)
 207 #define DBG_FNC_SOSEND_LIST     NETDBG_CODE(DBG_NETSOCK, (4 << 8) | 3)
 208 #define DBG_FNC_SORECEIVE       NETDBG_CODE(DBG_NETSOCK, (8 << 8))
 209 #define DBG_FNC_SORECEIVE_LIST  NETDBG_CODE(DBG_NETSOCK, (8 << 8) | 3)
 210 #define DBG_FNC_SOSHUTDOWN      NETDBG_CODE(DBG_NETSOCK, (9 << 8))
 211
 212 #define MAX_SOOPTGETM_SIZE      (128 * MCLBYTES)
 213
 214 int somaxconn = SOMAXCONN;
 215 SYSCTL_INT(_kern_ipc, KIPC_SOMAXCONN, somaxconn,
 216         CTLFLAG_RW | CTLFLAG_LOCKED, &somaxconn, 0, "");
 217
 218 /* Should we get a maximum also ??? */
 219 static int sosendmaxchain = 65536;
 220 static int sosendminchain = 16384;
 221 static int sorecvmincopy  = 16384;
 222 SYSCTL_INT(_kern_ipc, OID_AUTO, sosendminchain,
 223         CTLFLAG_RW | CTLFLAG_LOCKED, &sosendminchain, 0, "");
 224 SYSCTL_INT(_kern_ipc, OID_AUTO, sorecvmincopy,
 225         CTLFLAG_RW | CTLFLAG_LOCKED, &sorecvmincopy, 0, "");
 226
 227 /*
 228  * Set to enable jumbo clusters (if available) for large writes when
 229  * the socket is marked with SOF_MULTIPAGES; see below.
 230  */
 231 int sosendjcl = 1;
 232 SYSCTL_INT(_kern_ipc, OID_AUTO, sosendjcl,
 233         CTLFLAG_RW | CTLFLAG_LOCKED, &sosendjcl, 0, "");
 234
 235 /*
 236  * Set this to ignore SOF_MULTIPAGES and use jumbo clusters for large
 237  * writes on the socket for all protocols on any network interfaces,
 238  * depending upon sosendjcl above.  Be extra careful when setting this
 239  * to 1, because sending down packets that cross physical pages down to
 240  * broken drivers (those that falsely assume that the physical pages
 241  * are contiguous) might lead to system panics or silent data corruption.
 242  * When set to 0, the system will respect SOF_MULTIPAGES, which is set
 243  * only for TCP sockets whose outgoing interface is IFNET_MULTIPAGES
 244  * capable.  Set this to 1 only for testing/debugging purposes.
 245  */
 246 int sosendjcl_ignore_capab = 0;
 247 SYSCTL_INT(_kern_ipc, OID_AUTO, sosendjcl_ignore_capab,
 248         CTLFLAG_RW | CTLFLAG_LOCKED, &sosendjcl_ignore_capab, 0, "");
 249
 250 /*
 251  * Set this to ignore SOF1_IF_2KCL and use big clusters for large
 252  * writes on the socket for all protocols on any network interfaces.
 253  * Be extra careful when setting this to 1, because sending down packets with
 254  * clusters larger that 2 KB might lead to system panics or data corruption.
 255  * When set to 0, the system will respect SOF1_IF_2KCL, which is set
 256  * on the outgoing interface
 257  * Set this to 1  for testing/debugging purposes only.
 258  */
 259 int sosendbigcl_ignore_capab = 0;
 260 SYSCTL_INT(_kern_ipc, OID_AUTO, sosendbigcl_ignore_capab,
 261         CTLFLAG_RW | CTLFLAG_LOCKED, &sosendbigcl_ignore_capab, 0, "");
 262
 263 int sodefunctlog = 0;
 264 SYSCTL_INT(_kern_ipc, OID_AUTO, sodefunctlog, CTLFLAG_RW | CTLFLAG_LOCKED,
 265         &sodefunctlog, 0, "");
 266
 267 int sothrottlelog = 0;
 268 SYSCTL_INT(_kern_ipc, OID_AUTO, sothrottlelog, CTLFLAG_RW | CTLFLAG_LOCKED,
 269         &sothrottlelog, 0, "");
 270
 271 int sorestrictrecv = 1;
 272 SYSCTL_INT(_kern_ipc, OID_AUTO, sorestrictrecv, CTLFLAG_RW | CTLFLAG_LOCKED,
 273         &sorestrictrecv, 0, "Enable inbound interface restrictions");
 274
 275 int sorestrictsend = 1;
 276 SYSCTL_INT(_kern_ipc, OID_AUTO, sorestrictsend, CTLFLAG_RW | CTLFLAG_LOCKED,
 277         &sorestrictsend, 0, "Enable outbound interface restrictions");
 278
 279 int soreserveheadroom = 1;
 280 SYSCTL_INT(_kern_ipc, OID_AUTO, soreserveheadroom, CTLFLAG_RW | CTLFLAG_LOCKED,
 281         &soreserveheadroom, 0, "To allocate contiguous datagram buffers");
 282
 283 extern struct inpcbinfo tcbinfo;
 284
 285 /* TODO: these should be in header file */
 286 extern int get_inpcb_str_size(void);
 287 extern int get_tcp_str_size(void);
 288
 289 static unsigned int sl_zone_size;               /* size of sockaddr_list */
 290 static struct zone *sl_zone;                    /* zone for sockaddr_list */
 291
 292 static unsigned int se_zone_size;               /* size of sockaddr_entry */
 293 static struct zone *se_zone;                    /* zone for sockaddr_entry */
 294
 295 vm_size_t       so_cache_zone_element_size;
 296
 297 static int sodelayed_copy(struct socket *, struct uio *, struct mbuf **,
 298     user_ssize_t *);
 299 static void cached_sock_alloc(struct socket **, int);
 300 static void cached_sock_free(struct socket *);
 301
 302 /*
 303  * Maximum of extended background idle sockets per process
 304  * Set to zero to disable further setting of the option
 305  */
 306
 307 #define SO_IDLE_BK_IDLE_MAX_PER_PROC    1
 308 #define SO_IDLE_BK_IDLE_TIME            600
 309 #define SO_IDLE_BK_IDLE_RCV_HIWAT       131072
 310
 311 struct soextbkidlestat soextbkidlestat;
 312
 313 SYSCTL_UINT(_kern_ipc, OID_AUTO, maxextbkidleperproc,
 314         CTLFLAG_RW | CTLFLAG_LOCKED, &soextbkidlestat.so_xbkidle_maxperproc, 0,
 315         "Maximum of extended background idle sockets per process");
 316
 317 SYSCTL_UINT(_kern_ipc, OID_AUTO, extbkidletime, CTLFLAG_RW | CTLFLAG_LOCKED,
 318         &soextbkidlestat.so_xbkidle_time, 0,
 319         "Time in seconds to keep extended background idle sockets");
 320
 321 SYSCTL_UINT(_kern_ipc, OID_AUTO, extbkidlercvhiwat, CTLFLAG_RW | CTLFLAG_LOCKED,
 322         &soextbkidlestat.so_xbkidle_rcvhiwat, 0,
 323         "High water mark for extended background idle sockets");
 324
 325 SYSCTL_STRUCT(_kern_ipc, OID_AUTO, extbkidlestat, CTLFLAG_RD | CTLFLAG_LOCKED,
 326         &soextbkidlestat, soextbkidlestat, "");
 327
 328 int so_set_extended_bk_idle(struct socket *, int);
 329
 330 /*
 331  * SOTCDB_NO_DSCP is set by default, to prevent the networking stack from
 332  * setting the DSCP code on the packet based on the service class; see
 333  * <rdar://problem/11277343> for details.
 334  */
 335 __private_extern__ u_int32_t sotcdb = SOTCDB_NO_DSCP;
 336 SYSCTL_INT(_kern_ipc, OID_AUTO, sotcdb, CTLFLAG_RW | CTLFLAG_LOCKED,
 337         &sotcdb, 0, "");
 338
 339 void
 340 socketinit(void)
 341 {
 342         _CASSERT(sizeof(so_gencnt) == sizeof(uint64_t));
 343         VERIFY(IS_P2ALIGNED(&so_gencnt, sizeof(uint32_t)));
 344
 345 #ifdef __LP64__
 346         _CASSERT(sizeof(struct sa_endpoints) == sizeof(struct user64_sa_endpoints));
 347         _CASSERT(offsetof(struct sa_endpoints, sae_srcif) == offsetof(struct user64_sa_endpoints, sae_srcif));
 348         _CASSERT(offsetof(struct sa_endpoints, sae_srcaddr) == offsetof(struct user64_sa_endpoints, sae_srcaddr));
 349         _CASSERT(offsetof(struct sa_endpoints, sae_srcaddrlen) == offsetof(struct user64_sa_endpoints, sae_srcaddrlen));
 350         _CASSERT(offsetof(struct sa_endpoints, sae_dstaddr) == offsetof(struct user64_sa_endpoints, sae_dstaddr));
 351         _CASSERT(offsetof(struct sa_endpoints, sae_dstaddrlen) == offsetof(struct user64_sa_endpoints, sae_dstaddrlen));
 352 #else
 353         _CASSERT(sizeof(struct sa_endpoints) == sizeof(struct user32_sa_endpoints));
 354         _CASSERT(offsetof(struct sa_endpoints, sae_srcif) == offsetof(struct user32_sa_endpoints, sae_srcif));
 355         _CASSERT(offsetof(struct sa_endpoints, sae_srcaddr) == offsetof(struct user32_sa_endpoints, sae_srcaddr));
 356         _CASSERT(offsetof(struct sa_endpoints, sae_srcaddrlen) == offsetof(struct user32_sa_endpoints, sae_srcaddrlen));
 357         _CASSERT(offsetof(struct sa_endpoints, sae_dstaddr) == offsetof(struct user32_sa_endpoints, sae_dstaddr));
 358         _CASSERT(offsetof(struct sa_endpoints, sae_dstaddrlen) == offsetof(struct user32_sa_endpoints, sae_dstaddrlen));
 359 #endif
 360
 361         if (socketinit_done) {
 362                 printf("socketinit: already called...\n");
 363                 return;
 364         }
 365         socketinit_done = 1;
 366
 367         PE_parse_boot_argn("socket_debug", &socket_debug,
 368             sizeof (socket_debug));
 369
 370         /*
 371          * allocate lock group attribute and group for socket cache mutex
 372          */
 373         so_cache_mtx_grp_attr = lck_grp_attr_alloc_init();
 374         so_cache_mtx_grp = lck_grp_alloc_init("so_cache",
 375             so_cache_mtx_grp_attr);
 376
 377         /*
 378          * allocate the lock attribute for socket cache mutex
 379          */
 380         so_cache_mtx_attr = lck_attr_alloc_init();
 381
 382         /* cached sockets mutex */
 383         so_cache_mtx = lck_mtx_alloc_init(so_cache_mtx_grp, so_cache_mtx_attr);
 384         if (so_cache_mtx == NULL) {
 385                 panic("%s: unable to allocate so_cache_mtx\n", __func__);
 386                 /* NOTREACHED */
 387         }
 388         STAILQ_INIT(&so_cache_head);
 389
 390         so_cache_zone_element_size = (vm_size_t)(sizeof (struct socket) + 4
 391             + get_inpcb_str_size() + 4 + get_tcp_str_size());
 392
 393         so_cache_zone = zinit(so_cache_zone_element_size,
 394             (120000 * so_cache_zone_element_size), 8192, "socache zone");
 395         zone_change(so_cache_zone, Z_CALLERACCT, FALSE);
 396         zone_change(so_cache_zone, Z_NOENCRYPT, TRUE);
 397
 398         sl_zone_size = sizeof (struct sockaddr_list);
 399         if ((sl_zone = zinit(sl_zone_size, 1024 * sl_zone_size, 1024,
 400             "sockaddr_list")) == NULL) {
 401                 panic("%s: unable to allocate sockaddr_list zone\n", __func__);
 402                 /* NOTREACHED */
 403         }
 404         zone_change(sl_zone, Z_CALLERACCT, FALSE);
 405         zone_change(sl_zone, Z_EXPAND, TRUE);
 406
 407         se_zone_size = sizeof (struct sockaddr_entry);
 408         if ((se_zone = zinit(se_zone_size, 1024 * se_zone_size, 1024,
 409             "sockaddr_entry")) == NULL) {
 410                 panic("%s: unable to allocate sockaddr_entry zone\n", __func__);
 411                 /* NOTREACHED */
 412         }
 413         zone_change(se_zone, Z_CALLERACCT, FALSE);
 414         zone_change(se_zone, Z_EXPAND, TRUE);
 415
 416         bzero(&soextbkidlestat, sizeof(struct soextbkidlestat));
 417         soextbkidlestat.so_xbkidle_maxperproc = SO_IDLE_BK_IDLE_MAX_PER_PROC;
 418         soextbkidlestat.so_xbkidle_time = SO_IDLE_BK_IDLE_TIME;
 419         soextbkidlestat.so_xbkidle_rcvhiwat = SO_IDLE_BK_IDLE_RCV_HIWAT;
 420
 421         in_pcbinit();
 422         sflt_init();
 423         socket_tclass_init();
 424 #if MULTIPATH
 425         mp_pcbinit();
 426 #endif /* MULTIPATH */
 427 }
 428
 429 static void
 430 cached_sock_alloc(struct socket **so, int waitok)
 431 {
 432         caddr_t temp;
 433         uintptr_t offset;
 434
 435         lck_mtx_lock(so_cache_mtx);
 436
 437         if (!STAILQ_EMPTY(&so_cache_head)) {
 438                 VERIFY(cached_sock_count > 0);
 439
 440                 *so = STAILQ_FIRST(&so_cache_head);
 441                 STAILQ_REMOVE_HEAD(&so_cache_head, so_cache_ent);
 442                 STAILQ_NEXT((*so), so_cache_ent) = NULL;
 443
 444                 cached_sock_count--;
 445                 lck_mtx_unlock(so_cache_mtx);
 446
 447                 temp = (*so)->so_saved_pcb;
 448                 bzero((caddr_t)*so, sizeof (struct socket));
 449
 450                 (*so)->so_saved_pcb = temp;
 451         } else {
 452
 453                 lck_mtx_unlock(so_cache_mtx);
 454
 455                 if (waitok)
 456                         *so = (struct socket *)zalloc(so_cache_zone);
 457                 else
 458                         *so = (struct socket *)zalloc_noblock(so_cache_zone);
 459
 460                 if (*so == NULL)
 461                         return;
 462
 463                 bzero((caddr_t)*so, sizeof (struct socket));
 464
 465                 /*
 466                  * Define offsets for extra structures into our
 467                  * single block of memory. Align extra structures
 468                  * on longword boundaries.
 469                  */
 470
 471                 offset = (uintptr_t)*so;
 472                 offset += sizeof (struct socket);
 473
 474                 offset = ALIGN(offset);
 475
 476                 (*so)->so_saved_pcb = (caddr_t)offset;
 477                 offset += get_inpcb_str_size();
 478
 479                 offset = ALIGN(offset);
 480
 481                 ((struct inpcb *)(void *)(*so)->so_saved_pcb)->inp_saved_ppcb =
 482                     (caddr_t)offset;
 483         }
 484
 485         OSBitOrAtomic(SOF1_CACHED_IN_SOCK_LAYER, &(*so)->so_flags1);
 486 }
 487
 488 static void
 489 cached_sock_free(struct socket *so)
 490 {
 491
 492         lck_mtx_lock(so_cache_mtx);
 493
 494         so_cache_time = net_uptime();
 495         if (++cached_sock_count > max_cached_sock_count) {
 496                 --cached_sock_count;
 497                 lck_mtx_unlock(so_cache_mtx);
 498                 zfree(so_cache_zone, so);
 499         } else {
 500                 if (so_cache_hw < cached_sock_count)
 501                         so_cache_hw = cached_sock_count;
 502
 503                 STAILQ_INSERT_TAIL(&so_cache_head, so, so_cache_ent);
 504
 505                 so->cache_timestamp = so_cache_time;
 506                 lck_mtx_unlock(so_cache_mtx);
 507         }
 508 }
 509
 510 void
 511 so_update_last_owner_locked(struct socket *so, proc_t self)
 512 {
 513         if (so->last_pid != 0) {
 514                 /*
 515                  * last_pid and last_upid should remain zero for sockets
 516                  * created using sock_socket. The check above achieves that
 517                  */
 518                 if (self == PROC_NULL)
 519                         self = current_proc();
 520
 521                 if (so->last_upid != proc_uniqueid(self) ||
 522                     so->last_pid != proc_pid(self)) {
 523                         so->last_upid = proc_uniqueid(self);
 524                         so->last_pid = proc_pid(self);
 525                         proc_getexecutableuuid(self, so->last_uuid,
 526                             sizeof (so->last_uuid));
 527                 }
 528                 proc_pidoriginatoruuid(so->so_vuuid, sizeof(so->so_vuuid));
 529         }
 530 }
 531
 532 void
 533 so_update_policy(struct socket *so)
 534 {
 535         if (SOCK_DOM(so) == PF_INET || SOCK_DOM(so) == PF_INET6)
 536                 (void) inp_update_policy(sotoinpcb(so));
 537 }
 538
 539 #if NECP
 540 static void
 541 so_update_necp_policy(struct socket *so, struct sockaddr *override_local_addr,
 542     struct sockaddr *override_remote_addr)
 543 {
 544         if (SOCK_DOM(so) == PF_INET || SOCK_DOM(so) == PF_INET6)
 545                 inp_update_necp_policy(sotoinpcb(so), override_local_addr,
 546                     override_remote_addr, 0);
 547 }
 548 #endif /* NECP */
 549
 550 boolean_t
 551 so_cache_timer(void)
 552 {
 553         struct socket   *p;
 554         int             n_freed = 0;
 555         boolean_t rc = FALSE;
 556
 557         lck_mtx_lock(so_cache_mtx);
 558         so_cache_timeouts++;
 559         so_cache_time = net_uptime();
 560
 561         while (!STAILQ_EMPTY(&so_cache_head)) {
 562                 VERIFY(cached_sock_count > 0);
 563                 p = STAILQ_FIRST(&so_cache_head);
 564                 if ((so_cache_time - p->cache_timestamp) <
 565                         SO_CACHE_TIME_LIMIT)
 566                         break;
 567
 568                 STAILQ_REMOVE_HEAD(&so_cache_head, so_cache_ent);
 569                 --cached_sock_count;
 570
 571                 zfree(so_cache_zone, p);
 572
 573                 if (++n_freed >= SO_CACHE_MAX_FREE_BATCH) {
 574                         so_cache_max_freed++;
 575                         break;
 576                 }
 577         }
 578
 579         /* Schedule again if there is more to cleanup */
 580         if (!STAILQ_EMPTY(&so_cache_head))
 581                 rc = TRUE;
 582
 583         lck_mtx_unlock(so_cache_mtx);
 584         return (rc);
 585 }
 586
 587 /*
 588  * Get a socket structure from our zone, and initialize it.
 589  * We don't implement `waitok' yet (see comments in uipc_domain.c).
 590  * Note that it would probably be better to allocate socket
 591  * and PCB at the same time, but I'm not convinced that all
 592  * the protocols can be easily modified to do this.
 593  */
 594 struct socket *
 595 soalloc(int waitok, int dom, int type)
 596 {
 597         struct socket *so;
 598
 599         if ((dom == PF_INET) && (type == SOCK_STREAM)) {
 600                 cached_sock_alloc(&so, waitok);
 601         } else {
 602                 MALLOC_ZONE(so, struct socket *, sizeof (*so), socket_zone,
 603                     M_WAITOK);
 604                 if (so != NULL)
 605                         bzero(so, sizeof (*so));
 606         }
 607         if (so != NULL) {
 608                 so->so_gencnt = OSIncrementAtomic64((SInt64 *)&so_gencnt);
 609                 so->so_zone = socket_zone;
 610 #if CONFIG_MACF_SOCKET
 611                 /* Convert waitok to  M_WAITOK/M_NOWAIT for MAC Framework. */
 612                 if (mac_socket_label_init(so, !waitok) != 0) {
 613                         sodealloc(so);
 614                         return (NULL);
 615                 }
 616 #endif /* MAC_SOCKET */
 617         }
 618
 619         return (so);
 620 }
 621
 622 int
 623 socreate_internal(int dom, struct socket **aso, int type, int proto,
 624     struct proc *p, uint32_t flags, struct proc *ep)
 625 {
 626         struct protosw *prp;
 627         struct socket *so;
 628         int error = 0;
 629
 630 #if TCPDEBUG
 631         extern int tcpconsdebug;
 632 #endif
 633
 634         VERIFY(aso != NULL);
 635         *aso = NULL;
 636
 637         if (proto != 0)
 638                 prp = pffindproto(dom, proto, type);
 639         else
 640                 prp = pffindtype(dom, type);
 641
 642         if (prp == NULL || prp->pr_usrreqs->pru_attach == NULL) {
 643                 if (pffinddomain(dom) == NULL)
 644                         return (EAFNOSUPPORT);
 645                 if (proto != 0) {
 646                         if (pffindprotonotype(dom, proto) != NULL)
 647                                 return (EPROTOTYPE);
 648                 }
 649                 return (EPROTONOSUPPORT);
 650         }
 651         if (prp->pr_type != type)
 652                 return (EPROTOTYPE);
 653         so = soalloc(1, dom, type);
 654         if (so == NULL)
 655                 return (ENOBUFS);
 656
 657         if (flags & SOCF_ASYNC)
 658                 so->so_state |= SS_NBIO;
 659 #if MULTIPATH
 660         if (flags & SOCF_MP_SUBFLOW) {
 661                 /*
 662                  * A multipath subflow socket is used internally in the kernel,
 663                  * therefore it does not have a file desciptor associated by
 664                  * default.
 665                  */
 666                 so->so_state |= SS_NOFDREF;
 667                 so->so_flags |= SOF_MP_SUBFLOW;
 668         }
 669 #endif /* MULTIPATH */
 670
 671         TAILQ_INIT(&so->so_incomp);
 672         TAILQ_INIT(&so->so_comp);
 673         so->so_type = type;
 674         so->last_upid = proc_uniqueid(p);
 675         so->last_pid = proc_pid(p);
 676         proc_getexecutableuuid(p, so->last_uuid, sizeof (so->last_uuid));
 677         proc_pidoriginatoruuid(so->so_vuuid, sizeof(so->so_vuuid));
 678
 679         if (ep != PROC_NULL && ep != p) {
 680                 so->e_upid = proc_uniqueid(ep);
 681                 so->e_pid = proc_pid(ep);
 682                 proc_getexecutableuuid(ep, so->e_uuid, sizeof (so->e_uuid));
 683                 so->so_flags |= SOF_DELEGATED;
 684         }
 685
 686         so->so_cred = kauth_cred_proc_ref(p);
 687         if (!suser(kauth_cred_get(), NULL))
 688                 so->so_state |= SS_PRIV;
 689
 690         so->so_proto = prp;
 691         so->so_rcv.sb_flags |= SB_RECV;
 692         so->so_rcv.sb_so = so->so_snd.sb_so = so;
 693         so->next_lock_lr = 0;
 694         so->next_unlock_lr = 0;
 695
 696 #if CONFIG_MACF_SOCKET
 697         mac_socket_label_associate(kauth_cred_get(), so);
 698 #endif /* MAC_SOCKET */
 699
 700         /*
 701          * Attachment will create the per pcb lock if necessary and
 702          * increase refcount for creation, make sure it's done before
 703          * socket is inserted in lists.
 704          */
 705         so->so_usecount++;
 706
 707         error = (*prp->pr_usrreqs->pru_attach)(so, proto, p);
 708         if (error != 0) {
 709                 /*
 710                  * Warning:
 711                  * If so_pcb is not zero, the socket will be leaked,
 712                  * so protocol attachment handler must be coded carefuly
 713                  */
 714                 so->so_state |= SS_NOFDREF;
 715                 so->so_usecount--;
 716                 sofreelastref(so, 1);   /* will deallocate the socket */
 717                 return (error);
 718         }
 719
 720         atomic_add_32(&prp->pr_domain->dom_refs, 1);
 721         TAILQ_INIT(&so->so_evlist);
 722
 723         /* Attach socket filters for this protocol */
 724         sflt_initsock(so);
 725 #if TCPDEBUG
 726         if (tcpconsdebug == 2)
 727                 so->so_options |= SO_DEBUG;
 728 #endif
 729         so_set_default_traffic_class(so);
 730
 731         /*
 732          * If this thread or task is marked to create backgrounded sockets,
 733          * mark the socket as background.
 734          */
 735         if (proc_get_effective_thread_policy(current_thread(),
 736             TASK_POLICY_NEW_SOCKETS_BG)) {
 737                 socket_set_traffic_mgt_flags(so, TRAFFIC_MGT_SO_BACKGROUND);
 738                 so->so_background_thread = current_thread();
 739         }
 740
 741         switch (dom) {
 742         /*
 743          * Don't mark Unix domain, system or multipath sockets as
 744          * eligible for defunct by default.
 745          */
 746         case PF_LOCAL:
 747         case PF_SYSTEM:
 748         case PF_MULTIPATH:
 749                 so->so_flags |= SOF_NODEFUNCT;
 750                 break;
 751         default:
 752                 break;
 753         }
 754
 755         /*
 756          * Entitlements can't be checked at socket creation time except if the
 757          * application requested a feature guarded by a privilege (c.f., socket
 758          * delegation).
 759          * The priv(9) and the Sandboxing APIs are designed with the idea that
 760          * a privilege check should only be triggered by a userland request.
 761          * A privilege check at socket creation time is time consuming and
 762          * could trigger many authorisation error messages from the security
 763          * APIs.
 764          */
 765
 766         *aso = so;
 767
 768         return (0);
 769 }
 770
 771 /*
 772  * Returns:     0                       Success
 773  *              EAFNOSUPPORT
 774  *              EPROTOTYPE
 775  *              EPROTONOSUPPORT
 776  *              ENOBUFS
 777  *      <pru_attach>:ENOBUFS[AF_UNIX]
 778  *      <pru_attach>:ENOBUFS[TCP]
 779  *      <pru_attach>:ENOMEM[TCP]
 780  *      <pru_attach>:???                [other protocol families, IPSEC]
 781  */
 782 int
 783 socreate(int dom, struct socket **aso, int type, int proto)
 784 {
 785         return (socreate_internal(dom, aso, type, proto, current_proc(), 0,
 786             PROC_NULL));
 787 }
 788
 789 int
 790 socreate_delegate(int dom, struct socket **aso, int type, int proto, pid_t epid)
 791 {
 792         int error = 0;
 793         struct proc *ep = PROC_NULL;
 794
 795         if ((proc_selfpid() != epid) && ((ep = proc_find(epid)) == PROC_NULL)) {
 796                 error = ESRCH;
 797                 goto done;
 798         }
 799
 800         error = socreate_internal(dom, aso, type, proto, current_proc(), 0, ep);
 801
 802         /*
 803          * It might not be wise to hold the proc reference when calling
 804          * socreate_internal since it calls soalloc with M_WAITOK
 805          */
 806 done:
 807         if (ep != PROC_NULL)
 808                 proc_rele(ep);
 809
 810         return (error);
 811 }
 812
 813 /*
 814  * Returns:     0                       Success
 815  *      <pru_bind>:EINVAL               Invalid argument [COMMON_START]
 816  *      <pru_bind>:EAFNOSUPPORT         Address family not supported
 817  *      <pru_bind>:EADDRNOTAVAIL        Address not available.
 818  *      <pru_bind>:EINVAL               Invalid argument
 819  *      <pru_bind>:EAFNOSUPPORT         Address family not supported [notdef]
 820  *      <pru_bind>:EACCES               Permission denied
 821  *      <pru_bind>:EADDRINUSE           Address in use
 822  *      <pru_bind>:EAGAIN               Resource unavailable, try again
 823  *      <pru_bind>:EPERM                Operation not permitted
 824  *      <pru_bind>:???
 825  *      <sf_bind>:???
 826  *
 827  * Notes:       It's not possible to fully enumerate the return codes above,
 828  *              since socket filter authors and protocol family authors may
 829  *              not choose to limit their error returns to those listed, even
 830  *              though this may result in some software operating incorrectly.
 831  *
 832  *              The error codes which are enumerated above are those known to
 833  *              be returned by the tcp_usr_bind function supplied.
 834  */
 835 int
 836 sobindlock(struct socket *so, struct sockaddr *nam, int dolock)
 837 {
 838         struct proc *p = current_proc();
 839         int error = 0;
 840
 841         if (dolock)
 842                 socket_lock(so, 1);
 843         VERIFY(so->so_usecount > 1);
 844
 845         so_update_last_owner_locked(so, p);
 846         so_update_policy(so);
 847
 848 #if NECP
 849         so_update_necp_policy(so, nam, NULL);
 850 #endif /* NECP */
 851
 852         /*
 853          * If this is a bind request on a socket that has been marked
 854          * as inactive, reject it now before we go any further.
 855          */
 856         if (so->so_flags & SOF_DEFUNCT) {
 857                 error = EINVAL;
 858                 SODEFUNCTLOG(("%s[%d]: defunct so 0x%llx [%d,%d] (%d)\n",
 859                     __func__, proc_pid(p), (uint64_t)DEBUG_KERNEL_ADDRPERM(so),
 860                     SOCK_DOM(so), SOCK_TYPE(so), error));
 861                 goto out;
 862         }
 863
 864         /* Socket filter */
 865         error = sflt_bind(so, nam);
 866
 867         if (error == 0)
 868                 error = (*so->so_proto->pr_usrreqs->pru_bind)(so, nam, p);
 869 out:
 870         if (dolock)
 871                 socket_unlock(so, 1);
 872
 873         if (error == EJUSTRETURN)
 874                 error = 0;
 875
 876         return (error);
 877 }
 878
 879 void
 880 sodealloc(struct socket *so)
 881 {
 882         kauth_cred_unref(&so->so_cred);
 883
 884         /* Remove any filters */
 885         sflt_termsock(so);
 886
 887 #if CONTENT_FILTER
 888         cfil_sock_detach(so);
 889 #endif /* CONTENT_FILTER */
 890
 891         /* Delete the state allocated for msg queues on a socket */
 892         if (so->so_flags & SOF_ENABLE_MSGS) {
 893                 FREE(so->so_msg_state, M_TEMP);
 894                 so->so_msg_state = NULL;
 895         }
 896         VERIFY(so->so_msg_state == NULL);
 897
 898         so->so_gencnt = OSIncrementAtomic64((SInt64 *)&so_gencnt);
 899
 900 #if CONFIG_MACF_SOCKET
 901         mac_socket_label_destroy(so);
 902 #endif /* MAC_SOCKET */
 903
 904         if (so->so_flags1 & SOF1_CACHED_IN_SOCK_LAYER) {
 905                 cached_sock_free(so);
 906         } else {
 907                 FREE_ZONE(so, sizeof (*so), so->so_zone);
 908         }
 909 }
 910
 911 /*
 912  * Returns:     0                       Success
 913  *              EINVAL
 914  *              EOPNOTSUPP
 915  *      <pru_listen>:EINVAL[AF_UNIX]
 916  *      <pru_listen>:EINVAL[TCP]
 917  *      <pru_listen>:EADDRNOTAVAIL[TCP] Address not available.
 918  *      <pru_listen>:EINVAL[TCP]        Invalid argument
 919  *      <pru_listen>:EAFNOSUPPORT[TCP]  Address family not supported [notdef]
 920  *      <pru_listen>:EACCES[TCP]        Permission denied
 921  *      <pru_listen>:EADDRINUSE[TCP]    Address in use
 922  *      <pru_listen>:EAGAIN[TCP]        Resource unavailable, try again
 923  *      <pru_listen>:EPERM[TCP]         Operation not permitted
 924  *      <sf_listen>:???
 925  *
 926  * Notes:       Other <pru_listen> returns depend on the protocol family; all
 927  *              <sf_listen> returns depend on what the filter author causes
 928  *              their filter to return.
 929  */
 930 int
 931 solisten(struct socket *so, int backlog)
 932 {
 933         struct proc *p = current_proc();
 934         int error = 0;
 935
 936         socket_lock(so, 1);
 937
 938         so_update_last_owner_locked(so, p);
 939         so_update_policy(so);
 940
 941 #if NECP
 942         so_update_necp_policy(so, NULL, NULL);
 943 #endif /* NECP */
 944
 945         if (so->so_proto == NULL) {
 946                 error = EINVAL;
 947                 goto out;
 948         }
 949         if ((so->so_proto->pr_flags & PR_CONNREQUIRED) == 0) {
 950                 error = EOPNOTSUPP;
 951                 goto out;
 952         }
 953
 954         /*
 955          * If the listen request is made on a socket that is not fully
 956          * disconnected, or on a socket that has been marked as inactive,
 957          * reject the request now.
 958          */
 959         if ((so->so_state &
 960             (SS_ISCONNECTED|SS_ISCONNECTING|SS_ISDISCONNECTING)) ||
 961             (so->so_flags & SOF_DEFUNCT)) {
 962                 error = EINVAL;
 963                 if (so->so_flags & SOF_DEFUNCT) {
 964                         SODEFUNCTLOG(("%s[%d]: defunct so 0x%llx [%d,%d] "
 965                             "(%d)\n", __func__, proc_pid(p),
 966                             (uint64_t)DEBUG_KERNEL_ADDRPERM(so),
 967                             SOCK_DOM(so), SOCK_TYPE(so), error));
 968                 }
 969                 goto out;
 970         }
 971
 972         if ((so->so_restrictions & SO_RESTRICT_DENY_IN) != 0) {
 973                 error = EPERM;
 974                 goto out;
 975         }
 976
 977         error = sflt_listen(so);
 978         if (error == 0)
 979                 error = (*so->so_proto->pr_usrreqs->pru_listen)(so, p);
 980
 981         if (error) {
 982                 if (error == EJUSTRETURN)
 983                         error = 0;
 984                 goto out;
 985         }
 986
 987         if (TAILQ_EMPTY(&so->so_comp))
 988                 so->so_options |= SO_ACCEPTCONN;
 989         /*
 990          * POSIX: The implementation may have an upper limit on the length of
 991          * the listen queue-either global or per accepting socket. If backlog
 992          * exceeds this limit, the length of the listen queue is set to the
 993          * limit.
 994          *
 995          * If listen() is called with a backlog argument value that is less
 996          * than 0, the function behaves as if it had been called with a backlog
 997          * argument value of 0.
 998          *
 999          * A backlog argument of 0 may allow the socket to accept connections,
1000          * in which case the length of the listen queue may be set to an
1001          * implementation-defined minimum value.
1002          */
1003         if (backlog <= 0 || backlog > somaxconn)
1004                 backlog = somaxconn;
1005
1006         so->so_qlimit = backlog;
1007 out:
1008         socket_unlock(so, 1);
1009         return (error);
1010 }
1011
1012 void
1013 sofreelastref(struct socket *so, int dealloc)
1014 {
1015         struct socket *head = so->so_head;
1016
1017         /* Assume socket is locked */
1018
1019         if (!(so->so_flags & SOF_PCBCLEARING) || !(so->so_state & SS_NOFDREF)) {
1020                 selthreadclear(&so->so_snd.sb_sel);
1021                 selthreadclear(&so->so_rcv.sb_sel);
1022                 so->so_rcv.sb_flags &= ~(SB_SEL|SB_UPCALL);
1023                 so->so_snd.sb_flags &= ~(SB_SEL|SB_UPCALL);
1024                 so->so_event = sonullevent;
1025                 return;
1026         }
1027         if (head != NULL) {
1028                 socket_lock(head, 1);
1029                 if (so->so_state & SS_INCOMP) {
1030                         TAILQ_REMOVE(&head->so_incomp, so, so_list);
1031                         head->so_incqlen--;
1032                 } else if (so->so_state & SS_COMP) {
1033                         /*
1034                          * We must not decommission a socket that's
1035                          * on the accept(2) queue.  If we do, then
1036                          * accept(2) may hang after select(2) indicated
1037                          * that the listening socket was ready.
1038                          */
1039                         selthreadclear(&so->so_snd.sb_sel);
1040                         selthreadclear(&so->so_rcv.sb_sel);
1041                         so->so_rcv.sb_flags &= ~(SB_SEL|SB_UPCALL);
1042                         so->so_snd.sb_flags &= ~(SB_SEL|SB_UPCALL);
1043                         so->so_event = sonullevent;
1044                         socket_unlock(head, 1);
1045                         return;
1046                 } else {
1047                         panic("sofree: not queued");
1048                 }
1049                 head->so_qlen--;
1050                 so->so_state &= ~SS_INCOMP;
1051                 so->so_head = NULL;
1052                 socket_unlock(head, 1);
1053         }
1054         sowflush(so);
1055         sorflush(so);
1056
1057 #if FLOW_DIVERT
1058         if (so->so_flags & SOF_FLOW_DIVERT) {
1059                 flow_divert_detach(so);
1060         }
1061 #endif  /* FLOW_DIVERT */
1062
1063         /* 3932268: disable upcall */
1064         so->so_rcv.sb_flags &= ~SB_UPCALL;
1065         so->so_snd.sb_flags &= ~SB_UPCALL;
1066         so->so_event = sonullevent;
1067
1068         if (dealloc)
1069                 sodealloc(so);
1070 }
1071
1072 void
1073 soclose_wait_locked(struct socket *so)
1074 {
1075         lck_mtx_t *mutex_held;
1076
1077         if (so->so_proto->pr_getlock != NULL)
1078                 mutex_held = (*so->so_proto->pr_getlock)(so, 0);
1079         else
1080                 mutex_held = so->so_proto->pr_domain->dom_mtx;
1081         lck_mtx_assert(mutex_held, LCK_MTX_ASSERT_OWNED);
1082
1083         /*
1084          * Double check here and return if there's no outstanding upcall;
1085          * otherwise proceed further only if SOF_UPCALLCLOSEWAIT is set.
1086          */
1087         if (!so->so_upcallusecount || !(so->so_flags & SOF_UPCALLCLOSEWAIT))
1088                 return;
1089         so->so_rcv.sb_flags &= ~SB_UPCALL;
1090         so->so_snd.sb_flags &= ~SB_UPCALL;
1091         so->so_flags |= SOF_CLOSEWAIT;
1092         (void) msleep((caddr_t)&so->so_upcallusecount, mutex_held, (PZERO - 1),
1093             "soclose_wait_locked", NULL);
1094         lck_mtx_assert(mutex_held, LCK_MTX_ASSERT_OWNED);
1095         so->so_flags &= ~SOF_CLOSEWAIT;
1096 }
1097
1098 /*
1099  * Close a socket on last file table reference removal.
1100  * Initiate disconnect if connected.
1101  * Free socket when disconnect complete.
1102  */
1103 int
1104 soclose_locked(struct socket *so)
1105 {
1106         int error = 0;
1107         lck_mtx_t *mutex_held;
1108         struct timespec ts;
1109
1110         if (so->so_usecount == 0) {
1111                 panic("soclose: so=%p refcount=0\n", so);
1112                 /* NOTREACHED */
1113         }
1114
1115         sflt_notify(so, sock_evt_closing, NULL);
1116
1117         if (so->so_upcallusecount)
1118                 soclose_wait_locked(so);
1119
1120 #if CONTENT_FILTER
1121         /*
1122          * We have to wait until the content filters are done
1123          */
1124         if ((so->so_flags & SOF_CONTENT_FILTER) != 0) {
1125                 cfil_sock_close_wait(so);
1126                 cfil_sock_is_closed(so);
1127                 cfil_sock_detach(so);
1128         }
1129 #endif /* CONTENT_FILTER */
1130
1131         if (so->so_flags1 & SOF1_EXTEND_BK_IDLE_INPROG) {
1132                 soresume(current_proc(), so, 1);
1133                 so->so_flags1 &= ~SOF1_EXTEND_BK_IDLE_WANTED;
1134         }
1135
1136         if ((so->so_options & SO_ACCEPTCONN)) {
1137                 struct socket *sp, *sonext;
1138                 int socklock = 0;
1139
1140                 /*
1141                  * We do not want new connection to be added
1142                  * to the connection queues
1143                  */
1144                 so->so_options &= ~SO_ACCEPTCONN;
1145
1146                 for (sp = TAILQ_FIRST(&so->so_incomp);
1147                     sp != NULL; sp = sonext) {
1148                         sonext = TAILQ_NEXT(sp, so_list);
1149
1150                         /*
1151                          * Radar 5350314
1152                          * skip sockets thrown away by tcpdropdropblreq
1153                          * they will get cleanup by the garbage collection.
1154                          * otherwise, remove the incomp socket from the queue
1155                          * and let soabort trigger the appropriate cleanup.
1156                          */
1157                         if (sp->so_flags & SOF_OVERFLOW)
1158                                 continue;
1159
1160                         if (so->so_proto->pr_getlock != NULL) {
1161                                 /*
1162                                  * Lock ordering for consistency with the
1163                                  * rest of the stack, we lock the socket
1164                                  * first and then grabb the head.
1165                                  */
1166                                 socket_unlock(so, 0);
1167                                 socket_lock(sp, 1);
1168                                 socket_lock(so, 0);
1169                                 socklock = 1;
1170                         }
1171
1172                         TAILQ_REMOVE(&so->so_incomp, sp, so_list);
1173                         so->so_incqlen--;
1174
1175                         if (sp->so_state & SS_INCOMP) {
1176                                 sp->so_state &= ~SS_INCOMP;
1177                                 sp->so_head = NULL;
1178
1179                                 (void) soabort(sp);
1180                         }
1181
1182                         if (socklock)
1183                                 socket_unlock(sp, 1);
1184                 }
1185
1186                 while ((sp = TAILQ_FIRST(&so->so_comp)) != NULL) {
1187                         /* Dequeue from so_comp since sofree() won't do it */
1188                         TAILQ_REMOVE(&so->so_comp, sp, so_list);
1189                         so->so_qlen--;
1190
1191                         if (so->so_proto->pr_getlock != NULL) {
1192                                 socket_unlock(so, 0);
1193                                 socket_lock(sp, 1);
1194                         }
1195
1196                         if (sp->so_state & SS_COMP) {
1197                                 sp->so_state &= ~SS_COMP;
1198                                 sp->so_head = NULL;
1199
1200                                 (void) soabort(sp);
1201                         }
1202
1203                         if (so->so_proto->pr_getlock != NULL) {
1204                                 socket_unlock(sp, 1);
1205                                 socket_lock(so, 0);
1206                         }
1207                 }
1208         }
1209         if (so->so_pcb == NULL) {
1210                 /* 3915887: mark the socket as ready for dealloc */
1211                 so->so_flags |= SOF_PCBCLEARING;
1212                 goto discard;
1213         }
1214         if (so->so_state & SS_ISCONNECTED) {
1215                 if ((so->so_state & SS_ISDISCONNECTING) == 0) {
1216                         error = sodisconnectlocked(so);
1217                         if (error)
1218                                 goto drop;
1219                 }
1220                 if (so->so_options & SO_LINGER) {
1221                         if ((so->so_state & SS_ISDISCONNECTING) &&
1222                             (so->so_state & SS_NBIO))
1223                                 goto drop;
1224                         if (so->so_proto->pr_getlock != NULL)
1225                                 mutex_held = (*so->so_proto->pr_getlock)(so, 0);
1226                         else
1227                                 mutex_held = so->so_proto->pr_domain->dom_mtx;
1228                         while (so->so_state & SS_ISCONNECTED) {
1229                                 ts.tv_sec = (so->so_linger/100);
1230                                 ts.tv_nsec = (so->so_linger % 100) *
1231                                     NSEC_PER_USEC * 1000 * 10;
1232                                 error = msleep((caddr_t)&so->so_timeo,
1233                                     mutex_held, PSOCK | PCATCH, "soclose", &ts);
1234                                 if (error) {
1235                                         /*
1236                                          * It's OK when the time fires,
1237                                          * don't report an error
1238                                          */
1239                                         if (error == EWOULDBLOCK)
1240                                                 error = 0;
1241                                         break;
1242                                 }
1243                         }
1244                 }
1245         }
1246 drop:
1247         if (so->so_usecount == 0) {
1248                 panic("soclose: usecount is zero so=%p\n", so);
1249                 /* NOTREACHED */
1250         }
1251         if (so->so_pcb != NULL && !(so->so_flags & SOF_PCBCLEARING)) {
1252                 int error2 = (*so->so_proto->pr_usrreqs->pru_detach)(so);
1253                 if (error == 0)
1254                         error = error2;
1255         }
1256         if (so->so_usecount <= 0) {
1257                 panic("soclose: usecount is zero so=%p\n", so);
1258                 /* NOTREACHED */
1259         }
1260 discard:
1261         if (so->so_pcb != NULL && !(so->so_flags & SOF_MP_SUBFLOW) &&
1262             (so->so_state & SS_NOFDREF)) {
1263                 panic("soclose: NOFDREF");
1264                 /* NOTREACHED */
1265         }
1266         so->so_state |= SS_NOFDREF;
1267
1268         if (so->so_flags & SOF_MP_SUBFLOW)
1269                 so->so_flags &= ~SOF_MP_SUBFLOW;
1270
1271         if ((so->so_flags & SOF_KNOTE) != 0)
1272                 KNOTE(&so->so_klist, SO_FILT_HINT_LOCKED);
1273
1274         atomic_add_32(&so->so_proto->pr_domain->dom_refs, -1);
1275         evsofree(so);
1276
1277         so->so_usecount--;
1278         sofree(so);
1279         return (error);
1280 }
1281
1282 int
1283 soclose(struct socket *so)
1284 {
1285         int error = 0;
1286         socket_lock(so, 1);
1287
1288         if (so->so_retaincnt == 0) {
1289                 error = soclose_locked(so);
1290         } else {
1291                 /*
1292                  * if the FD is going away, but socket is
1293                  * retained in kernel remove its reference
1294                  */
1295                 so->so_usecount--;
1296                 if (so->so_usecount < 2)
1297                         panic("soclose: retaincnt non null and so=%p "
1298                             "usecount=%d\n", so, so->so_usecount);
1299         }
1300         socket_unlock(so, 1);
1301         return (error);
1302 }
1303
1304 /*
1305  * Must be called at splnet...
1306  */
1307 /* Should already be locked */
1308 int
1309 soabort(struct socket *so)
1310 {
1311         int error;
1312
1313 #ifdef MORE_LOCKING_DEBUG
1314         lck_mtx_t *mutex_held;
1315
1316         if (so->so_proto->pr_getlock != NULL)
1317                 mutex_held = (*so->so_proto->pr_getlock)(so, 0);
1318         else
1319                 mutex_held = so->so_proto->pr_domain->dom_mtx;
1320         lck_mtx_assert(mutex_held, LCK_MTX_ASSERT_OWNED);
1321 #endif
1322
1323         if ((so->so_flags & SOF_ABORTED) == 0) {
1324                 so->so_flags |= SOF_ABORTED;
1325                 error = (*so->so_proto->pr_usrreqs->pru_abort)(so);
1326                 if (error) {
1327                         sofree(so);
1328                         return (error);
1329                 }
1330         }
1331         return (0);
1332 }
1333
1334 int
1335 soacceptlock(struct socket *so, struct sockaddr **nam, int dolock)
1336 {
1337         int error;
1338
1339         if (dolock)
1340                 socket_lock(so, 1);
1341
1342         so_update_last_owner_locked(so, PROC_NULL);
1343         so_update_policy(so);
1344 #if NECP
1345         so_update_necp_policy(so, NULL, NULL);
1346 #endif /* NECP */
1347
1348         if ((so->so_state & SS_NOFDREF) == 0)
1349                 panic("soaccept: !NOFDREF");
1350         so->so_state &= ~SS_NOFDREF;
1351         error = (*so->so_proto->pr_usrreqs->pru_accept)(so, nam);
1352
1353         if (dolock)
1354                 socket_unlock(so, 1);
1355         return (error);
1356 }
1357
1358 int
1359 soaccept(struct socket *so, struct sockaddr **nam)
1360 {
1361         return (soacceptlock(so, nam, 1));
1362 }
1363
1364 int
1365 soacceptfilter(struct socket *so)
1366 {
1367         struct sockaddr *local = NULL, *remote = NULL;
1368         int error = 0;
1369         struct socket *head = so->so_head;
1370
1371         /*
1372          * Hold the lock even if this socket has not been made visible
1373          * to the filter(s).  For sockets with global locks, this protects
1374          * against the head or peer going away
1375          */
1376         socket_lock(so, 1);
1377         if (sogetaddr_locked(so, &remote, 1) != 0 ||
1378             sogetaddr_locked(so, &local, 0) != 0) {
1379                 so->so_state &= ~(SS_NOFDREF | SS_COMP);
1380                 so->so_head = NULL;
1381                 socket_unlock(so, 1);
1382                 soclose(so);
1383                 /* Out of resources; try it again next time */
1384                 error = ECONNABORTED;
1385                 goto done;
1386         }
1387
1388         error = sflt_accept(head, so, local, remote);
1389
1390         /*
1391          * If we get EJUSTRETURN from one of the filters, mark this socket
1392          * as inactive and return it anyway.  This newly accepted socket
1393          * will be disconnected later before we hand it off to the caller.
1394          */
1395         if (error == EJUSTRETURN) {
1396                 error = 0;
1397                 (void) sosetdefunct(current_proc(), so,
1398                     SHUTDOWN_SOCKET_LEVEL_DISCONNECT_INTERNAL, FALSE);
1399         }
1400
1401         if (error != 0) {
1402                 /*
1403                  * This may seem like a duplication to the above error
1404                  * handling part when we return ECONNABORTED, except
1405                  * the following is done while holding the lock since
1406                  * the socket has been exposed to the filter(s) earlier.
1407                  */
1408                 so->so_state &= ~(SS_NOFDREF | SS_COMP);
1409                 so->so_head = NULL;
1410                 socket_unlock(so, 1);
1411                 soclose(so);
1412                 /* Propagate socket filter's error code to the caller */
1413         } else {
1414                 socket_unlock(so, 1);
1415         }
1416 done:
1417         /* Callee checks for NULL pointer */
1418         sock_freeaddr(remote);
1419         sock_freeaddr(local);
1420         return (error);
1421 }
1422
1423 /*
1424  * Returns:     0                       Success
1425  *              EOPNOTSUPP              Operation not supported on socket
1426  *              EISCONN                 Socket is connected
1427  *      <pru_connect>:EADDRNOTAVAIL     Address not available.
1428  *      <pru_connect>:EINVAL            Invalid argument
1429  *      <pru_connect>:EAFNOSUPPORT      Address family not supported [notdef]
1430  *      <pru_connect>:EACCES            Permission denied
1431  *      <pru_connect>:EADDRINUSE        Address in use
1432  *      <pru_connect>:EAGAIN            Resource unavailable, try again
1433  *      <pru_connect>:EPERM             Operation not permitted
1434  *      <sf_connect_out>:???            [anything a filter writer might set]
1435  */
1436 int
1437 soconnectlock(struct socket *so, struct sockaddr *nam, int dolock)
1438 {
1439         int error;
1440         struct proc *p = current_proc();
1441
1442         if (dolock)
1443                 socket_lock(so, 1);
1444
1445         so_update_last_owner_locked(so, p);
1446         so_update_policy(so);
1447
1448 #if NECP
1449         so_update_necp_policy(so, NULL, nam);
1450 #endif /* NECP */
1451
1452         /*
1453          * If this is a listening socket or if this is a previously-accepted
1454          * socket that has been marked as inactive, reject the connect request.
1455          */
1456         if ((so->so_options & SO_ACCEPTCONN) || (so->so_flags & SOF_DEFUNCT)) {
1457                 error = EOPNOTSUPP;
1458                 if (so->so_flags & SOF_DEFUNCT) {
1459                         SODEFUNCTLOG(("%s[%d]: defunct so 0x%llx [%d,%d] "
1460                             "(%d)\n", __func__, proc_pid(p),
1461                             (uint64_t)DEBUG_KERNEL_ADDRPERM(so),
1462                             SOCK_DOM(so), SOCK_TYPE(so), error));
1463                 }
1464                 if (dolock)
1465                         socket_unlock(so, 1);
1466                 return (error);
1467         }
1468
1469         if ((so->so_restrictions & SO_RESTRICT_DENY_OUT) != 0) {
1470                 if (dolock)
1471                         socket_unlock(so, 1);
1472                 return (EPERM);
1473         }
1474
1475         /*
1476          * If protocol is connection-based, can only connect once.
1477          * Otherwise, if connected, try to disconnect first.
1478          * This allows user to disconnect by connecting to, e.g.,
1479          * a null address.
1480          */
1481         if (so->so_state & (SS_ISCONNECTED|SS_ISCONNECTING) &&
1482             ((so->so_proto->pr_flags & PR_CONNREQUIRED) ||
1483             (error = sodisconnectlocked(so)))) {
1484                 error = EISCONN;
1485         } else {
1486                 /*
1487                  * Run connect filter before calling protocol:
1488                  *  - non-blocking connect returns before completion;
1489                  */
1490                 error = sflt_connectout(so, nam);
1491                 if (error != 0) {
1492                         if (error == EJUSTRETURN)
1493                                 error = 0;
1494                 } else {
1495                         error = (*so->so_proto->pr_usrreqs->pru_connect)
1496                             (so, nam, p);
1497                 }
1498         }
1499         if (dolock)
1500                 socket_unlock(so, 1);
1501         return (error);
1502 }
1503
1504 int
1505 soconnect(struct socket *so, struct sockaddr *nam)
1506 {
1507         return (soconnectlock(so, nam, 1));
1508 }
1509
1510 /*
1511  * Returns:     0                       Success
1512  *      <pru_connect2>:EINVAL[AF_UNIX]
1513  *      <pru_connect2>:EPROTOTYPE[AF_UNIX]
1514  *      <pru_connect2>:???              [other protocol families]
1515  *
1516  * Notes:       <pru_connect2> is not supported by [TCP].
1517  */
1518 int
1519 soconnect2(struct socket *so1, struct socket *so2)
1520 {
1521         int error;
1522
1523         socket_lock(so1, 1);
1524         if (so2->so_proto->pr_lock)
1525                 socket_lock(so2, 1);
1526
1527         error = (*so1->so_proto->pr_usrreqs->pru_connect2)(so1, so2);
1528
1529         socket_unlock(so1, 1);
1530         if (so2->so_proto->pr_lock)
1531                 socket_unlock(so2, 1);
1532         return (error);
1533 }
1534
1535 int
1536 soconnectxlocked(struct socket *so, struct sockaddr_list **src_sl,
1537     struct sockaddr_list **dst_sl, struct proc *p, uint32_t ifscope,
1538     sae_associd_t aid, sae_connid_t *pcid, uint32_t flags, void *arg,
1539     uint32_t arglen, uio_t auio, user_ssize_t *bytes_written)
1540 {
1541         int error;
1542
1543         so_update_last_owner_locked(so, p);
1544         so_update_policy(so);
1545
1546         /*
1547          * If this is a listening socket or if this is a previously-accepted
1548          * socket that has been marked as inactive, reject the connect request.
1549          */
1550         if ((so->so_options & SO_ACCEPTCONN) || (so->so_flags & SOF_DEFUNCT)) {
1551                 error = EOPNOTSUPP;
1552                 if (so->so_flags & SOF_DEFUNCT) {
1553                         SODEFUNCTLOG(("%s[%d]: defunct so 0x%llx [%d,%d] "
1554                             "(%d)\n", __func__, proc_pid(p),
1555                             (uint64_t)DEBUG_KERNEL_ADDRPERM(so),
1556                             SOCK_DOM(so), SOCK_TYPE(so), error));
1557                 }
1558                 return (error);
1559         }
1560
1561         if ((so->so_restrictions & SO_RESTRICT_DENY_OUT) != 0)
1562                 return (EPERM);
1563
1564         /*
1565          * If protocol is connection-based, can only connect once
1566          * unless PR_MULTICONN is set.  Otherwise, if connected,
1567          * try to disconnect first.  This allows user to disconnect
1568          * by connecting to, e.g., a null address.
1569          */
1570         if ((so->so_state & (SS_ISCONNECTED|SS_ISCONNECTING)) &&
1571             !(so->so_proto->pr_flags & PR_MULTICONN) &&
1572             ((so->so_proto->pr_flags & PR_CONNREQUIRED) ||
1573             (error = sodisconnectlocked(so)) != 0)) {
1574                 error = EISCONN;
1575         } else {
1576                 /*
1577                  * Run connect filter before calling protocol:
1578                  *  - non-blocking connect returns before completion;
1579                  */
1580                 error = sflt_connectxout(so, dst_sl);
1581                 if (error != 0) {
1582                         /* Disable PRECONNECT_DATA, as we don't need to send a SYN anymore. */
1583                         so->so_flags1 &= ~SOF1_PRECONNECT_DATA;
1584                         if (error == EJUSTRETURN)
1585                                 error = 0;
1586                 } else {
1587                         error = (*so->so_proto->pr_usrreqs->pru_connectx)
1588                             (so, src_sl, dst_sl, p, ifscope, aid, pcid,
1589                             flags, arg, arglen, auio, bytes_written);
1590                 }
1591         }
1592
1593         return (error);
1594 }
1595
1596 int
1597 sodisconnectlocked(struct socket *so)
1598 {
1599         int error;
1600
1601         if ((so->so_state & SS_ISCONNECTED) == 0) {
1602                 error = ENOTCONN;
1603                 goto bad;
1604         }
1605         if (so->so_state & SS_ISDISCONNECTING) {
1606                 error = EALREADY;
1607                 goto bad;
1608         }
1609
1610         error = (*so->so_proto->pr_usrreqs->pru_disconnect)(so);
1611         if (error == 0)
1612                 sflt_notify(so, sock_evt_disconnected, NULL);
1613
1614 bad:
1615         return (error);
1616 }
1617
1618 /* Locking version */
1619 int
1620 sodisconnect(struct socket *so)
1621 {
1622         int error;
1623
1624         socket_lock(so, 1);
1625         error = sodisconnectlocked(so);
1626         socket_unlock(so, 1);
1627         return (error);
1628 }
1629
1630 int
1631 sodisconnectxlocked(struct socket *so, sae_associd_t aid, sae_connid_t cid)
1632 {
1633         int error;
1634
1635         /*
1636          * Call the protocol disconnectx handler; let it handle all
1637          * matters related to the connection state of this session.
1638          */
1639         error = (*so->so_proto->pr_usrreqs->pru_disconnectx)(so, aid, cid);
1640         if (error == 0) {
1641                 /*
1642                  * The event applies only for the session, not for
1643                  * the disconnection of individual subflows.
1644                  */
1645                 if (so->so_state & (SS_ISDISCONNECTING|SS_ISDISCONNECTED))
1646                         sflt_notify(so, sock_evt_disconnected, NULL);
1647         }
1648         return (error);
1649 }
1650
1651 int
1652 sodisconnectx(struct socket *so, sae_associd_t aid, sae_connid_t cid)
1653 {
1654         int error;
1655
1656         socket_lock(so, 1);
1657         error = sodisconnectxlocked(so, aid, cid);
1658         socket_unlock(so, 1);
1659         return (error);
1660 }
1661
1662 int
1663 sopeelofflocked(struct socket *so, sae_associd_t aid, struct socket **psop)
1664 {
1665         return ((*so->so_proto->pr_usrreqs->pru_peeloff)(so, aid, psop));
1666 }
1667
1668 #define SBLOCKWAIT(f)   (((f) & MSG_DONTWAIT) ? 0 : SBL_WAIT)
1669
1670 /*
1671  * sosendcheck will lock the socket buffer if it isn't locked and
1672  * verify that there is space for the data being inserted.
1673  *
1674  * Returns:     0                       Success
1675  *              EPIPE
1676  *      sblock:EWOULDBLOCK
1677  *      sblock:EINTR
1678  *      sbwait:EBADF
1679  *      sbwait:EINTR
1680  *      [so_error]:???
1681  */
1682 int
1683 sosendcheck(struct socket *so, struct sockaddr *addr, user_ssize_t resid,
1684     int32_t clen, int32_t atomic, int flags, int *sblocked,
1685     struct mbuf *control)
1686 {
1687         int     error = 0;
1688         int32_t space;
1689         int     assumelock = 0;
1690
1691 restart:
1692         if (*sblocked == 0) {
1693                 if ((so->so_snd.sb_flags & SB_LOCK) != 0 &&
1694                     so->so_send_filt_thread != 0 &&
1695                     so->so_send_filt_thread == current_thread()) {
1696                         /*
1697                          * We're being called recursively from a filter,
1698                          * allow this to continue. Radar 4150520.
1699                          * Don't set sblocked because we don't want
1700                          * to perform an unlock later.
1701                          */
1702                         assumelock = 1;
1703                 } else {
1704                         error = sblock(&so->so_snd, SBLOCKWAIT(flags));
1705                         if (error) {
1706                                 if (so->so_flags & SOF_DEFUNCT)
1707                                         goto defunct;
1708                                 return (error);
1709                         }
1710                         *sblocked = 1;
1711                 }
1712         }
1713
1714         /*
1715          * If a send attempt is made on a socket that has been marked
1716          * as inactive (disconnected), reject the request.
1717          */
1718         if (so->so_flags & SOF_DEFUNCT) {
1719 defunct:
1720                 error = EPIPE;
1721                 SODEFUNCTLOG(("%s[%d]: defunct so 0x%llx [%d,%d] (%d)\n",
1722                     __func__, proc_selfpid(),
1723                     (uint64_t)DEBUG_KERNEL_ADDRPERM(so),
1724                     SOCK_DOM(so), SOCK_TYPE(so), error));
1725                 return (error);
1726         }
1727
1728         if (so->so_state & SS_CANTSENDMORE) {
1729 #if CONTENT_FILTER
1730                 /*
1731                  * Can re-inject data of half closed connections
1732                  */
1733                 if ((so->so_state & SS_ISDISCONNECTED) == 0 &&
1734                         so->so_snd.sb_cfil_thread == current_thread() &&
1735                         cfil_sock_data_pending(&so->so_snd) != 0)
1736                         CFIL_LOG(LOG_INFO,
1737                                 "so %llx ignore SS_CANTSENDMORE",
1738                                 (uint64_t)DEBUG_KERNEL_ADDRPERM(so));
1739                 else
1740 #endif /* CONTENT_FILTER */
1741                         return (EPIPE);
1742         }
1743         if (so->so_error) {
1744                 error = so->so_error;
1745                 so->so_error = 0;
1746                 return (error);
1747         }
1748
1749         if ((so->so_state & SS_ISCONNECTED) == 0) {
1750                 if ((so->so_proto->pr_flags & PR_CONNREQUIRED) != 0) {
1751                         if (((so->so_state & SS_ISCONFIRMING) == 0) &&
1752                             (resid != 0 || clen == 0) &&
1753                             !(so->so_flags1 & SOF1_PRECONNECT_DATA)) {
1754 #if MPTCP
1755                                 /*
1756                                  * MPTCP Fast Join sends data before the
1757                                  * socket is truly connected.
1758                                  */
1759                                 if ((so->so_flags & (SOF_MP_SUBFLOW |
1760                                         SOF_MPTCP_FASTJOIN)) !=
1761                                     (SOF_MP_SUBFLOW | SOF_MPTCP_FASTJOIN))
1762 #endif /* MPTCP */
1763                                 return (ENOTCONN);
1764                         }
1765                 } else if (addr == 0 && !(flags&MSG_HOLD)) {
1766                         return ((so->so_proto->pr_flags & PR_CONNREQUIRED) ?
1767                             ENOTCONN : EDESTADDRREQ);
1768                 }
1769         }
1770
1771         if (so->so_flags & SOF_ENABLE_MSGS)
1772                 space = msgq_sbspace(so, control);
1773         else
1774                 space = sbspace(&so->so_snd);
1775
1776         if (flags & MSG_OOB)
1777                 space += 1024;
1778         if ((atomic && resid > so->so_snd.sb_hiwat) ||
1779             clen > so->so_snd.sb_hiwat)
1780                 return (EMSGSIZE);
1781
1782         if ((space < resid + clen &&
1783             (atomic || (space < (int32_t)so->so_snd.sb_lowat) ||
1784             space < clen)) ||
1785             (so->so_type == SOCK_STREAM && so_wait_for_if_feedback(so))) {
1786                 /*
1787                  * don't block the connectx call when there's more data
1788                  * than can be copied.
1789                  */
1790                 if (so->so_flags1 & SOF1_PRECONNECT_DATA) {
1791                         if (space == 0) {
1792                                 return (EWOULDBLOCK);
1793                         }
1794                         if (space < (int32_t)so->so_snd.sb_lowat) {
1795                                 return (0);
1796                         }
1797                 }
1798                 if ((so->so_state & SS_NBIO) || (flags & MSG_NBIO) ||
1799                     assumelock) {
1800                         return (EWOULDBLOCK);
1801                 }
1802                 sbunlock(&so->so_snd, TRUE);    /* keep socket locked */
1803                 *sblocked = 0;
1804                 error = sbwait(&so->so_snd);
1805                 if (error) {
1806                         if (so->so_flags & SOF_DEFUNCT)
1807                                 goto defunct;
1808                         return (error);
1809                 }
1810                 goto restart;
1811         }
1812         return (0);
1813 }
1814
1815 /*
1816  * Send on a socket.
1817  * If send must go all at once and message is larger than
1818  * send buffering, then hard error.
1819  * Lock against other senders.
1820  * If must go all at once and not enough room now, then
1821  * inform user that this would block and do nothing.
1822  * Otherwise, if nonblocking, send as much as possible.
1823  * The data to be sent is described by "uio" if nonzero,
1824  * otherwise by the mbuf chain "top" (which must be null
1825  * if uio is not).  Data provided in mbuf chain must be small
1826  * enough to send all at once.
1827  *
1828  * Returns nonzero on error, timeout or signal; callers
1829  * must check for short counts if EINTR/ERESTART are returned.
1830  * Data and control buffers are freed on return.
1831  * Experiment:
1832  * MSG_HOLD: go thru most of sosend(), but just enqueue the mbuf
1833  * MSG_SEND: go thru as for MSG_HOLD on current fragment, then
1834  *  point at the mbuf chain being constructed and go from there.
1835  *
1836  * Returns:     0                       Success
1837  *              EOPNOTSUPP
1838  *              EINVAL
1839  *              ENOBUFS
1840  *      uiomove:EFAULT
1841  *      sosendcheck:EPIPE
1842  *      sosendcheck:EWOULDBLOCK
1843  *      sosendcheck:EINTR
1844  *      sosendcheck:EBADF
1845  *      sosendcheck:EINTR
1846  *      sosendcheck:???                 [value from so_error]
1847  *      <pru_send>:ECONNRESET[TCP]
1848  *      <pru_send>:EINVAL[TCP]
1849  *      <pru_send>:ENOBUFS[TCP]
1850  *      <pru_send>:EADDRINUSE[TCP]
1851  *      <pru_send>:EADDRNOTAVAIL[TCP]
1852  *      <pru_send>:EAFNOSUPPORT[TCP]
1853  *      <pru_send>:EACCES[TCP]
1854  *      <pru_send>:EAGAIN[TCP]
1855  *      <pru_send>:EPERM[TCP]
1856  *      <pru_send>:EMSGSIZE[TCP]
1857  *      <pru_send>:EHOSTUNREACH[TCP]
1858  *      <pru_send>:ENETUNREACH[TCP]
1859  *      <pru_send>:ENETDOWN[TCP]
1860  *      <pru_send>:ENOMEM[TCP]
1861  *      <pru_send>:ENOBUFS[TCP]
1862  *      <pru_send>:???[TCP]             [ignorable: mostly IPSEC/firewall/DLIL]
1863  *      <pru_send>:EINVAL[AF_UNIX]
1864  *      <pru_send>:EOPNOTSUPP[AF_UNIX]
1865  *      <pru_send>:EPIPE[AF_UNIX]
1866  *      <pru_send>:ENOTCONN[AF_UNIX]
1867  *      <pru_send>:EISCONN[AF_UNIX]
1868  *      <pru_send>:???[AF_UNIX]         [whatever a filter author chooses]
1869  *      <sf_data_out>:???               [whatever a filter author chooses]
1870  *
1871  * Notes:       Other <pru_send> returns depend on the protocol family; all
1872  *              <sf_data_out> returns depend on what the filter author causes
1873  *              their filter to return.
1874  */
1875 int
1876 sosend(struct socket *so, struct sockaddr *addr, struct uio *uio,
1877     struct mbuf *top, struct mbuf *control, int flags)
1878 {
1879         struct mbuf **mp;
1880         struct mbuf *m, *freelist = NULL;
1881         user_ssize_t space, len, resid, orig_resid;
1882         int clen = 0, error, dontroute, mlen, sendflags;
1883         int atomic = sosendallatonce(so) || top;
1884         int sblocked = 0;
1885         struct proc *p = current_proc();
1886         struct mbuf *control_copy = NULL;
1887         uint16_t headroom = 0;
1888         boolean_t en_tracing = FALSE;
1889
1890         if (uio != NULL)
1891                 resid = uio_resid(uio);
1892         else
1893                 resid = top->m_pkthdr.len;
1894
1895         KERNEL_DEBUG((DBG_FNC_SOSEND | DBG_FUNC_START), so, resid,
1896             so->so_snd.sb_cc, so->so_snd.sb_lowat, so->so_snd.sb_hiwat);
1897
1898         socket_lock(so, 1);
1899
1900         /*
1901          * trace if tracing & network (vs. unix) sockets & and
1902          * non-loopback
1903          */
1904         if (ENTR_SHOULDTRACE &&
1905             (SOCK_CHECK_DOM(so, AF_INET) || SOCK_CHECK_DOM(so, AF_INET6))) {
1906                 struct inpcb *inp = sotoinpcb(so);
1907                 if (inp->inp_last_outifp != NULL &&
1908                     !(inp->inp_last_outifp->if_flags & IFF_LOOPBACK)) {
1909                         en_tracing = TRUE;
1910                         KERNEL_ENERGYTRACE(kEnTrActKernSockWrite, DBG_FUNC_START,
1911                             VM_KERNEL_ADDRPERM(so),
1912                             ((so->so_state & SS_NBIO) ? kEnTrFlagNonBlocking : 0),
1913                             (int64_t)resid);
1914                         orig_resid = resid;
1915                 }
1916         }
1917
1918         /*
1919          * Re-injection should not affect process accounting
1920          */
1921         if ((flags & MSG_SKIPCFIL) == 0) {
1922                 so_update_last_owner_locked(so, p);
1923                 so_update_policy(so);
1924
1925 #if NECP
1926                 so_update_necp_policy(so, NULL, addr);
1927 #endif /* NECP */
1928         }
1929
1930         if (so->so_type != SOCK_STREAM && (flags & MSG_OOB) != 0) {
1931                 error = EOPNOTSUPP;
1932                 socket_unlock(so, 1);
1933                 goto out;
1934         }
1935
1936         /*
1937          * In theory resid should be unsigned.
1938          * However, space must be signed, as it might be less than 0
1939          * if we over-committed, and we must use a signed comparison
1940          * of space and resid.  On the other hand, a negative resid
1941          * causes us to loop sending 0-length segments to the protocol.
1942          *
1943          * Usually, MSG_EOR isn't used on SOCK_STREAM type sockets.
1944          * But it will be used by sockets doing message delivery.
1945          *
1946          * Note: We limit resid to be a positive int value as we use
1947          * imin() to set bytes_to_copy -- radr://14558484
1948          */
1949         if (resid < 0 || resid > INT_MAX || (so->so_type == SOCK_STREAM &&
1950             !(so->so_flags & SOF_ENABLE_MSGS) && (flags & MSG_EOR))) {
1951                 error = EINVAL;
1952                 socket_unlock(so, 1);
1953                 goto out;
1954         }
1955
1956         dontroute = (flags & MSG_DONTROUTE) &&
1957             (so->so_options & SO_DONTROUTE) == 0 &&
1958             (so->so_proto->pr_flags & PR_ATOMIC);
1959         OSIncrementAtomicLong(&p->p_stats->p_ru.ru_msgsnd);
1960
1961         if (control != NULL)
1962                 clen = control->m_len;
1963
1964         if (soreserveheadroom != 0)
1965                 headroom = so->so_pktheadroom;
1966
1967         do {
1968                 error = sosendcheck(so, addr, resid, clen, atomic, flags,
1969                     &sblocked, control);
1970                 if (error)
1971                         goto release;
1972
1973                 mp = &top;
1974                 if (so->so_flags & SOF_ENABLE_MSGS)
1975                         space = msgq_sbspace(so, control);
1976                 else
1977                         space = sbspace(&so->so_snd) - clen;
1978                 space += ((flags & MSG_OOB) ? 1024 : 0);
1979
1980                 do {
1981                         if (uio == NULL) {
1982                                 /*
1983                                  * Data is prepackaged in "top".
1984                                  */
1985                                 resid = 0;
1986                                 if (flags & MSG_EOR)
1987                                         top->m_flags |= M_EOR;
1988                         } else {
1989                                 int chainlength;
1990                                 int bytes_to_copy;
1991                                 boolean_t jumbocl;
1992                                 boolean_t bigcl;
1993                                 int bytes_to_alloc;
1994
1995                                 bytes_to_copy = imin(resid, space);
1996
1997                                 bytes_to_alloc = bytes_to_copy;
1998                                 if (top == NULL)
1999                                         bytes_to_alloc += headroom;
2000
2001                                 if (sosendminchain > 0)
2002                                         chainlength = 0;
2003                                 else
2004                                         chainlength = sosendmaxchain;
2005
2006                                 /*
2007                                  * Use big 4 KB cluster when the outgoing interface
2008                                  * does not prefer 2 KB clusters
2009                                  */
2010                                 bigcl = !(so->so_flags1 & SOF1_IF_2KCL) ||
2011                                     sosendbigcl_ignore_capab;
2012
2013                                 /*
2014                                  * Attempt to use larger than system page-size
2015                                  * clusters for large writes only if there is
2016                                  * a jumbo cluster pool and if the socket is
2017                                  * marked accordingly.
2018                                  */
2019                                 jumbocl = sosendjcl && njcl > 0 &&
2020                                     ((so->so_flags & SOF_MULTIPAGES) ||
2021                                     sosendjcl_ignore_capab) &&
2022                                     bigcl;
2023
2024                                 socket_unlock(so, 0);
2025
2026                                 do {
2027                                         int num_needed;
2028                                         int hdrs_needed = (top == NULL) ? 1 : 0;
2029
2030                                         /*
2031                                          * try to maintain a local cache of mbuf
2032                                          * clusters needed to complete this
2033                                          * write the list is further limited to
2034                                          * the number that are currently needed
2035                                          * to fill the socket this mechanism
2036                                          * allows a large number of mbufs/
2037                                          * clusters to be grabbed under a single
2038                                          * mbuf lock... if we can't get any
2039                                          * clusters, than fall back to trying
2040                                          * for mbufs if we fail early (or
2041                                          * miscalcluate the number needed) make
2042                                          * sure to release any clusters we
2043                                          * haven't yet consumed.
2044                                          */
2045                                         if (freelist == NULL &&
2046                                             bytes_to_alloc > MBIGCLBYTES &&
2047                                             jumbocl) {
2048                                                 num_needed =
2049                                                     bytes_to_alloc / M16KCLBYTES;
2050
2051                                                 if ((bytes_to_alloc -
2052                                                     (num_needed * M16KCLBYTES))
2053                                                     >= MINCLSIZE)
2054                                                         num_needed++;
2055
2056                                                 freelist =
2057                                                     m_getpackets_internal(
2058                                                     (unsigned int *)&num_needed,
2059                                                     hdrs_needed, M_WAIT, 0,
2060                                                     M16KCLBYTES);
2061                                                 /*
2062                                                  * Fall back to 4K cluster size
2063                                                  * if allocation failed
2064                                                  */
2065                                         }
2066
2067                                         if (freelist == NULL &&
2068                                             bytes_to_alloc > MCLBYTES &&
2069                                             bigcl) {
2070                                                 num_needed =
2071                                                     bytes_to_alloc / MBIGCLBYTES;
2072
2073                                                 if ((bytes_to_alloc -
2074                                                     (num_needed * MBIGCLBYTES)) >=
2075                                                     MINCLSIZE)
2076                                                         num_needed++;
2077
2078                                                 freelist =
2079                                                     m_getpackets_internal(
2080                                                     (unsigned int *)&num_needed,
2081                                                     hdrs_needed, M_WAIT, 0,
2082                                                     MBIGCLBYTES);
2083                                                 /*
2084                                                  * Fall back to cluster size
2085                                                  * if allocation failed
2086                                                  */
2087                                         }
2088
2089                                         /*
2090                                          * Allocate a cluster as we want to
2091                                          * avoid to split the data in more
2092                                          * that one segment and using MINCLSIZE
2093                                          * would lead us to allocate two mbufs
2094                                          */
2095                                         if (soreserveheadroom != 0 &&
2096                                             freelist == NULL &&
2097                                             ((top == NULL &&
2098                                             bytes_to_alloc > _MHLEN) ||
2099                                             bytes_to_alloc > _MLEN)) {
2100                                                 num_needed = ROUNDUP(bytes_to_alloc, MCLBYTES) /
2101                                                     MCLBYTES;
2102                                                 freelist =
2103                                                     m_getpackets_internal(
2104                                                     (unsigned int *)&num_needed,
2105                                                     hdrs_needed, M_WAIT, 0,
2106                                                     MCLBYTES);
2107                                                 /*
2108                                                  * Fall back to a single mbuf
2109                                                  * if allocation failed
2110                                                  */
2111                                         } else if (freelist == NULL &&
2112                                             bytes_to_alloc > MINCLSIZE) {
2113                                                 num_needed =
2114                                                     bytes_to_alloc / MCLBYTES;
2115
2116                                                 if ((bytes_to_alloc -
2117                                                     (num_needed * MCLBYTES)) >=
2118                                                     MINCLSIZE)
2119                                                         num_needed++;
2120
2121                                                 freelist =
2122                                                     m_getpackets_internal(
2123                                                     (unsigned int *)&num_needed,
2124                                                     hdrs_needed, M_WAIT, 0,
2125                                                     MCLBYTES);
2126                                                 /*
2127                                                  * Fall back to a single mbuf
2128                                                  * if allocation failed
2129                                                  */
2130                                         }
2131                                         /*
2132                                          * For datagram protocols, leave
2133                                          * headroom for protocol headers
2134                                          * in the first cluster of the chain
2135                                          */
2136                                         if (freelist != NULL && atomic &&
2137                                             top == NULL && headroom > 0) {
2138                                                 freelist->m_data += headroom;
2139                                         }
2140
2141                                         /*
2142                                          * Fall back to regular mbufs without
2143                                          * reserving the socket headroom
2144                                          */
2145                                         if (freelist == NULL) {
2146                                                 if (top == NULL)
2147                                                         MGETHDR(freelist,
2148                                                             M_WAIT, MT_DATA);
2149                                                 else
2150                                                         MGET(freelist,
2151                                                             M_WAIT, MT_DATA);
2152
2153                                                 if (freelist == NULL) {
2154                                                         error = ENOBUFS;
2155                                                         socket_lock(so, 0);
2156                                                         goto release;
2157                                                 }
2158                                                 /*
2159                                                  * For datagram protocols,
2160                                                  * leave room for protocol
2161                                                  * headers in first mbuf.
2162                                                  */
2163                                                 if (atomic && top == NULL &&
2164                                                     bytes_to_copy < MHLEN) {
2165                                                         MH_ALIGN(freelist,
2166                                                             bytes_to_copy);
2167                                                 }
2168                                         }
2169                                         m = freelist;
2170                                         freelist = m->m_next;
2171                                         m->m_next = NULL;
2172
2173                                         if ((m->m_flags & M_EXT))
2174                                                 mlen = m->m_ext.ext_size -
2175                                                     m_leadingspace(m);
2176                                         else if ((m->m_flags & M_PKTHDR))
2177                                                 mlen =
2178                                                     MHLEN - m_leadingspace(m);
2179                                         else
2180                                                 mlen = MLEN - m_leadingspace(m);
2181                                         len = imin(mlen, bytes_to_copy);
2182
2183                                         chainlength += len;
2184
2185                                         space -= len;
2186
2187                                         error = uiomove(mtod(m, caddr_t),
2188                                             len, uio);
2189
2190                                         resid = uio_resid(uio);
2191
2192                                         m->m_len = len;
2193                                         *mp = m;
2194                                         top->m_pkthdr.len += len;
2195                                         if (error)
2196                                                 break;
2197                                         mp = &m->m_next;
2198                                         if (resid <= 0) {
2199                                                 if (flags & MSG_EOR)
2200                                                         top->m_flags |= M_EOR;
2201                                                 break;
2202                                         }
2203                                         bytes_to_copy = min(resid, space);
2204
2205                                 } while (space > 0 &&
2206                                     (chainlength < sosendmaxchain || atomic ||
2207                                     resid < MINCLSIZE));
2208
2209                                 socket_lock(so, 0);
2210
2211                                 if (error)
2212                                         goto release;
2213                         }
2214
2215                         if (flags & (MSG_HOLD|MSG_SEND)) {
2216                                 /* Enqueue for later, go away if HOLD */
2217                                 struct mbuf *mb1;
2218                                 if (so->so_temp && (flags & MSG_FLUSH)) {
2219                                         m_freem(so->so_temp);
2220                                         so->so_temp = NULL;
2221                                 }
2222                                 if (so->so_temp)
2223                                         so->so_tail->m_next = top;
2224                                 else
2225                                         so->so_temp = top;
2226                                 mb1 = top;
2227                                 while (mb1->m_next)
2228                                         mb1 = mb1->m_next;
2229                                 so->so_tail = mb1;
2230                                 if (flags & MSG_HOLD) {
2231                                         top = NULL;
2232                                         goto release;
2233                                 }
2234                                 top = so->so_temp;
2235                         }
2236                         if (dontroute)
2237                                 so->so_options |= SO_DONTROUTE;
2238
2239                         /*
2240                          * Compute flags here, for pru_send and NKEs
2241                          *
2242                          * If the user set MSG_EOF, the protocol
2243                          * understands this flag and nothing left to
2244                          * send then use PRU_SEND_EOF instead of PRU_SEND.
2245                          */
2246                         sendflags = (flags & MSG_OOB) ? PRUS_OOB :
2247                             ((flags & MSG_EOF) &&
2248                             (so->so_proto->pr_flags & PR_IMPLOPCL) &&
2249                             (resid <= 0)) ? PRUS_EOF :
2250                             /* If there is more to send set PRUS_MORETOCOME */
2251                             (resid > 0 && space > 0) ? PRUS_MORETOCOME : 0;
2252
2253                         if ((flags & MSG_SKIPCFIL) == 0) {
2254                                 /*
2255                                  * Socket filter processing
2256                                  */
2257                                 error = sflt_data_out(so, addr, &top,
2258                                     &control, (sendflags & MSG_OOB) ?
2259                                     sock_data_filt_flag_oob : 0);
2260                                 if (error) {
2261                                         if (error == EJUSTRETURN) {
2262                                                 error = 0;
2263                                                 clen = 0;
2264                                                 control = NULL;
2265                                                 top = NULL;
2266                                         }
2267                                         goto release;
2268                                 }
2269 #if CONTENT_FILTER
2270                                 /*
2271                                  * Content filter processing
2272                                  */
2273                                 error = cfil_sock_data_out(so, addr, top,
2274                                     control, (sendflags & MSG_OOB) ?
2275                                     sock_data_filt_flag_oob : 0);
2276                                 if (error) {
2277                                         if (error == EJUSTRETURN) {
2278                                                 error = 0;
2279                                                 clen = 0;
2280                                                 control = NULL;
2281                                                 top = NULL;
2282                                                 }
2283                                         goto release;
2284                                 }
2285 #endif /* CONTENT_FILTER */
2286                         }
2287                         if (so->so_flags & SOF_ENABLE_MSGS) {
2288                                 /*
2289                                  * Make a copy of control mbuf,
2290                                  * so that msg priority can be
2291                                  * passed to subsequent mbufs.
2292                                  */
2293                                 control_copy = m_dup(control, M_NOWAIT);
2294                         }
2295                         error = (*so->so_proto->pr_usrreqs->pru_send)
2296                             (so, sendflags, top, addr, control, p);
2297
2298                         if (flags & MSG_SEND)
2299                                 so->so_temp = NULL;
2300
2301                         if (dontroute)
2302                                 so->so_options &= ~SO_DONTROUTE;
2303
2304                         clen = 0;
2305                         control = control_copy;
2306                         control_copy = NULL;
2307                         top = NULL;
2308                         mp = &top;
2309                         if (error)
2310                                 goto release;
2311                 } while (resid && space > 0);
2312         } while (resid);
2313
2314 release:
2315         if (sblocked)
2316                 sbunlock(&so->so_snd, FALSE);   /* will unlock socket */
2317         else
2318                 socket_unlock(so, 1);
2319 out:
2320         if (top != NULL)
2321                 m_freem(top);
2322         if (control != NULL)
2323                 m_freem(control);
2324         if (freelist != NULL)
2325                 m_freem_list(freelist);
2326         if (control_copy != NULL)
2327                 m_freem(control_copy);
2328
2329         /*
2330          * One write has been done. This was enough. Get back to "normal"
2331          * behavior.
2332          */
2333         if (so->so_flags1 & SOF1_PRECONNECT_DATA)
2334                 so->so_flags1 &= ~SOF1_PRECONNECT_DATA;
2335
2336         if (en_tracing) {
2337                 /* resid passed here is the bytes left in uio */
2338                 KERNEL_ENERGYTRACE(kEnTrActKernSockWrite, DBG_FUNC_END,
2339                     VM_KERNEL_ADDRPERM(so),
2340                     ((error == EWOULDBLOCK) ? kEnTrFlagNoWork : 0),
2341                     (int64_t)(orig_resid - resid));
2342         }
2343         KERNEL_DEBUG(DBG_FNC_SOSEND | DBG_FUNC_END, so, resid,
2344             so->so_snd.sb_cc, space, error);
2345
2346         return (error);
2347 }
2348
2349 /*
2350  * Supported only connected sockets (no address) without ancillary data
2351  * (control mbuf) for atomic protocols
2352  */
2353 int
2354 sosend_list(struct socket *so, struct uio **uioarray, u_int uiocnt, int flags)
2355 {
2356         struct mbuf *m, *freelist = NULL;
2357         user_ssize_t len, resid;
2358         int error, dontroute, mlen;
2359         int atomic = sosendallatonce(so);
2360         int sblocked = 0;
2361         struct proc *p = current_proc();
2362         u_int uiofirst = 0;
2363         u_int uiolast = 0;
2364         struct mbuf *top = NULL;
2365         uint16_t headroom = 0;
2366         boolean_t bigcl;
2367
2368         KERNEL_DEBUG((DBG_FNC_SOSEND_LIST | DBG_FUNC_START), so, uiocnt,
2369             so->so_snd.sb_cc, so->so_snd.sb_lowat, so->so_snd.sb_hiwat);
2370
2371         if (so->so_type != SOCK_DGRAM) {
2372                 error = EINVAL;
2373                 goto out;
2374         }
2375         if (atomic == 0) {
2376                 error = EINVAL;
2377                 goto out;
2378         }
2379         if (so->so_proto->pr_usrreqs->pru_send_list == NULL) {
2380                 error = EPROTONOSUPPORT;
2381                 goto out;
2382         }
2383         if (flags & ~(MSG_DONTWAIT | MSG_NBIO)) {
2384                 error = EINVAL;
2385                 goto out;
2386         }
2387         resid = uio_array_resid(uioarray, uiocnt);
2388
2389         /*
2390          * In theory resid should be unsigned.
2391          * However, space must be signed, as it might be less than 0
2392          * if we over-committed, and we must use a signed comparison
2393          * of space and resid.  On the other hand, a negative resid
2394          * causes us to loop sending 0-length segments to the protocol.
2395          *
2396          * Note: We limit resid to be a positive int value as we use
2397          * imin() to set bytes_to_copy -- radr://14558484
2398          */
2399         if (resid < 0 || resid > INT_MAX) {
2400                 error = EINVAL;
2401                 goto out;
2402         }
2403
2404         socket_lock(so, 1);
2405         so_update_last_owner_locked(so, p);
2406         so_update_policy(so);
2407
2408 #if NECP
2409         so_update_necp_policy(so, NULL, NULL);
2410 #endif /* NECP */
2411
2412         dontroute = (flags & MSG_DONTROUTE) &&
2413             (so->so_options & SO_DONTROUTE) == 0 &&
2414             (so->so_proto->pr_flags & PR_ATOMIC);
2415         OSIncrementAtomicLong(&p->p_stats->p_ru.ru_msgsnd);
2416
2417         error = sosendcheck(so, NULL, resid, 0, atomic, flags,
2418             &sblocked, NULL);
2419         if (error)
2420                 goto release;
2421
2422         /*
2423          * Use big 4 KB clusters when the outgoing interface does not prefer
2424          * 2 KB clusters
2425          */
2426         bigcl = !(so->so_flags1 & SOF1_IF_2KCL) || sosendbigcl_ignore_capab;
2427
2428         if (soreserveheadroom != 0)
2429                 headroom = so->so_pktheadroom;
2430
2431         do {
2432                 int i;
2433                 int num_needed = 0;
2434                 int chainlength;
2435                 size_t maxpktlen = 0;
2436                 int bytes_to_alloc;
2437
2438                 if (sosendminchain > 0)
2439                         chainlength = 0;
2440                 else
2441                         chainlength = sosendmaxchain;
2442
2443                 socket_unlock(so, 0);
2444
2445                 /*
2446                  * Find a set of uio that fit in a reasonable number
2447                  * of mbuf packets
2448                  */
2449                 for (i = uiofirst; i < uiocnt; i++) {
2450                         struct uio *auio = uioarray[i];
2451
2452                         len = uio_resid(auio);
2453
2454                         /* Do nothing for empty messages */
2455                         if (len == 0)
2456                                 continue;
2457
2458                         num_needed += 1;
2459                         uiolast += 1;
2460
2461                         if (len > maxpktlen)
2462                                 maxpktlen = len;
2463
2464                         chainlength += len;
2465                         if (chainlength > sosendmaxchain)
2466                                 break;
2467                 }
2468                 /*
2469                  * Nothing left to send
2470                  */
2471                 if (num_needed == 0) {
2472                         socket_lock(so, 0);
2473                         break;
2474                 }
2475                 /*
2476                  * Allocate buffer large enough to include headroom space for
2477                  * network and link header
2478                  *
2479                  */
2480                 bytes_to_alloc = maxpktlen + headroom;
2481
2482                 /*
2483                  * Allocate a single contiguous buffer of the smallest available
2484                  * size when possible
2485                  */
2486                 if (bytes_to_alloc > MCLBYTES &&
2487                     bytes_to_alloc <= MBIGCLBYTES && bigcl) {
2488                         freelist = m_getpackets_internal(
2489                             (unsigned int *)&num_needed,
2490                             num_needed, M_WAIT, 1,
2491                             MBIGCLBYTES);
2492                 } else if (bytes_to_alloc > _MHLEN &&
2493                     bytes_to_alloc <= MCLBYTES) {
2494                         freelist = m_getpackets_internal(
2495                             (unsigned int *)&num_needed,
2496                             num_needed, M_WAIT, 1,
2497                             MCLBYTES);
2498                 } else {
2499                         freelist = m_allocpacket_internal(
2500                             (unsigned int *)&num_needed,
2501                             bytes_to_alloc, NULL, M_WAIT, 1, 0);
2502                 }
2503
2504                 if (freelist == NULL) {
2505                         socket_lock(so, 0);
2506                         error = ENOMEM;
2507                         goto release;
2508                 }
2509                 /*
2510                  * Copy each uio of the set into its own mbuf packet
2511                  */
2512                 for (i = uiofirst, m = freelist;
2513                     i < uiolast && m != NULL;
2514                     i++) {
2515                         int bytes_to_copy;
2516                         struct mbuf *n;
2517                         struct uio *auio = uioarray[i];
2518
2519                         bytes_to_copy = uio_resid(auio);
2520
2521                         /* Do nothing for empty messages */
2522                         if (bytes_to_copy == 0)
2523                                 continue;
2524                         /*
2525                          * Leave headroom for protocol headers
2526                          * in the first mbuf of the chain
2527                          */
2528                         m->m_data += headroom;
2529
2530                         for (n = m; n != NULL; n = n->m_next) {
2531                                 if ((m->m_flags & M_EXT))
2532                                         mlen = m->m_ext.ext_size -
2533                                             m_leadingspace(m);
2534                                 else if ((m->m_flags & M_PKTHDR))
2535                                         mlen =
2536                                             MHLEN - m_leadingspace(m);
2537                                 else
2538                                         mlen = MLEN - m_leadingspace(m);
2539                                 len = imin(mlen, bytes_to_copy);
2540
2541                                 /*
2542                                  * Note: uiomove() decrements the iovec
2543                                  * length
2544                                  */
2545                                 error = uiomove(mtod(n, caddr_t),
2546                                     len, auio);
2547                                 if (error != 0)
2548                                         break;
2549                                 n->m_len = len;
2550                                 m->m_pkthdr.len += len;
2551
2552                                 VERIFY(m->m_pkthdr.len <= maxpktlen);
2553
2554                                 bytes_to_copy -= len;
2555                                 resid -= len;
2556                         }
2557                         if (m->m_pkthdr.len == 0) {
2558                                 printf(
2559                                     "%s:%d so %llx pkt %llx type %u len null\n",
2560                                     __func__, __LINE__,
2561                                     (uint64_t)DEBUG_KERNEL_ADDRPERM(so),
2562                                     (uint64_t)DEBUG_KERNEL_ADDRPERM(m),
2563                                     m->m_type);
2564                         }
2565                         if (error != 0)
2566                                 break;
2567                         m = m->m_nextpkt;
2568                 }
2569
2570                 socket_lock(so, 0);
2571
2572                 if (error)
2573                         goto release;
2574                 top = freelist;
2575                 freelist = NULL;
2576
2577                 if (dontroute)
2578                         so->so_options |= SO_DONTROUTE;
2579
2580                 if ((flags & MSG_SKIPCFIL) == 0) {
2581                         struct mbuf **prevnextp = NULL;
2582
2583                         for (i = uiofirst, m = top;
2584                             i < uiolast && m != NULL;
2585                             i++) {
2586                                 struct mbuf *nextpkt = m->m_nextpkt;
2587
2588                                 /*
2589                                  * Socket filter processing
2590                                  */
2591                                 error = sflt_data_out(so, NULL, &m,
2592                                     NULL, 0);
2593                                 if (error != 0 && error != EJUSTRETURN)
2594                                         goto release;
2595
2596 #if CONTENT_FILTER
2597                                 if (error == 0) {
2598                                         /*
2599                                          * Content filter processing
2600                                          */
2601                                         error = cfil_sock_data_out(so, NULL, m,
2602                                             NULL, 0);
2603                                         if (error != 0 && error != EJUSTRETURN)
2604                                                 goto release;
2605                                 }
2606 #endif /* CONTENT_FILTER */
2607                                 /*
2608                                  * Remove packet from the list when
2609                                  * swallowed by a filter
2610                                  */
2611                                 if (error == EJUSTRETURN) {
2612                                         error = 0;
2613                                         if (prevnextp != NULL)
2614                                                 *prevnextp = nextpkt;
2615                                         else
2616                                                 top = nextpkt;
2617                                 }
2618
2619                                 m = nextpkt;
2620                                 if (m != NULL)
2621                                         prevnextp = &m->m_nextpkt;
2622                         }
2623                 }
2624                 if (top != NULL)
2625                         error = (*so->so_proto->pr_usrreqs->pru_send_list)
2626                             (so, 0, top, NULL, NULL, p);
2627
2628                 if (dontroute)
2629                         so->so_options &= ~SO_DONTROUTE;
2630
2631                 top = NULL;
2632                 uiofirst = uiolast;
2633         } while (resid > 0 && error == 0);
2634 release:
2635         if (sblocked)
2636                 sbunlock(&so->so_snd, FALSE);   /* will unlock socket */
2637         else
2638                 socket_unlock(so, 1);
2639 out:
2640         if (top != NULL)
2641                 m_freem(top);
2642         if (freelist != NULL)
2643                 m_freem_list(freelist);
2644
2645         KERNEL_DEBUG(DBG_FNC_SOSEND_LIST | DBG_FUNC_END, so, resid,
2646             so->so_snd.sb_cc, 0, error);
2647
2648         return (error);
2649 }
2650
2651 /*
2652  * May return ERESTART when packet is dropped by MAC policy check
2653  */
2654 static int
2655 soreceive_addr(struct proc *p, struct socket *so, struct sockaddr **psa,
2656     int flags, struct mbuf **mp, struct mbuf **nextrecordp, int canwait)
2657 {
2658         int error = 0;
2659         struct mbuf *m = *mp;
2660         struct mbuf *nextrecord = *nextrecordp;
2661
2662         KASSERT(m->m_type == MT_SONAME, ("receive 1a"));
2663 #if CONFIG_MACF_SOCKET_SUBSET
2664         /*
2665          * Call the MAC framework for policy checking if we're in
2666          * the user process context and the socket isn't connected.
2667          */
2668         if (p != kernproc && !(so->so_state & SS_ISCONNECTED)) {
2669                 struct mbuf *m0 = m;
2670                 /*
2671                  * Dequeue this record (temporarily) from the receive
2672                  * list since we're about to drop the socket's lock
2673                  * where a new record may arrive and be appended to
2674                  * the list.  Upon MAC policy failure, the record
2675                  * will be freed.  Otherwise, we'll add it back to
2676                  * the head of the list.  We cannot rely on SB_LOCK
2677                  * because append operation uses the socket's lock.
2678                  */
2679                 do {
2680                         m->m_nextpkt = NULL;
2681                         sbfree(&so->so_rcv, m);
2682                         m = m->m_next;
2683                 } while (m != NULL);
2684                 m = m0;
2685                 so->so_rcv.sb_mb = nextrecord;
2686                 SB_EMPTY_FIXUP(&so->so_rcv);
2687                 SBLASTRECORDCHK(&so->so_rcv, "soreceive 1a");
2688                 SBLASTMBUFCHK(&so->so_rcv, "soreceive 1a");
2689                 socket_unlock(so, 0);
2690
2691                 if (mac_socket_check_received(proc_ucred(p), so,
2692                     mtod(m, struct sockaddr *)) != 0) {
2693                         /*
2694                          * MAC policy failure; free this record and
2695                          * process the next record (or block until
2696                          * one is available).  We have adjusted sb_cc
2697                          * and sb_mbcnt above so there is no need to
2698                          * call sbfree() again.
2699                          */
2700                         m_freem(m);
2701                         /*
2702                          * Clear SB_LOCK but don't unlock the socket.
2703                          * Process the next record or wait for one.
2704                          */
2705                         socket_lock(so, 0);
2706                         sbunlock(&so->so_rcv, TRUE); /* stay locked */
2707                         error = ERESTART;
2708                         goto done;
2709                 }
2710                 socket_lock(so, 0);
2711                 /*
2712                  * If the socket has been defunct'd, drop it.
2713                  */
2714                 if (so->so_flags & SOF_DEFUNCT) {
2715                         m_freem(m);
2716                         error = ENOTCONN;
2717                         goto done;
2718                 }
2719                 /*
2720                  * Re-adjust the socket receive list and re-enqueue
2721                  * the record in front of any packets which may have
2722                  * been appended while we dropped the lock.
2723                  */
2724                 for (m = m0; m->m_next != NULL; m = m->m_next)
2725                         sballoc(&so->so_rcv, m);
2726                 sballoc(&so->so_rcv, m);
2727                 if (so->so_rcv.sb_mb == NULL) {
2728                         so->so_rcv.sb_lastrecord = m0;
2729                         so->so_rcv.sb_mbtail = m;
2730                 }
2731                 m = m0;
2732                 nextrecord = m->m_nextpkt = so->so_rcv.sb_mb;
2733                 so->so_rcv.sb_mb = m;
2734                 SBLASTRECORDCHK(&so->so_rcv, "soreceive 1b");
2735                 SBLASTMBUFCHK(&so->so_rcv, "soreceive 1b");
2736         }
2737 #endif /* CONFIG_MACF_SOCKET_SUBSET */
2738         if (psa != NULL) {
2739                 *psa = dup_sockaddr(mtod(m, struct sockaddr *), canwait);
2740                 if ((*psa == NULL) && (flags & MSG_NEEDSA)) {
2741                         error = EWOULDBLOCK;
2742                         goto done;
2743                 }
2744         }
2745         if (flags & MSG_PEEK) {
2746                 m = m->m_next;
2747         } else {
2748                 sbfree(&so->so_rcv, m);
2749                 if (m->m_next == NULL && so->so_rcv.sb_cc != 0) {
2750                         panic("%s: about to create invalid socketbuf",
2751                             __func__);
2752                         /* NOTREACHED */
2753                 }
2754                 MFREE(m, so->so_rcv.sb_mb);
2755                 m = so->so_rcv.sb_mb;
2756                 if (m != NULL) {
2757                         m->m_nextpkt = nextrecord;
2758                 } else {
2759                         so->so_rcv.sb_mb = nextrecord;
2760                         SB_EMPTY_FIXUP(&so->so_rcv);
2761                 }
2762         }
2763 done:
2764         *mp = m;
2765         *nextrecordp = nextrecord;
2766
2767         return (error);
2768 }
2769
2770 /*
2771  * Process one or more MT_CONTROL mbufs present before any data mbufs
2772  * in the first mbuf chain on the socket buffer.  If MSG_PEEK, we
2773  * just copy the data; if !MSG_PEEK, we call into the protocol to
2774  * perform externalization.
2775  */
2776 static int
2777 soreceive_ctl(struct socket *so, struct mbuf **controlp, int flags,
2778     struct mbuf **mp, struct mbuf **nextrecordp)
2779 {
2780         int error = 0;
2781         struct mbuf *cm = NULL, *cmn;
2782         struct mbuf **cme = &cm;
2783         struct sockbuf *sb_rcv = &so->so_rcv;
2784         struct mbuf **msgpcm = NULL;
2785         struct mbuf *m = *mp;
2786         struct mbuf *nextrecord = *nextrecordp;
2787         struct protosw *pr = so->so_proto;
2788
2789         /*
2790          * Externalizing the control messages would require us to
2791          * drop the socket's lock below.  Once we re-acquire the
2792          * lock, the mbuf chain might change.  In order to preserve
2793          * consistency, we unlink all control messages from the
2794          * first mbuf chain in one shot and link them separately
2795          * onto a different chain.
2796          */
2797         do {
2798                 if (flags & MSG_PEEK) {
2799                         if (controlp != NULL) {
2800                                 if (*controlp == NULL) {
2801                                         msgpcm = controlp;
2802                                 }
2803                                 *controlp = m_copy(m, 0, m->m_len);
2804
2805                                 /*
2806                                  * If we failed to allocate an mbuf,
2807                                  * release any previously allocated
2808                                  * mbufs for control data. Return
2809                                  * an error. Keep the mbufs in the
2810                                  * socket as this is using
2811                                  * MSG_PEEK flag.
2812                                  */
2813                                 if (*controlp == NULL) {
2814                                         m_freem(*msgpcm);
2815                                         error = ENOBUFS;
2816                                         goto done;
2817                                 }
2818                                 controlp = &(*controlp)->m_next;
2819                         }
2820                         m = m->m_next;
2821                 } else {
2822                         m->m_nextpkt = NULL;
2823                         sbfree(sb_rcv, m);
2824                         sb_rcv->sb_mb = m->m_next;
2825                         m->m_next = NULL;
2826                         *cme = m;
2827                         cme = &(*cme)->m_next;
2828                         m = sb_rcv->sb_mb;
2829                 }
2830         } while (m != NULL && m->m_type == MT_CONTROL);
2831
2832         if (!(flags & MSG_PEEK)) {
2833                 if (sb_rcv->sb_mb != NULL) {
2834                         sb_rcv->sb_mb->m_nextpkt = nextrecord;
2835                 } else {
2836                         sb_rcv->sb_mb = nextrecord;
2837                         SB_EMPTY_FIXUP(sb_rcv);
2838                 }
2839                 if (nextrecord == NULL)
2840                         sb_rcv->sb_lastrecord = m;
2841         }
2842
2843         SBLASTRECORDCHK(&so->so_rcv, "soreceive ctl");
2844         SBLASTMBUFCHK(&so->so_rcv, "soreceive ctl");
2845
2846         while (cm != NULL) {
2847                 int cmsg_type;
2848
2849                 cmn = cm->m_next;
2850                 cm->m_next = NULL;
2851                 cmsg_type = mtod(cm, struct cmsghdr *)->cmsg_type;
2852
2853                 /*
2854                  * Call the protocol to externalize SCM_RIGHTS message
2855                  * and return the modified message to the caller upon
2856                  * success.  Otherwise, all other control messages are
2857                  * returned unmodified to the caller.  Note that we
2858                  * only get into this loop if MSG_PEEK is not set.
2859                  */
2860                 if (pr->pr_domain->dom_externalize != NULL &&
2861                     cmsg_type == SCM_RIGHTS) {
2862                         /*
2863                          * Release socket lock: see 3903171.  This
2864                          * would also allow more records to be appended
2865                          * to the socket buffer.  We still have SB_LOCK
2866                          * set on it, so we can be sure that the head
2867                          * of the mbuf chain won't change.
2868                          */
2869                         socket_unlock(so, 0);
2870                         error = (*pr->pr_domain->dom_externalize)(cm);
2871                         socket_lock(so, 0);
2872                 } else {
2873                         error = 0;
2874                 }
2875
2876                 if (controlp != NULL && error == 0) {
2877                         *controlp = cm;
2878                         controlp = &(*controlp)->m_next;
2879                 } else {
2880                         (void) m_free(cm);
2881                 }
2882                 cm = cmn;
2883         }
2884         /*
2885          * Update the value of nextrecord in case we received new
2886          * records when the socket was unlocked above for
2887          * externalizing SCM_RIGHTS.
2888          */
2889         if (m != NULL)
2890                 nextrecord = sb_rcv->sb_mb->m_nextpkt;
2891         else
2892                 nextrecord = sb_rcv->sb_mb;
2893
2894 done:
2895         *mp = m;
2896         *nextrecordp = nextrecord;
2897
2898         return (error);
2899 }
2900
2901 /*
2902  * Implement receive operations on a socket.
2903  * We depend on the way that records are added to the sockbuf
2904  * by sbappend*.  In particular, each record (mbufs linked through m_next)
2905  * must begin with an address if the protocol so specifies,
2906  * followed by an optional mbuf or mbufs containing ancillary data,
2907  * and then zero or more mbufs of data.
2908  * In order to avoid blocking network interrupts for the entire time here,
2909  * we splx() while doing the actual copy to user space.
2910  * Although the sockbuf is locked, new data may still be appended,
2911  * and thus we must maintain consistency of the sockbuf during that time.
2912  *
2913  * The caller may receive the data as a single mbuf chain by supplying
2914  * an mbuf **mp0 for use in returning the chain.  The uio is then used
2915  * only for the count in uio_resid.
2916  *
2917  * Returns:     0                       Success
2918  *              ENOBUFS
2919  *              ENOTCONN
2920  *              EWOULDBLOCK
2921  *      uiomove:EFAULT
2922  *      sblock:EWOULDBLOCK
2923  *      sblock:EINTR
2924  *      sbwait:EBADF
2925  *      sbwait:EINTR
2926  *      sodelayed_copy:EFAULT
2927  *      <pru_rcvoob>:EINVAL[TCP]
2928  *      <pru_rcvoob>:EWOULDBLOCK[TCP]
2929  *      <pru_rcvoob>:???
2930  *      <pr_domain->dom_externalize>:EMSGSIZE[AF_UNIX]
2931  *      <pr_domain->dom_externalize>:ENOBUFS[AF_UNIX]
2932  *      <pr_domain->dom_externalize>:???
2933  *
2934  * Notes:       Additional return values from calls through <pru_rcvoob> and
2935  *              <pr_domain->dom_externalize> depend on protocols other than
2936  *              TCP or AF_UNIX, which are documented above.
2937  */
2938 int
2939 soreceive(struct socket *so, struct sockaddr **psa, struct uio *uio,
2940     struct mbuf **mp0, struct mbuf **controlp, int *flagsp)
2941 {
2942         struct mbuf *m, **mp, *ml = NULL;
2943         struct mbuf *nextrecord, *free_list;
2944         int flags, error, offset;
2945         user_ssize_t len;
2946         struct protosw *pr = so->so_proto;
2947         int moff, type = 0;
2948         user_ssize_t orig_resid = uio_resid(uio);
2949         user_ssize_t delayed_copy_len;
2950         int can_delay;
2951         int need_event;
2952         struct proc *p = current_proc();
2953         boolean_t en_tracing = FALSE;
2954
2955         /*
2956          * Sanity check on the length passed by caller as we are making 'int'
2957          * comparisons
2958          */
2959         if (orig_resid < 0 || orig_resid > INT_MAX)
2960                 return (EINVAL);
2961
2962         KERNEL_DEBUG(DBG_FNC_SORECEIVE | DBG_FUNC_START, so,
2963             uio_resid(uio), so->so_rcv.sb_cc, so->so_rcv.sb_lowat,
2964             so->so_rcv.sb_hiwat);
2965
2966         socket_lock(so, 1);
2967         so_update_last_owner_locked(so, p);
2968         so_update_policy(so);
2969
2970 #ifdef MORE_LOCKING_DEBUG
2971         if (so->so_usecount == 1) {
2972                 panic("%s: so=%x no other reference on socket\n", __func__, so);
2973                 /* NOTREACHED */
2974         }
2975 #endif
2976         mp = mp0;
2977         if (psa != NULL)
2978                 *psa = NULL;
2979         if (controlp != NULL)
2980                 *controlp = NULL;
2981         if (flagsp != NULL)
2982                 flags = *flagsp &~ MSG_EOR;
2983         else
2984                 flags = 0;
2985
2986         /*
2987          * If a recv attempt is made on a previously-accepted socket
2988          * that has been marked as inactive (disconnected), reject
2989          * the request.
2990          */
2991         if (so->so_flags & SOF_DEFUNCT) {
2992                 struct sockbuf *sb = &so->so_rcv;
2993
2994                 error = ENOTCONN;
2995                 SODEFUNCTLOG(("%s[%d]: defunct so 0x%llx [%d,%d] (%d)\n",
2996                     __func__, proc_pid(p), (uint64_t)DEBUG_KERNEL_ADDRPERM(so),
2997                     SOCK_DOM(so), SOCK_TYPE(so), error));
2998                 /*
2999                  * This socket should have been disconnected and flushed
3000                  * prior to being returned from sodefunct(); there should
3001                  * be no data on its receive list, so panic otherwise.
3002                  */
3003                 if (so->so_state & SS_DEFUNCT)
3004                         sb_empty_assert(sb, __func__);
3005                 socket_unlock(so, 1);
3006                 return (error);
3007         }
3008
3009         if ((so->so_flags1 & SOF1_PRECONNECT_DATA) &&
3010             pr->pr_usrreqs->pru_preconnect) {
3011                 /*
3012                  * A user may set the CONNECT_RESUME_ON_READ_WRITE-flag but not
3013                  * calling write() right after this. *If* the app calls a read
3014                  * we do not want to block this read indefinetely. Thus,
3015                  * we trigger a connect so that the session gets initiated.
3016                  */
3017                 error = (*pr->pr_usrreqs->pru_preconnect)(so);
3018
3019                 if (error) {
3020                         socket_unlock(so, 1);
3021                         return (error);
3022                 }
3023         }
3024
3025         if (ENTR_SHOULDTRACE &&
3026             (SOCK_CHECK_DOM(so, AF_INET) || SOCK_CHECK_DOM(so, AF_INET6))) {
3027                 /*
3028                  * enable energy tracing for inet sockets that go over
3029                  * non-loopback interfaces only.
3030                  */
3031                 struct inpcb *inp = sotoinpcb(so);
3032                 if (inp->inp_last_outifp != NULL &&
3033                     !(inp->inp_last_outifp->if_flags & IFF_LOOPBACK)) {
3034                         en_tracing = TRUE;
3035                         KERNEL_ENERGYTRACE(kEnTrActKernSockRead, DBG_FUNC_START,
3036                             VM_KERNEL_ADDRPERM(so),
3037                             ((so->so_state & SS_NBIO) ?
3038                             kEnTrFlagNonBlocking : 0),
3039                             (int64_t)orig_resid);
3040                 }
3041         }
3042
3043         /*
3044          * When SO_WANTOOBFLAG is set we try to get out-of-band data
3045          * regardless of the flags argument. Here is the case were
3046          * out-of-band data is not inline.
3047          */
3048         if ((flags & MSG_OOB) ||
3049             ((so->so_options & SO_WANTOOBFLAG) != 0 &&
3050             (so->so_options & SO_OOBINLINE) == 0 &&
3051             (so->so_oobmark || (so->so_state & SS_RCVATMARK)))) {
3052                 m = m_get(M_WAIT, MT_DATA);
3053                 if (m == NULL) {
3054                         socket_unlock(so, 1);
3055                         KERNEL_DEBUG(DBG_FNC_SORECEIVE | DBG_FUNC_END,
3056                             ENOBUFS, 0, 0, 0, 0);
3057                         return (ENOBUFS);
3058                 }
3059                 error = (*pr->pr_usrreqs->pru_rcvoob)(so, m, flags & MSG_PEEK);
3060                 if (error)
3061                         goto bad;
3062                 socket_unlock(so, 0);
3063                 do {
3064                         error = uiomove(mtod(m, caddr_t),
3065                             imin(uio_resid(uio), m->m_len), uio);
3066                         m = m_free(m);
3067                 } while (uio_resid(uio) && error == 0 && m != NULL);
3068                 socket_lock(so, 0);
3069 bad:
3070                 if (m != NULL)
3071                         m_freem(m);
3072
3073                 if ((so->so_options & SO_WANTOOBFLAG) != 0) {
3074                         if (error == EWOULDBLOCK || error == EINVAL) {
3075                                 /*
3076                                  * Let's try to get normal data:
3077                                  * EWOULDBLOCK: out-of-band data not
3078                                  * receive yet. EINVAL: out-of-band data
3079                                  * already read.
3080                                  */
3081                                 error = 0;
3082                                 goto nooob;
3083                         } else if (error == 0 && flagsp != NULL) {
3084                                 *flagsp |= MSG_OOB;
3085                         }
3086                 }
3087                 socket_unlock(so, 1);
3088                 if (en_tracing) {
3089                         KERNEL_ENERGYTRACE(kEnTrActKernSockRead, DBG_FUNC_END,
3090                             VM_KERNEL_ADDRPERM(so), 0,
3091                             (int64_t)(orig_resid - uio_resid(uio)));
3092                 }
3093                 KERNEL_DEBUG(DBG_FNC_SORECEIVE | DBG_FUNC_END, error,
3094                     0, 0, 0, 0);
3095
3096                 return (error);
3097         }
3098 nooob:
3099         if (mp != NULL)
3100                 *mp = NULL;
3101
3102         if (so->so_state & SS_ISCONFIRMING && uio_resid(uio)) {
3103                 (*pr->pr_usrreqs->pru_rcvd)(so, 0);
3104         }
3105
3106         free_list = NULL;
3107         delayed_copy_len = 0;
3108 restart:
3109 #ifdef MORE_LOCKING_DEBUG
3110         if (so->so_usecount <= 1)
3111                 printf("soreceive: sblock so=0x%llx ref=%d on socket\n",
3112                     (uint64_t)DEBUG_KERNEL_ADDRPERM(so), so->so_usecount);
3113 #endif
3114         /*
3115          * See if the socket has been closed (SS_NOFDREF|SS_CANTRCVMORE)
3116          * and if so just return to the caller.  This could happen when
3117          * soreceive() is called by a socket upcall function during the
3118          * time the socket is freed.  The socket buffer would have been
3119          * locked across the upcall, therefore we cannot put this thread
3120          * to sleep (else we will deadlock) or return EWOULDBLOCK (else
3121          * we may livelock), because the lock on the socket buffer will
3122          * only be released when the upcall routine returns to its caller.
3123          * Because the socket has been officially closed, there can be
3124          * no further read on it.
3125          *
3126          * A multipath subflow socket would have its SS_NOFDREF set by
3127          * default, so check for SOF_MP_SUBFLOW socket flag; when the
3128          * socket is closed for real, SOF_MP_SUBFLOW would be cleared.
3129          */
3130         if ((so->so_state & (SS_NOFDREF | SS_CANTRCVMORE)) ==
3131             (SS_NOFDREF | SS_CANTRCVMORE) && !(so->so_flags & SOF_MP_SUBFLOW)) {
3132                 socket_unlock(so, 1);
3133                 return (0);
3134         }
3135
3136         error = sblock(&so->so_rcv, SBLOCKWAIT(flags));
3137         if (error) {
3138                 socket_unlock(so, 1);
3139                 KERNEL_DEBUG(DBG_FNC_SORECEIVE | DBG_FUNC_END, error,
3140                     0, 0, 0, 0);
3141                 if (en_tracing) {
3142                         KERNEL_ENERGYTRACE(kEnTrActKernSockRead, DBG_FUNC_END,
3143                             VM_KERNEL_ADDRPERM(so), 0,
3144                             (int64_t)(orig_resid - uio_resid(uio)));
3145                 }
3146                 return (error);
3147         }
3148
3149         m = so->so_rcv.sb_mb;
3150         /*
3151          * If we have less data than requested, block awaiting more
3152          * (subject to any timeout) if:
3153          *   1. the current count is less than the low water mark, or
3154          *   2. MSG_WAITALL is set, and it is possible to do the entire
3155          *      receive operation at once if we block (resid <= hiwat).
3156          *   3. MSG_DONTWAIT is not set
3157          * If MSG_WAITALL is set but resid is larger than the receive buffer,
3158          * we have to do the receive in sections, and thus risk returning
3159          * a short count if a timeout or signal occurs after we start.
3160          */
3161         if (m == NULL || (((flags & MSG_DONTWAIT) == 0 &&
3162             so->so_rcv.sb_cc < uio_resid(uio)) &&
3163             (so->so_rcv.sb_cc < so->so_rcv.sb_lowat ||
3164             ((flags & MSG_WAITALL) && uio_resid(uio) <= so->so_rcv.sb_hiwat)) &&
3165             m->m_nextpkt == NULL && (pr->pr_flags & PR_ATOMIC) == 0)) {
3166                 /*
3167                  * Panic if we notice inconsistencies in the socket's
3168                  * receive list; both sb_mb and sb_cc should correctly
3169                  * reflect the contents of the list, otherwise we may
3170                  * end up with false positives during select() or poll()
3171                  * which could put the application in a bad state.
3172                  */
3173                 SB_MB_CHECK(&so->so_rcv);
3174
3175                 if (so->so_error) {
3176                         if (m != NULL)
3177                                 goto dontblock;
3178                         error = so->so_error;
3179                         if ((flags & MSG_PEEK) == 0)
3180                                 so->so_error = 0;
3181                         goto release;
3182                 }
3183                 if (so->so_state & SS_CANTRCVMORE) {
3184 #if CONTENT_FILTER
3185                         /*
3186                          * Deal with half closed connections
3187                          */
3188                         if ((so->so_state & SS_ISDISCONNECTED) == 0 &&
3189                                 cfil_sock_data_pending(&so->so_rcv) != 0)
3190                                 CFIL_LOG(LOG_INFO,
3191                                         "so %llx ignore SS_CANTRCVMORE",
3192                                         (uint64_t)DEBUG_KERNEL_ADDRPERM(so));
3193                         else
3194 #endif /* CONTENT_FILTER */
3195                         if (m != NULL)
3196                                 goto dontblock;
3197                         else
3198                                 goto release;
3199                 }
3200                 for (; m != NULL; m = m->m_next)
3201                         if (m->m_type == MT_OOBDATA || (m->m_flags & M_EOR)) {
3202                                 m = so->so_rcv.sb_mb;
3203                                 goto dontblock;
3204                         }
3205                 if ((so->so_state & (SS_ISCONNECTED|SS_ISCONNECTING)) == 0 &&
3206                     (so->so_proto->pr_flags & PR_CONNREQUIRED)) {
3207                         error = ENOTCONN;
3208                         goto release;
3209                 }
3210                 if (uio_resid(uio) == 0)
3211                         goto release;
3212
3213                 if ((so->so_state & SS_NBIO) ||
3214                     (flags & (MSG_DONTWAIT|MSG_NBIO))) {
3215                         error = EWOULDBLOCK;
3216                         goto release;
3217                 }
3218                 SBLASTRECORDCHK(&so->so_rcv, "soreceive sbwait 1");
3219                 SBLASTMBUFCHK(&so->so_rcv, "soreceive sbwait 1");
3220                 sbunlock(&so->so_rcv, TRUE);    /* keep socket locked */
3221 #if EVEN_MORE_LOCKING_DEBUG
3222                 if (socket_debug)
3223                         printf("Waiting for socket data\n");
3224 #endif
3225
3226                 error = sbwait(&so->so_rcv);
3227 #if EVEN_MORE_LOCKING_DEBUG
3228                 if (socket_debug)
3229                         printf("SORECEIVE - sbwait returned %d\n", error);
3230 #endif
3231                 if (so->so_usecount < 1) {
3232                         panic("%s: after 2nd sblock so=%p ref=%d on socket\n",
3233                             __func__, so, so->so_usecount);
3234                         /* NOTREACHED */
3235                 }
3236                 if (error) {
3237                         socket_unlock(so, 1);
3238                         KERNEL_DEBUG(DBG_FNC_SORECEIVE | DBG_FUNC_END, error,
3239                             0, 0, 0, 0);
3240                         if (en_tracing) {
3241                                 KERNEL_ENERGYTRACE(kEnTrActKernSockRead, DBG_FUNC_END,
3242                                     VM_KERNEL_ADDRPERM(so), 0,
3243                                     (int64_t)(orig_resid - uio_resid(uio)));
3244                         }
3245                         return (error);
3246                 }
3247                 goto restart;
3248         }
3249 dontblock:
3250         OSIncrementAtomicLong(&p->p_stats->p_ru.ru_msgrcv);
3251         SBLASTRECORDCHK(&so->so_rcv, "soreceive 1");
3252         SBLASTMBUFCHK(&so->so_rcv, "soreceive 1");
3253         nextrecord = m->m_nextpkt;
3254
3255         if ((pr->pr_flags & PR_ADDR) && m->m_type == MT_SONAME) {
3256                 error = soreceive_addr(p, so, psa, flags, &m, &nextrecord,
3257                     mp0 == NULL);
3258                 if (error == ERESTART)
3259                         goto restart;
3260                 else if (error != 0)
3261                         goto release;
3262                 orig_resid = 0;
3263         }
3264
3265         /*
3266          * Process one or more MT_CONTROL mbufs present before any data mbufs
3267          * in the first mbuf chain on the socket buffer.  If MSG_PEEK, we
3268          * just copy the data; if !MSG_PEEK, we call into the protocol to
3269          * perform externalization.
3270          */
3271         if (m != NULL && m->m_type == MT_CONTROL) {
3272                 error = soreceive_ctl(so, controlp, flags, &m, &nextrecord);
3273                 if (error != 0)
3274                         goto release;
3275                 orig_resid = 0;
3276         }
3277
3278         /*
3279          * If the socket is a TCP socket with message delivery
3280          * enabled, then create a control msg to deliver the
3281          * relative TCP sequence number for this data. Waiting
3282          * until this point will protect against failures to
3283          * allocate an mbuf for control msgs.
3284          */
3285         if (so->so_type == SOCK_STREAM && SOCK_PROTO(so) == IPPROTO_TCP &&
3286             (so->so_flags & SOF_ENABLE_MSGS) && controlp != NULL) {
3287                 struct mbuf *seq_cm;
3288
3289                 seq_cm = sbcreatecontrol((caddr_t)&m->m_pkthdr.msg_seq,
3290                     sizeof (uint32_t), SCM_SEQNUM, SOL_SOCKET);
3291                 if (seq_cm == NULL) {
3292                         /* unable to allocate a control mbuf */
3293                         error = ENOBUFS;
3294                         goto release;
3295                 }
3296                 *controlp = seq_cm;
3297                 controlp = &seq_cm->m_next;
3298         }
3299
3300         if (m != NULL) {
3301                 if (!(flags & MSG_PEEK)) {
3302                         /*
3303                          * We get here because m points to an mbuf following
3304                          * any MT_SONAME or MT_CONTROL mbufs which have been
3305                          * processed above.  In any case, m should be pointing
3306                          * to the head of the mbuf chain, and the nextrecord
3307                          * should be either NULL or equal to m->m_nextpkt.
3308                          * See comments above about SB_LOCK.
3309                          */
3310                         if (m != so->so_rcv.sb_mb ||
3311                             m->m_nextpkt != nextrecord) {
3312                                 panic("%s: post-control !sync so=%p m=%p "
3313                                     "nextrecord=%p\n", __func__, so, m,
3314                                     nextrecord);
3315                                 /* NOTREACHED */
3316                         }
3317                         if (nextrecord == NULL)
3318                                 so->so_rcv.sb_lastrecord = m;
3319                 }
3320                 type = m->m_type;
3321                 if (type == MT_OOBDATA)
3322                         flags |= MSG_OOB;
3323         } else {
3324                 if (!(flags & MSG_PEEK)) {
3325                         SB_EMPTY_FIXUP(&so->so_rcv);
3326                 }
3327         }
3328         SBLASTRECORDCHK(&so->so_rcv, "soreceive 2");
3329         SBLASTMBUFCHK(&so->so_rcv, "soreceive 2");
3330
3331         moff = 0;
3332         offset = 0;
3333
3334         if (!(flags & MSG_PEEK) && uio_resid(uio) > sorecvmincopy)
3335                 can_delay = 1;
3336         else
3337                 can_delay = 0;
3338
3339         need_event = 0;
3340
3341         while (m != NULL &&
3342             (uio_resid(uio) - delayed_copy_len) > 0 && error == 0) {
3343                 if (m->m_type == MT_OOBDATA) {
3344                         if (type != MT_OOBDATA)
3345                                 break;
3346                 } else if (type == MT_OOBDATA) {
3347                         break;
3348                 }
3349                 /*
3350                  * Make sure to allways set MSG_OOB event when getting
3351                  * out of band data inline.
3352                  */
3353                 if ((so->so_options & SO_WANTOOBFLAG) != 0 &&
3354                     (so->so_options & SO_OOBINLINE) != 0 &&
3355                     (so->so_state & SS_RCVATMARK) != 0) {
3356                         flags |= MSG_OOB;
3357                 }
3358                 so->so_state &= ~SS_RCVATMARK;
3359                 len = uio_resid(uio) - delayed_copy_len;
3360                 if (so->so_oobmark && len > so->so_oobmark - offset)
3361                         len = so->so_oobmark - offset;
3362                 if (len > m->m_len - moff)
3363                         len = m->m_len - moff;
3364                 /*
3365                  * If mp is set, just pass back the mbufs.
3366                  * Otherwise copy them out via the uio, then free.
3367                  * Sockbuf must be consistent here (points to current mbuf,
3368                  * it points to next record) when we drop priority;
3369                  * we must note any additions to the sockbuf when we
3370                  * block interrupts again.
3371                  */
3372                 if (mp == NULL) {
3373                         SBLASTRECORDCHK(&so->so_rcv, "soreceive uiomove");
3374                         SBLASTMBUFCHK(&so->so_rcv, "soreceive uiomove");
3375                         if (can_delay && len == m->m_len) {
3376                                 /*
3377                                  * only delay the copy if we're consuming the
3378                                  * mbuf and we're NOT in MSG_PEEK mode
3379                                  * and we have enough data to make it worthwile
3380                                  * to drop and retake the lock... can_delay
3381                                  * reflects the state of the 2 latter
3382                                  * constraints moff should always be zero
3383                                  * in these cases
3384                                  */
3385                                 delayed_copy_len += len;
3386                         } else {
3387                                 if (delayed_copy_len) {
3388                                         error = sodelayed_copy(so, uio,
3389                                             &free_list, &delayed_copy_len);
3390
3391                                         if (error) {
3392                                                 goto release;
3393                                         }
3394                                         /*
3395                                          * can only get here if MSG_PEEK is not
3396                                          * set therefore, m should point at the
3397                                          * head of the rcv queue; if it doesn't,
3398                                          * it means something drastically
3399                                          * changed while we were out from behind
3400                                          * the lock in sodelayed_copy. perhaps
3401                                          * a RST on the stream. in any event,
3402                                          * the stream has been interrupted. it's
3403                                          * probably best just to return whatever
3404                                          * data we've moved and let the caller
3405                                          * sort it out...
3406                                          */
3407                                         if (m != so->so_rcv.sb_mb) {
3408                                                 break;
3409                                         }
3410                                 }
3411                                 socket_unlock(so, 0);
3412                                 error = uiomove(mtod(m, caddr_t) + moff,
3413                                     (int)len, uio);
3414                                 socket_lock(so, 0);
3415
3416                                 if (error)
3417                                         goto release;
3418                         }
3419                 } else {
3420                         uio_setresid(uio, (uio_resid(uio) - len));
3421                 }
3422                 if (len == m->m_len - moff) {
3423                         if (m->m_flags & M_EOR)
3424                                 flags |= MSG_EOR;
3425                         if (flags & MSG_PEEK) {
3426                                 m = m->m_next;
3427                                 moff = 0;
3428                         } else {
3429                                 nextrecord = m->m_nextpkt;
3430                                 sbfree(&so->so_rcv, m);
3431                                 m->m_nextpkt = NULL;
3432
3433                                 /*
3434                                  * If this packet is an unordered packet
3435                                  * (indicated by M_UNORDERED_DATA flag), remove
3436                                  * the additional bytes added to the
3437                                  * receive socket buffer size.
3438                                  */
3439                                 if ((so->so_flags & SOF_ENABLE_MSGS) &&
3440                                     m->m_len &&
3441                                     (m->m_flags & M_UNORDERED_DATA) &&
3442                                     sbreserve(&so->so_rcv,
3443                                     so->so_rcv.sb_hiwat - m->m_len)) {
3444                                         if (so->so_msg_state->msg_uno_bytes >
3445                                             m->m_len) {
3446                                                 so->so_msg_state->
3447                                                     msg_uno_bytes -= m->m_len;
3448                                         } else {
3449                                                 so->so_msg_state->
3450                                                     msg_uno_bytes = 0;
3451                                         }
3452                                         m->m_flags &= ~M_UNORDERED_DATA;
3453                                 }
3454
3455                                 if (mp != NULL) {
3456                                         *mp = m;
3457                                         mp = &m->m_next;
3458                                         so->so_rcv.sb_mb = m = m->m_next;
3459                                         *mp = NULL;
3460                                 } else {
3461                                         if (free_list == NULL)
3462                                                 free_list = m;
3463                                         else
3464                                                 ml->m_next = m;
3465                                         ml = m;
3466                                         so->so_rcv.sb_mb = m = m->m_next;
3467                                         ml->m_next = NULL;
3468                                 }
3469                                 if (m != NULL) {
3470                                         m->m_nextpkt = nextrecord;
3471                                         if (nextrecord == NULL)
3472                                                 so->so_rcv.sb_lastrecord = m;
3473                                 } else {
3474                                         so->so_rcv.sb_mb = nextrecord;
3475                                         SB_EMPTY_FIXUP(&so->so_rcv);
3476                                 }
3477                                 SBLASTRECORDCHK(&so->so_rcv, "soreceive 3");
3478                                 SBLASTMBUFCHK(&so->so_rcv, "soreceive 3");
3479                         }
3480                 } else {
3481                         if (flags & MSG_PEEK) {
3482                                 moff += len;
3483                         } else {
3484                                 if (mp != NULL) {
3485                                         int copy_flag;
3486
3487                                         if (flags & MSG_DONTWAIT)
3488                                                 copy_flag = M_DONTWAIT;
3489                                         else
3490                                                 copy_flag = M_WAIT;
3491                                         *mp = m_copym(m, 0, len, copy_flag);
3492                                         /*
3493                                          * Failed to allocate an mbuf?
3494                                          * Adjust uio_resid back, it was
3495                                          * adjusted down by len bytes which
3496                                          * we didn't copy over.
3497                                          */
3498                                         if (*mp == NULL) {
3499                                                 uio_setresid(uio,
3500                                                     (uio_resid(uio) + len));
3501                                                 break;
3502                                         }
3503                                 }
3504                                 m->m_data += len;
3505                                 m->m_len -= len;
3506                                 so->so_rcv.sb_cc -= len;
3507                         }
3508                 }
3509                 if (so->so_oobmark) {
3510                         if ((flags & MSG_PEEK) == 0) {
3511                                 so->so_oobmark -= len;
3512                                 if (so->so_oobmark == 0) {
3513                                         so->so_state |= SS_RCVATMARK;
3514                                         /*
3515                                          * delay posting the actual event until
3516                                          * after any delayed copy processing
3517                                          * has finished
3518                                          */
3519                                         need_event = 1;
3520                                         break;
3521                                 }
3522                         } else {
3523                                 offset += len;
3524                                 if (offset == so->so_oobmark)
3525                                         break;
3526                         }
3527                 }
3528                 if (flags & MSG_EOR)
3529                         break;
3530                 /*
3531                  * If the MSG_WAITALL or MSG_WAITSTREAM flag is set
3532                  * (for non-atomic socket), we must not quit until
3533                  * "uio->uio_resid == 0" or an error termination.
3534                  * If a signal/timeout occurs, return with a short
3535                  * count but without error.  Keep sockbuf locked
3536                  * against other readers.
3537                  */
3538                 while (flags & (MSG_WAITALL|MSG_WAITSTREAM) && m == NULL &&
3539                     (uio_resid(uio) - delayed_copy_len) > 0 &&
3540                     !sosendallatonce(so) && !nextrecord) {
3541                         if (so->so_error || ((so->so_state & SS_CANTRCVMORE)
3542 #if CONTENT_FILTER
3543                             && cfil_sock_data_pending(&so->so_rcv) == 0
3544 #endif /* CONTENT_FILTER */
3545                             ))
3546                                 goto release;
3547
3548                         /*
3549                          * Depending on the protocol (e.g. TCP), the following
3550                          * might cause the socket lock to be dropped and later
3551                          * be reacquired, and more data could have arrived and
3552                          * have been appended to the receive socket buffer by
3553                          * the time it returns.  Therefore, we only sleep in
3554                          * sbwait() below if and only if the socket buffer is
3555                          * empty, in order to avoid a false sleep.
3556                          */
3557                         if (pr->pr_flags & PR_WANTRCVD && so->so_pcb &&
3558                             (((struct inpcb *)so->so_pcb)->inp_state !=
3559                             INPCB_STATE_DEAD))
3560                                 (*pr->pr_usrreqs->pru_rcvd)(so, flags);
3561
3562                         SBLASTRECORDCHK(&so->so_rcv, "soreceive sbwait 2");
3563                         SBLASTMBUFCHK(&so->so_rcv, "soreceive sbwait 2");
3564
3565                         if (so->so_rcv.sb_mb == NULL && sbwait(&so->so_rcv)) {
3566                                 error = 0;
3567                                 goto release;
3568                         }
3569                         /*
3570                          * have to wait until after we get back from the sbwait
3571                          * to do the copy because we will drop the lock if we
3572                          * have enough data that has been delayed... by dropping
3573                          * the lock we open up a window allowing the netisr
3574                          * thread to process the incoming packets and to change
3575                          * the state of this socket... we're issuing the sbwait
3576                          * because the socket is empty and we're expecting the
3577                          * netisr thread to wake us up when more packets arrive;
3578                          * if we allow that processing to happen and then sbwait
3579                          * we could stall forever with packets sitting in the
3580                          * socket if no further packets arrive from the remote
3581                          * side.
3582                          *
3583                          * we want to copy before we've collected all the data
3584                          * to satisfy this request to allow the copy to overlap
3585                          * the incoming packet processing on an MP system
3586                          */
3587                         if (delayed_copy_len > sorecvmincopy &&
3588                             (delayed_copy_len > (so->so_rcv.sb_hiwat / 2))) {
3589                                 error = sodelayed_copy(so, uio,
3590                                     &free_list, &delayed_copy_len);
3591
3592                                 if (error)
3593                                         goto release;
3594                         }
3595                         m = so->so_rcv.sb_mb;
3596                         if (m != NULL) {
3597                                 nextrecord = m->m_nextpkt;
3598                         }
3599                         SB_MB_CHECK(&so->so_rcv);
3600                 }
3601         }
3602 #ifdef MORE_LOCKING_DEBUG
3603         if (so->so_usecount <= 1) {
3604                 panic("%s: after big while so=%p ref=%d on socket\n",
3605                     __func__, so, so->so_usecount);
3606                 /* NOTREACHED */
3607         }
3608 #endif
3609
3610         if (m != NULL && pr->pr_flags & PR_ATOMIC) {
3611                 if (so->so_options & SO_DONTTRUNC) {
3612                         flags |= MSG_RCVMORE;
3613                 } else {
3614                         flags |= MSG_TRUNC;
3615                         if ((flags & MSG_PEEK) == 0)
3616                                 (void) sbdroprecord(&so->so_rcv);
3617                 }
3618         }
3619
3620         /*
3621          * pru_rcvd below (for TCP) may cause more data to be received
3622          * if the socket lock is dropped prior to sending the ACK; some
3623          * legacy OpenTransport applications don't handle this well
3624          * (if it receives less data than requested while MSG_HAVEMORE
3625          * is set), and so we set the flag now based on what we know
3626          * prior to calling pru_rcvd.
3627          */
3628         if ((so->so_options & SO_WANTMORE) && so->so_rcv.sb_cc > 0)
3629                 flags |= MSG_HAVEMORE;
3630
3631         if ((flags & MSG_PEEK) == 0) {
3632                 if (m == NULL) {
3633                         so->so_rcv.sb_mb = nextrecord;
3634                         /*
3635                          * First part is an inline SB_EMPTY_FIXUP().  Second
3636                          * part makes sure sb_lastrecord is up-to-date if
3637                          * there is still data in the socket buffer.
3638                          */
3639                         if (so->so_rcv.sb_mb == NULL) {
3640                                 so->so_rcv.sb_mbtail = NULL;
3641                                 so->so_rcv.sb_lastrecord = NULL;
3642                         } else if (nextrecord->m_nextpkt == NULL) {
3643                                 so->so_rcv.sb_lastrecord = nextrecord;
3644                         }
3645                         SB_MB_CHECK(&so->so_rcv);
3646                 }
3647                 SBLASTRECORDCHK(&so->so_rcv, "soreceive 4");
3648                 SBLASTMBUFCHK(&so->so_rcv, "soreceive 4");
3649                 if (pr->pr_flags & PR_WANTRCVD && so->so_pcb)
3650                         (*pr->pr_usrreqs->pru_rcvd)(so, flags);
3651         }
3652
3653         if (delayed_copy_len) {
3654                 error = sodelayed_copy(so, uio, &free_list, &delayed_copy_len);
3655                 if (error)
3656                         goto release;
3657         }
3658         if (free_list != NULL) {
3659                 m_freem_list(free_list);
3660                 free_list = NULL;
3661         }
3662         if (need_event)
3663                 postevent(so, 0, EV_OOB);
3664
3665         if (orig_resid == uio_resid(uio) && orig_resid &&
3666             (flags & MSG_EOR) == 0 && (so->so_state & SS_CANTRCVMORE) == 0) {
3667                 sbunlock(&so->so_rcv, TRUE);    /* keep socket locked */
3668                 goto restart;
3669         }
3670
3671         if (flagsp != NULL)
3672                 *flagsp |= flags;
3673 release:
3674 #ifdef MORE_LOCKING_DEBUG
3675         if (so->so_usecount <= 1) {
3676                 panic("%s: release so=%p ref=%d on socket\n", __func__,
3677                     so, so->so_usecount);
3678                 /* NOTREACHED */
3679         }
3680 #endif
3681         if (delayed_copy_len)
3682                 error = sodelayed_copy(so, uio, &free_list, &delayed_copy_len);
3683
3684         if (free_list != NULL)
3685                 m_freem_list(free_list);
3686
3687         sbunlock(&so->so_rcv, FALSE);   /* will unlock socket */
3688
3689         if (en_tracing) {
3690                 KERNEL_ENERGYTRACE(kEnTrActKernSockRead, DBG_FUNC_END,
3691                     VM_KERNEL_ADDRPERM(so),
3692                     ((error == EWOULDBLOCK) ? kEnTrFlagNoWork : 0),
3693                     (int64_t)(orig_resid - uio_resid(uio)));
3694         }
3695         KERNEL_DEBUG(DBG_FNC_SORECEIVE | DBG_FUNC_END, so, uio_resid(uio),
3696             so->so_rcv.sb_cc, 0, error);
3697
3698         return (error);
3699 }
3700
3701 /*
3702  * Returns:     0                       Success
3703  *      uiomove:EFAULT
3704  */
3705 static int
3706 sodelayed_copy(struct socket *so, struct uio *uio, struct mbuf **free_list,
3707     user_ssize_t *resid)
3708 {
3709         int error = 0;
3710         struct mbuf *m;
3711
3712         m = *free_list;
3713
3714         socket_unlock(so, 0);
3715
3716         while (m != NULL && error == 0) {
3717                 error = uiomove(mtod(m, caddr_t), (int)m->m_len, uio);
3718                 m = m->m_next;
3719         }
3720         m_freem_list(*free_list);
3721
3722         *free_list = NULL;
3723         *resid = 0;
3724
3725         socket_lock(so, 0);
3726
3727         return (error);
3728 }
3729
3730 static int
3731 sodelayed_copy_list(struct socket *so, struct recv_msg_elem *msgarray,
3732     u_int uiocnt, struct mbuf **free_list, user_ssize_t *resid)
3733 {
3734 #pragma unused(so)
3735         int error = 0;
3736         struct mbuf *ml, *m;
3737         int i = 0;
3738         struct uio *auio;
3739
3740         for (ml = *free_list, i = 0; ml != NULL && i < uiocnt;
3741             ml = ml->m_nextpkt, i++) {
3742                 auio = msgarray[i].uio;
3743                 for (m = ml; m != NULL; m = m->m_next) {
3744                         error = uiomove(mtod(m, caddr_t), m->m_len, auio);
3745                         if (error != 0)
3746                                 goto out;
3747                 }
3748         }
3749 out:
3750         m_freem_list(*free_list);
3751
3752         *free_list = NULL;
3753         *resid = 0;
3754
3755         return (error);
3756 }
3757
3758 int
3759 soreceive_list(struct socket *so, struct recv_msg_elem *msgarray, u_int uiocnt,
3760     int *flagsp)
3761 {
3762         struct mbuf *m;
3763         struct mbuf *nextrecord;
3764         struct mbuf *ml = NULL, *free_list = NULL, *free_tail = NULL;
3765         int error;
3766         user_ssize_t len, pktlen, delayed_copy_len = 0;
3767         struct protosw *pr = so->so_proto;
3768         user_ssize_t resid;
3769         struct proc *p = current_proc();
3770         struct uio *auio = NULL;
3771         int npkts = 0;
3772         int sblocked = 0;
3773         struct sockaddr **psa = NULL;
3774         struct mbuf **controlp = NULL;
3775         int can_delay;
3776         int flags;
3777         struct mbuf *free_others = NULL;
3778
3779         KERNEL_DEBUG(DBG_FNC_SORECEIVE_LIST | DBG_FUNC_START,
3780             so, uiocnt,
3781             so->so_rcv.sb_cc, so->so_rcv.sb_lowat, so->so_rcv.sb_hiwat);
3782
3783         /*
3784          * Sanity checks:
3785          * - Only supports don't wait flags
3786          * - Only support datagram sockets (could be extended to raw)
3787          * - Must be atomic
3788          * - Protocol must support packet chains
3789          * - The uio array is NULL (should we panic?)
3790          */
3791         if (flagsp != NULL)
3792                 flags = *flagsp;
3793         else
3794                 flags = 0;
3795         if (flags & ~(MSG_PEEK | MSG_WAITALL | MSG_DONTWAIT | MSG_NEEDSA |
3796             MSG_NBIO)) {
3797                 printf("%s invalid flags 0x%x\n", __func__, flags);
3798                 error = EINVAL;
3799                 goto out;
3800         }
3801         if (so->so_type != SOCK_DGRAM) {
3802                 error = EINVAL;
3803                 goto out;
3804         }
3805         if (sosendallatonce(so) == 0) {
3806                 error = EINVAL;
3807                 goto out;
3808         }
3809         if (so->so_proto->pr_usrreqs->pru_send_list == NULL) {
3810                 error = EPROTONOSUPPORT;
3811                 goto out;
3812         }
3813         if (msgarray == NULL) {
3814                 printf("%s uioarray is NULL\n", __func__);
3815                 error = EINVAL;
3816                 goto out;
3817         }
3818         if (uiocnt == 0) {
3819                 printf("%s uiocnt is 0\n", __func__);
3820                 error = EINVAL;
3821                 goto out;
3822         }
3823         /*
3824          * Sanity check on the length passed by caller as we are making 'int'
3825          * comparisons
3826          */
3827         resid = recv_msg_array_resid(msgarray, uiocnt);
3828         if (resid < 0 || resid > INT_MAX) {
3829                 error = EINVAL;
3830                 goto out;
3831         }
3832
3833         if (!(flags & MSG_PEEK) && sorecvmincopy > 0)
3834                 can_delay = 1;
3835         else
3836                 can_delay = 0;
3837
3838         socket_lock(so, 1);
3839         so_update_last_owner_locked(so, p);
3840         so_update_policy(so);
3841
3842 #if NECP
3843         so_update_necp_policy(so, NULL, NULL);
3844 #endif /* NECP */
3845
3846         /*
3847          * If a recv attempt is made on a previously-accepted socket
3848          * that has been marked as inactive (disconnected), reject
3849          * the request.
3850          */
3851         if (so->so_flags & SOF_DEFUNCT) {
3852                 struct sockbuf *sb = &so->so_rcv;
3853
3854                 error = ENOTCONN;
3855                 SODEFUNCTLOG(("%s[%d]: defunct so 0x%llx [%d,%d] (%d)\n",
3856                     __func__, proc_pid(p), (uint64_t)DEBUG_KERNEL_ADDRPERM(so),
3857                     SOCK_DOM(so), SOCK_TYPE(so), error));
3858                 /*
3859                  * This socket should have been disconnected and flushed
3860                  * prior to being returned from sodefunct(); there should
3861                  * be no data on its receive list, so panic otherwise.
3862                  */
3863                 if (so->so_state & SS_DEFUNCT)
3864                         sb_empty_assert(sb, __func__);
3865                 goto release;
3866         }
3867
3868 next:
3869         /*
3870          * The uio may be empty
3871          */
3872         if (npkts >= uiocnt) {
3873                 error = 0;
3874                 goto release;
3875         }
3876 restart:
3877         /*
3878          * See if the socket has been closed (SS_NOFDREF|SS_CANTRCVMORE)
3879          * and if so just return to the caller.  This could happen when
3880          * soreceive() is called by a socket upcall function during the
3881          * time the socket is freed.  The socket buffer would have been
3882          * locked across the upcall, therefore we cannot put this thread
3883          * to sleep (else we will deadlock) or return EWOULDBLOCK (else
3884          * we may livelock), because the lock on the socket buffer will
3885          * only be released when the upcall routine returns to its caller.
3886          * Because the socket has been officially closed, there can be
3887          * no further read on it.
3888          */
3889         if ((so->so_state & (SS_NOFDREF | SS_CANTRCVMORE)) ==
3890             (SS_NOFDREF | SS_CANTRCVMORE)) {
3891                 error = 0;
3892                 goto release;
3893         }
3894
3895         error = sblock(&so->so_rcv, SBLOCKWAIT(flags));
3896         if (error) {
3897                 goto release;
3898         }
3899         sblocked = 1;
3900
3901         m = so->so_rcv.sb_mb;
3902         /*
3903          * Block awaiting more datagram if needed
3904          */
3905         if (m == NULL || (((flags & MSG_DONTWAIT) == 0 &&
3906             (so->so_rcv.sb_cc < so->so_rcv.sb_lowat ||
3907             ((flags & MSG_WAITALL) && npkts < uiocnt))))) {
3908                 /*
3909                  * Panic if we notice inconsistencies in the socket's
3910                  * receive list; both sb_mb and sb_cc should correctly
3911                  * reflect the contents of the list, otherwise we may
3912                  * end up with false positives during select() or poll()
3913                  * which could put the application in a bad state.
3914                  */
3915                 SB_MB_CHECK(&so->so_rcv);
3916
3917                 if (so->so_error) {
3918                         error = so->so_error;
3919                         if ((flags & MSG_PEEK) == 0)
3920                                 so->so_error = 0;
3921                         goto release;
3922                 }
3923                 if (so->so_state & SS_CANTRCVMORE) {
3924                         goto release;
3925                 }
3926                 if ((so->so_state & (SS_ISCONNECTED|SS_ISCONNECTING)) == 0 &&
3927                     (so->so_proto->pr_flags & PR_CONNREQUIRED)) {
3928                         error = ENOTCONN;
3929                         goto release;
3930                 }
3931                 if ((so->so_state & SS_NBIO) ||
3932                     (flags & (MSG_DONTWAIT|MSG_NBIO))) {
3933                         error = EWOULDBLOCK;
3934                         goto release;
3935                 }
3936                 /*
3937                  * Do not block if we got some data
3938                  */
3939                 if (free_list != NULL) {
3940                         error = 0;
3941                         goto release;
3942                 }
3943
3944                 SBLASTRECORDCHK(&so->so_rcv, "soreceive sbwait 1");
3945                 SBLASTMBUFCHK(&so->so_rcv, "soreceive sbwait 1");
3946
3947                 sbunlock(&so->so_rcv, TRUE);    /* keep socket locked */
3948                 sblocked = 0;
3949
3950                 error = sbwait(&so->so_rcv);
3951                 if (error) {
3952                         goto release;
3953                 }
3954                 goto restart;
3955         }
3956
3957         OSIncrementAtomicLong(&p->p_stats->p_ru.ru_msgrcv);
3958         SBLASTRECORDCHK(&so->so_rcv, "soreceive 1");
3959         SBLASTMBUFCHK(&so->so_rcv, "soreceive 1");
3960
3961         /*
3962          * Consume the current uio index as we have a datagram
3963          */
3964         auio = msgarray[npkts].uio;
3965         resid = uio_resid(auio);
3966         msgarray[npkts].which |= SOCK_MSG_DATA;
3967         psa = (msgarray[npkts].which & SOCK_MSG_SA) ?
3968             &msgarray[npkts].psa : NULL;
3969         controlp = (msgarray[npkts].which & SOCK_MSG_CONTROL) ?
3970             &msgarray[npkts].controlp : NULL;
3971         npkts += 1;
3972         nextrecord = m->m_nextpkt;
3973
3974         if ((pr->pr_flags & PR_ADDR) && m->m_type == MT_SONAME) {
3975                 error = soreceive_addr(p, so, psa, flags, &m, &nextrecord, 1);
3976                 if (error == ERESTART)
3977                         goto restart;
3978                 else if (error != 0)
3979                         goto release;
3980         }
3981
3982         if (m != NULL && m->m_type == MT_CONTROL) {
3983                 error = soreceive_ctl(so, controlp, flags, &m, &nextrecord);
3984                 if (error != 0)
3985                         goto release;
3986         }
3987
3988         if (m->m_pkthdr.len == 0) {
3989                 printf("%s:%d so %llx pkt %llx type %u pktlen null\n",
3990                     __func__, __LINE__,
3991                     (uint64_t)DEBUG_KERNEL_ADDRPERM(so),
3992                     (uint64_t)DEBUG_KERNEL_ADDRPERM(m),
3993                     m->m_type);
3994         }
3995
3996         /*
3997          * Loop to copy the mbufs of the current record
3998          * Support zero length packets
3999          */
4000         ml = NULL;
4001         pktlen = 0;
4002         while (m != NULL && (len = resid - pktlen) >= 0 && error == 0) {
4003                 if (m->m_len == 0)
4004                         panic("%p m_len zero", m);
4005                 if (m->m_type == 0)
4006                         panic("%p m_type zero", m);
4007                 /*
4008                  * Clip to the residual length
4009                  */
4010                 if (len > m->m_len)
4011                         len = m->m_len;
4012                 pktlen += len;
4013                 /*
4014                  * Copy the mbufs via the uio or delay the copy
4015                  * Sockbuf must be consistent here (points to current mbuf,
4016                  * it points to next record) when we drop priority;
4017                  * we must note any additions to the sockbuf when we
4018                  * block interrupts again.
4019                  */
4020                 if (len > 0 && can_delay == 0) {
4021                         socket_unlock(so, 0);
4022                         error = uiomove(mtod(m, caddr_t), (int)len, auio);
4023                         socket_lock(so, 0);
4024                         if (error)
4025                                 goto release;
4026                 } else {
4027                         delayed_copy_len += len;
4028                 }
4029
4030                 if (len == m->m_len) {
4031                         /*
4032                          * m was entirely copied
4033                          */
4034                         sbfree(&so->so_rcv, m);
4035                         nextrecord = m->m_nextpkt;
4036                         m->m_nextpkt = NULL;
4037
4038                         /*
4039                          * Set the first packet to the head of the free list
4040                          */
4041                         if (free_list == NULL)
4042                                 free_list = m;
4043                         /*
4044                          * Link current packet to tail of free list
4045                          */
4046                         if (ml == NULL) {
4047                                 if (free_tail != NULL)
4048                                         free_tail->m_nextpkt = m;
4049                                 free_tail = m;
4050                         }
4051                         /*
4052                          * Link current mbuf to last mbuf of current packet
4053                          */
4054                         if (ml != NULL)
4055                                 ml->m_next = m;
4056                         ml = m;
4057
4058                         /*
4059                          * Move next buf to head of socket buffer
4060                          */
4061                         so->so_rcv.sb_mb = m = ml->m_next;
4062                         ml->m_next = NULL;
4063
4064                         if (m != NULL) {
4065                                 m->m_nextpkt = nextrecord;
4066                                 if (nextrecord == NULL)
4067                                         so->so_rcv.sb_lastrecord = m;
4068                         } else {
4069                                 so->so_rcv.sb_mb = nextrecord;
4070                                 SB_EMPTY_FIXUP(&so->so_rcv);
4071                         }
4072                         SBLASTRECORDCHK(&so->so_rcv, "soreceive 3");
4073                         SBLASTMBUFCHK(&so->so_rcv, "soreceive 3");
4074                 } else {
4075                         /*
4076                          * Stop the loop on partial copy
4077                          */
4078                         break;
4079                 }
4080         }
4081 #ifdef MORE_LOCKING_DEBUG
4082         if (so->so_usecount <= 1) {
4083                 panic("%s: after big while so=%llx ref=%d on socket\n",
4084                     __func__,
4085                     (uint64_t)DEBUG_KERNEL_ADDRPERM(so), so->so_usecount);
4086                 /* NOTREACHED */
4087         }
4088 #endif
4089         /*
4090          * Tell the caller we made a partial copy
4091          */
4092         if (m != NULL) {
4093                 if (so->so_options & SO_DONTTRUNC) {
4094                         /*
4095                          * Copyout first the freelist then the partial mbuf
4096                          */
4097                         socket_unlock(so, 0);
4098                         if (delayed_copy_len)
4099                                 error = sodelayed_copy_list(so, msgarray,
4100                                     uiocnt, &free_list, &delayed_copy_len);
4101
4102                         if (error == 0) {
4103                                 error = uiomove(mtod(m, caddr_t), (int)len,
4104                                     auio);
4105                         }
4106                         socket_lock(so, 0);
4107                         if (error)
4108                                 goto release;
4109
4110                         m->m_data += len;
4111                         m->m_len -= len;
4112                         so->so_rcv.sb_cc -= len;
4113                         flags |= MSG_RCVMORE;
4114                 } else {
4115                         (void) sbdroprecord(&so->so_rcv);
4116                         nextrecord = so->so_rcv.sb_mb;
4117                         m = NULL;
4118                         flags |= MSG_TRUNC;
4119                 }
4120         }
4121
4122         if (m == NULL) {
4123                 so->so_rcv.sb_mb = nextrecord;
4124                 /*
4125                  * First part is an inline SB_EMPTY_FIXUP().  Second
4126                  * part makes sure sb_lastrecord is up-to-date if
4127                  * there is still data in the socket buffer.
4128                  */
4129                 if (so->so_rcv.sb_mb == NULL) {
4130                         so->so_rcv.sb_mbtail = NULL;
4131                         so->so_rcv.sb_lastrecord = NULL;
4132                 } else if (nextrecord->m_nextpkt == NULL) {
4133                         so->so_rcv.sb_lastrecord = nextrecord;
4134                 }
4135                 SB_MB_CHECK(&so->so_rcv);
4136         }
4137         SBLASTRECORDCHK(&so->so_rcv, "soreceive 4");
4138         SBLASTMBUFCHK(&so->so_rcv, "soreceive 4");
4139
4140         /*
4141          * We can continue to the next packet as long as:
4142          * - We haven't exhausted the uio array
4143          * - There was no error
4144          * - A packet was not truncated
4145          * - We can still receive more data
4146          */
4147         if (npkts < uiocnt && error == 0 &&
4148             (flags & (MSG_RCVMORE | MSG_TRUNC)) == 0 &&
4149             (so->so_state & SS_CANTRCVMORE) == 0) {
4150                 sbunlock(&so->so_rcv, TRUE);    /* keep socket locked */
4151                 sblocked = 0;
4152
4153                 goto next;
4154         }
4155         if (flagsp != NULL)
4156                 *flagsp |= flags;
4157
4158 release:
4159         /*
4160          * pru_rcvd may cause more data to be received if the socket lock
4161          * is dropped so we set MSG_HAVEMORE now based on what we know.
4162          * That way the caller won't be surprised if it receives less data
4163          * than requested.
4164          */
4165         if ((so->so_options & SO_WANTMORE) && so->so_rcv.sb_cc > 0)
4166                 flags |= MSG_HAVEMORE;
4167
4168         if (pr->pr_flags & PR_WANTRCVD && so->so_pcb)
4169                 (*pr->pr_usrreqs->pru_rcvd)(so, flags);
4170
4171         if (sblocked)
4172                 sbunlock(&so->so_rcv, FALSE);   /* will unlock socket */
4173         else
4174                 socket_unlock(so, 1);
4175
4176         if (delayed_copy_len)
4177                 error = sodelayed_copy_list(so, msgarray, uiocnt,
4178                     &free_list, &delayed_copy_len);
4179 out:
4180         /*
4181          * Amortize the cost of freeing the mbufs
4182          */
4183         if (free_list != NULL)
4184                 m_freem_list(free_list);
4185         if (free_others != NULL)
4186                 m_freem_list(free_others);
4187
4188         KERNEL_DEBUG(DBG_FNC_SORECEIVE_LIST | DBG_FUNC_END, error,
4189             0, 0, 0, 0);
4190         return (error);
4191 }
4192
4193 /*
4194  * Returns:     0                       Success
4195  *              EINVAL
4196  *              ENOTCONN
4197  *      <pru_shutdown>:EINVAL
4198  *      <pru_shutdown>:EADDRNOTAVAIL[TCP]
4199  *      <pru_shutdown>:ENOBUFS[TCP]
4200  *      <pru_shutdown>:EMSGSIZE[TCP]
4201  *      <pru_shutdown>:EHOSTUNREACH[TCP]
4202  *      <pru_shutdown>:ENETUNREACH[TCP]
4203  *      <pru_shutdown>:ENETDOWN[TCP]
4204  *      <pru_shutdown>:ENOMEM[TCP]
4205  *      <pru_shutdown>:EACCES[TCP]
4206  *      <pru_shutdown>:EMSGSIZE[TCP]
4207  *      <pru_shutdown>:ENOBUFS[TCP]
4208  *      <pru_shutdown>:???[TCP]         [ignorable: mostly IPSEC/firewall/DLIL]
4209  *      <pru_shutdown>:???              [other protocol families]
4210  */
4211 int
4212 soshutdown(struct socket *so, int how)
4213 {
4214         int error;
4215
4216         KERNEL_DEBUG(DBG_FNC_SOSHUTDOWN | DBG_FUNC_START, how, 0, 0, 0, 0);
4217
4218         switch (how) {
4219         case SHUT_RD:
4220         case SHUT_WR:
4221         case SHUT_RDWR:
4222                 socket_lock(so, 1);
4223                 if ((so->so_state &
4224                     (SS_ISCONNECTED|SS_ISCONNECTING|SS_ISDISCONNECTING)) == 0) {
4225                         error = ENOTCONN;
4226                 } else {
4227                         error = soshutdownlock(so, how);
4228                 }
4229                 socket_unlock(so, 1);
4230                 break;
4231         default:
4232                 error = EINVAL;
4233                 break;
4234         }
4235
4236         KERNEL_DEBUG(DBG_FNC_SOSHUTDOWN | DBG_FUNC_END, how, error, 0, 0, 0);
4237
4238         return (error);
4239 }
4240
4241 int
4242 soshutdownlock_final(struct socket *so, int how)
4243 {
4244         struct protosw *pr = so->so_proto;
4245         int error = 0;
4246
4247         sflt_notify(so, sock_evt_shutdown, &how);
4248
4249         if (how != SHUT_WR) {
4250                 if ((so->so_state & SS_CANTRCVMORE) != 0) {
4251                         /* read already shut down */
4252                         error = ENOTCONN;
4253                         goto done;
4254                 }
4255                 sorflush(so);
4256                 postevent(so, 0, EV_RCLOSED);
4257         }
4258         if (how != SHUT_RD) {
4259                 if ((so->so_state & SS_CANTSENDMORE) != 0) {
4260                         /* write already shut down */
4261                         error = ENOTCONN;
4262                         goto done;
4263                 }
4264                 error = (*pr->pr_usrreqs->pru_shutdown)(so);
4265                 postevent(so, 0, EV_WCLOSED);
4266         }
4267 done:
4268         KERNEL_DEBUG(DBG_FNC_SOSHUTDOWN, how, 1, 0, 0, 0);
4269         return (error);
4270 }
4271
4272 int
4273 soshutdownlock(struct socket *so, int how)
4274 {
4275         int error = 0;
4276
4277 #if CONTENT_FILTER
4278         /*
4279          * A content filter may delay the actual shutdown until it
4280          * has processed the pending data
4281          */
4282         if (so->so_flags & SOF_CONTENT_FILTER) {
4283                 error = cfil_sock_shutdown(so, &how);
4284                 if (error == EJUSTRETURN) {
4285                         error = 0;
4286                         goto done;
4287                 } else if (error != 0) {
4288                         goto done;
4289                 }
4290         }
4291 #endif /* CONTENT_FILTER */
4292
4293         error = soshutdownlock_final(so, how);
4294
4295 done:
4296         return (error);
4297 }
4298
4299 void
4300 sowflush(struct socket *so)
4301 {
4302         struct sockbuf *sb = &so->so_snd;
4303 #ifdef notyet
4304         lck_mtx_t *mutex_held;
4305         /*
4306          * XXX: This code is currently commented out, because we may get here
4307          * as part of sofreelastref(), and at that time, pr_getlock() may no
4308          * longer be able to return us the lock; this will be fixed in future.
4309          */
4310         if (so->so_proto->pr_getlock != NULL)
4311                 mutex_held = (*so->so_proto->pr_getlock)(so, 0);
4312         else
4313                 mutex_held = so->so_proto->pr_domain->dom_mtx;
4314
4315         lck_mtx_assert(mutex_held, LCK_MTX_ASSERT_OWNED);
4316 #endif /* notyet */
4317
4318         /*
4319          * Obtain lock on the socket buffer (SB_LOCK).  This is required
4320          * to prevent the socket buffer from being unexpectedly altered
4321          * while it is used by another thread in socket send/receive.
4322          *
4323          * sblock() must not fail here, hence the assertion.
4324          */
4325         (void) sblock(sb, SBL_WAIT | SBL_NOINTR | SBL_IGNDEFUNCT);
4326         VERIFY(sb->sb_flags & SB_LOCK);
4327
4328         sb->sb_flags            &= ~(SB_SEL|SB_UPCALL);
4329         sb->sb_flags            |= SB_DROP;
4330         sb->sb_upcall           = NULL;
4331         sb->sb_upcallarg        = NULL;
4332
4333         sbunlock(sb, TRUE);     /* keep socket locked */
4334
4335         selthreadclear(&sb->sb_sel);
4336         sbrelease(sb);
4337 }
4338
4339 void
4340 sorflush(struct socket *so)
4341 {
4342         struct sockbuf *sb = &so->so_rcv;
4343         struct protosw *pr = so->so_proto;
4344         struct sockbuf asb;
4345 #ifdef notyet
4346         lck_mtx_t *mutex_held;
4347         /*
4348          * XXX: This code is currently commented out, because we may get here
4349          * as part of sofreelastref(), and at that time, pr_getlock() may no
4350          * longer be able to return us the lock; this will be fixed in future.
4351          */
4352         if (so->so_proto->pr_getlock != NULL)
4353                 mutex_held = (*so->so_proto->pr_getlock)(so, 0);
4354         else
4355                 mutex_held = so->so_proto->pr_domain->dom_mtx;
4356
4357         lck_mtx_assert(mutex_held, LCK_MTX_ASSERT_OWNED);
4358 #endif /* notyet */
4359
4360         sflt_notify(so, sock_evt_flush_read, NULL);
4361
4362         socantrcvmore(so);
4363
4364         /*
4365          * Obtain lock on the socket buffer (SB_LOCK).  This is required
4366          * to prevent the socket buffer from being unexpectedly altered
4367          * while it is used by another thread in socket send/receive.
4368          *
4369          * sblock() must not fail here, hence the assertion.
4370          */
4371         (void) sblock(sb, SBL_WAIT | SBL_NOINTR | SBL_IGNDEFUNCT);
4372         VERIFY(sb->sb_flags & SB_LOCK);
4373
4374         /*
4375          * Copy only the relevant fields from "sb" to "asb" which we
4376          * need for sbrelease() to function.  In particular, skip
4377          * sb_sel as it contains the wait queue linkage, which would
4378          * wreak havoc if we were to issue selthreadclear() on "asb".
4379          * Make sure to not carry over SB_LOCK in "asb", as we need
4380          * to acquire it later as part of sbrelease().
4381          */
4382         bzero(&asb, sizeof (asb));
4383         asb.sb_cc               = sb->sb_cc;
4384         asb.sb_hiwat            = sb->sb_hiwat;
4385         asb.sb_mbcnt            = sb->sb_mbcnt;
4386         asb.sb_mbmax            = sb->sb_mbmax;
4387         asb.sb_ctl              = sb->sb_ctl;
4388         asb.sb_lowat            = sb->sb_lowat;
4389         asb.sb_mb               = sb->sb_mb;
4390         asb.sb_mbtail           = sb->sb_mbtail;
4391         asb.sb_lastrecord       = sb->sb_lastrecord;
4392         asb.sb_so               = sb->sb_so;
4393         asb.sb_flags            = sb->sb_flags;
4394         asb.sb_flags            &= ~(SB_LOCK|SB_SEL|SB_KNOTE|SB_UPCALL);
4395         asb.sb_flags            |= SB_DROP;
4396
4397         /*
4398          * Ideally we'd bzero() these and preserve the ones we need;
4399          * but to do that we'd need to shuffle things around in the
4400          * sockbuf, and we can't do it now because there are KEXTS
4401          * that are directly referring to the socket structure.
4402          *
4403          * Setting SB_DROP acts as a barrier to prevent further appends.
4404          * Clearing SB_SEL is done for selthreadclear() below.
4405          */
4406         sb->sb_cc               = 0;
4407         sb->sb_hiwat            = 0;
4408         sb->sb_mbcnt            = 0;
4409         sb->sb_mbmax            = 0;
4410         sb->sb_ctl              = 0;
4411         sb->sb_lowat            = 0;
4412         sb->sb_mb               = NULL;
4413         sb->sb_mbtail           = NULL;
4414         sb->sb_lastrecord       = NULL;
4415         sb->sb_timeo.tv_sec     = 0;
4416         sb->sb_timeo.tv_usec    = 0;
4417         sb->sb_upcall           = NULL;
4418         sb->sb_upcallarg        = NULL;
4419         sb->sb_flags            &= ~(SB_SEL|SB_UPCALL);
4420         sb->sb_flags            |= SB_DROP;
4421
4422         sbunlock(sb, TRUE);     /* keep socket locked */
4423
4424         /*
4425          * Note that selthreadclear() is called on the original "sb" and
4426          * not the local "asb" because of the way wait queue linkage is
4427          * implemented.  Given that selwakeup() may be triggered, SB_SEL
4428          * should no longer be set (cleared above.)
4429          */
4430         selthreadclear(&sb->sb_sel);
4431
4432         if ((pr->pr_flags & PR_RIGHTS) && pr->pr_domain->dom_dispose)
4433                 (*pr->pr_domain->dom_dispose)(asb.sb_mb);
4434
4435         sbrelease(&asb);
4436 }
4437
4438 /*
4439  * Perhaps this routine, and sooptcopyout(), below, ought to come in
4440  * an additional variant to handle the case where the option value needs
4441  * to be some kind of integer, but not a specific size.
4442  * In addition to their use here, these functions are also called by the
4443  * protocol-level pr_ctloutput() routines.
4444  *
4445  * Returns:     0                       Success
4446  *              EINVAL
4447  *      copyin:EFAULT
4448  */
4449 int
4450 sooptcopyin(struct sockopt *sopt, void *buf, size_t len, size_t minlen)
4451 {
4452         size_t  valsize;
4453
4454         /*
4455          * If the user gives us more than we wanted, we ignore it,
4456          * but if we don't get the minimum length the caller
4457          * wants, we return EINVAL.  On success, sopt->sopt_valsize
4458          * is set to however much we actually retrieved.
4459          */
4460         if ((valsize = sopt->sopt_valsize) < minlen)
4461                 return (EINVAL);
4462         if (valsize > len)
4463                 sopt->sopt_valsize = valsize = len;
4464
4465         if (sopt->sopt_p != kernproc)
4466                 return (copyin(sopt->sopt_val, buf, valsize));
4467
4468         bcopy(CAST_DOWN(caddr_t, sopt->sopt_val), buf, valsize);
4469         return (0);
4470 }
4471
4472 /*
4473  * sooptcopyin_timeval
4474  *   Copy in a timeval value into tv_p, and take into account whether the
4475  *   the calling process is 64-bit or 32-bit.  Moved the sanity checking
4476  *   code here so that we can verify the 64-bit tv_sec value before we lose
4477  *   the top 32-bits assigning tv64.tv_sec to tv_p->tv_sec.
4478  */
4479 static int
4480 sooptcopyin_timeval(struct sockopt *sopt, struct timeval *tv_p)
4481 {
4482         int                     error;
4483
4484         if (proc_is64bit(sopt->sopt_p)) {
4485                 struct user64_timeval   tv64;
4486
4487                 if (sopt->sopt_valsize < sizeof (tv64))
4488                         return (EINVAL);
4489
4490                 sopt->sopt_valsize = sizeof (tv64);
4491                 if (sopt->sopt_p != kernproc) {
4492                         error = copyin(sopt->sopt_val, &tv64, sizeof (tv64));
4493                         if (error != 0)
4494                                 return (error);
4495                 } else {
4496                         bcopy(CAST_DOWN(caddr_t, sopt->sopt_val), &tv64,
4497                             sizeof (tv64));
4498                 }
4499                 if (tv64.tv_sec < 0 || tv64.tv_sec > LONG_MAX ||
4500                     tv64.tv_usec < 0 || tv64.tv_usec >= 1000000)
4501                         return (EDOM);
4502
4503                 tv_p->tv_sec = tv64.tv_sec;
4504                 tv_p->tv_usec = tv64.tv_usec;
4505         } else {
4506                 struct user32_timeval   tv32;
4507
4508                 if (sopt->sopt_valsize < sizeof (tv32))
4509                         return (EINVAL);
4510
4511                 sopt->sopt_valsize = sizeof (tv32);
4512                 if (sopt->sopt_p != kernproc) {
4513                         error = copyin(sopt->sopt_val, &tv32, sizeof (tv32));
4514                         if (error != 0) {
4515                                 return (error);
4516                         }
4517                 } else {
4518                         bcopy(CAST_DOWN(caddr_t, sopt->sopt_val), &tv32,
4519                             sizeof (tv32));
4520                 }
4521 #ifndef __LP64__
4522                 /*
4523                  * K64todo "comparison is always false due to
4524                  * limited range of data type"
4525                  */
4526                 if (tv32.tv_sec < 0 || tv32.tv_sec > LONG_MAX ||
4527                     tv32.tv_usec < 0 || tv32.tv_usec >= 1000000)
4528                         return (EDOM);
4529 #endif
4530                 tv_p->tv_sec = tv32.tv_sec;
4531                 tv_p->tv_usec = tv32.tv_usec;
4532         }
4533         return (0);
4534 }
4535
4536 /*
4537  * Returns:     0                       Success
4538  *              EINVAL
4539  *              ENOPROTOOPT
4540  *              ENOBUFS
4541  *              EDOM
4542  *      sooptcopyin:EINVAL
4543  *      sooptcopyin:EFAULT
4544  *      sooptcopyin_timeval:EINVAL
4545  *      sooptcopyin_timeval:EFAULT
4546  *      sooptcopyin_timeval:EDOM
4547  *      <pr_ctloutput>:EOPNOTSUPP[AF_UNIX]
4548  *      <pr_ctloutput>:???w
4549  *      sflt_attach_private:???         [whatever a filter author chooses]
4550  *      <sf_setoption>:???              [whatever a filter author chooses]
4551  *
4552  * Notes:       Other <pru_listen> returns depend on the protocol family; all
4553  *              <sf_listen> returns depend on what the filter author causes
4554  *              their filter to return.
4555  */
4556 int
4557 sosetoptlock(struct socket *so, struct sockopt *sopt, int dolock)
4558 {
4559         int     error, optval;
4560         struct  linger l;
4561         struct  timeval tv;
4562 #if CONFIG_MACF_SOCKET
4563         struct mac extmac;
4564 #endif /* MAC_SOCKET */
4565
4566         if (sopt->sopt_dir != SOPT_SET)
4567                 sopt->sopt_dir = SOPT_SET;
4568
4569         if (dolock)
4570                 socket_lock(so, 1);
4571
4572         if ((so->so_state & (SS_CANTRCVMORE | SS_CANTSENDMORE)) ==
4573             (SS_CANTRCVMORE | SS_CANTSENDMORE) &&
4574             (so->so_flags & SOF_NPX_SETOPTSHUT) == 0) {
4575                 /* the socket has been shutdown, no more sockopt's */
4576                 error = EINVAL;
4577                 goto out;
4578         }
4579
4580         error = sflt_setsockopt(so, sopt);
4581         if (error != 0) {
4582                 if (error == EJUSTRETURN)
4583                         error = 0;
4584                 goto out;
4585         }
4586
4587         if (sopt->sopt_level != SOL_SOCKET) {
4588                 if (so->so_proto != NULL &&
4589                     so->so_proto->pr_ctloutput != NULL) {
4590                         error = (*so->so_proto->pr_ctloutput)(so, sopt);
4591                         goto out;
4592                 }
4593                 error = ENOPROTOOPT;
4594         } else {
4595                 /*
4596                  * Allow socket-level (SOL_SOCKET) options to be filtered by
4597                  * the protocol layer, if needed.  A zero value returned from
4598                  * the handler means use default socket-level processing as
4599                  * done by the rest of this routine.  Otherwise, any other
4600                  * return value indicates that the option is unsupported.
4601                  */
4602                 if (so->so_proto != NULL && (error = so->so_proto->pr_usrreqs->
4603                     pru_socheckopt(so, sopt)) != 0)
4604                         goto out;
4605
4606                 error = 0;
4607                 switch (sopt->sopt_name) {
4608                 case SO_LINGER:
4609                 case SO_LINGER_SEC:
4610                         error = sooptcopyin(sopt, &l, sizeof (l), sizeof (l));
4611                         if (error != 0)
4612                                 goto out;
4613
4614                         so->so_linger = (sopt->sopt_name == SO_LINGER) ?
4615                             l.l_linger : l.l_linger * hz;
4616                         if (l.l_onoff != 0)
4617                                 so->so_options |= SO_LINGER;
4618                         else
4619                                 so->so_options &= ~SO_LINGER;
4620                         break;
4621
4622                 case SO_DEBUG:
4623                 case SO_KEEPALIVE:
4624                 case SO_DONTROUTE:
4625                 case SO_USELOOPBACK:
4626                 case SO_BROADCAST:
4627                 case SO_REUSEADDR:
4628                 case SO_REUSEPORT:
4629                 case SO_OOBINLINE:
4630                 case SO_TIMESTAMP:
4631                 case SO_TIMESTAMP_MONOTONIC:
4632                 case SO_DONTTRUNC:
4633                 case SO_WANTMORE:
4634                 case SO_WANTOOBFLAG:
4635                 case SO_NOWAKEFROMSLEEP:
4636                         error = sooptcopyin(sopt, &optval, sizeof (optval),
4637                             sizeof (optval));
4638                         if (error != 0)
4639                                 goto out;
4640                         if (optval)
4641                                 so->so_options |= sopt->sopt_name;
4642                         else
4643                                 so->so_options &= ~sopt->sopt_name;
4644                         break;
4645
4646                 case SO_SNDBUF:
4647                 case SO_RCVBUF:
4648                 case SO_SNDLOWAT:
4649                 case SO_RCVLOWAT:
4650                         error = sooptcopyin(sopt, &optval, sizeof (optval),
4651                             sizeof (optval));
4652                         if (error != 0)
4653                                 goto out;
4654
4655                         /*
4656                          * Values < 1 make no sense for any of these
4657                          * options, so disallow them.
4658                          */
4659                         if (optval < 1) {
4660                                 error = EINVAL;
4661                                 goto out;
4662                         }
4663
4664                         switch (sopt->sopt_name) {
4665                         case SO_SNDBUF:
4666                         case SO_RCVBUF: {
4667                                 struct sockbuf *sb =
4668                                     (sopt->sopt_name == SO_SNDBUF) ?
4669                                     &so->so_snd : &so->so_rcv;
4670                                 if (sbreserve(sb, (u_int32_t)optval) == 0) {
4671                                         error = ENOBUFS;
4672                                         goto out;
4673                                 }
4674                                 sb->sb_flags |= SB_USRSIZE;
4675                                 sb->sb_flags &= ~SB_AUTOSIZE;
4676                                 sb->sb_idealsize = (u_int32_t)optval;
4677                                 break;
4678                         }
4679                         /*
4680                          * Make sure the low-water is never greater than
4681                          * the high-water.
4682                          */
4683                         case SO_SNDLOWAT: {
4684                                 int space = sbspace(&so->so_snd);
4685                                 u_int32_t hiwat = so->so_snd.sb_hiwat;
4686
4687                                 if (so->so_snd.sb_flags & SB_UNIX) {
4688                                         struct unpcb *unp =
4689                                             (struct unpcb *)(so->so_pcb);
4690                                         if (unp != NULL &&
4691                                             unp->unp_conn != NULL) {
4692                                                 hiwat += unp->unp_conn->unp_cc;
4693                                         }
4694                                 }
4695
4696                                 so->so_snd.sb_lowat =
4697                                     (optval > hiwat) ?
4698                                     hiwat : optval;
4699
4700                                 if (space >= so->so_snd.sb_lowat) {
4701                                         sowwakeup(so);
4702                                 }
4703                                 break;
4704                         }
4705                         case SO_RCVLOWAT: {
4706                                 int64_t data_len;
4707                                 so->so_rcv.sb_lowat =
4708                                     (optval > so->so_rcv.sb_hiwat) ?
4709                                     so->so_rcv.sb_hiwat : optval;
4710                                 data_len = so->so_rcv.sb_cc
4711                                     - so->so_rcv.sb_ctl;
4712                                 if (data_len >= so->so_rcv.sb_lowat)
4713                                     sorwakeup(so);
4714                                 break;
4715                         }
4716                         }
4717                         break;
4718
4719                 case SO_SNDTIMEO:
4720                 case SO_RCVTIMEO:
4721                         error = sooptcopyin_timeval(sopt, &tv);
4722                         if (error != 0)
4723                                 goto out;
4724
4725                         switch (sopt->sopt_name) {
4726                         case SO_SNDTIMEO:
4727                                 so->so_snd.sb_timeo = tv;
4728                                 break;
4729                         case SO_RCVTIMEO:
4730                                 so->so_rcv.sb_timeo = tv;
4731                                 break;
4732                         }
4733                         break;
4734
4735                 case SO_NKE: {
4736                         struct so_nke nke;
4737
4738                         error = sooptcopyin(sopt, &nke, sizeof (nke),
4739                             sizeof (nke));
4740                         if (error != 0)
4741                                 goto out;
4742
4743                         error = sflt_attach_internal(so, nke.nke_handle);
4744                         break;
4745                 }
4746
4747                 case SO_NOSIGPIPE:
4748                         error = sooptcopyin(sopt, &optval, sizeof (optval),
4749                             sizeof (optval));
4750                         if (error != 0)
4751                                 goto out;
4752                         if (optval != 0)
4753                                 so->so_flags |= SOF_NOSIGPIPE;
4754                         else
4755                                 so->so_flags &= ~SOF_NOSIGPIPE;
4756                         break;
4757
4758                 case SO_NOADDRERR:
4759                         error = sooptcopyin(sopt, &optval, sizeof (optval),
4760                             sizeof (optval));
4761                         if (error != 0)
4762                                 goto out;
4763                         if (optval != 0)
4764                                 so->so_flags |= SOF_NOADDRAVAIL;
4765                         else
4766                                 so->so_flags &= ~SOF_NOADDRAVAIL;
4767                         break;
4768
4769                 case SO_REUSESHAREUID:
4770                         error = sooptcopyin(sopt, &optval, sizeof (optval),
4771                             sizeof (optval));
4772                         if (error != 0)
4773                                 goto out;
4774                         if (optval != 0)
4775                                 so->so_flags |= SOF_REUSESHAREUID;
4776                         else
4777                                 so->so_flags &= ~SOF_REUSESHAREUID;
4778                         break;
4779
4780                 case SO_NOTIFYCONFLICT:
4781                         if (kauth_cred_issuser(kauth_cred_get()) == 0) {
4782                                 error = EPERM;
4783                                 goto out;
4784                         }
4785                         error = sooptcopyin(sopt, &optval, sizeof (optval),
4786                             sizeof (optval));
4787                         if (error != 0)
4788                                 goto out;
4789                         if (optval != 0)
4790                                 so->so_flags |= SOF_NOTIFYCONFLICT;
4791                         else
4792                                 so->so_flags &= ~SOF_NOTIFYCONFLICT;
4793                         break;
4794
4795                 case SO_RESTRICTIONS:
4796                         error = sooptcopyin(sopt, &optval, sizeof (optval),
4797                             sizeof (optval));
4798                         if (error != 0)
4799                                 goto out;
4800
4801                         error = so_set_restrictions(so, optval);
4802                         break;
4803
4804                 case SO_AWDL_UNRESTRICTED:
4805                         if (SOCK_DOM(so) != PF_INET &&
4806                             SOCK_DOM(so) != PF_INET6) {
4807                                 error = EOPNOTSUPP;
4808                                 goto out;
4809                         }
4810                         error = sooptcopyin(sopt, &optval, sizeof(optval),
4811                             sizeof(optval));
4812                         if (error != 0)
4813                                 goto out;
4814                         if (optval != 0) {
4815                                 kauth_cred_t cred =  NULL;
4816                                 proc_t ep = PROC_NULL;
4817
4818                                 if (so->so_flags & SOF_DELEGATED) {
4819                                         ep = proc_find(so->e_pid);
4820                                         if (ep)
4821                                                 cred = kauth_cred_proc_ref(ep);
4822                                 }
4823                                 error = priv_check_cred(
4824                                     cred ? cred : so->so_cred,
4825                                     PRIV_NET_RESTRICTED_AWDL, 0);
4826                                 if (error == 0)
4827                                         inp_set_awdl_unrestricted(
4828                                             sotoinpcb(so));
4829                                 if (cred)
4830                                         kauth_cred_unref(&cred);
4831                                 if (ep != PROC_NULL)
4832                                         proc_rele(ep);
4833                         } else
4834                                 inp_clear_awdl_unrestricted(sotoinpcb(so));
4835                         break;
4836
4837                 case SO_LABEL:
4838 #if CONFIG_MACF_SOCKET
4839                         if ((error = sooptcopyin(sopt, &extmac, sizeof (extmac),
4840                             sizeof (extmac))) != 0)
4841                                 goto out;
4842
4843                         error = mac_setsockopt_label(proc_ucred(sopt->sopt_p),
4844                             so, &extmac);
4845 #else
4846                         error = EOPNOTSUPP;
4847 #endif /* MAC_SOCKET */
4848                         break;
4849
4850                 case SO_UPCALLCLOSEWAIT:
4851                         error = sooptcopyin(sopt, &optval, sizeof (optval),
4852                             sizeof (optval));
4853                         if (error != 0)
4854                                 goto out;
4855                         if (optval != 0)
4856                                 so->so_flags |= SOF_UPCALLCLOSEWAIT;
4857                         else
4858                                 so->so_flags &= ~SOF_UPCALLCLOSEWAIT;
4859                         break;
4860
4861                 case SO_RANDOMPORT:
4862                         error = sooptcopyin(sopt, &optval, sizeof (optval),
4863                             sizeof (optval));
4864                         if (error != 0)
4865                                 goto out;
4866                         if (optval != 0)
4867                                 so->so_flags |= SOF_BINDRANDOMPORT;
4868                         else
4869                                 so->so_flags &= ~SOF_BINDRANDOMPORT;
4870                         break;
4871
4872                 case SO_NP_EXTENSIONS: {
4873                         struct so_np_extensions sonpx;
4874
4875                         error = sooptcopyin(sopt, &sonpx, sizeof (sonpx),
4876                             sizeof (sonpx));
4877                         if (error != 0)
4878                                 goto out;
4879                         if (sonpx.npx_mask & ~SONPX_MASK_VALID) {
4880                                 error = EINVAL;
4881                                 goto out;
4882                         }
4883                         /*
4884                          * Only one bit defined for now
4885                          */
4886                         if ((sonpx.npx_mask & SONPX_SETOPTSHUT)) {
4887                                 if ((sonpx.npx_flags & SONPX_SETOPTSHUT))
4888                                         so->so_flags |= SOF_NPX_SETOPTSHUT;
4889                                 else
4890                                         so->so_flags &= ~SOF_NPX_SETOPTSHUT;
4891                         }
4892                         break;
4893                 }
4894
4895                 case SO_TRAFFIC_CLASS: {
4896                         error = sooptcopyin(sopt, &optval, sizeof (optval),
4897                             sizeof (optval));
4898                         if (error != 0)
4899                                 goto out;
4900                         error = so_set_traffic_class(so, optval);
4901                         if (error != 0)
4902                                 goto out;
4903                         break;
4904                 }
4905
4906                 case SO_RECV_TRAFFIC_CLASS: {
4907                         error = sooptcopyin(sopt, &optval, sizeof (optval),
4908                             sizeof (optval));
4909                         if (error != 0)
4910                                 goto out;
4911                         if (optval == 0)
4912                                 so->so_flags &= ~SOF_RECV_TRAFFIC_CLASS;
4913                         else
4914                                 so->so_flags |= SOF_RECV_TRAFFIC_CLASS;
4915                         break;
4916                 }
4917
4918                 case SO_TRAFFIC_CLASS_DBG: {
4919                         struct so_tcdbg so_tcdbg;
4920
4921                         error = sooptcopyin(sopt, &so_tcdbg,
4922                             sizeof (struct so_tcdbg), sizeof (struct so_tcdbg));
4923                         if (error != 0)
4924                                 goto out;
4925                         error = so_set_tcdbg(so, &so_tcdbg);
4926                         if (error != 0)
4927                                 goto out;
4928                         break;
4929                 }
4930
4931                 case SO_PRIVILEGED_TRAFFIC_CLASS:
4932                         error = priv_check_cred(kauth_cred_get(),
4933                             PRIV_NET_PRIVILEGED_TRAFFIC_CLASS, 0);
4934                         if (error != 0)
4935                                 goto out;
4936                         error = sooptcopyin(sopt, &optval, sizeof (optval),
4937                             sizeof (optval));
4938                         if (error != 0)
4939                                 goto out;
4940                         if (optval == 0)
4941                                 so->so_flags &= ~SOF_PRIVILEGED_TRAFFIC_CLASS;
4942                         else
4943                                 so->so_flags |= SOF_PRIVILEGED_TRAFFIC_CLASS;
4944                         break;
4945
4946                 case SO_DEFUNCTOK:
4947                         error = sooptcopyin(sopt, &optval, sizeof (optval),
4948                             sizeof (optval));
4949                         if (error != 0 || (so->so_flags & SOF_DEFUNCT)) {
4950                                 if (error == 0)
4951                                         error = EBADF;
4952                                 goto out;
4953                         }
4954                         /*
4955                          * Any process can set SO_DEFUNCTOK (clear
4956                          * SOF_NODEFUNCT), but only root can clear
4957                          * SO_DEFUNCTOK (set SOF_NODEFUNCT).
4958                          */
4959                         if (optval == 0 &&
4960                             kauth_cred_issuser(kauth_cred_get()) == 0) {
4961                                 error = EPERM;
4962                                 goto out;
4963                         }
4964                         if (optval)
4965                                 so->so_flags &= ~SOF_NODEFUNCT;
4966                         else
4967                                 so->so_flags |= SOF_NODEFUNCT;
4968
4969                         if (SOCK_DOM(so) == PF_INET ||
4970                             SOCK_DOM(so) == PF_INET6) {
4971                                 char s[MAX_IPv6_STR_LEN];
4972                                 char d[MAX_IPv6_STR_LEN];
4973                                 struct inpcb *inp = sotoinpcb(so);
4974
4975                                 SODEFUNCTLOG(("%s[%d]: so 0x%llx [%s %s:%d -> "
4976                                     "%s:%d] is now marked as %seligible for "
4977                                     "defunct\n", __func__, proc_selfpid(),
4978                                     (uint64_t)DEBUG_KERNEL_ADDRPERM(so),
4979                                     (SOCK_TYPE(so) == SOCK_STREAM) ?
4980                                     "TCP" : "UDP", inet_ntop(SOCK_DOM(so),
4981                                     ((SOCK_DOM(so) == PF_INET) ?
4982                                     (void *)&inp->inp_laddr.s_addr :
4983                                     (void *)&inp->in6p_laddr), s, sizeof (s)),
4984                                     ntohs(inp->in6p_lport),
4985                                     inet_ntop(SOCK_DOM(so),
4986                                     (SOCK_DOM(so) == PF_INET) ?
4987                                     (void *)&inp->inp_faddr.s_addr :
4988                                     (void *)&inp->in6p_faddr, d, sizeof (d)),
4989                                     ntohs(inp->in6p_fport),
4990                                     (so->so_flags & SOF_NODEFUNCT) ?
4991                                     "not " : ""));
4992                         } else {
4993                                 SODEFUNCTLOG(("%s[%d]: so 0x%llx [%d,%d] is "
4994                                     "now marked as %seligible for defunct\n",
4995                                     __func__, proc_selfpid(),
4996                                     (uint64_t)DEBUG_KERNEL_ADDRPERM(so),
4997                                     SOCK_DOM(so), SOCK_TYPE(so),
4998                                     (so->so_flags & SOF_NODEFUNCT) ?
4999                                     "not " : ""));
5000                         }
5001                         break;
5002
5003                 case SO_ISDEFUNCT:
5004                         /* This option is not settable */
5005                         error = EINVAL;
5006                         break;
5007
5008                 case SO_OPPORTUNISTIC:
5009                         error = sooptcopyin(sopt, &optval, sizeof (optval),
5010                             sizeof (optval));
5011                         if (error == 0)
5012                                 error = so_set_opportunistic(so, optval);
5013                         break;
5014
5015                 case SO_FLUSH:
5016                         /* This option is handled by lower layer(s) */
5017                         error = 0;
5018                         break;
5019
5020                 case SO_RECV_ANYIF:
5021                         error = sooptcopyin(sopt, &optval, sizeof (optval),
5022                             sizeof (optval));
5023                         if (error == 0)
5024                                 error = so_set_recv_anyif(so, optval);
5025                         break;
5026
5027                 case SO_TRAFFIC_MGT_BACKGROUND: {
5028                         /* This option is handled by lower layer(s) */
5029                         error = 0;
5030                         break;
5031                 }
5032
5033 #if FLOW_DIVERT
5034                 case SO_FLOW_DIVERT_TOKEN:
5035                         error = flow_divert_token_set(so, sopt);
5036                         break;
5037 #endif  /* FLOW_DIVERT */
5038
5039
5040                 case SO_DELEGATED:
5041                         if ((error = sooptcopyin(sopt, &optval, sizeof (optval),
5042                             sizeof (optval))) != 0)
5043                                 break;
5044
5045                         error = so_set_effective_pid(so, optval, sopt->sopt_p);
5046                         break;
5047
5048                 case SO_DELEGATED_UUID: {
5049                         uuid_t euuid;
5050
5051                         if ((error = sooptcopyin(sopt, &euuid, sizeof (euuid),
5052                             sizeof (euuid))) != 0)
5053                                 break;
5054
5055                         error = so_set_effective_uuid(so, euuid, sopt->sopt_p);
5056                         break;
5057                 }
5058
5059 #if NECP
5060                 case SO_NECP_ATTRIBUTES:
5061                         error = necp_set_socket_attributes(so, sopt);
5062                         break;
5063 #endif /* NECP */
5064
5065 #if MPTCP
5066                 case SO_MPTCP_FASTJOIN:
5067                         if (!((so->so_flags & SOF_MP_SUBFLOW) ||
5068                             ((SOCK_CHECK_DOM(so, PF_MULTIPATH)) &&
5069                             (SOCK_CHECK_PROTO(so, IPPROTO_TCP))))) {
5070                                 error = ENOPROTOOPT;
5071                                 break;
5072                         }
5073
5074                         error = sooptcopyin(sopt, &optval, sizeof (optval),
5075                             sizeof (optval));
5076                         if (error != 0)
5077                                 goto out;
5078                         if (optval == 0)
5079                                 so->so_flags &= ~SOF_MPTCP_FASTJOIN;
5080                         else
5081                                 so->so_flags |= SOF_MPTCP_FASTJOIN;
5082                         break;
5083 #endif /* MPTCP */
5084
5085                 case SO_EXTENDED_BK_IDLE:
5086                         error = sooptcopyin(sopt, &optval, sizeof (optval),
5087                             sizeof (optval));
5088                         if (error == 0)
5089                                 error = so_set_extended_bk_idle(so, optval);
5090                         break;
5091
5092                 case SO_MARK_CELLFALLBACK:
5093                         error = sooptcopyin(sopt, &optval, sizeof(optval),
5094                             sizeof(optval));
5095                         if (error != 0)
5096                                 goto out;
5097                         if (optval < 0) {
5098                                 error = EINVAL;
5099                                 goto out;
5100                         }
5101                         if (optval == 0)
5102                                 so->so_flags1 &= ~SOF1_CELLFALLBACK;
5103                         else
5104                                 so->so_flags1 |= SOF1_CELLFALLBACK;
5105                         break;
5106                 default:
5107                         error = ENOPROTOOPT;
5108                         break;
5109                 }
5110                 if (error == 0 && so->so_proto != NULL &&
5111                     so->so_proto->pr_ctloutput != NULL) {
5112                         (void) so->so_proto->pr_ctloutput(so, sopt);
5113                 }
5114         }
5115 out:
5116         if (dolock)
5117                 socket_unlock(so, 1);
5118         return (error);
5119 }
5120
5121 /* Helper routines for getsockopt */
5122 int
5123 sooptcopyout(struct sockopt *sopt, void *buf, size_t len)
5124 {
5125         int     error;
5126         size_t  valsize;
5127
5128         error = 0;
5129
5130         /*
5131          * Documented get behavior is that we always return a value,
5132          * possibly truncated to fit in the user's buffer.
5133          * Traditional behavior is that we always tell the user
5134          * precisely how much we copied, rather than something useful
5135          * like the total amount we had available for her.
5136          * Note that this interface is not idempotent; the entire answer must
5137          * generated ahead of time.
5138          */
5139         valsize = min(len, sopt->sopt_valsize);
5140         sopt->sopt_valsize = valsize;
5141         if (sopt->sopt_val != USER_ADDR_NULL) {
5142                 if (sopt->sopt_p != kernproc)
5143                         error = copyout(buf, sopt->sopt_val, valsize);
5144                 else
5145                         bcopy(buf, CAST_DOWN(caddr_t, sopt->sopt_val), valsize);
5146         }
5147         return (error);
5148 }
5149
5150 static int
5151 sooptcopyout_timeval(struct sockopt *sopt, const struct timeval *tv_p)
5152 {
5153         int                     error;
5154         size_t                  len;
5155         struct user64_timeval   tv64;
5156         struct user32_timeval   tv32;
5157         const void *            val;
5158         size_t                  valsize;
5159
5160         error = 0;
5161         if (proc_is64bit(sopt->sopt_p)) {
5162                 len = sizeof (tv64);
5163                 tv64.tv_sec = tv_p->tv_sec;
5164                 tv64.tv_usec = tv_p->tv_usec;
5165                 val = &tv64;
5166         } else {
5167                 len = sizeof (tv32);
5168                 tv32.tv_sec = tv_p->tv_sec;
5169                 tv32.tv_usec = tv_p->tv_usec;
5170                 val = &tv32;
5171         }
5172         valsize = min(len, sopt->sopt_valsize);
5173         sopt->sopt_valsize = valsize;
5174         if (sopt->sopt_val != USER_ADDR_NULL) {
5175                 if (sopt->sopt_p != kernproc)
5176                         error = copyout(val, sopt->sopt_val, valsize);
5177                 else
5178                         bcopy(val, CAST_DOWN(caddr_t, sopt->sopt_val), valsize);
5179         }
5180         return (error);
5181 }
5182
5183 /*
5184  * Return:      0                       Success
5185  *              ENOPROTOOPT
5186  *      <pr_ctloutput>:EOPNOTSUPP[AF_UNIX]
5187  *      <pr_ctloutput>:???
5188  *      <sf_getoption>:???
5189  */
5190 int
5191 sogetoptlock(struct socket *so, struct sockopt *sopt, int dolock)
5192 {
5193         int     error, optval;
5194         struct  linger l;
5195         struct  timeval tv;
5196 #if CONFIG_MACF_SOCKET
5197         struct mac extmac;
5198 #endif /* MAC_SOCKET */
5199
5200         if (sopt->sopt_dir != SOPT_GET)
5201                 sopt->sopt_dir = SOPT_GET;
5202
5203         if (dolock)
5204                 socket_lock(so, 1);
5205
5206         error = sflt_getsockopt(so, sopt);
5207         if (error != 0) {
5208                 if (error == EJUSTRETURN)
5209                         error = 0;
5210                 goto out;
5211         }
5212
5213         if (sopt->sopt_level != SOL_SOCKET) {
5214                 if (so->so_proto != NULL &&
5215                     so->so_proto->pr_ctloutput != NULL) {
5216                         error = (*so->so_proto->pr_ctloutput)(so, sopt);
5217                         goto out;
5218                 }
5219                 error = ENOPROTOOPT;
5220         } else {
5221                 /*
5222                  * Allow socket-level (SOL_SOCKET) options to be filtered by
5223                  * the protocol layer, if needed.  A zero value returned from
5224                  * the handler means use default socket-level processing as
5225                  * done by the rest of this routine.  Otherwise, any other
5226                  * return value indicates that the option is unsupported.
5227                  */
5228                 if (so->so_proto != NULL && (error = so->so_proto->pr_usrreqs->
5229                     pru_socheckopt(so, sopt)) != 0)
5230                         goto out;
5231
5232                 error = 0;
5233                 switch (sopt->sopt_name) {
5234                 case SO_LINGER:
5235                 case SO_LINGER_SEC:
5236                         l.l_onoff = ((so->so_options & SO_LINGER) ? 1 : 0);
5237                         l.l_linger = (sopt->sopt_name == SO_LINGER) ?
5238                             so->so_linger : so->so_linger / hz;
5239                         error = sooptcopyout(sopt, &l, sizeof (l));
5240                         break;
5241
5242                 case SO_USELOOPBACK:
5243                 case SO_DONTROUTE:
5244                 case SO_DEBUG:
5245                 case SO_KEEPALIVE:
5246                 case SO_REUSEADDR:
5247                 case SO_REUSEPORT:
5248                 case SO_BROADCAST:
5249                 case SO_OOBINLINE:
5250                 case SO_TIMESTAMP:
5251                 case SO_TIMESTAMP_MONOTONIC:
5252                 case SO_DONTTRUNC:
5253                 case SO_WANTMORE:
5254                 case SO_WANTOOBFLAG:
5255                 case SO_NOWAKEFROMSLEEP:
5256                         optval = so->so_options & sopt->sopt_name;
5257 integer:
5258                         error = sooptcopyout(sopt, &optval, sizeof (optval));
5259                         break;
5260
5261                 case SO_TYPE:
5262                         optval = so->so_type;
5263                         goto integer;
5264
5265                 case SO_NREAD:
5266                         if (so->so_proto->pr_flags & PR_ATOMIC) {
5267                                 int pkt_total;
5268                                 struct mbuf *m1;
5269
5270                                 pkt_total = 0;
5271                                 m1 = so->so_rcv.sb_mb;
5272                                 while (m1 != NULL) {
5273                                         if (m1->m_type == MT_DATA ||
5274                                             m1->m_type == MT_HEADER ||
5275                                             m1->m_type == MT_OOBDATA)
5276                                                 pkt_total += m1->m_len;
5277                                         m1 = m1->m_next;
5278                                 }
5279                                 optval = pkt_total;
5280                         } else {
5281                                 optval = so->so_rcv.sb_cc - so->so_rcv.sb_ctl;
5282                         }
5283                         goto integer;
5284
5285                 case SO_NUMRCVPKT:
5286                         if (so->so_proto->pr_flags & PR_ATOMIC) {
5287                                 int cnt = 0;
5288                                 struct mbuf *m1;
5289
5290                                 m1 = so->so_rcv.sb_mb;
5291                                 while (m1 != NULL) {
5292                                         if (m1->m_type == MT_DATA ||
5293                                             m1->m_type == MT_HEADER ||
5294                                             m1->m_type == MT_OOBDATA)
5295                                                 cnt += 1;
5296                                         m1 = m1->m_nextpkt;
5297                                 }
5298                                 optval = cnt;
5299                                 goto integer;
5300                         } else {
5301                                 error = EINVAL;
5302                                 break;
5303                         }
5304
5305                 case SO_NWRITE:
5306                         optval = so->so_snd.sb_cc;
5307                         goto integer;
5308
5309                 case SO_ERROR:
5310                         optval = so->so_error;
5311                         so->so_error = 0;
5312                         goto integer;
5313
5314                 case SO_SNDBUF: {
5315                         u_int32_t hiwat = so->so_snd.sb_hiwat;
5316
5317                         if (so->so_snd.sb_flags & SB_UNIX) {
5318                                 struct unpcb *unp =
5319                                     (struct unpcb *)(so->so_pcb);
5320                                 if (unp != NULL && unp->unp_conn != NULL) {
5321                                         hiwat += unp->unp_conn->unp_cc;
5322                                 }
5323                         }
5324
5325                         optval = hiwat;
5326                         goto integer;
5327                 }
5328                 case SO_RCVBUF:
5329                         optval = so->so_rcv.sb_hiwat;
5330                         goto integer;
5331
5332                 case SO_SNDLOWAT:
5333                         optval = so->so_snd.sb_lowat;
5334                         goto integer;
5335
5336                 case SO_RCVLOWAT:
5337                         optval = so->so_rcv.sb_lowat;
5338                         goto integer;
5339
5340                 case SO_SNDTIMEO:
5341                 case SO_RCVTIMEO:
5342                         tv = (sopt->sopt_name == SO_SNDTIMEO ?
5343                             so->so_snd.sb_timeo : so->so_rcv.sb_timeo);
5344
5345                         error = sooptcopyout_timeval(sopt, &tv);
5346                         break;
5347
5348                 case SO_NOSIGPIPE:
5349                         optval = (so->so_flags & SOF_NOSIGPIPE);
5350                         goto integer;
5351
5352                 case SO_NOADDRERR:
5353                         optval = (so->so_flags & SOF_NOADDRAVAIL);
5354                         goto integer;
5355
5356                 case SO_REUSESHAREUID:
5357                         optval = (so->so_flags & SOF_REUSESHAREUID);
5358                         goto integer;
5359
5360
5361                 case SO_NOTIFYCONFLICT:
5362                         optval = (so->so_flags & SOF_NOTIFYCONFLICT);
5363                         goto integer;
5364
5365                 case SO_RESTRICTIONS:
5366                         optval = so_get_restrictions(so);
5367                         goto integer;
5368
5369                 case SO_AWDL_UNRESTRICTED:
5370                         if (SOCK_DOM(so) == PF_INET ||
5371                             SOCK_DOM(so) == PF_INET6) {
5372                                 optval = inp_get_awdl_unrestricted(
5373                                     sotoinpcb(so));
5374                                 goto integer;
5375                         } else
5376                                 error = EOPNOTSUPP;
5377                         break;
5378
5379                 case SO_LABEL:
5380 #if CONFIG_MACF_SOCKET
5381                         if ((error = sooptcopyin(sopt, &extmac, sizeof (extmac),
5382                             sizeof (extmac))) != 0 ||
5383                             (error = mac_socket_label_get(proc_ucred(
5384                             sopt->sopt_p), so, &extmac)) != 0)
5385                                 break;
5386
5387                         error = sooptcopyout(sopt, &extmac, sizeof (extmac));
5388 #else
5389                         error = EOPNOTSUPP;
5390 #endif /* MAC_SOCKET */
5391                         break;
5392
5393                 case SO_PEERLABEL:
5394 #if CONFIG_MACF_SOCKET
5395                         if ((error = sooptcopyin(sopt, &extmac, sizeof (extmac),
5396                             sizeof (extmac))) != 0 ||
5397                             (error = mac_socketpeer_label_get(proc_ucred(
5398                             sopt->sopt_p), so, &extmac)) != 0)
5399                                 break;
5400
5401                         error = sooptcopyout(sopt, &extmac, sizeof (extmac));
5402 #else
5403                         error = EOPNOTSUPP;
5404 #endif /* MAC_SOCKET */
5405                         break;
5406
5407 #ifdef __APPLE_API_PRIVATE
5408                 case SO_UPCALLCLOSEWAIT:
5409                         optval = (so->so_flags & SOF_UPCALLCLOSEWAIT);
5410                         goto integer;
5411 #endif
5412                 case SO_RANDOMPORT:
5413                         optval = (so->so_flags & SOF_BINDRANDOMPORT);
5414                         goto integer;
5415
5416                 case SO_NP_EXTENSIONS: {
5417                         struct so_np_extensions sonpx;
5418
5419                         sonpx.npx_flags = (so->so_flags & SOF_NPX_SETOPTSHUT) ?
5420                             SONPX_SETOPTSHUT : 0;
5421                         sonpx.npx_mask = SONPX_MASK_VALID;
5422
5423                         error = sooptcopyout(sopt, &sonpx,
5424                             sizeof (struct so_np_extensions));
5425                         break;
5426                 }
5427
5428                 case SO_TRAFFIC_CLASS:
5429                         optval = so->so_traffic_class;
5430                         goto integer;
5431
5432                 case SO_RECV_TRAFFIC_CLASS:
5433                         optval = (so->so_flags & SOF_RECV_TRAFFIC_CLASS);
5434                         goto integer;
5435
5436                 case SO_TRAFFIC_CLASS_STATS:
5437                         error = sooptcopyout(sopt, &so->so_tc_stats,
5438                             sizeof (so->so_tc_stats));
5439                         break;
5440
5441                 case SO_TRAFFIC_CLASS_DBG:
5442                         error = sogetopt_tcdbg(so, sopt);
5443                         break;
5444
5445                 case SO_PRIVILEGED_TRAFFIC_CLASS:
5446                         optval = (so->so_flags & SOF_PRIVILEGED_TRAFFIC_CLASS);
5447                         goto integer;
5448
5449                 case SO_DEFUNCTOK:
5450                         optval = !(so->so_flags & SOF_NODEFUNCT);
5451                         goto integer;
5452
5453                 case SO_ISDEFUNCT:
5454                         optval = (so->so_flags & SOF_DEFUNCT);
5455                         goto integer;
5456
5457                 case SO_OPPORTUNISTIC:
5458                         optval = so_get_opportunistic(so);
5459                         goto integer;
5460
5461                 case SO_FLUSH:
5462                         /* This option is not gettable */
5463                         error = EINVAL;
5464                         break;
5465
5466                 case SO_RECV_ANYIF:
5467                         optval = so_get_recv_anyif(so);
5468                         goto integer;
5469
5470                 case SO_TRAFFIC_MGT_BACKGROUND:
5471                         /* This option is handled by lower layer(s) */
5472                         if (so->so_proto != NULL &&
5473                             so->so_proto->pr_ctloutput != NULL) {
5474                                 (void) so->so_proto->pr_ctloutput(so, sopt);
5475                         }
5476                         break;
5477
5478 #if FLOW_DIVERT
5479                 case SO_FLOW_DIVERT_TOKEN:
5480                         error = flow_divert_token_get(so, sopt);
5481                         break;
5482 #endif  /* FLOW_DIVERT */
5483
5484 #if NECP
5485                 case SO_NECP_ATTRIBUTES:
5486                         error = necp_get_socket_attributes(so, sopt);
5487                         break;
5488 #endif /* NECP */
5489
5490 #if CONTENT_FILTER
5491                 case SO_CFIL_SOCK_ID: {
5492                         cfil_sock_id_t sock_id;
5493
5494                         sock_id = cfil_sock_id_from_socket(so);
5495
5496                         error = sooptcopyout(sopt, &sock_id,
5497                                 sizeof(cfil_sock_id_t));
5498                         break;
5499                 }
5500 #endif  /* CONTENT_FILTER */
5501
5502 #if MPTCP
5503                 case SO_MPTCP_FASTJOIN:
5504                         if (!((so->so_flags & SOF_MP_SUBFLOW) ||
5505                             ((SOCK_CHECK_DOM(so, PF_MULTIPATH)) &&
5506                             (SOCK_CHECK_PROTO(so, IPPROTO_TCP))))) {
5507                                 error = ENOPROTOOPT;
5508                                 break;
5509                         }
5510                         optval = (so->so_flags & SOF_MPTCP_FASTJOIN);
5511                         /* Fixed along with rdar://19391339 */
5512                         goto integer;
5513 #endif /* MPTCP */
5514
5515                 case SO_EXTENDED_BK_IDLE:
5516                         optval = (so->so_flags1 & SOF1_EXTEND_BK_IDLE_WANTED);
5517                         goto integer;
5518                 case SO_MARK_CELLFALLBACK:
5519                         optval = ((so->so_flags1 & SOF1_CELLFALLBACK) > 0)
5520                             ? 1 : 0;
5521                         goto integer;
5522                 default:
5523                         error = ENOPROTOOPT;
5524                         break;
5525                 }
5526         }
5527 out:
5528         if (dolock)
5529                 socket_unlock(so, 1);
5530         return (error);
5531 }
5532
5533 /*
5534  * The size limits on our soopt_getm is different from that on FreeBSD.
5535  * We limit the size of options to MCLBYTES. This will have to change
5536  * if we need to define options that need more space than MCLBYTES.
5537  */
5538 int
5539 soopt_getm(struct sockopt *sopt, struct mbuf **mp)
5540 {
5541         struct mbuf *m, *m_prev;
5542         int sopt_size = sopt->sopt_valsize;
5543         int how;
5544
5545         if (sopt_size <= 0 || sopt_size > MCLBYTES)
5546                 return (EMSGSIZE);
5547
5548         how = sopt->sopt_p != kernproc ? M_WAIT : M_DONTWAIT;
5549         MGET(m, how, MT_DATA);
5550         if (m == NULL)
5551                 return (ENOBUFS);
5552         if (sopt_size > MLEN) {
5553                 MCLGET(m, how);
5554                 if ((m->m_flags & M_EXT) == 0) {
5555                         m_free(m);
5556                         return (ENOBUFS);
5557                 }
5558                 m->m_len = min(MCLBYTES, sopt_size);
5559         } else {
5560                 m->m_len = min(MLEN, sopt_size);
5561         }
5562         sopt_size -= m->m_len;
5563         *mp = m;
5564         m_prev = m;
5565
5566         while (sopt_size > 0) {
5567                 MGET(m, how, MT_DATA);
5568                 if (m == NULL) {
5569                         m_freem(*mp);
5570                         return (ENOBUFS);
5571                 }
5572                 if (sopt_size > MLEN) {
5573                         MCLGET(m, how);
5574                         if ((m->m_flags & M_EXT) == 0) {
5575                                 m_freem(*mp);
5576                                 m_freem(m);
5577                                 return (ENOBUFS);
5578                         }
5579                         m->m_len = min(MCLBYTES, sopt_size);
5580                 } else {
5581                         m->m_len = min(MLEN, sopt_size);
5582                 }
5583                 sopt_size -= m->m_len;
5584                 m_prev->m_next = m;
5585                 m_prev = m;
5586         }
5587         return (0);
5588 }
5589
5590 /* copyin sopt data into mbuf chain */
5591 int
5592 soopt_mcopyin(struct sockopt *sopt, struct mbuf *m)
5593 {
5594         struct mbuf *m0 = m;
5595
5596         if (sopt->sopt_val == USER_ADDR_NULL)
5597                 return (0);
5598         while (m != NULL && sopt->sopt_valsize >= m->m_len) {
5599                 if (sopt->sopt_p != kernproc) {
5600                         int error;
5601
5602                         error = copyin(sopt->sopt_val, mtod(m, char *),
5603                             m->m_len);
5604                         if (error != 0) {
5605                                 m_freem(m0);
5606                                 return (error);
5607                         }
5608                 } else {
5609                         bcopy(CAST_DOWN(caddr_t, sopt->sopt_val),
5610                             mtod(m, char *), m->m_len);
5611                 }
5612                 sopt->sopt_valsize -= m->m_len;
5613                 sopt->sopt_val += m->m_len;
5614                 m = m->m_next;
5615         }
5616         /* should be allocated enoughly at ip6_sooptmcopyin() */
5617         if (m != NULL) {
5618                 panic("soopt_mcopyin");
5619                 /* NOTREACHED */
5620         }
5621         return (0);
5622 }
5623
5624 /* copyout mbuf chain data into soopt */
5625 int
5626 soopt_mcopyout(struct sockopt *sopt, struct mbuf *m)
5627 {
5628         struct mbuf *m0 = m;
5629         size_t valsize = 0;
5630
5631         if (sopt->sopt_val == USER_ADDR_NULL)
5632                 return (0);
5633         while (m != NULL && sopt->sopt_valsize >= m->m_len) {
5634                 if (sopt->sopt_p != kernproc) {
5635                         int error;
5636
5637                         error = copyout(mtod(m, char *), sopt->sopt_val,
5638                             m->m_len);
5639                         if (error != 0) {
5640                                 m_freem(m0);
5641                                 return (error);
5642                         }
5643                 } else {
5644                         bcopy(mtod(m, char *),
5645                             CAST_DOWN(caddr_t, sopt->sopt_val), m->m_len);
5646                 }
5647                 sopt->sopt_valsize -= m->m_len;
5648                 sopt->sopt_val += m->m_len;
5649                 valsize += m->m_len;
5650                 m = m->m_next;
5651         }
5652         if (m != NULL) {
5653                 /* enough soopt buffer should be given from user-land */
5654                 m_freem(m0);
5655                 return (EINVAL);
5656         }
5657         sopt->sopt_valsize = valsize;
5658         return (0);
5659 }
5660
5661 void
5662 sohasoutofband(struct socket *so)
5663 {
5664         if (so->so_pgid < 0)
5665                 gsignal(-so->so_pgid, SIGURG);
5666         else if (so->so_pgid > 0)
5667                 proc_signal(so->so_pgid, SIGURG);
5668         selwakeup(&so->so_rcv.sb_sel);
5669 }
5670
5671 int
5672 sopoll(struct socket *so, int events, kauth_cred_t cred, void * wql)
5673 {
5674 #pragma unused(cred)
5675         struct proc *p = current_proc();
5676         int revents = 0;
5677
5678         socket_lock(so, 1);
5679         so_update_last_owner_locked(so, PROC_NULL);
5680         so_update_policy(so);
5681
5682         if (events & (POLLIN | POLLRDNORM))
5683                 if (soreadable(so))
5684                         revents |= events & (POLLIN | POLLRDNORM);
5685
5686         if (events & (POLLOUT | POLLWRNORM))
5687                 if (sowriteable(so))
5688                         revents |= events & (POLLOUT | POLLWRNORM);
5689
5690         if (events & (POLLPRI | POLLRDBAND))
5691                 if (so->so_oobmark || (so->so_state & SS_RCVATMARK))
5692                         revents |= events & (POLLPRI | POLLRDBAND);
5693
5694         if (revents == 0) {
5695                 if (events & (POLLIN | POLLPRI | POLLRDNORM | POLLRDBAND)) {
5696                         /*
5697                          * Darwin sets the flag first,
5698                          * BSD calls selrecord first
5699                          */
5700                         so->so_rcv.sb_flags |= SB_SEL;
5701                         selrecord(p, &so->so_rcv.sb_sel, wql);
5702                 }
5703
5704                 if (events & (POLLOUT | POLLWRNORM)) {
5705                         /*
5706                          * Darwin sets the flag first,
5707                          * BSD calls selrecord first
5708                          */
5709                         so->so_snd.sb_flags |= SB_SEL;
5710                         selrecord(p, &so->so_snd.sb_sel, wql);
5711                 }
5712         }
5713
5714         socket_unlock(so, 1);
5715         return (revents);
5716 }
5717
5718 int
5719 soo_kqfilter(struct fileproc *fp, struct knote *kn, vfs_context_t ctx)
5720 {
5721 #pragma unused(fp)
5722 #if !CONFIG_MACF_SOCKET
5723 #pragma unused(ctx)
5724 #endif /* MAC_SOCKET */
5725         struct socket *so = (struct socket *)kn->kn_fp->f_fglob->fg_data;
5726         struct klist *skl;
5727
5728         socket_lock(so, 1);
5729         so_update_last_owner_locked(so, PROC_NULL);
5730         so_update_policy(so);
5731
5732 #if CONFIG_MACF_SOCKET
5733         if (mac_socket_check_kqfilter(proc_ucred(vfs_context_proc(ctx)),
5734             kn, so) != 0) {
5735                 socket_unlock(so, 1);
5736                 return (1);
5737         }
5738 #endif /* MAC_SOCKET */
5739
5740         switch (kn->kn_filter) {
5741         case EVFILT_READ:
5742                 kn->kn_fop = &soread_filtops;
5743                 /*
5744                  * If the caller explicitly asked for OOB results (e.g. poll()),
5745                  * save that off in the hookid field and reserve the kn_flags
5746                  * EV_OOBAND bit for output only.
5747                  */
5748                 if (kn->kn_flags & EV_OOBAND) {
5749                         kn->kn_flags &= ~EV_OOBAND;
5750                         kn->kn_hookid = EV_OOBAND;
5751                 } else {
5752                         kn->kn_hookid = 0;
5753                 }
5754                 skl = &so->so_rcv.sb_sel.si_note;
5755                 break;
5756         case EVFILT_WRITE:
5757                 kn->kn_fop = &sowrite_filtops;
5758                 skl = &so->so_snd.sb_sel.si_note;
5759                 break;
5760         case EVFILT_SOCK:
5761                 kn->kn_fop = &sock_filtops;
5762                 skl = &so->so_klist;
5763                 kn->kn_hookid = 0;
5764                 kn->kn_status |= KN_TOUCH;
5765                 break;
5766         default:
5767                 socket_unlock(so, 1);
5768                 return (1);
5769         }
5770
5771         if (KNOTE_ATTACH(skl, kn)) {
5772                 switch (kn->kn_filter) {
5773                 case EVFILT_READ:
5774                         so->so_rcv.sb_flags |= SB_KNOTE;
5775                         break;
5776                 case EVFILT_WRITE:
5777                         so->so_snd.sb_flags |= SB_KNOTE;
5778                         break;
5779                 case EVFILT_SOCK:
5780                         so->so_flags |= SOF_KNOTE;
5781                         break;
5782                 default:
5783                         socket_unlock(so, 1);
5784                         return (1);
5785                 }
5786         }
5787         socket_unlock(so, 1);
5788         return (0);
5789 }
5790
5791 static void
5792 filt_sordetach(struct knote *kn)
5793 {
5794         struct socket *so = (struct socket *)kn->kn_fp->f_fglob->fg_data;
5795
5796         socket_lock(so, 1);
5797         if (so->so_rcv.sb_flags & SB_KNOTE)
5798                 if (KNOTE_DETACH(&so->so_rcv.sb_sel.si_note, kn))
5799                         so->so_rcv.sb_flags &= ~SB_KNOTE;
5800         socket_unlock(so, 1);
5801 }
5802
5803 /*ARGSUSED*/
5804 static int
5805 filt_soread(struct knote *kn, long hint)
5806 {
5807         struct socket *so = (struct socket *)kn->kn_fp->f_fglob->fg_data;
5808
5809         if ((hint & SO_FILT_HINT_LOCKED) == 0)
5810                 socket_lock(so, 1);
5811
5812         if (so->so_options & SO_ACCEPTCONN) {
5813                 int isempty;
5814
5815                 /*
5816                  * Radar 6615193 handle the listen case dynamically
5817                  * for kqueue read filter. This allows to call listen()
5818                  * after registering the kqueue EVFILT_READ.
5819                  */
5820
5821                 kn->kn_data = so->so_qlen;
5822                 isempty = ! TAILQ_EMPTY(&so->so_comp);
5823
5824                 if ((hint & SO_FILT_HINT_LOCKED) == 0)
5825                         socket_unlock(so, 1);
5826
5827                 return (isempty);
5828         }
5829
5830         /* socket isn't a listener */
5831         /*
5832          * NOTE_LOWAT specifies new low water mark in data, i.e.
5833          * the bytes of protocol data. We therefore exclude any
5834          * control bytes.
5835          */
5836         kn->kn_data = so->so_rcv.sb_cc - so->so_rcv.sb_ctl;
5837
5838         /*
5839          * Clear out EV_OOBAND that filt_soread may have set in the
5840          * past.
5841          */
5842         kn->kn_flags &= ~EV_OOBAND;
5843         if ((so->so_oobmark) || (so->so_state & SS_RCVATMARK)) {
5844                 kn->kn_flags |= EV_OOBAND;
5845                 /*
5846                  * If caller registered explicit interest in OOB data,
5847                  * return immediately (data == amount beyond mark, for
5848                  * legacy reasons - that should be changed later).
5849                  */
5850                 if (kn->kn_hookid == EV_OOBAND) {
5851                         /*
5852                          * When so_state is SS_RCVATMARK, so_oobmark
5853                          * is 0.
5854                          */
5855                         kn->kn_data -= so->so_oobmark;
5856                         if ((hint & SO_FILT_HINT_LOCKED) == 0)
5857                                 socket_unlock(so, 1);
5858                         return (1);
5859                 }
5860         }
5861
5862         if ((so->so_state & SS_CANTRCVMORE)
5863 #if CONTENT_FILTER
5864             && cfil_sock_data_pending(&so->so_rcv) == 0
5865 #endif /* CONTENT_FILTER */
5866            ) {
5867                 kn->kn_flags |= EV_EOF;
5868                 kn->kn_fflags = so->so_error;
5869                 if ((hint & SO_FILT_HINT_LOCKED) == 0)
5870                         socket_unlock(so, 1);
5871                 return (1);
5872         }
5873
5874         if (so->so_error) {     /* temporary udp error */
5875                 if ((hint & SO_FILT_HINT_LOCKED) == 0)
5876                         socket_unlock(so, 1);
5877                 return (1);
5878         }
5879
5880         int64_t lowwat = so->so_rcv.sb_lowat;
5881         /*
5882          * Ensure that when NOTE_LOWAT is used, the derived
5883          * low water mark is bounded by socket's rcv buf's
5884          * high and low water mark values.
5885          */
5886         if (kn->kn_sfflags & NOTE_LOWAT) {
5887                 if (kn->kn_sdata > so->so_rcv.sb_hiwat)
5888                         lowwat = so->so_rcv.sb_hiwat;
5889                 else if (kn->kn_sdata > lowwat)
5890                         lowwat = kn->kn_sdata;
5891         }
5892
5893         if ((hint & SO_FILT_HINT_LOCKED) == 0)
5894                 socket_unlock(so, 1);
5895
5896         /*
5897          * The order below is important. Since NOTE_LOWAT
5898          * overrides sb_lowat, check for NOTE_LOWAT case
5899          * first.
5900          */
5901         if (kn->kn_sfflags & NOTE_LOWAT)
5902                 return (kn->kn_data >= lowwat);
5903
5904         return (so->so_rcv.sb_cc >= lowwat);
5905 }
5906
5907 static void
5908 filt_sowdetach(struct knote *kn)
5909 {
5910         struct socket *so = (struct socket *)kn->kn_fp->f_fglob->fg_data;
5911         socket_lock(so, 1);
5912
5913         if (so->so_snd.sb_flags & SB_KNOTE)
5914                 if (KNOTE_DETACH(&so->so_snd.sb_sel.si_note, kn))
5915                         so->so_snd.sb_flags &= ~SB_KNOTE;
5916         socket_unlock(so, 1);
5917 }
5918
5919 int
5920 so_wait_for_if_feedback(struct socket *so)
5921 {
5922         if ((SOCK_DOM(so) == PF_INET || SOCK_DOM(so) == PF_INET6) &&
5923             (so->so_state & SS_ISCONNECTED)) {
5924                 struct inpcb *inp = sotoinpcb(so);
5925                 if (INP_WAIT_FOR_IF_FEEDBACK(inp))
5926                         return (1);
5927         }
5928         return (0);
5929 }
5930
5931 /*ARGSUSED*/
5932 static int
5933 filt_sowrite(struct knote *kn, long hint)
5934 {
5935         struct socket *so = (struct socket *)kn->kn_fp->f_fglob->fg_data;
5936         int ret = 0;
5937
5938         if ((hint & SO_FILT_HINT_LOCKED) == 0)
5939                 socket_lock(so, 1);
5940
5941         kn->kn_data = sbspace(&so->so_snd);
5942         if (so->so_state & SS_CANTSENDMORE) {
5943                 kn->kn_flags |= EV_EOF;
5944                 kn->kn_fflags = so->so_error;
5945                 ret = 1;
5946                 goto out;
5947         }
5948         if (so->so_error) {     /* temporary udp error */
5949                 ret = 1;
5950                 goto out;
5951         }
5952         if (!socanwrite(so)) {
5953                 ret = 0;
5954                 goto out;
5955         }
5956         if (so->so_flags1 & SOF1_PRECONNECT_DATA) {
5957                 ret = 1;
5958                 goto out;
5959         }
5960         int64_t lowwat = so->so_snd.sb_lowat;
5961         if (kn->kn_sfflags & NOTE_LOWAT) {
5962                 if (kn->kn_sdata > so->so_snd.sb_hiwat)
5963                         lowwat = so->so_snd.sb_hiwat;
5964                 else if (kn->kn_sdata > lowwat)
5965                         lowwat = kn->kn_sdata;
5966         }
5967         if (kn->kn_data >= lowwat) {
5968                 if (so->so_flags & SOF_NOTSENT_LOWAT) {
5969                         if ((SOCK_DOM(so) == PF_INET
5970                             || SOCK_DOM(so) == PF_INET6)
5971                             && so->so_type == SOCK_STREAM) {
5972                                 ret = tcp_notsent_lowat_check(so);
5973                         }
5974 #if MPTCP
5975                         else if ((SOCK_DOM(so) == PF_MULTIPATH) &&
5976                             (SOCK_PROTO(so) == IPPROTO_TCP)) {
5977                                 ret = mptcp_notsent_lowat_check(so);
5978                         }
5979 #endif
5980                         else {
5981                                 ret = 1;
5982                                 goto out;
5983                         }
5984                 } else {
5985                         ret = 1;
5986                 }
5987         }
5988         if (so_wait_for_if_feedback(so))
5989                 ret = 0;
5990 out:
5991         if ((hint & SO_FILT_HINT_LOCKED) == 0)
5992                 socket_unlock(so, 1);
5993         return (ret);
5994 }
5995
5996 static void
5997 filt_sockdetach(struct knote *kn)
5998 {
5999         struct socket *so = (struct socket *)kn->kn_fp->f_fglob->fg_data;
6000         socket_lock(so, 1);
6001
6002         if ((so->so_flags & SOF_KNOTE) != 0)
6003                 if (KNOTE_DETACH(&so->so_klist, kn))
6004                         so->so_flags &= ~SOF_KNOTE;
6005         socket_unlock(so, 1);
6006 }
6007
6008 static int
6009 filt_sockev(struct knote *kn, long hint)
6010 {
6011         int ret = 0, locked = 0;
6012         struct socket *so = (struct socket *)kn->kn_fp->f_fglob->fg_data;
6013         long ev_hint = (hint & SO_FILT_HINT_EV);
6014         uint32_t level_trigger = 0;
6015
6016         if ((hint & SO_FILT_HINT_LOCKED) == 0) {
6017                 socket_lock(so, 1);
6018                 locked = 1;
6019         }
6020
6021         if (ev_hint & SO_FILT_HINT_CONNRESET) {
6022                 kn->kn_fflags |= NOTE_CONNRESET;
6023         }
6024         if (ev_hint & SO_FILT_HINT_TIMEOUT) {
6025                 kn->kn_fflags |= NOTE_TIMEOUT;
6026         }
6027         if (ev_hint & SO_FILT_HINT_NOSRCADDR) {
6028                 kn->kn_fflags |= NOTE_NOSRCADDR;
6029         }
6030         if (ev_hint & SO_FILT_HINT_IFDENIED) {
6031                 kn->kn_fflags |= NOTE_IFDENIED;
6032         }
6033         if (ev_hint & SO_FILT_HINT_KEEPALIVE) {
6034                 kn->kn_fflags |= NOTE_KEEPALIVE;
6035         }
6036         if (ev_hint & SO_FILT_HINT_ADAPTIVE_WTIMO) {
6037                 kn->kn_fflags |= NOTE_ADAPTIVE_WTIMO;
6038         }
6039         if (ev_hint & SO_FILT_HINT_ADAPTIVE_RTIMO) {
6040                 kn->kn_fflags |= NOTE_ADAPTIVE_RTIMO;
6041         }
6042         if ((ev_hint & SO_FILT_HINT_CONNECTED) ||
6043             (so->so_state & SS_ISCONNECTED)) {
6044                 kn->kn_fflags |= NOTE_CONNECTED;
6045                 level_trigger |= NOTE_CONNECTED;
6046         }
6047         if ((ev_hint & SO_FILT_HINT_DISCONNECTED) ||
6048             (so->so_state & SS_ISDISCONNECTED)) {
6049                 kn->kn_fflags |= NOTE_DISCONNECTED;
6050                 level_trigger |= NOTE_DISCONNECTED;
6051         }
6052         if (ev_hint & SO_FILT_HINT_CONNINFO_UPDATED) {
6053                 if (so->so_proto != NULL &&
6054                     (so->so_proto->pr_flags & PR_EVCONNINFO))
6055                         kn->kn_fflags |= NOTE_CONNINFO_UPDATED;
6056         }
6057
6058         if ((so->so_state & SS_CANTRCVMORE)
6059 #if CONTENT_FILTER
6060             && cfil_sock_data_pending(&so->so_rcv) == 0
6061 #endif /* CONTENT_FILTER */
6062             ) {
6063                 kn->kn_fflags |= NOTE_READCLOSED;
6064                 level_trigger |= NOTE_READCLOSED;
6065         }
6066
6067         if (so->so_state & SS_CANTSENDMORE) {
6068                 kn->kn_fflags |= NOTE_WRITECLOSED;
6069                 level_trigger |= NOTE_WRITECLOSED;
6070         }
6071
6072         if ((ev_hint & SO_FILT_HINT_SUSPEND) ||
6073             (so->so_flags & SOF_SUSPENDED)) {
6074                 kn->kn_fflags &= ~(NOTE_SUSPEND | NOTE_RESUME);
6075
6076                 /* If resume event was delivered before, reset it */
6077                 kn->kn_hookid &= ~NOTE_RESUME;
6078
6079                 kn->kn_fflags |= NOTE_SUSPEND;
6080                 level_trigger |= NOTE_SUSPEND;
6081         }
6082
6083         if ((ev_hint & SO_FILT_HINT_RESUME) ||
6084             (so->so_flags & SOF_SUSPENDED) == 0) {
6085                 kn->kn_fflags &= ~(NOTE_SUSPEND | NOTE_RESUME);
6086
6087                 /* If suspend event was delivered before, reset it */
6088                 kn->kn_hookid &= ~NOTE_SUSPEND;
6089
6090                 kn->kn_fflags |= NOTE_RESUME;
6091                 level_trigger |= NOTE_RESUME;
6092         }
6093
6094         if (so->so_error != 0) {
6095                 ret = 1;
6096                 kn->kn_data = so->so_error;
6097                 kn->kn_flags |= EV_EOF;
6098         } else {
6099                 get_sockev_state(so, (u_int32_t *)&(kn->kn_data));
6100         }
6101
6102         /* Reset any events that are not requested on this knote */
6103         kn->kn_fflags &= (kn->kn_sfflags & EVFILT_SOCK_ALL_MASK);
6104         level_trigger &= (kn->kn_sfflags & EVFILT_SOCK_ALL_MASK);
6105
6106         /* Find the level triggerred events that are already delivered */
6107         level_trigger &= kn->kn_hookid;
6108         level_trigger &= EVFILT_SOCK_LEVEL_TRIGGER_MASK;
6109
6110         /* Do not deliver level triggerred events more than once */
6111         if ((kn->kn_fflags & ~level_trigger) != 0)
6112                 ret = 1;
6113
6114         if (locked)
6115                 socket_unlock(so, 1);
6116
6117         return (ret);
6118 }
6119
6120 static void
6121 filt_socktouch(struct knote *kn, struct kevent_internal_s *kev, long type)
6122 {
6123 #pragma unused(kev)
6124         switch (type) {
6125         case EVENT_REGISTER:
6126         {
6127                 uint32_t changed_flags;
6128                 changed_flags = (kn->kn_sfflags ^ kn->kn_hookid);
6129
6130                 /*
6131                  * Since we keep track of events that are already
6132                  * delivered, if any of those events are not requested
6133                  * anymore the state related to them can be reset
6134                  */
6135                 kn->kn_hookid &=
6136                     ~(changed_flags & EVFILT_SOCK_LEVEL_TRIGGER_MASK);
6137                 break;
6138         }
6139         case EVENT_PROCESS:
6140                 /*
6141                  * Store the state of the events being delivered. This
6142                  * state can be used to deliver level triggered events
6143                  * ateast once and still avoid waking up the application
6144                  * multiple times as long as the event is active.
6145                  */
6146                 if (kn->kn_fflags != 0)
6147                         kn->kn_hookid |= (kn->kn_fflags &
6148                                 EVFILT_SOCK_LEVEL_TRIGGER_MASK);
6149
6150                 /*
6151                  * NOTE_RESUME and NOTE_SUSPEND are an exception, deliver
6152                  * only one of them and remember the last one that was
6153                  * delivered last
6154                  */
6155                 if (kn->kn_fflags & NOTE_SUSPEND)
6156                         kn->kn_hookid &= ~NOTE_RESUME;
6157                 if (kn->kn_fflags & NOTE_RESUME)
6158                         kn->kn_hookid &= ~NOTE_SUSPEND;
6159                 break;
6160         default:
6161                 break;
6162         }
6163 }
6164
6165 void
6166 get_sockev_state(struct socket *so, u_int32_t *statep)
6167 {
6168         u_int32_t state = *(statep);
6169
6170         if (so->so_state & SS_ISCONNECTED)
6171                 state |= SOCKEV_CONNECTED;
6172         else
6173                 state &= ~(SOCKEV_CONNECTED);
6174         state |= ((so->so_state & SS_ISDISCONNECTED) ? SOCKEV_DISCONNECTED : 0);
6175         *(statep) = state;
6176 }
6177
6178 #define SO_LOCK_HISTORY_STR_LEN \
6179         (2 * SO_LCKDBG_MAX * (2 + (2 * sizeof (void *)) + 1) + 1)
6180
6181 __private_extern__ const char *
6182 solockhistory_nr(struct socket *so)
6183 {
6184         size_t n = 0;
6185         int i;
6186         static char lock_history_str[SO_LOCK_HISTORY_STR_LEN];
6187
6188         bzero(lock_history_str, sizeof (lock_history_str));
6189         for (i = SO_LCKDBG_MAX - 1; i >= 0; i--) {
6190                 n += snprintf(lock_history_str + n,
6191                     SO_LOCK_HISTORY_STR_LEN - n, "%p:%p ",
6192                     so->lock_lr[(so->next_lock_lr + i) % SO_LCKDBG_MAX],
6193                     so->unlock_lr[(so->next_unlock_lr + i) % SO_LCKDBG_MAX]);
6194         }
6195         return (lock_history_str);
6196 }
6197
6198 int
6199 socket_lock(struct socket *so, int refcount)
6200 {
6201         int error = 0;
6202         void *lr_saved;
6203
6204         lr_saved = __builtin_return_address(0);
6205
6206         if (so->so_proto->pr_lock) {
6207                 error = (*so->so_proto->pr_lock)(so, refcount, lr_saved);
6208         } else {
6209 #ifdef MORE_LOCKING_DEBUG
6210                 lck_mtx_assert(so->so_proto->pr_domain->dom_mtx,
6211                     LCK_MTX_ASSERT_NOTOWNED);
6212 #endif
6213                 lck_mtx_lock(so->so_proto->pr_domain->dom_mtx);
6214                 if (refcount)
6215                         so->so_usecount++;
6216                 so->lock_lr[so->next_lock_lr] = lr_saved;
6217                 so->next_lock_lr = (so->next_lock_lr+1) % SO_LCKDBG_MAX;
6218         }
6219
6220         return (error);
6221 }
6222
6223 int
6224 socket_unlock(struct socket *so, int refcount)
6225 {
6226         int error = 0;
6227         void *lr_saved;
6228         lck_mtx_t *mutex_held;
6229
6230         lr_saved = __builtin_return_address(0);
6231
6232         if (so->so_proto == NULL) {
6233                 panic("%s: null so_proto so=%p\n", __func__, so);
6234                 /* NOTREACHED */
6235         }
6236
6237         if (so && so->so_proto->pr_unlock) {
6238                 error = (*so->so_proto->pr_unlock)(so, refcount, lr_saved);
6239         } else {
6240                 mutex_held = so->so_proto->pr_domain->dom_mtx;
6241 #ifdef MORE_LOCKING_DEBUG
6242                 lck_mtx_assert(mutex_held, LCK_MTX_ASSERT_OWNED);
6243 #endif
6244                 so->unlock_lr[so->next_unlock_lr] = lr_saved;
6245                 so->next_unlock_lr = (so->next_unlock_lr+1) % SO_LCKDBG_MAX;
6246
6247                 if (refcount) {
6248                         if (so->so_usecount <= 0) {
6249                                 panic("%s: bad refcount=%d so=%p (%d, %d, %d) "
6250                                     "lrh=%s", __func__, so->so_usecount, so,
6251                                     SOCK_DOM(so), so->so_type,
6252                                     SOCK_PROTO(so), solockhistory_nr(so));
6253                                 /* NOTREACHED */
6254                         }
6255
6256                         so->so_usecount--;
6257                         if (so->so_usecount == 0)
6258                                 sofreelastref(so, 1);
6259                 }
6260                 lck_mtx_unlock(mutex_held);
6261         }
6262
6263         return (error);
6264 }
6265
6266 /* Called with socket locked, will unlock socket */
6267 void
6268 sofree(struct socket *so)
6269 {
6270         lck_mtx_t *mutex_held;
6271
6272         if (so->so_proto->pr_getlock != NULL)
6273                 mutex_held = (*so->so_proto->pr_getlock)(so, 0);
6274         else
6275                 mutex_held = so->so_proto->pr_domain->dom_mtx;
6276         lck_mtx_assert(mutex_held, LCK_MTX_ASSERT_OWNED);
6277
6278         sofreelastref(so, 0);
6279 }
6280
6281 void
6282 soreference(struct socket *so)
6283 {
6284         socket_lock(so, 1);     /* locks & take one reference on socket */
6285         socket_unlock(so, 0);   /* unlock only */
6286 }
6287
6288 void
6289 sodereference(struct socket *so)
6290 {
6291         socket_lock(so, 0);
6292         socket_unlock(so, 1);
6293 }
6294
6295 /*
6296  * Set or clear SOF_MULTIPAGES on the socket to enable or disable the
6297  * possibility of using jumbo clusters.  Caller must ensure to hold
6298  * the socket lock.
6299  */
6300 void
6301 somultipages(struct socket *so, boolean_t set)
6302 {
6303         if (set)
6304                 so->so_flags |= SOF_MULTIPAGES;
6305         else
6306                 so->so_flags &= ~SOF_MULTIPAGES;
6307 }
6308
6309 void
6310 soif2kcl(struct socket *so, boolean_t set)
6311 {
6312         if (set)
6313                 so->so_flags1 |= SOF1_IF_2KCL;
6314         else
6315                 so->so_flags1 &= ~SOF1_IF_2KCL;
6316 }
6317
6318 int
6319 so_isdstlocal(struct socket *so) {
6320
6321         struct inpcb *inp = (struct inpcb *)so->so_pcb;
6322
6323         if (SOCK_DOM(so) == PF_INET)
6324                 return (inaddr_local(inp->inp_faddr));
6325         else if (SOCK_DOM(so) == PF_INET6)
6326                 return (in6addr_local(&inp->in6p_faddr));
6327
6328         return (0);
6329 }
6330
6331 int
6332 sosetdefunct(struct proc *p, struct socket *so, int level, boolean_t noforce)
6333 {
6334         struct sockbuf *rcv, *snd;
6335         int err = 0, defunct;
6336
6337         rcv = &so->so_rcv;
6338         snd = &so->so_snd;
6339
6340         defunct = (so->so_flags & SOF_DEFUNCT);
6341         if (defunct) {
6342                 if (!(snd->sb_flags & rcv->sb_flags & SB_DROP)) {
6343                         panic("%s: SB_DROP not set", __func__);
6344                         /* NOTREACHED */
6345                 }
6346                 goto done;
6347         }
6348
6349         if (so->so_flags & SOF_NODEFUNCT) {
6350                 if (noforce) {
6351                         err = EOPNOTSUPP;
6352                         SODEFUNCTLOG(("%s[%d]: (target pid %d level %d) "
6353                             "so 0x%llx [%d,%d] is not eligible for defunct "
6354                             "(%d)\n", __func__, proc_selfpid(), proc_pid(p),
6355                             level, (uint64_t)DEBUG_KERNEL_ADDRPERM(so),
6356                             SOCK_DOM(so), SOCK_TYPE(so), err));
6357                         return (err);
6358                 }
6359                 so->so_flags &= ~SOF_NODEFUNCT;
6360                 SODEFUNCTLOG(("%s[%d]: (target pid %d level %d) so 0x%llx "
6361                     "[%d,%d] defunct by force\n", __func__, proc_selfpid(),
6362                     proc_pid(p), level, (uint64_t)DEBUG_KERNEL_ADDRPERM(so),
6363                     SOCK_DOM(so), SOCK_TYPE(so)));
6364         } else if (so->so_flags1 & SOF1_EXTEND_BK_IDLE_WANTED) {
6365                 struct inpcb *inp = (struct inpcb *)so->so_pcb;
6366                 struct ifnet *ifp = inp->inp_last_outifp;
6367
6368                 if (ifp && IFNET_IS_CELLULAR(ifp)) {
6369                         OSIncrementAtomic(&soextbkidlestat.so_xbkidle_nocell);
6370                 } else if (so->so_flags & SOF_DELEGATED) {
6371                         OSIncrementAtomic(&soextbkidlestat.so_xbkidle_nodlgtd);
6372                 } else if (soextbkidlestat.so_xbkidle_time == 0) {
6373                         OSIncrementAtomic(&soextbkidlestat.so_xbkidle_notime);
6374                 } else if (noforce) {
6375                         OSIncrementAtomic(&soextbkidlestat.so_xbkidle_active);
6376
6377                         so->so_flags1 |= SOF1_EXTEND_BK_IDLE_INPROG;
6378                         so->so_extended_bk_start = net_uptime();
6379                         OSBitOrAtomic(P_LXBKIDLEINPROG, &p->p_ladvflag);
6380
6381                         inpcb_timer_sched(inp->inp_pcbinfo, INPCB_TIMER_LAZY);
6382
6383                         err = EOPNOTSUPP;
6384                         SODEFUNCTLOG(("%s[%d]: (target pid %d level %d) "
6385                             "extend bk idle "
6386                             "so 0x%llx rcv hw %d cc %d\n",
6387                             __func__, proc_selfpid(), proc_pid(p),
6388                             level, (uint64_t)DEBUG_KERNEL_ADDRPERM(so),
6389                             so->so_rcv.sb_hiwat, so->so_rcv.sb_cc));
6390                         return (err);
6391                 } else {
6392                         OSIncrementAtomic(&soextbkidlestat.so_xbkidle_forced);
6393                 }
6394         }
6395
6396         so->so_flags |= SOF_DEFUNCT;
6397
6398         /* Prevent further data from being appended to the socket buffers */
6399         snd->sb_flags |= SB_DROP;
6400         rcv->sb_flags |= SB_DROP;
6401
6402         /* Flush any existing data in the socket buffers */
6403         if (rcv->sb_cc != 0) {
6404                 rcv->sb_flags &= ~SB_SEL;
6405                 selthreadclear(&rcv->sb_sel);
6406                 sbrelease(rcv);
6407         }
6408         if (snd->sb_cc != 0) {
6409                 snd->sb_flags &= ~SB_SEL;
6410                 selthreadclear(&snd->sb_sel);
6411                 sbrelease(snd);
6412         }
6413
6414 done:
6415         SODEFUNCTLOG(("%s[%d]: (target pid %d level %d) so 0x%llx [%d,%d] %s "
6416             "defunct%s\n", __func__, proc_selfpid(), proc_pid(p), level,
6417             (uint64_t)DEBUG_KERNEL_ADDRPERM(so), SOCK_DOM(so), SOCK_TYPE(so),
6418             defunct ? "is already" : "marked as",
6419             (so->so_flags1 & SOF1_EXTEND_BK_IDLE_WANTED) ? " extbkidle" : ""));
6420
6421         return (err);
6422 }
6423
6424 int
6425 sodefunct(struct proc *p, struct socket *so, int level)
6426 {
6427         struct sockbuf *rcv, *snd;
6428
6429         if (!(so->so_flags & SOF_DEFUNCT)) {
6430                 panic("%s improperly called", __func__);
6431                 /* NOTREACHED */
6432         }
6433         if (so->so_state & SS_DEFUNCT)
6434                 goto done;
6435
6436         rcv = &so->so_rcv;
6437         snd = &so->so_snd;
6438
6439         if (SOCK_DOM(so) == PF_INET || SOCK_DOM(so) == PF_INET6) {
6440                 char s[MAX_IPv6_STR_LEN];
6441                 char d[MAX_IPv6_STR_LEN];
6442                 struct inpcb *inp = sotoinpcb(so);
6443
6444                 SODEFUNCTLOG(("%s[%d]: (target pid %d level %d) so 0x%llx [%s "
6445                     "%s:%d -> %s:%d] is now defunct [rcv_si 0x%x, snd_si 0x%x, "
6446                     "rcv_fl 0x%x, snd_fl 0x%x]\n", __func__, proc_selfpid(),
6447                     proc_pid(p), level, (uint64_t)DEBUG_KERNEL_ADDRPERM(so),
6448                     (SOCK_TYPE(so) == SOCK_STREAM) ? "TCP" : "UDP",
6449                     inet_ntop(SOCK_DOM(so), ((SOCK_DOM(so) == PF_INET) ?
6450                     (void *)&inp->inp_laddr.s_addr : (void *)&inp->in6p_laddr),
6451                     s, sizeof (s)), ntohs(inp->in6p_lport),
6452                     inet_ntop(SOCK_DOM(so), (SOCK_DOM(so) == PF_INET) ?
6453                     (void *)&inp->inp_faddr.s_addr : (void *)&inp->in6p_faddr,
6454                     d, sizeof (d)), ntohs(inp->in6p_fport),
6455                     (uint32_t)rcv->sb_sel.si_flags,
6456                     (uint32_t)snd->sb_sel.si_flags,
6457                     rcv->sb_flags, snd->sb_flags));
6458         } else {
6459                 SODEFUNCTLOG(("%s[%d]: (target pid %d level %d) so 0x%llx "
6460                     "[%d,%d] is now defunct [rcv_si 0x%x, snd_si 0x%x, "
6461                     "rcv_fl 0x%x, snd_fl 0x%x]\n", __func__, proc_selfpid(),
6462                     proc_pid(p), level, (uint64_t)DEBUG_KERNEL_ADDRPERM(so),
6463                     SOCK_DOM(so), SOCK_TYPE(so), (uint32_t)rcv->sb_sel.si_flags,
6464                     (uint32_t)snd->sb_sel.si_flags, rcv->sb_flags,
6465                     snd->sb_flags));
6466         }
6467
6468         /*
6469          * Unwedge threads blocked on sbwait() and sb_lock().
6470          */
6471         sbwakeup(rcv);
6472         sbwakeup(snd);
6473
6474         so->so_flags1 |= SOF1_DEFUNCTINPROG;
6475         if (rcv->sb_flags & SB_LOCK)
6476                 sbunlock(rcv, TRUE);    /* keep socket locked */
6477         if (snd->sb_flags & SB_LOCK)
6478                 sbunlock(snd, TRUE);    /* keep socket locked */
6479
6480         /*
6481          * Flush the buffers and disconnect.  We explicitly call shutdown
6482          * on both data directions to ensure that SS_CANT{RCV,SEND}MORE
6483          * states are set for the socket.  This would also flush out data
6484          * hanging off the receive list of this socket.
6485          */
6486         (void) soshutdownlock_final(so, SHUT_RD);
6487         (void) soshutdownlock_final(so, SHUT_WR);
6488         (void) sodisconnectlocked(so);
6489
6490         /*
6491          * Explicitly handle connectionless-protocol disconnection
6492          * and release any remaining data in the socket buffers.
6493          */
6494         if (!(so->so_flags & SS_ISDISCONNECTED))
6495                 (void) soisdisconnected(so);
6496
6497         if (so->so_error == 0)
6498                 so->so_error = EBADF;
6499
6500         if (rcv->sb_cc != 0) {
6501                 rcv->sb_flags &= ~SB_SEL;
6502                 selthreadclear(&rcv->sb_sel);
6503                 sbrelease(rcv);
6504         }
6505         if (snd->sb_cc != 0) {
6506                 snd->sb_flags &= ~SB_SEL;
6507                 selthreadclear(&snd->sb_sel);
6508                 sbrelease(snd);
6509         }
6510         so->so_state |= SS_DEFUNCT;
6511
6512 done:
6513         return (0);
6514 }
6515
6516 int
6517 soresume(struct proc *p, struct socket *so, int locked)
6518 {
6519         if (locked == 0)
6520                 socket_lock(so, 1);
6521
6522         if (so->so_flags1 & SOF1_EXTEND_BK_IDLE_INPROG) {
6523                 SODEFUNCTLOG(("%s[%d]: )target pid %d) so 0x%llx [%d,%d] "
6524                     "resumed from bk idle\n",
6525                     __func__, proc_selfpid(), proc_pid(p),
6526                     (uint64_t)DEBUG_KERNEL_ADDRPERM(so),
6527                     SOCK_DOM(so), SOCK_TYPE(so)));
6528
6529                 so->so_flags1 &= ~SOF1_EXTEND_BK_IDLE_INPROG;
6530                 so->so_extended_bk_start = 0;
6531                 OSBitAndAtomic(~P_LXBKIDLEINPROG, &p->p_ladvflag);
6532
6533                 OSIncrementAtomic(&soextbkidlestat.so_xbkidle_resumed);
6534                 OSDecrementAtomic(&soextbkidlestat.so_xbkidle_active);
6535                 VERIFY(soextbkidlestat.so_xbkidle_active >= 0);
6536         }
6537         if (locked == 0)
6538                 socket_unlock(so, 1);
6539
6540         return (0);
6541 }
6542
6543 /*
6544  * Does not attempt to account for sockets that are delegated from
6545  * the current process
6546  */
6547 int
6548 so_set_extended_bk_idle(struct socket *so, int optval)
6549 {
6550         int error = 0;
6551
6552         if ((SOCK_DOM(so) != PF_INET && SOCK_DOM(so) != PF_INET6) ||
6553             SOCK_PROTO(so) != IPPROTO_TCP) {
6554                 OSDecrementAtomic(&soextbkidlestat.so_xbkidle_notsupp);
6555                 error = EOPNOTSUPP;
6556         } else if (optval == 0) {
6557                 so->so_flags1 &= ~SOF1_EXTEND_BK_IDLE_WANTED;
6558
6559                 soresume(current_proc(), so, 1);
6560         } else {
6561                 struct proc *p = current_proc();
6562                 int i;
6563                 struct filedesc *fdp;
6564                 int count = 0;
6565
6566                 proc_fdlock(p);
6567
6568                 fdp = p->p_fd;
6569                 for (i = 0; i < fdp->fd_nfiles; i++) {
6570                         struct fileproc *fp = fdp->fd_ofiles[i];
6571                         struct socket *so2;
6572
6573                         if (fp == NULL ||
6574                             (fdp->fd_ofileflags[i] & UF_RESERVED) != 0 ||
6575                             FILEGLOB_DTYPE(fp->f_fglob) != DTYPE_SOCKET)
6576                                 continue;
6577
6578                         so2 = (struct socket *)fp->f_fglob->fg_data;
6579                         if (so != so2 &&
6580                             so2->so_flags1 & SOF1_EXTEND_BK_IDLE_WANTED)
6581                                 count++;
6582                         if (count >= soextbkidlestat.so_xbkidle_maxperproc)
6583                                 break;
6584                 }
6585                 if (count >= soextbkidlestat.so_xbkidle_maxperproc) {
6586                         OSIncrementAtomic(&soextbkidlestat.so_xbkidle_toomany);
6587                         error = EBUSY;
6588                 } else if (so->so_flags & SOF_DELEGATED) {
6589                         OSIncrementAtomic(&soextbkidlestat.so_xbkidle_nodlgtd);
6590                         error = EBUSY;
6591                 } else {
6592                         so->so_flags1 |= SOF1_EXTEND_BK_IDLE_WANTED;
6593                         OSIncrementAtomic(&soextbkidlestat.so_xbkidle_wantok);
6594                 }
6595                 SODEFUNCTLOG(("%s[%d]: so 0x%llx [%d,%d] "
6596                     "%s marked for extended bk idle\n",
6597                     __func__, proc_selfpid(),
6598                     (uint64_t)DEBUG_KERNEL_ADDRPERM(so),
6599                     SOCK_DOM(so), SOCK_TYPE(so),
6600                     (so->so_flags1 & SOF1_EXTEND_BK_IDLE_WANTED) ?
6601                     "is" : "not"));
6602
6603                 proc_fdunlock(p);
6604         }
6605
6606         return (error);
6607 }
6608
6609 static void
6610 so_stop_extended_bk_idle(struct socket *so)
6611 {
6612         so->so_flags1 &= ~SOF1_EXTEND_BK_IDLE_INPROG;
6613         so->so_extended_bk_start = 0;
6614
6615         OSDecrementAtomic(&soextbkidlestat.so_xbkidle_active);
6616         VERIFY(soextbkidlestat.so_xbkidle_active >= 0);
6617         /*
6618          * Force defunct
6619          */
6620         sosetdefunct(current_proc(), so,
6621             SHUTDOWN_SOCKET_LEVEL_DISCONNECT_INTERNAL, FALSE);
6622         if (so->so_flags & SOF_DEFUNCT) {
6623                 sodefunct(current_proc(), so,
6624                     SHUTDOWN_SOCKET_LEVEL_DISCONNECT_INTERNAL);
6625         }
6626 }
6627
6628 void
6629 so_drain_extended_bk_idle(struct socket *so)
6630 {
6631         if (so && (so->so_flags1 & SOF1_EXTEND_BK_IDLE_INPROG)) {
6632                 /*
6633                  * Only penalize sockets that have outstanding data
6634                  */
6635                 if (so->so_rcv.sb_cc || so->so_snd.sb_cc) {
6636                         so_stop_extended_bk_idle(so);
6637
6638                         OSIncrementAtomic(&soextbkidlestat.so_xbkidle_drained);
6639                 }
6640         }
6641 }
6642
6643 /*
6644  * Return values tells if socket is still in extended background idle
6645  */
6646 int
6647 so_check_extended_bk_idle_time(struct socket *so)
6648 {
6649         int ret = 1;
6650
6651         if ((so->so_flags1 & SOF1_EXTEND_BK_IDLE_INPROG)) {
6652                 SODEFUNCTLOG(("%s[%d]: so 0x%llx [%d,%d]\n",
6653                     __func__, proc_selfpid(),
6654                     (uint64_t)DEBUG_KERNEL_ADDRPERM(so),
6655                     SOCK_DOM(so), SOCK_TYPE(so)));
6656                 if (net_uptime() - so->so_extended_bk_start >
6657                     soextbkidlestat.so_xbkidle_time) {
6658                         so_stop_extended_bk_idle(so);
6659
6660                         OSIncrementAtomic(&soextbkidlestat.so_xbkidle_expired);
6661
6662                         ret = 0;
6663                 } else {
6664                         struct inpcb *inp = (struct inpcb *)so->so_pcb;
6665
6666                         inpcb_timer_sched(inp->inp_pcbinfo, INPCB_TIMER_LAZY);
6667                         OSIncrementAtomic(&soextbkidlestat.so_xbkidle_resched);
6668                 }
6669         }
6670
6671         return (ret);
6672 }
6673
6674 void
6675 resume_proc_sockets(proc_t p)
6676 {
6677         if (p->p_ladvflag & P_LXBKIDLEINPROG) {
6678                 struct filedesc *fdp;
6679                 int i;
6680
6681                 proc_fdlock(p);
6682                 fdp = p->p_fd;
6683                 for (i = 0; i < fdp->fd_nfiles; i++) {
6684                         struct fileproc *fp;
6685                         struct socket *so;
6686
6687                         fp = fdp->fd_ofiles[i];
6688                         if (fp == NULL ||
6689                             (fdp->fd_ofileflags[i] & UF_RESERVED) != 0 ||
6690                             FILEGLOB_DTYPE(fp->f_fglob) != DTYPE_SOCKET)
6691                                 continue;
6692
6693                         so = (struct socket *)fp->f_fglob->fg_data;
6694                         (void) soresume(p, so, 0);
6695                 }
6696                 proc_fdunlock(p);
6697
6698                 OSBitAndAtomic(~P_LXBKIDLEINPROG, &p->p_ladvflag);
6699         }
6700 }
6701
6702 __private_extern__ int
6703 so_set_recv_anyif(struct socket *so, int optval)
6704 {
6705         int ret = 0;
6706
6707 #if INET6
6708         if (SOCK_DOM(so) == PF_INET || SOCK_DOM(so) == PF_INET6) {
6709 #else
6710         if (SOCK_DOM(so) == PF_INET) {
6711 #endif /* !INET6 */
6712                 if (optval)
6713                         sotoinpcb(so)->inp_flags |= INP_RECV_ANYIF;
6714                 else
6715                         sotoinpcb(so)->inp_flags &= ~INP_RECV_ANYIF;
6716         }
6717
6718         return (ret);
6719 }
6720
6721 __private_extern__ int
6722 so_get_recv_anyif(struct socket *so)
6723 {
6724         int ret = 0;
6725
6726 #if INET6
6727         if (SOCK_DOM(so) == PF_INET || SOCK_DOM(so) == PF_INET6) {
6728 #else
6729         if (SOCK_DOM(so) == PF_INET) {
6730 #endif /* !INET6 */
6731                 ret = (sotoinpcb(so)->inp_flags & INP_RECV_ANYIF) ? 1 : 0;
6732         }
6733
6734         return (ret);
6735 }
6736
6737 int
6738 so_set_restrictions(struct socket *so, uint32_t vals)
6739 {
6740         int nocell_old, nocell_new;
6741         int noexpensive_old, noexpensive_new;
6742
6743         /*
6744          * Deny-type restrictions are trapdoors; once set they cannot be
6745          * unset for the lifetime of the socket.  This allows them to be
6746          * issued by a framework on behalf of the application without
6747          * having to worry that they can be undone.
6748          *
6749          * Note here that socket-level restrictions overrides any protocol
6750          * level restrictions.  For instance, SO_RESTRICT_DENY_CELLULAR
6751          * socket restriction issued on the socket has a higher precendence
6752          * than INP_NO_IFT_CELLULAR.  The latter is affected by the UUID
6753          * policy PROC_UUID_NO_CELLULAR for unrestricted sockets only,
6754          * i.e. when SO_RESTRICT_DENY_CELLULAR has not been issued.
6755          */
6756         nocell_old = (so->so_restrictions & SO_RESTRICT_DENY_CELLULAR);
6757         noexpensive_old = (so->so_restrictions & SO_RESTRICT_DENY_EXPENSIVE);
6758         so->so_restrictions |= (vals & (SO_RESTRICT_DENY_IN |
6759             SO_RESTRICT_DENY_OUT | SO_RESTRICT_DENY_CELLULAR |
6760             SO_RESTRICT_DENY_EXPENSIVE));
6761         nocell_new = (so->so_restrictions & SO_RESTRICT_DENY_CELLULAR);
6762         noexpensive_new = (so->so_restrictions & SO_RESTRICT_DENY_EXPENSIVE);
6763
6764         /* we can only set, not clear restrictions */
6765         if ((nocell_new - nocell_old) == 0 &&
6766             (noexpensive_new - noexpensive_old) == 0)
6767                 return (0);
6768 #if INET6
6769         if (SOCK_DOM(so) == PF_INET || SOCK_DOM(so) == PF_INET6) {
6770 #else
6771         if (SOCK_DOM(so) == PF_INET) {
6772 #endif /* !INET6 */
6773                 if (nocell_new - nocell_old != 0) {
6774                         /*
6775                          * if deny cellular is now set, do what's needed
6776                          * for INPCB
6777                          */
6778                         inp_set_nocellular(sotoinpcb(so));
6779                 }
6780                 if (noexpensive_new - noexpensive_old != 0) {
6781                         inp_set_noexpensive(sotoinpcb(so));
6782                 }
6783         }
6784
6785         return (0);
6786 }
6787
6788 uint32_t
6789 so_get_restrictions(struct socket *so)
6790 {
6791         return (so->so_restrictions & (SO_RESTRICT_DENY_IN |
6792             SO_RESTRICT_DENY_OUT |
6793             SO_RESTRICT_DENY_CELLULAR | SO_RESTRICT_DENY_EXPENSIVE));
6794 }
6795
6796 struct sockaddr_entry *
6797 sockaddrentry_alloc(int how)
6798 {
6799         struct sockaddr_entry *se;
6800
6801         se = (how == M_WAITOK) ? zalloc(se_zone) : zalloc_noblock(se_zone);
6802         if (se != NULL)
6803                 bzero(se, se_zone_size);
6804
6805         return (se);
6806 }
6807
6808 void
6809 sockaddrentry_free(struct sockaddr_entry *se)
6810 {
6811         if (se->se_addr != NULL) {
6812                 FREE(se->se_addr, M_SONAME);
6813                 se->se_addr = NULL;
6814         }
6815         zfree(se_zone, se);
6816 }
6817
6818 struct sockaddr_entry *
6819 sockaddrentry_dup(const struct sockaddr_entry *src_se, int how)
6820 {
6821         struct sockaddr_entry *dst_se;
6822
6823         dst_se = sockaddrentry_alloc(how);
6824         if (dst_se != NULL) {
6825                 int len = src_se->se_addr->sa_len;
6826
6827                 MALLOC(dst_se->se_addr, struct sockaddr *,
6828                     len, M_SONAME, how | M_ZERO);
6829                 if (dst_se->se_addr != NULL) {
6830                         bcopy(src_se->se_addr, dst_se->se_addr, len);
6831                 } else {
6832                         sockaddrentry_free(dst_se);
6833                         dst_se = NULL;
6834                 }
6835         }
6836
6837         return (dst_se);
6838 }
6839
6840 struct sockaddr_list *
6841 sockaddrlist_alloc(int how)
6842 {
6843         struct sockaddr_list *sl;
6844
6845         sl = (how == M_WAITOK) ? zalloc(sl_zone) : zalloc_noblock(sl_zone);
6846         if (sl != NULL) {
6847                 bzero(sl, sl_zone_size);
6848                 TAILQ_INIT(&sl->sl_head);
6849         }
6850         return (sl);
6851 }
6852
6853 void
6854 sockaddrlist_free(struct sockaddr_list *sl)
6855 {
6856         struct sockaddr_entry *se, *tse;
6857
6858         TAILQ_FOREACH_SAFE(se, &sl->sl_head, se_link, tse) {
6859                 sockaddrlist_remove(sl, se);
6860                 sockaddrentry_free(se);
6861         }
6862         VERIFY(sl->sl_cnt == 0 && TAILQ_EMPTY(&sl->sl_head));
6863         zfree(sl_zone, sl);
6864 }
6865
6866 void
6867 sockaddrlist_insert(struct sockaddr_list *sl, struct sockaddr_entry *se)
6868 {
6869         VERIFY(!(se->se_flags & SEF_ATTACHED));
6870         se->se_flags |= SEF_ATTACHED;
6871         TAILQ_INSERT_TAIL(&sl->sl_head, se, se_link);
6872         sl->sl_cnt++;
6873         VERIFY(sl->sl_cnt != 0);
6874 }
6875
6876 void
6877 sockaddrlist_remove(struct sockaddr_list *sl, struct sockaddr_entry *se)
6878 {
6879         VERIFY(se->se_flags & SEF_ATTACHED);
6880         se->se_flags &= ~SEF_ATTACHED;
6881         VERIFY(sl->sl_cnt != 0);
6882         sl->sl_cnt--;
6883         TAILQ_REMOVE(&sl->sl_head, se, se_link);
6884 }
6885
6886 struct sockaddr_list *
6887 sockaddrlist_dup(const struct sockaddr_list *src_sl, int how)
6888 {
6889         struct sockaddr_entry *src_se, *tse;
6890         struct sockaddr_list *dst_sl;
6891
6892         dst_sl = sockaddrlist_alloc(how);
6893         if (dst_sl == NULL)
6894                 return (NULL);
6895
6896         TAILQ_FOREACH_SAFE(src_se, &src_sl->sl_head, se_link, tse) {
6897                 struct sockaddr_entry *dst_se;
6898
6899                 if (src_se->se_addr == NULL)
6900                         continue;
6901
6902                 dst_se = sockaddrentry_dup(src_se, how);
6903                 if (dst_se == NULL) {
6904                         sockaddrlist_free(dst_sl);
6905                         return (NULL);
6906                 }
6907
6908                 sockaddrlist_insert(dst_sl, dst_se);
6909         }
6910         VERIFY(src_sl->sl_cnt == dst_sl->sl_cnt);
6911
6912         return (dst_sl);
6913 }
6914
6915 int
6916 so_set_effective_pid(struct socket *so, int epid, struct proc *p)
6917 {
6918         struct proc *ep = PROC_NULL;
6919         int error = 0;
6920
6921         /* pid 0 is reserved for kernel */
6922         if (epid == 0) {
6923                 error = EINVAL;
6924                 goto done;
6925         }
6926
6927         /*
6928          * If this is an in-kernel socket, prevent its delegate
6929          * association from changing unless the socket option is
6930          * coming from within the kernel itself.
6931          */
6932         if (so->last_pid == 0 && p != kernproc) {
6933                 error = EACCES;
6934                 goto done;
6935         }
6936
6937         /*
6938          * If this is issued by a process that's recorded as the
6939          * real owner of the socket, or if the pid is the same as
6940          * the process's own pid, then proceed.  Otherwise ensure
6941          * that the issuing process has the necessary privileges.
6942          */
6943         if (epid != so->last_pid || epid != proc_pid(p)) {
6944                 if ((error = priv_check_cred(kauth_cred_get(),
6945                     PRIV_NET_PRIVILEGED_SOCKET_DELEGATE, 0))) {
6946                         error = EACCES;
6947                         goto done;
6948                 }
6949         }
6950
6951         /* Find the process that corresponds to the effective pid */
6952         if ((ep = proc_find(epid)) == PROC_NULL) {
6953                 error = ESRCH;
6954                 goto done;
6955         }
6956
6957         /*
6958          * If a process tries to delegate the socket to itself, then
6959          * there's really nothing to do; treat it as a way for the
6960          * delegate association to be cleared.  Note that we check
6961          * the passed-in proc rather than calling proc_selfpid(),
6962          * as we need to check the process issuing the socket option
6963          * which could be kernproc.  Given that we don't allow 0 for
6964          * effective pid, it means that a delegated in-kernel socket
6965          * stays delegated during its lifetime (which is probably OK.)
6966          */
6967         if (epid == proc_pid(p)) {
6968                 so->so_flags &= ~SOF_DELEGATED;
6969                 so->e_upid = 0;
6970                 so->e_pid = 0;
6971                 uuid_clear(so->e_uuid);
6972         } else {
6973                 so->so_flags |= SOF_DELEGATED;
6974                 so->e_upid = proc_uniqueid(ep);
6975                 so->e_pid = proc_pid(ep);
6976                 proc_getexecutableuuid(ep, so->e_uuid, sizeof (so->e_uuid));
6977         }
6978 done:
6979         if (error == 0 && net_io_policy_log) {
6980                 uuid_string_t buf;
6981
6982                 uuid_unparse(so->e_uuid, buf);
6983                 log(LOG_DEBUG, "%s[%s,%d]: so 0x%llx [%d,%d] epid %d (%s) "
6984                     "euuid %s%s\n", __func__, proc_name_address(p),
6985                     proc_pid(p), (uint64_t)DEBUG_KERNEL_ADDRPERM(so),
6986                     SOCK_DOM(so), SOCK_TYPE(so),
6987                     so->e_pid, proc_name_address(ep), buf,
6988                     ((so->so_flags & SOF_DELEGATED) ? " [delegated]" : ""));
6989         } else if (error != 0 && net_io_policy_log) {
6990                 log(LOG_ERR, "%s[%s,%d]: so 0x%llx [%d,%d] epid %d (%s) "
6991                     "ERROR (%d)\n", __func__, proc_name_address(p),
6992                     proc_pid(p), (uint64_t)DEBUG_KERNEL_ADDRPERM(so),
6993                     SOCK_DOM(so), SOCK_TYPE(so),
6994                     epid, (ep == PROC_NULL) ? "PROC_NULL" :
6995                     proc_name_address(ep), error);
6996         }
6997
6998         /* Update this socket's policy upon success */
6999         if (error == 0) {
7000                 so->so_policy_gencnt *= -1;
7001                 so_update_policy(so);
7002 #if NECP
7003                 so_update_necp_policy(so, NULL, NULL);
7004 #endif /* NECP */
7005         }
7006
7007         if (ep != PROC_NULL)
7008                 proc_rele(ep);
7009
7010         return (error);
7011 }
7012
7013 int
7014 so_set_effective_uuid(struct socket *so, uuid_t euuid, struct proc *p)
7015 {
7016         uuid_string_t buf;
7017         uuid_t uuid;
7018         int error = 0;
7019
7020         /* UUID must not be all-zeroes (reserved for kernel) */
7021         if (uuid_is_null(euuid)) {
7022                 error = EINVAL;
7023                 goto done;
7024         }
7025
7026         /*
7027          * If this is an in-kernel socket, prevent its delegate
7028          * association from changing unless the socket option is
7029          * coming from within the kernel itself.
7030          */
7031         if (so->last_pid == 0 && p != kernproc) {
7032                 error = EACCES;
7033                 goto done;
7034         }
7035
7036         /* Get the UUID of the issuing process */
7037         proc_getexecutableuuid(p, uuid, sizeof (uuid));
7038
7039         /*
7040          * If this is issued by a process that's recorded as the
7041          * real owner of the socket, or if the uuid is the same as
7042          * the process's own uuid, then proceed.  Otherwise ensure
7043          * that the issuing process has the necessary privileges.
7044          */
7045         if (uuid_compare(euuid, so->last_uuid) != 0 ||
7046             uuid_compare(euuid, uuid) != 0) {
7047                 if ((error = priv_check_cred(kauth_cred_get(),
7048                     PRIV_NET_PRIVILEGED_SOCKET_DELEGATE, 0))) {
7049                         error = EACCES;
7050                         goto done;
7051                 }
7052         }
7053
7054         /*
7055          * If a process tries to delegate the socket to itself, then
7056          * there's really nothing to do; treat it as a way for the
7057          * delegate association to be cleared.  Note that we check
7058          * the uuid of the passed-in proc rather than that of the
7059          * current process, as we need to check the process issuing
7060          * the socket option which could be kernproc itself.  Given
7061          * that we don't allow 0 for effective uuid, it means that
7062          * a delegated in-kernel socket stays delegated during its
7063          * lifetime (which is okay.)
7064          */
7065         if (uuid_compare(euuid, uuid) == 0) {
7066                 so->so_flags &= ~SOF_DELEGATED;
7067                 so->e_upid = 0;
7068                 so->e_pid = 0;
7069                 uuid_clear(so->e_uuid);
7070         } else {
7071                 so->so_flags |= SOF_DELEGATED;
7072                 /*
7073                  * Unlike so_set_effective_pid(), we only have the UUID
7074                  * here and the process ID is not known.  Inherit the
7075                  * real {pid,upid} of the socket.
7076                  */
7077                 so->e_upid = so->last_upid;
7078                 so->e_pid = so->last_pid;
7079                 uuid_copy(so->e_uuid, euuid);
7080         }
7081
7082 done:
7083         if (error == 0 && net_io_policy_log) {
7084                 uuid_unparse(so->e_uuid, buf);
7085                 log(LOG_DEBUG, "%s[%s,%d]: so 0x%llx [%d,%d] epid %d "
7086                     "euuid %s%s\n", __func__, proc_name_address(p), proc_pid(p),
7087                     (uint64_t)DEBUG_KERNEL_ADDRPERM(so), SOCK_DOM(so),
7088                     SOCK_TYPE(so), so->e_pid, buf,
7089                     ((so->so_flags & SOF_DELEGATED) ? " [delegated]" : ""));
7090         } else if (error != 0 && net_io_policy_log) {
7091                 uuid_unparse(euuid, buf);
7092                 log(LOG_DEBUG, "%s[%s,%d]: so 0x%llx [%d,%d] euuid %s "
7093                     "ERROR (%d)\n", __func__, proc_name_address(p), proc_pid(p),
7094                     (uint64_t)DEBUG_KERNEL_ADDRPERM(so), SOCK_DOM(so),
7095                     SOCK_TYPE(so), buf, error);
7096         }
7097
7098         /* Update this socket's policy upon success */
7099         if (error == 0) {
7100                 so->so_policy_gencnt *= -1;
7101                 so_update_policy(so);
7102 #if NECP
7103                 so_update_necp_policy(so, NULL, NULL);
7104 #endif /* NECP */
7105         }
7106
7107         return (error);
7108 }
7109
7110 void
7111 netpolicy_post_msg(uint32_t ev_code, struct netpolicy_event_data *ev_data,
7112     uint32_t ev_datalen)
7113 {
7114         struct kev_msg ev_msg;
7115
7116         /*
7117          * A netpolicy event always starts with a netpolicy_event_data
7118          * structure, but the caller can provide for a longer event
7119          * structure to post, depending on the event code.
7120          */
7121         VERIFY(ev_data != NULL && ev_datalen >= sizeof (*ev_data));
7122
7123         bzero(&ev_msg, sizeof (ev_msg));
7124         ev_msg.vendor_code      = KEV_VENDOR_APPLE;
7125         ev_msg.kev_class        = KEV_NETWORK_CLASS;
7126         ev_msg.kev_subclass     = KEV_NETPOLICY_SUBCLASS;
7127         ev_msg.event_code       = ev_code;
7128
7129         ev_msg.dv[0].data_ptr   = ev_data;
7130         ev_msg.dv[0].data_length = ev_datalen;
7131
7132         kev_post_msg(&ev_msg);
7133 }
7134
7135 void
7136 socket_post_kev_msg(uint32_t ev_code,
7137     struct kev_socket_event_data *ev_data,
7138     uint32_t ev_datalen)
7139 {
7140         struct kev_msg ev_msg;
7141
7142         bzero(&ev_msg, sizeof(ev_msg));
7143         ev_msg.vendor_code = KEV_VENDOR_APPLE;
7144         ev_msg.kev_class = KEV_NETWORK_CLASS;
7145         ev_msg.kev_subclass = KEV_SOCKET_SUBCLASS;
7146         ev_msg.event_code = ev_code;
7147
7148         ev_msg.dv[0].data_ptr = ev_data;
7149         ev_msg.dv[0]. data_length = ev_datalen;
7150
7151         kev_post_msg(&ev_msg);
7152 }
7153
7154 void
7155 socket_post_kev_msg_closed(struct socket *so)
7156 {
7157         struct kev_socket_closed ev;
7158         struct sockaddr *socksa = NULL, *peersa = NULL;
7159         int err;
7160         bzero(&ev, sizeof(ev));
7161         err = (*so->so_proto->pr_usrreqs->pru_sockaddr)(so, &socksa);
7162         if (err == 0) {
7163                 err = (*so->so_proto->pr_usrreqs->pru_peeraddr)(so,
7164                     &peersa);
7165                 if (err == 0) {
7166                         memcpy(&ev.ev_data.kev_sockname, socksa,
7167                             min(socksa->sa_len,
7168                             sizeof (ev.ev_data.kev_sockname)));
7169                         memcpy(&ev.ev_data.kev_peername, peersa,
7170                             min(peersa->sa_len,
7171                             sizeof (ev.ev_data.kev_peername)));
7172                         socket_post_kev_msg(KEV_SOCKET_CLOSED,
7173                             &ev.ev_data, sizeof (ev));
7174                 }
7175         }
7176         if (socksa != NULL)
7177                 FREE(socksa, M_SONAME);
7178         if (peersa != NULL)
7179                 FREE(peersa, M_SONAME);
7180 }