bsd/kern/uipc_socket.c

   1 /*
   2  * Copyright (c) 1998-2016 Apple Inc. All rights reserved.
   3  *
   4  * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
   5  *
   6  * This file contains Original Code and/or Modifications of Original Code
   7  * as defined in and that are subject to the Apple Public Source License
   8  * Version 2.0 (the 'License'). You may not use this file except in
   9  * compliance with the License. The rights granted to you under the License
  10  * may not be used to create, or enable the creation or redistribution of,
  11  * unlawful or unlicensed copies of an Apple operating system, or to
  12  * circumvent, violate, or enable the circumvention or violation of, any
  13  * terms of an Apple operating system software license agreement.
  14  *
  15  * Please obtain a copy of the License at
  16  * http://www.opensource.apple.com/apsl/ and read it before using this file.
  17  *
  18  * The Original Code and all software distributed under the License are
  19  * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
  20  * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
  21  * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
  22  * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
  23  * Please see the License for the specific language governing rights and
  24  * limitations under the License.
  25  *
  26  * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
  27  */
  28 /* Copyright (c) 1995 NeXT Computer, Inc. All Rights Reserved */
  29 /*
  30  * Copyright (c) 1982, 1986, 1988, 1990, 1993
  31  *      The Regents of the University of California.  All rights reserved.
  32  *
  33  * Redistribution and use in source and binary forms, with or without
  34  * modification, are permitted provided that the following conditions
  35  * are met:
  36  * 1. Redistributions of source code must retain the above copyright
  37  *    notice, this list of conditions and the following disclaimer.
  38  * 2. Redistributions in binary form must reproduce the above copyright
  39  *    notice, this list of conditions and the following disclaimer in the
  40  *    documentation and/or other materials provided with the distribution.
  41  * 3. All advertising materials mentioning features or use of this software
  42  *    must display the following acknowledgement:
  43  *      This product includes software developed by the University of
  44  *      California, Berkeley and its contributors.
  45  * 4. Neither the name of the University nor the names of its contributors
  46  *    may be used to endorse or promote products derived from this software
  47  *    without specific prior written permission.
  48  *
  49  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
  50  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  51  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  52  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  53  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  54  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  55  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  56  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  57  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  58  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  59  * SUCH DAMAGE.
  60  *
  61  *      @(#)uipc_socket.c       8.3 (Berkeley) 4/15/94
  62  */
  63 /*
  64  * NOTICE: This file was modified by SPARTA, Inc. in 2005 to introduce
  65  * support for mandatory and extensible security protections.  This notice
  66  * is included in support of clause 2.2 (b) of the Apple Public License,
  67  * Version 2.0.
  68  */
  69
  70 #include <sys/param.h>
  71 #include <sys/systm.h>
  72 #include <sys/filedesc.h>
  73 #include <sys/proc.h>
  74 #include <sys/proc_internal.h>
  75 #include <sys/kauth.h>
  76 #include <sys/file_internal.h>
  77 #include <sys/fcntl.h>
  78 #include <sys/malloc.h>
  79 #include <sys/mbuf.h>
  80 #include <sys/domain.h>
  81 #include <sys/kernel.h>
  82 #include <sys/event.h>
  83 #include <sys/poll.h>
  84 #include <sys/protosw.h>
  85 #include <sys/socket.h>
  86 #include <sys/socketvar.h>
  87 #include <sys/resourcevar.h>
  88 #include <sys/signalvar.h>
  89 #include <sys/sysctl.h>
  90 #include <sys/syslog.h>
  91 #include <sys/uio.h>
  92 #include <sys/uio_internal.h>
  93 #include <sys/ev.h>
  94 #include <sys/kdebug.h>
  95 #include <sys/un.h>
  96 #include <sys/user.h>
  97 #include <sys/priv.h>
  98 #include <sys/kern_event.h>
  99 #include <net/route.h>
 100 #include <net/init.h>
 101 #include <net/ntstat.h>
 102 #include <net/content_filter.h>
 103 #include <netinet/in.h>
 104 #include <netinet/in_pcb.h>
 105 #include <netinet/in_tclass.h>
 106 #include <netinet/tcp_var.h>
 107 #include <netinet/ip6.h>
 108 #include <netinet6/ip6_var.h>
 109 #include <netinet/flow_divert.h>
 110 #include <kern/zalloc.h>
 111 #include <kern/locks.h>
 112 #include <machine/limits.h>
 113 #include <libkern/OSAtomic.h>
 114 #include <pexpert/pexpert.h>
 115 #include <kern/assert.h>
 116 #include <kern/task.h>
 117 #include <kern/policy_internal.h>
 118
 119 #include <sys/kpi_mbuf.h>
 120 #include <sys/mcache.h>
 121 #include <sys/unpcb.h>
 122
 123 #if CONFIG_MACF
 124 #include <security/mac.h>
 125 #include <security/mac_framework.h>
 126 #endif /* MAC */
 127
 128 #if MULTIPATH
 129 #include <netinet/mp_pcb.h>
 130 #include <netinet/mptcp_var.h>
 131 #endif /* MULTIPATH */
 132
 133 #define ROUNDUP(a, b) (((a) + ((b) - 1)) & (~((b) - 1)))
 134
 135 #if DEBUG || DEVELOPMENT
 136 #define DEBUG_KERNEL_ADDRPERM(_v) (_v)
 137 #else
 138 #define DEBUG_KERNEL_ADDRPERM(_v) VM_KERNEL_ADDRPERM(_v)
 139 #endif
 140
 141 /* TODO: this should be in a header file somewhere */
 142 extern char *proc_name_address(void *p);
 143 extern char *proc_best_name(proc_t);
 144
 145 static u_int32_t        so_cache_hw;    /* High water mark for socache */
 146 static u_int32_t        so_cache_timeouts;      /* number of timeouts */
 147 static u_int32_t        so_cache_max_freed;     /* max freed per timeout */
 148 static u_int32_t        cached_sock_count = 0;
 149 STAILQ_HEAD(, socket)   so_cache_head;
 150 int     max_cached_sock_count = MAX_CACHED_SOCKETS;
 151 static u_int32_t        so_cache_time;
 152 static int              socketinit_done;
 153 static struct zone      *so_cache_zone;
 154
 155 static lck_grp_t        *so_cache_mtx_grp;
 156 static lck_attr_t       *so_cache_mtx_attr;
 157 static lck_grp_attr_t   *so_cache_mtx_grp_attr;
 158 static lck_mtx_t        *so_cache_mtx;
 159
 160 #include <machine/limits.h>
 161
 162 static int      filt_sorattach(struct knote *kn);
 163 static void     filt_sordetach(struct knote *kn);
 164 static int      filt_soread(struct knote *kn, long hint);
 165 static int      filt_sortouch(struct knote *kn, struct kevent_internal_s *kev);
 166 static int      filt_sorprocess(struct knote *kn, struct filt_process_s *data, struct kevent_internal_s *kev);
 167
 168 static int      filt_sowattach(struct knote *kn);
 169 static void     filt_sowdetach(struct knote *kn);
 170 static int      filt_sowrite(struct knote *kn, long hint);
 171 static int      filt_sowtouch(struct knote *kn, struct kevent_internal_s *kev);
 172 static int      filt_sowprocess(struct knote *kn, struct filt_process_s *data, struct kevent_internal_s *kev);
 173
 174 static int      filt_sockattach(struct knote *kn);
 175 static void     filt_sockdetach(struct knote *kn);
 176 static int      filt_sockev(struct knote *kn, long hint);
 177 static int      filt_socktouch(struct knote *kn, struct kevent_internal_s *kev);
 178 static int      filt_sockprocess(struct knote *kn, struct filt_process_s *data, struct kevent_internal_s *kev);
 179
 180 static int sooptcopyin_timeval(struct sockopt *, struct timeval *);
 181 static int sooptcopyout_timeval(struct sockopt *, const struct timeval *);
 182
 183 struct filterops soread_filtops = {
 184         .f_isfd = 1,
 185         .f_attach = filt_sorattach,
 186         .f_detach = filt_sordetach,
 187         .f_event = filt_soread,
 188         .f_touch = filt_sortouch,
 189         .f_process = filt_sorprocess,
 190 };
 191
 192 struct filterops sowrite_filtops = {
 193         .f_isfd = 1,
 194         .f_attach = filt_sowattach,
 195         .f_detach = filt_sowdetach,
 196         .f_event = filt_sowrite,
 197         .f_touch = filt_sowtouch,
 198         .f_process = filt_sowprocess,
 199 };
 200
 201 struct filterops sock_filtops = {
 202         .f_isfd = 1,
 203         .f_attach = filt_sockattach,
 204         .f_detach = filt_sockdetach,
 205         .f_event = filt_sockev,
 206         .f_touch = filt_socktouch,
 207         .f_process = filt_sockprocess,
 208 };
 209
 210 struct filterops soexcept_filtops = {
 211         .f_isfd = 1,
 212         .f_attach = filt_sorattach,
 213         .f_detach = filt_sordetach,
 214         .f_event = filt_soread,
 215         .f_touch = filt_sortouch,
 216         .f_process = filt_sorprocess,
 217 };
 218
 219 SYSCTL_DECL(_kern_ipc);
 220
 221 #define EVEN_MORE_LOCKING_DEBUG 0
 222
 223 int socket_debug = 0;
 224 SYSCTL_INT(_kern_ipc, OID_AUTO, socket_debug,
 225         CTLFLAG_RW | CTLFLAG_LOCKED, &socket_debug, 0, "");
 226
 227 static unsigned long sodefunct_calls = 0;
 228 SYSCTL_LONG(_kern_ipc, OID_AUTO, sodefunct_calls, CTLFLAG_LOCKED,
 229     &sodefunct_calls, "");
 230
 231 static int socket_zone = M_SOCKET;
 232 so_gen_t        so_gencnt;      /* generation count for sockets */
 233
 234 MALLOC_DEFINE(M_SONAME, "soname", "socket name");
 235 MALLOC_DEFINE(M_PCB, "pcb", "protocol control block");
 236
 237 #define DBG_LAYER_IN_BEG        NETDBG_CODE(DBG_NETSOCK, 0)
 238 #define DBG_LAYER_IN_END        NETDBG_CODE(DBG_NETSOCK, 2)
 239 #define DBG_LAYER_OUT_BEG       NETDBG_CODE(DBG_NETSOCK, 1)
 240 #define DBG_LAYER_OUT_END       NETDBG_CODE(DBG_NETSOCK, 3)
 241 #define DBG_FNC_SOSEND          NETDBG_CODE(DBG_NETSOCK, (4 << 8) | 1)
 242 #define DBG_FNC_SOSEND_LIST     NETDBG_CODE(DBG_NETSOCK, (4 << 8) | 3)
 243 #define DBG_FNC_SORECEIVE       NETDBG_CODE(DBG_NETSOCK, (8 << 8))
 244 #define DBG_FNC_SORECEIVE_LIST  NETDBG_CODE(DBG_NETSOCK, (8 << 8) | 3)
 245 #define DBG_FNC_SOSHUTDOWN      NETDBG_CODE(DBG_NETSOCK, (9 << 8))
 246
 247 #define MAX_SOOPTGETM_SIZE      (128 * MCLBYTES)
 248
 249 int somaxconn = SOMAXCONN;
 250 SYSCTL_INT(_kern_ipc, KIPC_SOMAXCONN, somaxconn,
 251         CTLFLAG_RW | CTLFLAG_LOCKED, &somaxconn, 0, "");
 252
 253 /* Should we get a maximum also ??? */
 254 static int sosendmaxchain = 65536;
 255 static int sosendminchain = 16384;
 256 static int sorecvmincopy  = 16384;
 257 SYSCTL_INT(_kern_ipc, OID_AUTO, sosendminchain,
 258         CTLFLAG_RW | CTLFLAG_LOCKED, &sosendminchain, 0, "");
 259 SYSCTL_INT(_kern_ipc, OID_AUTO, sorecvmincopy,
 260         CTLFLAG_RW | CTLFLAG_LOCKED, &sorecvmincopy, 0, "");
 261
 262 /*
 263  * Set to enable jumbo clusters (if available) for large writes when
 264  * the socket is marked with SOF_MULTIPAGES; see below.
 265  */
 266 int sosendjcl = 1;
 267 SYSCTL_INT(_kern_ipc, OID_AUTO, sosendjcl,
 268         CTLFLAG_RW | CTLFLAG_LOCKED, &sosendjcl, 0, "");
 269
 270 /*
 271  * Set this to ignore SOF_MULTIPAGES and use jumbo clusters for large
 272  * writes on the socket for all protocols on any network interfaces,
 273  * depending upon sosendjcl above.  Be extra careful when setting this
 274  * to 1, because sending down packets that cross physical pages down to
 275  * broken drivers (those that falsely assume that the physical pages
 276  * are contiguous) might lead to system panics or silent data corruption.
 277  * When set to 0, the system will respect SOF_MULTIPAGES, which is set
 278  * only for TCP sockets whose outgoing interface is IFNET_MULTIPAGES
 279  * capable.  Set this to 1 only for testing/debugging purposes.
 280  */
 281 int sosendjcl_ignore_capab = 0;
 282 SYSCTL_INT(_kern_ipc, OID_AUTO, sosendjcl_ignore_capab,
 283         CTLFLAG_RW | CTLFLAG_LOCKED, &sosendjcl_ignore_capab, 0, "");
 284
 285 /*
 286  * Set this to ignore SOF1_IF_2KCL and use big clusters for large
 287  * writes on the socket for all protocols on any network interfaces.
 288  * Be extra careful when setting this to 1, because sending down packets with
 289  * clusters larger that 2 KB might lead to system panics or data corruption.
 290  * When set to 0, the system will respect SOF1_IF_2KCL, which is set
 291  * on the outgoing interface
 292  * Set this to 1  for testing/debugging purposes only.
 293  */
 294 int sosendbigcl_ignore_capab = 0;
 295 SYSCTL_INT(_kern_ipc, OID_AUTO, sosendbigcl_ignore_capab,
 296         CTLFLAG_RW | CTLFLAG_LOCKED, &sosendbigcl_ignore_capab, 0, "");
 297
 298 int sodefunctlog = 0;
 299 SYSCTL_INT(_kern_ipc, OID_AUTO, sodefunctlog, CTLFLAG_RW | CTLFLAG_LOCKED,
 300         &sodefunctlog, 0, "");
 301
 302 int sothrottlelog = 0;
 303 SYSCTL_INT(_kern_ipc, OID_AUTO, sothrottlelog, CTLFLAG_RW | CTLFLAG_LOCKED,
 304         &sothrottlelog, 0, "");
 305
 306 int sorestrictrecv = 1;
 307 SYSCTL_INT(_kern_ipc, OID_AUTO, sorestrictrecv, CTLFLAG_RW | CTLFLAG_LOCKED,
 308         &sorestrictrecv, 0, "Enable inbound interface restrictions");
 309
 310 int sorestrictsend = 1;
 311 SYSCTL_INT(_kern_ipc, OID_AUTO, sorestrictsend, CTLFLAG_RW | CTLFLAG_LOCKED,
 312         &sorestrictsend, 0, "Enable outbound interface restrictions");
 313
 314 int soreserveheadroom = 1;
 315 SYSCTL_INT(_kern_ipc, OID_AUTO, soreserveheadroom, CTLFLAG_RW | CTLFLAG_LOCKED,
 316         &soreserveheadroom, 0, "To allocate contiguous datagram buffers");
 317
 318 #if (DEBUG || DEVELOPMENT)
 319 int so_notsent_lowat_check = 1;
 320 SYSCTL_INT(_kern_ipc, OID_AUTO, notsent_lowat, CTLFLAG_RW|CTLFLAG_LOCKED,
 321     &so_notsent_lowat_check, 0, "enable/disable notsnet lowat check");
 322 #endif /* DEBUG || DEVELOPMENT */
 323
 324 extern struct inpcbinfo tcbinfo;
 325
 326 /* TODO: these should be in header file */
 327 extern int get_inpcb_str_size(void);
 328 extern int get_tcp_str_size(void);
 329
 330 static unsigned int sl_zone_size;               /* size of sockaddr_list */
 331 static struct zone *sl_zone;                    /* zone for sockaddr_list */
 332
 333 static unsigned int se_zone_size;               /* size of sockaddr_entry */
 334 static struct zone *se_zone;                    /* zone for sockaddr_entry */
 335
 336 vm_size_t       so_cache_zone_element_size;
 337
 338 static int sodelayed_copy(struct socket *, struct uio *, struct mbuf **,
 339     user_ssize_t *);
 340 static void cached_sock_alloc(struct socket **, int);
 341 static void cached_sock_free(struct socket *);
 342
 343 /*
 344  * Maximum of extended background idle sockets per process
 345  * Set to zero to disable further setting of the option
 346  */
 347
 348 #define SO_IDLE_BK_IDLE_MAX_PER_PROC    1
 349 #define SO_IDLE_BK_IDLE_TIME            600
 350 #define SO_IDLE_BK_IDLE_RCV_HIWAT       131072
 351
 352 struct soextbkidlestat soextbkidlestat;
 353
 354 SYSCTL_UINT(_kern_ipc, OID_AUTO, maxextbkidleperproc,
 355         CTLFLAG_RW | CTLFLAG_LOCKED, &soextbkidlestat.so_xbkidle_maxperproc, 0,
 356         "Maximum of extended background idle sockets per process");
 357
 358 SYSCTL_UINT(_kern_ipc, OID_AUTO, extbkidletime, CTLFLAG_RW | CTLFLAG_LOCKED,
 359         &soextbkidlestat.so_xbkidle_time, 0,
 360         "Time in seconds to keep extended background idle sockets");
 361
 362 SYSCTL_UINT(_kern_ipc, OID_AUTO, extbkidlercvhiwat, CTLFLAG_RW | CTLFLAG_LOCKED,
 363         &soextbkidlestat.so_xbkidle_rcvhiwat, 0,
 364         "High water mark for extended background idle sockets");
 365
 366 SYSCTL_STRUCT(_kern_ipc, OID_AUTO, extbkidlestat, CTLFLAG_RD | CTLFLAG_LOCKED,
 367         &soextbkidlestat, soextbkidlestat, "");
 368
 369 int so_set_extended_bk_idle(struct socket *, int);
 370
 371 /*
 372  * SOTCDB_NO_DSCP is set by default, to prevent the networking stack from
 373  * setting the DSCP code on the packet based on the service class; see
 374  * <rdar://problem/11277343> for details.
 375  */
 376 __private_extern__ u_int32_t sotcdb = 0;
 377 SYSCTL_INT(_kern_ipc, OID_AUTO, sotcdb, CTLFLAG_RW | CTLFLAG_LOCKED,
 378         &sotcdb, 0, "");
 379
 380 void
 381 socketinit(void)
 382 {
 383         _CASSERT(sizeof(so_gencnt) == sizeof(uint64_t));
 384         VERIFY(IS_P2ALIGNED(&so_gencnt, sizeof(uint32_t)));
 385
 386 #ifdef __LP64__
 387         _CASSERT(sizeof(struct sa_endpoints) == sizeof(struct user64_sa_endpoints));
 388         _CASSERT(offsetof(struct sa_endpoints, sae_srcif) == offsetof(struct user64_sa_endpoints, sae_srcif));
 389         _CASSERT(offsetof(struct sa_endpoints, sae_srcaddr) == offsetof(struct user64_sa_endpoints, sae_srcaddr));
 390         _CASSERT(offsetof(struct sa_endpoints, sae_srcaddrlen) == offsetof(struct user64_sa_endpoints, sae_srcaddrlen));
 391         _CASSERT(offsetof(struct sa_endpoints, sae_dstaddr) == offsetof(struct user64_sa_endpoints, sae_dstaddr));
 392         _CASSERT(offsetof(struct sa_endpoints, sae_dstaddrlen) == offsetof(struct user64_sa_endpoints, sae_dstaddrlen));
 393 #else
 394         _CASSERT(sizeof(struct sa_endpoints) == sizeof(struct user32_sa_endpoints));
 395         _CASSERT(offsetof(struct sa_endpoints, sae_srcif) == offsetof(struct user32_sa_endpoints, sae_srcif));
 396         _CASSERT(offsetof(struct sa_endpoints, sae_srcaddr) == offsetof(struct user32_sa_endpoints, sae_srcaddr));
 397         _CASSERT(offsetof(struct sa_endpoints, sae_srcaddrlen) == offsetof(struct user32_sa_endpoints, sae_srcaddrlen));
 398         _CASSERT(offsetof(struct sa_endpoints, sae_dstaddr) == offsetof(struct user32_sa_endpoints, sae_dstaddr));
 399         _CASSERT(offsetof(struct sa_endpoints, sae_dstaddrlen) == offsetof(struct user32_sa_endpoints, sae_dstaddrlen));
 400 #endif
 401
 402         if (socketinit_done) {
 403                 printf("socketinit: already called...\n");
 404                 return;
 405         }
 406         socketinit_done = 1;
 407
 408         PE_parse_boot_argn("socket_debug", &socket_debug,
 409             sizeof (socket_debug));
 410
 411         /*
 412          * allocate lock group attribute and group for socket cache mutex
 413          */
 414         so_cache_mtx_grp_attr = lck_grp_attr_alloc_init();
 415         so_cache_mtx_grp = lck_grp_alloc_init("so_cache",
 416             so_cache_mtx_grp_attr);
 417
 418         /*
 419          * allocate the lock attribute for socket cache mutex
 420          */
 421         so_cache_mtx_attr = lck_attr_alloc_init();
 422
 423         /* cached sockets mutex */
 424         so_cache_mtx = lck_mtx_alloc_init(so_cache_mtx_grp, so_cache_mtx_attr);
 425         if (so_cache_mtx == NULL) {
 426                 panic("%s: unable to allocate so_cache_mtx\n", __func__);
 427                 /* NOTREACHED */
 428         }
 429         STAILQ_INIT(&so_cache_head);
 430
 431         so_cache_zone_element_size = (vm_size_t)(sizeof (struct socket) + 4
 432             + get_inpcb_str_size() + 4 + get_tcp_str_size());
 433
 434         so_cache_zone = zinit(so_cache_zone_element_size,
 435             (120000 * so_cache_zone_element_size), 8192, "socache zone");
 436         zone_change(so_cache_zone, Z_CALLERACCT, FALSE);
 437         zone_change(so_cache_zone, Z_NOENCRYPT, TRUE);
 438
 439         sl_zone_size = sizeof (struct sockaddr_list);
 440         if ((sl_zone = zinit(sl_zone_size, 1024 * sl_zone_size, 1024,
 441             "sockaddr_list")) == NULL) {
 442                 panic("%s: unable to allocate sockaddr_list zone\n", __func__);
 443                 /* NOTREACHED */
 444         }
 445         zone_change(sl_zone, Z_CALLERACCT, FALSE);
 446         zone_change(sl_zone, Z_EXPAND, TRUE);
 447
 448         se_zone_size = sizeof (struct sockaddr_entry);
 449         if ((se_zone = zinit(se_zone_size, 1024 * se_zone_size, 1024,
 450             "sockaddr_entry")) == NULL) {
 451                 panic("%s: unable to allocate sockaddr_entry zone\n", __func__);
 452                 /* NOTREACHED */
 453         }
 454         zone_change(se_zone, Z_CALLERACCT, FALSE);
 455         zone_change(se_zone, Z_EXPAND, TRUE);
 456
 457         bzero(&soextbkidlestat, sizeof(struct soextbkidlestat));
 458         soextbkidlestat.so_xbkidle_maxperproc = SO_IDLE_BK_IDLE_MAX_PER_PROC;
 459         soextbkidlestat.so_xbkidle_time = SO_IDLE_BK_IDLE_TIME;
 460         soextbkidlestat.so_xbkidle_rcvhiwat = SO_IDLE_BK_IDLE_RCV_HIWAT;
 461
 462         in_pcbinit();
 463         sflt_init();
 464         socket_tclass_init();
 465 #if MULTIPATH
 466         mp_pcbinit();
 467 #endif /* MULTIPATH */
 468 }
 469
 470 static void
 471 cached_sock_alloc(struct socket **so, int waitok)
 472 {
 473         caddr_t temp;
 474         uintptr_t offset;
 475
 476         lck_mtx_lock(so_cache_mtx);
 477
 478         if (!STAILQ_EMPTY(&so_cache_head)) {
 479                 VERIFY(cached_sock_count > 0);
 480
 481                 *so = STAILQ_FIRST(&so_cache_head);
 482                 STAILQ_REMOVE_HEAD(&so_cache_head, so_cache_ent);
 483                 STAILQ_NEXT((*so), so_cache_ent) = NULL;
 484
 485                 cached_sock_count--;
 486                 lck_mtx_unlock(so_cache_mtx);
 487
 488                 temp = (*so)->so_saved_pcb;
 489                 bzero((caddr_t)*so, sizeof (struct socket));
 490
 491                 (*so)->so_saved_pcb = temp;
 492         } else {
 493
 494                 lck_mtx_unlock(so_cache_mtx);
 495
 496                 if (waitok)
 497                         *so = (struct socket *)zalloc(so_cache_zone);
 498                 else
 499                         *so = (struct socket *)zalloc_noblock(so_cache_zone);
 500
 501                 if (*so == NULL)
 502                         return;
 503
 504                 bzero((caddr_t)*so, sizeof (struct socket));
 505
 506                 /*
 507                  * Define offsets for extra structures into our
 508                  * single block of memory. Align extra structures
 509                  * on longword boundaries.
 510                  */
 511
 512                 offset = (uintptr_t)*so;
 513                 offset += sizeof (struct socket);
 514
 515                 offset = ALIGN(offset);
 516
 517                 (*so)->so_saved_pcb = (caddr_t)offset;
 518                 offset += get_inpcb_str_size();
 519
 520                 offset = ALIGN(offset);
 521
 522                 ((struct inpcb *)(void *)(*so)->so_saved_pcb)->inp_saved_ppcb =
 523                     (caddr_t)offset;
 524         }
 525
 526         OSBitOrAtomic(SOF1_CACHED_IN_SOCK_LAYER, &(*so)->so_flags1);
 527 }
 528
 529 static void
 530 cached_sock_free(struct socket *so)
 531 {
 532
 533         lck_mtx_lock(so_cache_mtx);
 534
 535         so_cache_time = net_uptime();
 536         if (++cached_sock_count > max_cached_sock_count) {
 537                 --cached_sock_count;
 538                 lck_mtx_unlock(so_cache_mtx);
 539                 zfree(so_cache_zone, so);
 540         } else {
 541                 if (so_cache_hw < cached_sock_count)
 542                         so_cache_hw = cached_sock_count;
 543
 544                 STAILQ_INSERT_TAIL(&so_cache_head, so, so_cache_ent);
 545
 546                 so->cache_timestamp = so_cache_time;
 547                 lck_mtx_unlock(so_cache_mtx);
 548         }
 549 }
 550
 551 void
 552 so_update_last_owner_locked(struct socket *so, proc_t self)
 553 {
 554         if (so->last_pid != 0) {
 555                 /*
 556                  * last_pid and last_upid should remain zero for sockets
 557                  * created using sock_socket. The check above achieves that
 558                  */
 559                 if (self == PROC_NULL)
 560                         self = current_proc();
 561
 562                 if (so->last_upid != proc_uniqueid(self) ||
 563                     so->last_pid != proc_pid(self)) {
 564                         so->last_upid = proc_uniqueid(self);
 565                         so->last_pid = proc_pid(self);
 566                         proc_getexecutableuuid(self, so->last_uuid,
 567                             sizeof (so->last_uuid));
 568                 }
 569                 proc_pidoriginatoruuid(so->so_vuuid, sizeof(so->so_vuuid));
 570         }
 571 }
 572
 573 void
 574 so_update_policy(struct socket *so)
 575 {
 576         if (SOCK_DOM(so) == PF_INET || SOCK_DOM(so) == PF_INET6)
 577                 (void) inp_update_policy(sotoinpcb(so));
 578 }
 579
 580 #if NECP
 581 static void
 582 so_update_necp_policy(struct socket *so, struct sockaddr *override_local_addr,
 583     struct sockaddr *override_remote_addr)
 584 {
 585         if (SOCK_DOM(so) == PF_INET || SOCK_DOM(so) == PF_INET6)
 586                 inp_update_necp_policy(sotoinpcb(so), override_local_addr,
 587                     override_remote_addr, 0);
 588 }
 589 #endif /* NECP */
 590
 591 boolean_t
 592 so_cache_timer(void)
 593 {
 594         struct socket   *p;
 595         int             n_freed = 0;
 596         boolean_t rc = FALSE;
 597
 598         lck_mtx_lock(so_cache_mtx);
 599         so_cache_timeouts++;
 600         so_cache_time = net_uptime();
 601
 602         while (!STAILQ_EMPTY(&so_cache_head)) {
 603                 VERIFY(cached_sock_count > 0);
 604                 p = STAILQ_FIRST(&so_cache_head);
 605                 if ((so_cache_time - p->cache_timestamp) <
 606                         SO_CACHE_TIME_LIMIT)
 607                         break;
 608
 609                 STAILQ_REMOVE_HEAD(&so_cache_head, so_cache_ent);
 610                 --cached_sock_count;
 611
 612                 zfree(so_cache_zone, p);
 613
 614                 if (++n_freed >= SO_CACHE_MAX_FREE_BATCH) {
 615                         so_cache_max_freed++;
 616                         break;
 617                 }
 618         }
 619
 620         /* Schedule again if there is more to cleanup */
 621         if (!STAILQ_EMPTY(&so_cache_head))
 622                 rc = TRUE;
 623
 624         lck_mtx_unlock(so_cache_mtx);
 625         return (rc);
 626 }
 627
 628 /*
 629  * Get a socket structure from our zone, and initialize it.
 630  * We don't implement `waitok' yet (see comments in uipc_domain.c).
 631  * Note that it would probably be better to allocate socket
 632  * and PCB at the same time, but I'm not convinced that all
 633  * the protocols can be easily modified to do this.
 634  */
 635 struct socket *
 636 soalloc(int waitok, int dom, int type)
 637 {
 638         struct socket *so;
 639
 640         if ((dom == PF_INET) && (type == SOCK_STREAM)) {
 641                 cached_sock_alloc(&so, waitok);
 642         } else {
 643                 MALLOC_ZONE(so, struct socket *, sizeof (*so), socket_zone,
 644                     M_WAITOK);
 645                 if (so != NULL)
 646                         bzero(so, sizeof (*so));
 647         }
 648         if (so != NULL) {
 649                 so->so_gencnt = OSIncrementAtomic64((SInt64 *)&so_gencnt);
 650                 so->so_zone = socket_zone;
 651 #if CONFIG_MACF_SOCKET
 652                 /* Convert waitok to  M_WAITOK/M_NOWAIT for MAC Framework. */
 653                 if (mac_socket_label_init(so, !waitok) != 0) {
 654                         sodealloc(so);
 655                         return (NULL);
 656                 }
 657 #endif /* MAC_SOCKET */
 658         }
 659
 660         return (so);
 661 }
 662
 663 int
 664 socreate_internal(int dom, struct socket **aso, int type, int proto,
 665     struct proc *p, uint32_t flags, struct proc *ep)
 666 {
 667         struct protosw *prp;
 668         struct socket *so;
 669         int error = 0;
 670
 671 #if TCPDEBUG
 672         extern int tcpconsdebug;
 673 #endif
 674
 675         VERIFY(aso != NULL);
 676         *aso = NULL;
 677
 678         if (proto != 0)
 679                 prp = pffindproto(dom, proto, type);
 680         else
 681                 prp = pffindtype(dom, type);
 682
 683         if (prp == NULL || prp->pr_usrreqs->pru_attach == NULL) {
 684                 if (pffinddomain(dom) == NULL)
 685                         return (EAFNOSUPPORT);
 686                 if (proto != 0) {
 687                         if (pffindprotonotype(dom, proto) != NULL)
 688                                 return (EPROTOTYPE);
 689                 }
 690                 return (EPROTONOSUPPORT);
 691         }
 692         if (prp->pr_type != type)
 693                 return (EPROTOTYPE);
 694         so = soalloc(1, dom, type);
 695         if (so == NULL)
 696                 return (ENOBUFS);
 697
 698         if (flags & SOCF_ASYNC)
 699                 so->so_state |= SS_NBIO;
 700 #if MULTIPATH
 701         if (flags & SOCF_MP_SUBFLOW) {
 702                 /*
 703                  * A multipath subflow socket is used internally in the kernel,
 704                  * therefore it does not have a file desciptor associated by
 705                  * default.
 706                  */
 707                 so->so_state |= SS_NOFDREF;
 708                 so->so_flags |= SOF_MP_SUBFLOW;
 709         }
 710 #endif /* MULTIPATH */
 711
 712         TAILQ_INIT(&so->so_incomp);
 713         TAILQ_INIT(&so->so_comp);
 714         so->so_type = type;
 715         so->last_upid = proc_uniqueid(p);
 716         so->last_pid = proc_pid(p);
 717         proc_getexecutableuuid(p, so->last_uuid, sizeof (so->last_uuid));
 718         proc_pidoriginatoruuid(so->so_vuuid, sizeof(so->so_vuuid));
 719
 720         if (ep != PROC_NULL && ep != p) {
 721                 so->e_upid = proc_uniqueid(ep);
 722                 so->e_pid = proc_pid(ep);
 723                 proc_getexecutableuuid(ep, so->e_uuid, sizeof (so->e_uuid));
 724                 so->so_flags |= SOF_DELEGATED;
 725         }
 726
 727         so->so_cred = kauth_cred_proc_ref(p);
 728         if (!suser(kauth_cred_get(), NULL))
 729                 so->so_state |= SS_PRIV;
 730
 731         so->so_proto = prp;
 732         so->so_rcv.sb_flags |= SB_RECV;
 733         so->so_rcv.sb_so = so->so_snd.sb_so = so;
 734         so->next_lock_lr = 0;
 735         so->next_unlock_lr = 0;
 736
 737 #if CONFIG_MACF_SOCKET
 738         mac_socket_label_associate(kauth_cred_get(), so);
 739 #endif /* MAC_SOCKET */
 740
 741         /*
 742          * Attachment will create the per pcb lock if necessary and
 743          * increase refcount for creation, make sure it's done before
 744          * socket is inserted in lists.
 745          */
 746         so->so_usecount++;
 747
 748         error = (*prp->pr_usrreqs->pru_attach)(so, proto, p);
 749         if (error != 0) {
 750                 /*
 751                  * Warning:
 752                  * If so_pcb is not zero, the socket will be leaked,
 753                  * so protocol attachment handler must be coded carefuly
 754                  */
 755                 so->so_state |= SS_NOFDREF;
 756                 so->so_usecount--;
 757                 sofreelastref(so, 1);   /* will deallocate the socket */
 758                 return (error);
 759         }
 760
 761         atomic_add_32(&prp->pr_domain->dom_refs, 1);
 762         TAILQ_INIT(&so->so_evlist);
 763
 764         /* Attach socket filters for this protocol */
 765         sflt_initsock(so);
 766 #if TCPDEBUG
 767         if (tcpconsdebug == 2)
 768                 so->so_options |= SO_DEBUG;
 769 #endif
 770         so_set_default_traffic_class(so);
 771
 772         /*
 773          * If this thread or task is marked to create backgrounded sockets,
 774          * mark the socket as background.
 775          */
 776         if (proc_get_effective_thread_policy(current_thread(),
 777             TASK_POLICY_NEW_SOCKETS_BG)) {
 778                 socket_set_traffic_mgt_flags(so, TRAFFIC_MGT_SO_BACKGROUND);
 779                 so->so_background_thread = current_thread();
 780         }
 781
 782         switch (dom) {
 783         /*
 784          * Don't mark Unix domain, system or multipath sockets as
 785          * eligible for defunct by default.
 786          */
 787         case PF_LOCAL:
 788         case PF_SYSTEM:
 789         case PF_MULTIPATH:
 790                 so->so_flags |= SOF_NODEFUNCT;
 791                 break;
 792         default:
 793                 break;
 794         }
 795
 796         /*
 797          * Entitlements can't be checked at socket creation time except if the
 798          * application requested a feature guarded by a privilege (c.f., socket
 799          * delegation).
 800          * The priv(9) and the Sandboxing APIs are designed with the idea that
 801          * a privilege check should only be triggered by a userland request.
 802          * A privilege check at socket creation time is time consuming and
 803          * could trigger many authorisation error messages from the security
 804          * APIs.
 805          */
 806
 807         *aso = so;
 808
 809         return (0);
 810 }
 811
 812 /*
 813  * Returns:     0                       Success
 814  *              EAFNOSUPPORT
 815  *              EPROTOTYPE
 816  *              EPROTONOSUPPORT
 817  *              ENOBUFS
 818  *      <pru_attach>:ENOBUFS[AF_UNIX]
 819  *      <pru_attach>:ENOBUFS[TCP]
 820  *      <pru_attach>:ENOMEM[TCP]
 821  *      <pru_attach>:???                [other protocol families, IPSEC]
 822  */
 823 int
 824 socreate(int dom, struct socket **aso, int type, int proto)
 825 {
 826         return (socreate_internal(dom, aso, type, proto, current_proc(), 0,
 827             PROC_NULL));
 828 }
 829
 830 int
 831 socreate_delegate(int dom, struct socket **aso, int type, int proto, pid_t epid)
 832 {
 833         int error = 0;
 834         struct proc *ep = PROC_NULL;
 835
 836         if ((proc_selfpid() != epid) && ((ep = proc_find(epid)) == PROC_NULL)) {
 837                 error = ESRCH;
 838                 goto done;
 839         }
 840
 841         error = socreate_internal(dom, aso, type, proto, current_proc(), 0, ep);
 842
 843         /*
 844          * It might not be wise to hold the proc reference when calling
 845          * socreate_internal since it calls soalloc with M_WAITOK
 846          */
 847 done:
 848         if (ep != PROC_NULL)
 849                 proc_rele(ep);
 850
 851         return (error);
 852 }
 853
 854 /*
 855  * Returns:     0                       Success
 856  *      <pru_bind>:EINVAL               Invalid argument [COMMON_START]
 857  *      <pru_bind>:EAFNOSUPPORT         Address family not supported
 858  *      <pru_bind>:EADDRNOTAVAIL        Address not available.
 859  *      <pru_bind>:EINVAL               Invalid argument
 860  *      <pru_bind>:EAFNOSUPPORT         Address family not supported [notdef]
 861  *      <pru_bind>:EACCES               Permission denied
 862  *      <pru_bind>:EADDRINUSE           Address in use
 863  *      <pru_bind>:EAGAIN               Resource unavailable, try again
 864  *      <pru_bind>:EPERM                Operation not permitted
 865  *      <pru_bind>:???
 866  *      <sf_bind>:???
 867  *
 868  * Notes:       It's not possible to fully enumerate the return codes above,
 869  *              since socket filter authors and protocol family authors may
 870  *              not choose to limit their error returns to those listed, even
 871  *              though this may result in some software operating incorrectly.
 872  *
 873  *              The error codes which are enumerated above are those known to
 874  *              be returned by the tcp_usr_bind function supplied.
 875  */
 876 int
 877 sobindlock(struct socket *so, struct sockaddr *nam, int dolock)
 878 {
 879         struct proc *p = current_proc();
 880         int error = 0;
 881
 882         if (dolock)
 883                 socket_lock(so, 1);
 884         VERIFY(so->so_usecount > 1);
 885
 886         so_update_last_owner_locked(so, p);
 887         so_update_policy(so);
 888
 889 #if NECP
 890         so_update_necp_policy(so, nam, NULL);
 891 #endif /* NECP */
 892
 893         /*
 894          * If this is a bind request on a socket that has been marked
 895          * as inactive, reject it now before we go any further.
 896          */
 897         if (so->so_flags & SOF_DEFUNCT) {
 898                 error = EINVAL;
 899                 SODEFUNCTLOG("%s[%d, %s]: defunct so 0x%llx [%d,%d] (%d)\n",
 900                     __func__, proc_pid(p), proc_best_name(p),
 901                     (uint64_t)DEBUG_KERNEL_ADDRPERM(so),
 902                     SOCK_DOM(so), SOCK_TYPE(so), error);
 903                 goto out;
 904         }
 905
 906         /* Socket filter */
 907         error = sflt_bind(so, nam);
 908
 909         if (error == 0)
 910                 error = (*so->so_proto->pr_usrreqs->pru_bind)(so, nam, p);
 911 out:
 912         if (dolock)
 913                 socket_unlock(so, 1);
 914
 915         if (error == EJUSTRETURN)
 916                 error = 0;
 917
 918         return (error);
 919 }
 920
 921 void
 922 sodealloc(struct socket *so)
 923 {
 924         kauth_cred_unref(&so->so_cred);
 925
 926         /* Remove any filters */
 927         sflt_termsock(so);
 928
 929 #if CONTENT_FILTER
 930         cfil_sock_detach(so);
 931 #endif /* CONTENT_FILTER */
 932
 933         /* Delete the state allocated for msg queues on a socket */
 934         if (so->so_flags & SOF_ENABLE_MSGS) {
 935                 FREE(so->so_msg_state, M_TEMP);
 936                 so->so_msg_state = NULL;
 937         }
 938         VERIFY(so->so_msg_state == NULL);
 939
 940         so->so_gencnt = OSIncrementAtomic64((SInt64 *)&so_gencnt);
 941
 942 #if CONFIG_MACF_SOCKET
 943         mac_socket_label_destroy(so);
 944 #endif /* MAC_SOCKET */
 945
 946         if (so->so_flags1 & SOF1_CACHED_IN_SOCK_LAYER) {
 947                 cached_sock_free(so);
 948         } else {
 949                 FREE_ZONE(so, sizeof (*so), so->so_zone);
 950         }
 951 }
 952
 953 /*
 954  * Returns:     0                       Success
 955  *              EINVAL
 956  *              EOPNOTSUPP
 957  *      <pru_listen>:EINVAL[AF_UNIX]
 958  *      <pru_listen>:EINVAL[TCP]
 959  *      <pru_listen>:EADDRNOTAVAIL[TCP] Address not available.
 960  *      <pru_listen>:EINVAL[TCP]        Invalid argument
 961  *      <pru_listen>:EAFNOSUPPORT[TCP]  Address family not supported [notdef]
 962  *      <pru_listen>:EACCES[TCP]        Permission denied
 963  *      <pru_listen>:EADDRINUSE[TCP]    Address in use
 964  *      <pru_listen>:EAGAIN[TCP]        Resource unavailable, try again
 965  *      <pru_listen>:EPERM[TCP]         Operation not permitted
 966  *      <sf_listen>:???
 967  *
 968  * Notes:       Other <pru_listen> returns depend on the protocol family; all
 969  *              <sf_listen> returns depend on what the filter author causes
 970  *              their filter to return.
 971  */
 972 int
 973 solisten(struct socket *so, int backlog)
 974 {
 975         struct proc *p = current_proc();
 976         int error = 0;
 977
 978         socket_lock(so, 1);
 979
 980         so_update_last_owner_locked(so, p);
 981         so_update_policy(so);
 982
 983 #if NECP
 984         so_update_necp_policy(so, NULL, NULL);
 985 #endif /* NECP */
 986
 987         if (so->so_proto == NULL) {
 988                 error = EINVAL;
 989                 goto out;
 990         }
 991         if ((so->so_proto->pr_flags & PR_CONNREQUIRED) == 0) {
 992                 error = EOPNOTSUPP;
 993                 goto out;
 994         }
 995
 996         /*
 997          * If the listen request is made on a socket that is not fully
 998          * disconnected, or on a socket that has been marked as inactive,
 999          * reject the request now.
1000          */
1001         if ((so->so_state &
1002             (SS_ISCONNECTED|SS_ISCONNECTING|SS_ISDISCONNECTING)) ||
1003             (so->so_flags & SOF_DEFUNCT)) {
1004                 error = EINVAL;
1005                 if (so->so_flags & SOF_DEFUNCT) {
1006                         SODEFUNCTLOG("%s[%d, %s]: defunct so 0x%llx [%d,%d] "
1007                             "(%d)\n", __func__, proc_pid(p),
1008                             proc_best_name(p),
1009                             (uint64_t)DEBUG_KERNEL_ADDRPERM(so),
1010                             SOCK_DOM(so), SOCK_TYPE(so), error);
1011                 }
1012                 goto out;
1013         }
1014
1015         if ((so->so_restrictions & SO_RESTRICT_DENY_IN) != 0) {
1016                 error = EPERM;
1017                 goto out;
1018         }
1019
1020         error = sflt_listen(so);
1021         if (error == 0)
1022                 error = (*so->so_proto->pr_usrreqs->pru_listen)(so, p);
1023
1024         if (error) {
1025                 if (error == EJUSTRETURN)
1026                         error = 0;
1027                 goto out;
1028         }
1029
1030         if (TAILQ_EMPTY(&so->so_comp))
1031                 so->so_options |= SO_ACCEPTCONN;
1032         /*
1033          * POSIX: The implementation may have an upper limit on the length of
1034          * the listen queue-either global or per accepting socket. If backlog
1035          * exceeds this limit, the length of the listen queue is set to the
1036          * limit.
1037          *
1038          * If listen() is called with a backlog argument value that is less
1039          * than 0, the function behaves as if it had been called with a backlog
1040          * argument value of 0.
1041          *
1042          * A backlog argument of 0 may allow the socket to accept connections,
1043          * in which case the length of the listen queue may be set to an
1044          * implementation-defined minimum value.
1045          */
1046         if (backlog <= 0 || backlog > somaxconn)
1047                 backlog = somaxconn;
1048
1049         so->so_qlimit = backlog;
1050 out:
1051         socket_unlock(so, 1);
1052         return (error);
1053 }
1054
1055 void
1056 sofreelastref(struct socket *so, int dealloc)
1057 {
1058         struct socket *head = so->so_head;
1059
1060         /* Assume socket is locked */
1061
1062         if (!(so->so_flags & SOF_PCBCLEARING) || !(so->so_state & SS_NOFDREF)) {
1063                 selthreadclear(&so->so_snd.sb_sel);
1064                 selthreadclear(&so->so_rcv.sb_sel);
1065                 so->so_rcv.sb_flags &= ~(SB_SEL|SB_UPCALL);
1066                 so->so_snd.sb_flags &= ~(SB_SEL|SB_UPCALL);
1067                 so->so_event = sonullevent;
1068                 return;
1069         }
1070         if (head != NULL) {
1071                 socket_lock(head, 1);
1072                 if (so->so_state & SS_INCOMP) {
1073                         TAILQ_REMOVE(&head->so_incomp, so, so_list);
1074                         head->so_incqlen--;
1075                 } else if (so->so_state & SS_COMP) {
1076                         /*
1077                          * We must not decommission a socket that's
1078                          * on the accept(2) queue.  If we do, then
1079                          * accept(2) may hang after select(2) indicated
1080                          * that the listening socket was ready.
1081                          */
1082                         selthreadclear(&so->so_snd.sb_sel);
1083                         selthreadclear(&so->so_rcv.sb_sel);
1084                         so->so_rcv.sb_flags &= ~(SB_SEL|SB_UPCALL);
1085                         so->so_snd.sb_flags &= ~(SB_SEL|SB_UPCALL);
1086                         so->so_event = sonullevent;
1087                         socket_unlock(head, 1);
1088                         return;
1089                 } else {
1090                         panic("sofree: not queued");
1091                 }
1092                 head->so_qlen--;
1093                 so->so_state &= ~SS_INCOMP;
1094                 so->so_head = NULL;
1095                 socket_unlock(head, 1);
1096         }
1097         sowflush(so);
1098         sorflush(so);
1099
1100 #if FLOW_DIVERT
1101         if (so->so_flags & SOF_FLOW_DIVERT) {
1102                 flow_divert_detach(so);
1103         }
1104 #endif  /* FLOW_DIVERT */
1105
1106         /* 3932268: disable upcall */
1107         so->so_rcv.sb_flags &= ~SB_UPCALL;
1108         so->so_snd.sb_flags &= ~(SB_UPCALL|SB_SNDBYTE_CNT);
1109         so->so_event = sonullevent;
1110
1111         if (dealloc)
1112                 sodealloc(so);
1113 }
1114
1115 void
1116 soclose_wait_locked(struct socket *so)
1117 {
1118         lck_mtx_t *mutex_held;
1119
1120         if (so->so_proto->pr_getlock != NULL)
1121                 mutex_held = (*so->so_proto->pr_getlock)(so, 0);
1122         else
1123                 mutex_held = so->so_proto->pr_domain->dom_mtx;
1124         lck_mtx_assert(mutex_held, LCK_MTX_ASSERT_OWNED);
1125
1126         /*
1127          * Double check here and return if there's no outstanding upcall;
1128          * otherwise proceed further only if SOF_UPCALLCLOSEWAIT is set.
1129          */
1130         if (!so->so_upcallusecount || !(so->so_flags & SOF_UPCALLCLOSEWAIT))
1131                 return;
1132         so->so_rcv.sb_flags &= ~SB_UPCALL;
1133         so->so_snd.sb_flags &= ~SB_UPCALL;
1134         so->so_flags |= SOF_CLOSEWAIT;
1135         (void) msleep((caddr_t)&so->so_upcallusecount, mutex_held, (PZERO - 1),
1136             "soclose_wait_locked", NULL);
1137         lck_mtx_assert(mutex_held, LCK_MTX_ASSERT_OWNED);
1138         so->so_flags &= ~SOF_CLOSEWAIT;
1139 }
1140
1141 /*
1142  * Close a socket on last file table reference removal.
1143  * Initiate disconnect if connected.
1144  * Free socket when disconnect complete.
1145  */
1146 int
1147 soclose_locked(struct socket *so)
1148 {
1149         int error = 0;
1150         lck_mtx_t *mutex_held;
1151         struct timespec ts;
1152
1153         if (so->so_usecount == 0) {
1154                 panic("soclose: so=%p refcount=0\n", so);
1155                 /* NOTREACHED */
1156         }
1157
1158         sflt_notify(so, sock_evt_closing, NULL);
1159
1160         if (so->so_upcallusecount)
1161                 soclose_wait_locked(so);
1162
1163 #if CONTENT_FILTER
1164         /*
1165          * We have to wait until the content filters are done
1166          */
1167         if ((so->so_flags & SOF_CONTENT_FILTER) != 0) {
1168                 cfil_sock_close_wait(so);
1169                 cfil_sock_is_closed(so);
1170                 cfil_sock_detach(so);
1171         }
1172 #endif /* CONTENT_FILTER */
1173
1174         if (so->so_flags1 & SOF1_EXTEND_BK_IDLE_INPROG) {
1175                 soresume(current_proc(), so, 1);
1176                 so->so_flags1 &= ~SOF1_EXTEND_BK_IDLE_WANTED;
1177         }
1178
1179         if ((so->so_options & SO_ACCEPTCONN)) {
1180                 struct socket *sp, *sonext;
1181                 int socklock = 0;
1182
1183                 /*
1184                  * We do not want new connection to be added
1185                  * to the connection queues
1186                  */
1187                 so->so_options &= ~SO_ACCEPTCONN;
1188
1189                 for (sp = TAILQ_FIRST(&so->so_incomp);
1190                     sp != NULL; sp = sonext) {
1191                         sonext = TAILQ_NEXT(sp, so_list);
1192
1193                         /*
1194                          * Radar 5350314
1195                          * skip sockets thrown away by tcpdropdropblreq
1196                          * they will get cleanup by the garbage collection.
1197                          * otherwise, remove the incomp socket from the queue
1198                          * and let soabort trigger the appropriate cleanup.
1199                          */
1200                         if (sp->so_flags & SOF_OVERFLOW)
1201                                 continue;
1202
1203                         if (so->so_proto->pr_getlock != NULL) {
1204                                 /*
1205                                  * Lock ordering for consistency with the
1206                                  * rest of the stack, we lock the socket
1207                                  * first and then grabb the head.
1208                                  */
1209                                 socket_unlock(so, 0);
1210                                 socket_lock(sp, 1);
1211                                 socket_lock(so, 0);
1212                                 socklock = 1;
1213                         }
1214
1215                         TAILQ_REMOVE(&so->so_incomp, sp, so_list);
1216                         so->so_incqlen--;
1217
1218                         if (sp->so_state & SS_INCOMP) {
1219                                 sp->so_state &= ~SS_INCOMP;
1220                                 sp->so_head = NULL;
1221
1222                                 (void) soabort(sp);
1223                         }
1224
1225                         if (socklock)
1226                                 socket_unlock(sp, 1);
1227                 }
1228
1229                 while ((sp = TAILQ_FIRST(&so->so_comp)) != NULL) {
1230                         /* Dequeue from so_comp since sofree() won't do it */
1231                         TAILQ_REMOVE(&so->so_comp, sp, so_list);
1232                         so->so_qlen--;
1233
1234                         if (so->so_proto->pr_getlock != NULL) {
1235                                 socket_unlock(so, 0);
1236                                 socket_lock(sp, 1);
1237                         }
1238
1239                         if (sp->so_state & SS_COMP) {
1240                                 sp->so_state &= ~SS_COMP;
1241                                 sp->so_head = NULL;
1242
1243                                 (void) soabort(sp);
1244                         }
1245
1246                         if (so->so_proto->pr_getlock != NULL) {
1247                                 socket_unlock(sp, 1);
1248                                 socket_lock(so, 0);
1249                         }
1250                 }
1251         }
1252         if (so->so_pcb == NULL) {
1253                 /* 3915887: mark the socket as ready for dealloc */
1254                 so->so_flags |= SOF_PCBCLEARING;
1255                 goto discard;
1256         }
1257         if (so->so_state & SS_ISCONNECTED) {
1258                 if ((so->so_state & SS_ISDISCONNECTING) == 0) {
1259                         error = sodisconnectlocked(so);
1260                         if (error)
1261                                 goto drop;
1262                 }
1263                 if (so->so_options & SO_LINGER) {
1264                         if ((so->so_state & SS_ISDISCONNECTING) &&
1265                             (so->so_state & SS_NBIO))
1266                                 goto drop;
1267                         if (so->so_proto->pr_getlock != NULL)
1268                                 mutex_held = (*so->so_proto->pr_getlock)(so, 0);
1269                         else
1270                                 mutex_held = so->so_proto->pr_domain->dom_mtx;
1271                         while (so->so_state & SS_ISCONNECTED) {
1272                                 ts.tv_sec = (so->so_linger/100);
1273                                 ts.tv_nsec = (so->so_linger % 100) *
1274                                     NSEC_PER_USEC * 1000 * 10;
1275                                 error = msleep((caddr_t)&so->so_timeo,
1276                                     mutex_held, PSOCK | PCATCH, "soclose", &ts);
1277                                 if (error) {
1278                                         /*
1279                                          * It's OK when the time fires,
1280                                          * don't report an error
1281                                          */
1282                                         if (error == EWOULDBLOCK)
1283                                                 error = 0;
1284                                         break;
1285                                 }
1286                         }
1287                 }
1288         }
1289 drop:
1290         if (so->so_usecount == 0) {
1291                 panic("soclose: usecount is zero so=%p\n", so);
1292                 /* NOTREACHED */
1293         }
1294         if (so->so_pcb != NULL && !(so->so_flags & SOF_PCBCLEARING)) {
1295                 int error2 = (*so->so_proto->pr_usrreqs->pru_detach)(so);
1296                 if (error == 0)
1297                         error = error2;
1298         }
1299         if (so->so_usecount <= 0) {
1300                 panic("soclose: usecount is zero so=%p\n", so);
1301                 /* NOTREACHED */
1302         }
1303 discard:
1304         if (so->so_pcb != NULL && !(so->so_flags & SOF_MP_SUBFLOW) &&
1305             (so->so_state & SS_NOFDREF)) {
1306                 panic("soclose: NOFDREF");
1307                 /* NOTREACHED */
1308         }
1309         so->so_state |= SS_NOFDREF;
1310
1311         if (so->so_flags & SOF_MP_SUBFLOW)
1312                 so->so_flags &= ~SOF_MP_SUBFLOW;
1313
1314         if ((so->so_flags & SOF_KNOTE) != 0)
1315                 KNOTE(&so->so_klist, SO_FILT_HINT_LOCKED);
1316
1317         atomic_add_32(&so->so_proto->pr_domain->dom_refs, -1);
1318         evsofree(so);
1319
1320         so->so_usecount--;
1321         sofree(so);
1322         return (error);
1323 }
1324
1325 int
1326 soclose(struct socket *so)
1327 {
1328         int error = 0;
1329         socket_lock(so, 1);
1330
1331         if (so->so_retaincnt == 0) {
1332                 error = soclose_locked(so);
1333         } else {
1334                 /*
1335                  * if the FD is going away, but socket is
1336                  * retained in kernel remove its reference
1337                  */
1338                 so->so_usecount--;
1339                 if (so->so_usecount < 2)
1340                         panic("soclose: retaincnt non null and so=%p "
1341                             "usecount=%d\n", so, so->so_usecount);
1342         }
1343         socket_unlock(so, 1);
1344         return (error);
1345 }
1346
1347 /*
1348  * Must be called at splnet...
1349  */
1350 /* Should already be locked */
1351 int
1352 soabort(struct socket *so)
1353 {
1354         int error;
1355
1356 #ifdef MORE_LOCKING_DEBUG
1357         lck_mtx_t *mutex_held;
1358
1359         if (so->so_proto->pr_getlock != NULL)
1360                 mutex_held = (*so->so_proto->pr_getlock)(so, 0);
1361         else
1362                 mutex_held = so->so_proto->pr_domain->dom_mtx;
1363         lck_mtx_assert(mutex_held, LCK_MTX_ASSERT_OWNED);
1364 #endif
1365
1366         if ((so->so_flags & SOF_ABORTED) == 0) {
1367                 so->so_flags |= SOF_ABORTED;
1368                 error = (*so->so_proto->pr_usrreqs->pru_abort)(so);
1369                 if (error) {
1370                         sofree(so);
1371                         return (error);
1372                 }
1373         }
1374         return (0);
1375 }
1376
1377 int
1378 soacceptlock(struct socket *so, struct sockaddr **nam, int dolock)
1379 {
1380         int error;
1381
1382         if (dolock)
1383                 socket_lock(so, 1);
1384
1385         so_update_last_owner_locked(so, PROC_NULL);
1386         so_update_policy(so);
1387 #if NECP
1388         so_update_necp_policy(so, NULL, NULL);
1389 #endif /* NECP */
1390
1391         if ((so->so_state & SS_NOFDREF) == 0)
1392                 panic("soaccept: !NOFDREF");
1393         so->so_state &= ~SS_NOFDREF;
1394         error = (*so->so_proto->pr_usrreqs->pru_accept)(so, nam);
1395
1396         if (dolock)
1397                 socket_unlock(so, 1);
1398         return (error);
1399 }
1400
1401 int
1402 soaccept(struct socket *so, struct sockaddr **nam)
1403 {
1404         return (soacceptlock(so, nam, 1));
1405 }
1406
1407 int
1408 soacceptfilter(struct socket *so)
1409 {
1410         struct sockaddr *local = NULL, *remote = NULL;
1411         int error = 0;
1412         struct socket *head = so->so_head;
1413
1414         /*
1415          * Hold the lock even if this socket has not been made visible
1416          * to the filter(s).  For sockets with global locks, this protects
1417          * against the head or peer going away
1418          */
1419         socket_lock(so, 1);
1420         if (sogetaddr_locked(so, &remote, 1) != 0 ||
1421             sogetaddr_locked(so, &local, 0) != 0) {
1422                 so->so_state &= ~(SS_NOFDREF | SS_COMP);
1423                 so->so_head = NULL;
1424                 socket_unlock(so, 1);
1425                 soclose(so);
1426                 /* Out of resources; try it again next time */
1427                 error = ECONNABORTED;
1428                 goto done;
1429         }
1430
1431         error = sflt_accept(head, so, local, remote);
1432
1433         /*
1434          * If we get EJUSTRETURN from one of the filters, mark this socket
1435          * as inactive and return it anyway.  This newly accepted socket
1436          * will be disconnected later before we hand it off to the caller.
1437          */
1438         if (error == EJUSTRETURN) {
1439                 error = 0;
1440                 (void) sosetdefunct(current_proc(), so,
1441                     SHUTDOWN_SOCKET_LEVEL_DISCONNECT_INTERNAL, FALSE);
1442         }
1443
1444         if (error != 0) {
1445                 /*
1446                  * This may seem like a duplication to the above error
1447                  * handling part when we return ECONNABORTED, except
1448                  * the following is done while holding the lock since
1449                  * the socket has been exposed to the filter(s) earlier.
1450                  */
1451                 so->so_state &= ~(SS_NOFDREF | SS_COMP);
1452                 so->so_head = NULL;
1453                 socket_unlock(so, 1);
1454                 soclose(so);
1455                 /* Propagate socket filter's error code to the caller */
1456         } else {
1457                 socket_unlock(so, 1);
1458         }
1459 done:
1460         /* Callee checks for NULL pointer */
1461         sock_freeaddr(remote);
1462         sock_freeaddr(local);
1463         return (error);
1464 }
1465
1466 /*
1467  * Returns:     0                       Success
1468  *              EOPNOTSUPP              Operation not supported on socket
1469  *              EISCONN                 Socket is connected
1470  *      <pru_connect>:EADDRNOTAVAIL     Address not available.
1471  *      <pru_connect>:EINVAL            Invalid argument
1472  *      <pru_connect>:EAFNOSUPPORT      Address family not supported [notdef]
1473  *      <pru_connect>:EACCES            Permission denied
1474  *      <pru_connect>:EADDRINUSE        Address in use
1475  *      <pru_connect>:EAGAIN            Resource unavailable, try again
1476  *      <pru_connect>:EPERM             Operation not permitted
1477  *      <sf_connect_out>:???            [anything a filter writer might set]
1478  */
1479 int
1480 soconnectlock(struct socket *so, struct sockaddr *nam, int dolock)
1481 {
1482         int error;
1483         struct proc *p = current_proc();
1484
1485         if (dolock)
1486                 socket_lock(so, 1);
1487
1488         so_update_last_owner_locked(so, p);
1489         so_update_policy(so);
1490
1491 #if NECP
1492         so_update_necp_policy(so, NULL, nam);
1493 #endif /* NECP */
1494
1495         /*
1496          * If this is a listening socket or if this is a previously-accepted
1497          * socket that has been marked as inactive, reject the connect request.
1498          */
1499         if ((so->so_options & SO_ACCEPTCONN) || (so->so_flags & SOF_DEFUNCT)) {
1500                 error = EOPNOTSUPP;
1501                 if (so->so_flags & SOF_DEFUNCT) {
1502                         SODEFUNCTLOG("%s[%d, %s]: defunct so 0x%llx [%d,%d] "
1503                             "(%d)\n", __func__, proc_pid(p),
1504                             proc_best_name(p),
1505                             (uint64_t)DEBUG_KERNEL_ADDRPERM(so),
1506                             SOCK_DOM(so), SOCK_TYPE(so), error);
1507                 }
1508                 if (dolock)
1509                         socket_unlock(so, 1);
1510                 return (error);
1511         }
1512
1513         if ((so->so_restrictions & SO_RESTRICT_DENY_OUT) != 0) {
1514                 if (dolock)
1515                         socket_unlock(so, 1);
1516                 return (EPERM);
1517         }
1518
1519         /*
1520          * If protocol is connection-based, can only connect once.
1521          * Otherwise, if connected, try to disconnect first.
1522          * This allows user to disconnect by connecting to, e.g.,
1523          * a null address.
1524          */
1525         if (so->so_state & (SS_ISCONNECTED|SS_ISCONNECTING) &&
1526             ((so->so_proto->pr_flags & PR_CONNREQUIRED) ||
1527             (error = sodisconnectlocked(so)))) {
1528                 error = EISCONN;
1529         } else {
1530                 /*
1531                  * Run connect filter before calling protocol:
1532                  *  - non-blocking connect returns before completion;
1533                  */
1534                 error = sflt_connectout(so, nam);
1535                 if (error != 0) {
1536                         if (error == EJUSTRETURN)
1537                                 error = 0;
1538                 } else {
1539                         error = (*so->so_proto->pr_usrreqs->pru_connect)
1540                             (so, nam, p);
1541                 }
1542         }
1543         if (dolock)
1544                 socket_unlock(so, 1);
1545         return (error);
1546 }
1547
1548 int
1549 soconnect(struct socket *so, struct sockaddr *nam)
1550 {
1551         return (soconnectlock(so, nam, 1));
1552 }
1553
1554 /*
1555  * Returns:     0                       Success
1556  *      <pru_connect2>:EINVAL[AF_UNIX]
1557  *      <pru_connect2>:EPROTOTYPE[AF_UNIX]
1558  *      <pru_connect2>:???              [other protocol families]
1559  *
1560  * Notes:       <pru_connect2> is not supported by [TCP].
1561  */
1562 int
1563 soconnect2(struct socket *so1, struct socket *so2)
1564 {
1565         int error;
1566
1567         socket_lock(so1, 1);
1568         if (so2->so_proto->pr_lock)
1569                 socket_lock(so2, 1);
1570
1571         error = (*so1->so_proto->pr_usrreqs->pru_connect2)(so1, so2);
1572
1573         socket_unlock(so1, 1);
1574         if (so2->so_proto->pr_lock)
1575                 socket_unlock(so2, 1);
1576         return (error);
1577 }
1578
1579 int
1580 soconnectxlocked(struct socket *so, struct sockaddr_list **src_sl,
1581     struct sockaddr_list **dst_sl, struct proc *p, uint32_t ifscope,
1582     sae_associd_t aid, sae_connid_t *pcid, uint32_t flags, void *arg,
1583     uint32_t arglen, uio_t auio, user_ssize_t *bytes_written)
1584 {
1585         int error;
1586
1587         so_update_last_owner_locked(so, p);
1588         so_update_policy(so);
1589
1590         /*
1591          * If this is a listening socket or if this is a previously-accepted
1592          * socket that has been marked as inactive, reject the connect request.
1593          */
1594         if ((so->so_options & SO_ACCEPTCONN) || (so->so_flags & SOF_DEFUNCT)) {
1595                 error = EOPNOTSUPP;
1596                 if (so->so_flags & SOF_DEFUNCT) {
1597                         SODEFUNCTLOG("%s[%d, %s]: defunct so 0x%llx [%d,%d] "
1598                             "(%d)\n", __func__, proc_pid(p),
1599                             proc_best_name(p),
1600                             (uint64_t)DEBUG_KERNEL_ADDRPERM(so),
1601                             SOCK_DOM(so), SOCK_TYPE(so), error);
1602                 }
1603                 return (error);
1604         }
1605
1606         if ((so->so_restrictions & SO_RESTRICT_DENY_OUT) != 0)
1607                 return (EPERM);
1608
1609         /*
1610          * If protocol is connection-based, can only connect once
1611          * unless PR_MULTICONN is set.  Otherwise, if connected,
1612          * try to disconnect first.  This allows user to disconnect
1613          * by connecting to, e.g., a null address.
1614          */
1615         if ((so->so_state & (SS_ISCONNECTED|SS_ISCONNECTING)) &&
1616             !(so->so_proto->pr_flags & PR_MULTICONN) &&
1617             ((so->so_proto->pr_flags & PR_CONNREQUIRED) ||
1618             (error = sodisconnectlocked(so)) != 0)) {
1619                 error = EISCONN;
1620         } else {
1621                 /*
1622                  * Run connect filter before calling protocol:
1623                  *  - non-blocking connect returns before completion;
1624                  */
1625                 error = sflt_connectxout(so, dst_sl);
1626                 if (error != 0) {
1627                         /* Disable PRECONNECT_DATA, as we don't need to send a SYN anymore. */
1628                         so->so_flags1 &= ~SOF1_PRECONNECT_DATA;
1629                         if (error == EJUSTRETURN)
1630                                 error = 0;
1631                 } else {
1632                         error = (*so->so_proto->pr_usrreqs->pru_connectx)
1633                             (so, src_sl, dst_sl, p, ifscope, aid, pcid,
1634                             flags, arg, arglen, auio, bytes_written);
1635                 }
1636         }
1637
1638         return (error);
1639 }
1640
1641 int
1642 sodisconnectlocked(struct socket *so)
1643 {
1644         int error;
1645
1646         if ((so->so_state & SS_ISCONNECTED) == 0) {
1647                 error = ENOTCONN;
1648                 goto bad;
1649         }
1650         if (so->so_state & SS_ISDISCONNECTING) {
1651                 error = EALREADY;
1652                 goto bad;
1653         }
1654
1655         error = (*so->so_proto->pr_usrreqs->pru_disconnect)(so);
1656         if (error == 0)
1657                 sflt_notify(so, sock_evt_disconnected, NULL);
1658
1659 bad:
1660         return (error);
1661 }
1662
1663 /* Locking version */
1664 int
1665 sodisconnect(struct socket *so)
1666 {
1667         int error;
1668
1669         socket_lock(so, 1);
1670         error = sodisconnectlocked(so);
1671         socket_unlock(so, 1);
1672         return (error);
1673 }
1674
1675 int
1676 sodisconnectxlocked(struct socket *so, sae_associd_t aid, sae_connid_t cid)
1677 {
1678         int error;
1679
1680         /*
1681          * Call the protocol disconnectx handler; let it handle all
1682          * matters related to the connection state of this session.
1683          */
1684         error = (*so->so_proto->pr_usrreqs->pru_disconnectx)(so, aid, cid);
1685         if (error == 0) {
1686                 /*
1687                  * The event applies only for the session, not for
1688                  * the disconnection of individual subflows.
1689                  */
1690                 if (so->so_state & (SS_ISDISCONNECTING|SS_ISDISCONNECTED))
1691                         sflt_notify(so, sock_evt_disconnected, NULL);
1692         }
1693         return (error);
1694 }
1695
1696 int
1697 sodisconnectx(struct socket *so, sae_associd_t aid, sae_connid_t cid)
1698 {
1699         int error;
1700
1701         socket_lock(so, 1);
1702         error = sodisconnectxlocked(so, aid, cid);
1703         socket_unlock(so, 1);
1704         return (error);
1705 }
1706
1707 int
1708 sopeelofflocked(struct socket *so, sae_associd_t aid, struct socket **psop)
1709 {
1710         return ((*so->so_proto->pr_usrreqs->pru_peeloff)(so, aid, psop));
1711 }
1712
1713 #define SBLOCKWAIT(f)   (((f) & MSG_DONTWAIT) ? 0 : SBL_WAIT)
1714
1715 /*
1716  * sosendcheck will lock the socket buffer if it isn't locked and
1717  * verify that there is space for the data being inserted.
1718  *
1719  * Returns:     0                       Success
1720  *              EPIPE
1721  *      sblock:EWOULDBLOCK
1722  *      sblock:EINTR
1723  *      sbwait:EBADF
1724  *      sbwait:EINTR
1725  *      [so_error]:???
1726  */
1727 int
1728 sosendcheck(struct socket *so, struct sockaddr *addr, user_ssize_t resid,
1729     int32_t clen, int32_t atomic, int flags, int *sblocked,
1730     struct mbuf *control)
1731 {
1732         int     error = 0;
1733         int32_t space;
1734         int     assumelock = 0;
1735
1736 restart:
1737         if (*sblocked == 0) {
1738                 if ((so->so_snd.sb_flags & SB_LOCK) != 0 &&
1739                     so->so_send_filt_thread != 0 &&
1740                     so->so_send_filt_thread == current_thread()) {
1741                         /*
1742                          * We're being called recursively from a filter,
1743                          * allow this to continue. Radar 4150520.
1744                          * Don't set sblocked because we don't want
1745                          * to perform an unlock later.
1746                          */
1747                         assumelock = 1;
1748                 } else {
1749                         error = sblock(&so->so_snd, SBLOCKWAIT(flags));
1750                         if (error) {
1751                                 if (so->so_flags & SOF_DEFUNCT)
1752                                         goto defunct;
1753                                 return (error);
1754                         }
1755                         *sblocked = 1;
1756                 }
1757         }
1758
1759         /*
1760          * If a send attempt is made on a socket that has been marked
1761          * as inactive (disconnected), reject the request.
1762          */
1763         if (so->so_flags & SOF_DEFUNCT) {
1764 defunct:
1765                 error = EPIPE;
1766                 SODEFUNCTLOG("%s[%d, %s]: defunct so 0x%llx [%d,%d] (%d)\n",
1767                     __func__, proc_selfpid(), proc_best_name(current_proc()),
1768                     (uint64_t)DEBUG_KERNEL_ADDRPERM(so),
1769                     SOCK_DOM(so), SOCK_TYPE(so), error);
1770                 return (error);
1771         }
1772
1773         if (so->so_state & SS_CANTSENDMORE) {
1774 #if CONTENT_FILTER
1775                 /*
1776                  * Can re-inject data of half closed connections
1777                  */
1778                 if ((so->so_state & SS_ISDISCONNECTED) == 0 &&
1779                         so->so_snd.sb_cfil_thread == current_thread() &&
1780                         cfil_sock_data_pending(&so->so_snd) != 0)
1781                         CFIL_LOG(LOG_INFO,
1782                                 "so %llx ignore SS_CANTSENDMORE",
1783                                 (uint64_t)DEBUG_KERNEL_ADDRPERM(so));
1784                 else
1785 #endif /* CONTENT_FILTER */
1786                         return (EPIPE);
1787         }
1788         if (so->so_error) {
1789                 error = so->so_error;
1790                 so->so_error = 0;
1791                 return (error);
1792         }
1793
1794         if ((so->so_state & SS_ISCONNECTED) == 0) {
1795                 if ((so->so_proto->pr_flags & PR_CONNREQUIRED) != 0) {
1796                         if (((so->so_state & SS_ISCONFIRMING) == 0) &&
1797                             (resid != 0 || clen == 0) &&
1798                             !(so->so_flags1 & SOF1_PRECONNECT_DATA)) {
1799 #if MPTCP
1800                                 /*
1801                                  * MPTCP Fast Join sends data before the
1802                                  * socket is truly connected.
1803                                  */
1804                                 if ((so->so_flags & (SOF_MP_SUBFLOW |
1805                                         SOF_MPTCP_FASTJOIN)) !=
1806                                     (SOF_MP_SUBFLOW | SOF_MPTCP_FASTJOIN))
1807 #endif /* MPTCP */
1808                                 return (ENOTCONN);
1809                         }
1810                 } else if (addr == 0 && !(flags&MSG_HOLD)) {
1811                         return ((so->so_proto->pr_flags & PR_CONNREQUIRED) ?
1812                             ENOTCONN : EDESTADDRREQ);
1813                 }
1814         }
1815
1816         if (so->so_flags & SOF_ENABLE_MSGS)
1817                 space = msgq_sbspace(so, control);
1818         else
1819                 space = sbspace(&so->so_snd);
1820
1821         if (flags & MSG_OOB)
1822                 space += 1024;
1823         if ((atomic && resid > so->so_snd.sb_hiwat) ||
1824             clen > so->so_snd.sb_hiwat)
1825                 return (EMSGSIZE);
1826
1827         if ((space < resid + clen &&
1828             (atomic || (space < (int32_t)so->so_snd.sb_lowat) ||
1829             space < clen)) ||
1830             (so->so_type == SOCK_STREAM && so_wait_for_if_feedback(so))) {
1831                 /*
1832                  * don't block the connectx call when there's more data
1833                  * than can be copied.
1834                  */
1835                 if (so->so_flags1 & SOF1_PRECONNECT_DATA) {
1836                         if (space == 0) {
1837                                 return (EWOULDBLOCK);
1838                         }
1839                         if (space < (int32_t)so->so_snd.sb_lowat) {
1840                                 return (0);
1841                         }
1842                 }
1843                 if ((so->so_state & SS_NBIO) || (flags & MSG_NBIO) ||
1844                     assumelock) {
1845                         return (EWOULDBLOCK);
1846                 }
1847                 sbunlock(&so->so_snd, TRUE);    /* keep socket locked */
1848                 *sblocked = 0;
1849                 error = sbwait(&so->so_snd);
1850                 if (error) {
1851                         if (so->so_flags & SOF_DEFUNCT)
1852                                 goto defunct;
1853                         return (error);
1854                 }
1855                 goto restart;
1856         }
1857         return (0);
1858 }
1859
1860 /*
1861  * Send on a socket.
1862  * If send must go all at once and message is larger than
1863  * send buffering, then hard error.
1864  * Lock against other senders.
1865  * If must go all at once and not enough room now, then
1866  * inform user that this would block and do nothing.
1867  * Otherwise, if nonblocking, send as much as possible.
1868  * The data to be sent is described by "uio" if nonzero,
1869  * otherwise by the mbuf chain "top" (which must be null
1870  * if uio is not).  Data provided in mbuf chain must be small
1871  * enough to send all at once.
1872  *
1873  * Returns nonzero on error, timeout or signal; callers
1874  * must check for short counts if EINTR/ERESTART are returned.
1875  * Data and control buffers are freed on return.
1876  * Experiment:
1877  * MSG_HOLD: go thru most of sosend(), but just enqueue the mbuf
1878  * MSG_SEND: go thru as for MSG_HOLD on current fragment, then
1879  *  point at the mbuf chain being constructed and go from there.
1880  *
1881  * Returns:     0                       Success
1882  *              EOPNOTSUPP
1883  *              EINVAL
1884  *              ENOBUFS
1885  *      uiomove:EFAULT
1886  *      sosendcheck:EPIPE
1887  *      sosendcheck:EWOULDBLOCK
1888  *      sosendcheck:EINTR
1889  *      sosendcheck:EBADF
1890  *      sosendcheck:EINTR
1891  *      sosendcheck:???                 [value from so_error]
1892  *      <pru_send>:ECONNRESET[TCP]
1893  *      <pru_send>:EINVAL[TCP]
1894  *      <pru_send>:ENOBUFS[TCP]
1895  *      <pru_send>:EADDRINUSE[TCP]
1896  *      <pru_send>:EADDRNOTAVAIL[TCP]
1897  *      <pru_send>:EAFNOSUPPORT[TCP]
1898  *      <pru_send>:EACCES[TCP]
1899  *      <pru_send>:EAGAIN[TCP]
1900  *      <pru_send>:EPERM[TCP]
1901  *      <pru_send>:EMSGSIZE[TCP]
1902  *      <pru_send>:EHOSTUNREACH[TCP]
1903  *      <pru_send>:ENETUNREACH[TCP]
1904  *      <pru_send>:ENETDOWN[TCP]
1905  *      <pru_send>:ENOMEM[TCP]
1906  *      <pru_send>:ENOBUFS[TCP]
1907  *      <pru_send>:???[TCP]             [ignorable: mostly IPSEC/firewall/DLIL]
1908  *      <pru_send>:EINVAL[AF_UNIX]
1909  *      <pru_send>:EOPNOTSUPP[AF_UNIX]
1910  *      <pru_send>:EPIPE[AF_UNIX]
1911  *      <pru_send>:ENOTCONN[AF_UNIX]
1912  *      <pru_send>:EISCONN[AF_UNIX]
1913  *      <pru_send>:???[AF_UNIX]         [whatever a filter author chooses]
1914  *      <sf_data_out>:???               [whatever a filter author chooses]
1915  *
1916  * Notes:       Other <pru_send> returns depend on the protocol family; all
1917  *              <sf_data_out> returns depend on what the filter author causes
1918  *              their filter to return.
1919  */
1920 int
1921 sosend(struct socket *so, struct sockaddr *addr, struct uio *uio,
1922     struct mbuf *top, struct mbuf *control, int flags)
1923 {
1924         struct mbuf **mp;
1925         struct mbuf *m, *freelist = NULL;
1926         user_ssize_t space, len, resid, orig_resid;
1927         int clen = 0, error, dontroute, mlen, sendflags;
1928         int atomic = sosendallatonce(so) || top;
1929         int sblocked = 0;
1930         struct proc *p = current_proc();
1931         struct mbuf *control_copy = NULL;
1932         uint16_t headroom = 0;
1933         boolean_t en_tracing = FALSE;
1934
1935         if (uio != NULL)
1936                 resid = uio_resid(uio);
1937         else
1938                 resid = top->m_pkthdr.len;
1939
1940         KERNEL_DEBUG((DBG_FNC_SOSEND | DBG_FUNC_START), so, resid,
1941             so->so_snd.sb_cc, so->so_snd.sb_lowat, so->so_snd.sb_hiwat);
1942
1943         socket_lock(so, 1);
1944
1945         /*
1946          * trace if tracing & network (vs. unix) sockets & and
1947          * non-loopback
1948          */
1949         if (ENTR_SHOULDTRACE &&
1950             (SOCK_CHECK_DOM(so, AF_INET) || SOCK_CHECK_DOM(so, AF_INET6))) {
1951                 struct inpcb *inp = sotoinpcb(so);
1952                 if (inp->inp_last_outifp != NULL &&
1953                     !(inp->inp_last_outifp->if_flags & IFF_LOOPBACK)) {
1954                         en_tracing = TRUE;
1955                         KERNEL_ENERGYTRACE(kEnTrActKernSockWrite, DBG_FUNC_START,
1956                             VM_KERNEL_ADDRPERM(so),
1957                             ((so->so_state & SS_NBIO) ? kEnTrFlagNonBlocking : 0),
1958                             (int64_t)resid);
1959                         orig_resid = resid;
1960                 }
1961         }
1962
1963         /*
1964          * Re-injection should not affect process accounting
1965          */
1966         if ((flags & MSG_SKIPCFIL) == 0) {
1967                 so_update_last_owner_locked(so, p);
1968                 so_update_policy(so);
1969
1970 #if NECP
1971                 so_update_necp_policy(so, NULL, addr);
1972 #endif /* NECP */
1973         }
1974
1975         if (so->so_type != SOCK_STREAM && (flags & MSG_OOB) != 0) {
1976                 error = EOPNOTSUPP;
1977                 socket_unlock(so, 1);
1978                 goto out;
1979         }
1980
1981         /*
1982          * In theory resid should be unsigned.
1983          * However, space must be signed, as it might be less than 0
1984          * if we over-committed, and we must use a signed comparison
1985          * of space and resid.  On the other hand, a negative resid
1986          * causes us to loop sending 0-length segments to the protocol.
1987          *
1988          * Usually, MSG_EOR isn't used on SOCK_STREAM type sockets.
1989          * But it will be used by sockets doing message delivery.
1990          *
1991          * Note: We limit resid to be a positive int value as we use
1992          * imin() to set bytes_to_copy -- radr://14558484
1993          */
1994         if (resid < 0 || resid > INT_MAX || (so->so_type == SOCK_STREAM &&
1995             !(so->so_flags & SOF_ENABLE_MSGS) && (flags & MSG_EOR))) {
1996                 error = EINVAL;
1997                 socket_unlock(so, 1);
1998                 goto out;
1999         }
2000
2001         dontroute = (flags & MSG_DONTROUTE) &&
2002             (so->so_options & SO_DONTROUTE) == 0 &&
2003             (so->so_proto->pr_flags & PR_ATOMIC);
2004         OSIncrementAtomicLong(&p->p_stats->p_ru.ru_msgsnd);
2005
2006         if (control != NULL)
2007                 clen = control->m_len;
2008
2009         if (soreserveheadroom != 0)
2010                 headroom = so->so_pktheadroom;
2011
2012         do {
2013                 error = sosendcheck(so, addr, resid, clen, atomic, flags,
2014                     &sblocked, control);
2015                 if (error)
2016                         goto release;
2017
2018                 mp = &top;
2019                 if (so->so_flags & SOF_ENABLE_MSGS)
2020                         space = msgq_sbspace(so, control);
2021                 else
2022                         space = sbspace(&so->so_snd) - clen;
2023                 space += ((flags & MSG_OOB) ? 1024 : 0);
2024
2025                 do {
2026                         if (uio == NULL) {
2027                                 /*
2028                                  * Data is prepackaged in "top".
2029                                  */
2030                                 resid = 0;
2031                                 if (flags & MSG_EOR)
2032                                         top->m_flags |= M_EOR;
2033                         } else {
2034                                 int chainlength;
2035                                 int bytes_to_copy;
2036                                 boolean_t jumbocl;
2037                                 boolean_t bigcl;
2038                                 int bytes_to_alloc;
2039
2040                                 bytes_to_copy = imin(resid, space);
2041
2042                                 bytes_to_alloc = bytes_to_copy;
2043                                 if (top == NULL)
2044                                         bytes_to_alloc += headroom;
2045
2046                                 if (sosendminchain > 0)
2047                                         chainlength = 0;
2048                                 else
2049                                         chainlength = sosendmaxchain;
2050
2051                                 /*
2052                                  * Use big 4 KB cluster when the outgoing interface
2053                                  * does not prefer 2 KB clusters
2054                                  */
2055                                 bigcl = !(so->so_flags1 & SOF1_IF_2KCL) ||
2056                                     sosendbigcl_ignore_capab;
2057
2058                                 /*
2059                                  * Attempt to use larger than system page-size
2060                                  * clusters for large writes only if there is
2061                                  * a jumbo cluster pool and if the socket is
2062                                  * marked accordingly.
2063                                  */
2064                                 jumbocl = sosendjcl && njcl > 0 &&
2065                                     ((so->so_flags & SOF_MULTIPAGES) ||
2066                                     sosendjcl_ignore_capab) &&
2067                                     bigcl;
2068
2069                                 socket_unlock(so, 0);
2070
2071                                 do {
2072                                         int num_needed;
2073                                         int hdrs_needed = (top == NULL) ? 1 : 0;
2074
2075                                         /*
2076                                          * try to maintain a local cache of mbuf
2077                                          * clusters needed to complete this
2078                                          * write the list is further limited to
2079                                          * the number that are currently needed
2080                                          * to fill the socket this mechanism
2081                                          * allows a large number of mbufs/
2082                                          * clusters to be grabbed under a single
2083                                          * mbuf lock... if we can't get any
2084                                          * clusters, than fall back to trying
2085                                          * for mbufs if we fail early (or
2086                                          * miscalcluate the number needed) make
2087                                          * sure to release any clusters we
2088                                          * haven't yet consumed.
2089                                          */
2090                                         if (freelist == NULL &&
2091                                             bytes_to_alloc > MBIGCLBYTES &&
2092                                             jumbocl) {
2093                                                 num_needed =
2094                                                     bytes_to_alloc / M16KCLBYTES;
2095
2096                                                 if ((bytes_to_alloc -
2097                                                     (num_needed * M16KCLBYTES))
2098                                                     >= MINCLSIZE)
2099                                                         num_needed++;
2100
2101                                                 freelist =
2102                                                     m_getpackets_internal(
2103                                                     (unsigned int *)&num_needed,
2104                                                     hdrs_needed, M_WAIT, 0,
2105                                                     M16KCLBYTES);
2106                                                 /*
2107                                                  * Fall back to 4K cluster size
2108                                                  * if allocation failed
2109                                                  */
2110                                         }
2111
2112                                         if (freelist == NULL &&
2113                                             bytes_to_alloc > MCLBYTES &&
2114                                             bigcl) {
2115                                                 num_needed =
2116                                                     bytes_to_alloc / MBIGCLBYTES;
2117
2118                                                 if ((bytes_to_alloc -
2119                                                     (num_needed * MBIGCLBYTES)) >=
2120                                                     MINCLSIZE)
2121                                                         num_needed++;
2122
2123                                                 freelist =
2124                                                     m_getpackets_internal(
2125                                                     (unsigned int *)&num_needed,
2126                                                     hdrs_needed, M_WAIT, 0,
2127                                                     MBIGCLBYTES);
2128                                                 /*
2129                                                  * Fall back to cluster size
2130                                                  * if allocation failed
2131                                                  */
2132                                         }
2133
2134                                         /*
2135                                          * Allocate a cluster as we want to
2136                                          * avoid to split the data in more
2137                                          * that one segment and using MINCLSIZE
2138                                          * would lead us to allocate two mbufs
2139                                          */
2140                                         if (soreserveheadroom != 0 &&
2141                                             freelist == NULL &&
2142                                             ((top == NULL &&
2143                                             bytes_to_alloc > _MHLEN) ||
2144                                             bytes_to_alloc > _MLEN)) {
2145                                                 num_needed = ROUNDUP(bytes_to_alloc, MCLBYTES) /
2146                                                     MCLBYTES;
2147                                                 freelist =
2148                                                     m_getpackets_internal(
2149                                                     (unsigned int *)&num_needed,
2150                                                     hdrs_needed, M_WAIT, 0,
2151                                                     MCLBYTES);
2152                                                 /*
2153                                                  * Fall back to a single mbuf
2154                                                  * if allocation failed
2155                                                  */
2156                                         } else if (freelist == NULL &&
2157                                             bytes_to_alloc > MINCLSIZE) {
2158                                                 num_needed =
2159                                                     bytes_to_alloc / MCLBYTES;
2160
2161                                                 if ((bytes_to_alloc -
2162                                                     (num_needed * MCLBYTES)) >=
2163                                                     MINCLSIZE)
2164                                                         num_needed++;
2165
2166                                                 freelist =
2167                                                     m_getpackets_internal(
2168                                                     (unsigned int *)&num_needed,
2169                                                     hdrs_needed, M_WAIT, 0,
2170                                                     MCLBYTES);
2171                                                 /*
2172                                                  * Fall back to a single mbuf
2173                                                  * if allocation failed
2174                                                  */
2175                                         }
2176                                         /*
2177                                          * For datagram protocols, leave
2178                                          * headroom for protocol headers
2179                                          * in the first cluster of the chain
2180                                          */
2181                                         if (freelist != NULL && atomic &&
2182                                             top == NULL && headroom > 0) {
2183                                                 freelist->m_data += headroom;
2184                                         }
2185
2186                                         /*
2187                                          * Fall back to regular mbufs without
2188                                          * reserving the socket headroom
2189                                          */
2190                                         if (freelist == NULL) {
2191                                                 if (top == NULL)
2192                                                         MGETHDR(freelist,
2193                                                             M_WAIT, MT_DATA);
2194                                                 else
2195                                                         MGET(freelist,
2196                                                             M_WAIT, MT_DATA);
2197
2198                                                 if (freelist == NULL) {
2199                                                         error = ENOBUFS;
2200                                                         socket_lock(so, 0);
2201                                                         goto release;
2202                                                 }
2203                                                 /*
2204                                                  * For datagram protocols,
2205                                                  * leave room for protocol
2206                                                  * headers in first mbuf.
2207                                                  */
2208                                                 if (atomic && top == NULL &&
2209                                                     bytes_to_copy < MHLEN) {
2210                                                         MH_ALIGN(freelist,
2211                                                             bytes_to_copy);
2212                                                 }
2213                                         }
2214                                         m = freelist;
2215                                         freelist = m->m_next;
2216                                         m->m_next = NULL;
2217
2218                                         if ((m->m_flags & M_EXT))
2219                                                 mlen = m->m_ext.ext_size -
2220                                                     m_leadingspace(m);
2221                                         else if ((m->m_flags & M_PKTHDR))
2222                                                 mlen =
2223                                                     MHLEN - m_leadingspace(m);
2224                                         else
2225                                                 mlen = MLEN - m_leadingspace(m);
2226                                         len = imin(mlen, bytes_to_copy);
2227
2228                                         chainlength += len;
2229
2230                                         space -= len;
2231
2232                                         error = uiomove(mtod(m, caddr_t),
2233                                             len, uio);
2234
2235                                         resid = uio_resid(uio);
2236
2237                                         m->m_len = len;
2238                                         *mp = m;
2239                                         top->m_pkthdr.len += len;
2240                                         if (error)
2241                                                 break;
2242                                         mp = &m->m_next;
2243                                         if (resid <= 0) {
2244                                                 if (flags & MSG_EOR)
2245                                                         top->m_flags |= M_EOR;
2246                                                 break;
2247                                         }
2248                                         bytes_to_copy = min(resid, space);
2249
2250                                 } while (space > 0 &&
2251                                     (chainlength < sosendmaxchain || atomic ||
2252                                     resid < MINCLSIZE));
2253
2254                                 socket_lock(so, 0);
2255
2256                                 if (error)
2257                                         goto release;
2258                         }
2259
2260                         if (flags & (MSG_HOLD|MSG_SEND)) {
2261                                 /* Enqueue for later, go away if HOLD */
2262                                 struct mbuf *mb1;
2263                                 if (so->so_temp && (flags & MSG_FLUSH)) {
2264                                         m_freem(so->so_temp);
2265                                         so->so_temp = NULL;
2266                                 }
2267                                 if (so->so_temp)
2268                                         so->so_tail->m_next = top;
2269                                 else
2270                                         so->so_temp = top;
2271                                 mb1 = top;
2272                                 while (mb1->m_next)
2273                                         mb1 = mb1->m_next;
2274                                 so->so_tail = mb1;
2275                                 if (flags & MSG_HOLD) {
2276                                         top = NULL;
2277                                         goto release;
2278                                 }
2279                                 top = so->so_temp;
2280                         }
2281                         if (dontroute)
2282                                 so->so_options |= SO_DONTROUTE;
2283
2284                         /*
2285                          * Compute flags here, for pru_send and NKEs
2286                          *
2287                          * If the user set MSG_EOF, the protocol
2288                          * understands this flag and nothing left to
2289                          * send then use PRU_SEND_EOF instead of PRU_SEND.
2290                          */
2291                         sendflags = (flags & MSG_OOB) ? PRUS_OOB :
2292                             ((flags & MSG_EOF) &&
2293                             (so->so_proto->pr_flags & PR_IMPLOPCL) &&
2294                             (resid <= 0)) ? PRUS_EOF :
2295                             /* If there is more to send set PRUS_MORETOCOME */
2296                             (resid > 0 && space > 0) ? PRUS_MORETOCOME : 0;
2297
2298                         if ((flags & MSG_SKIPCFIL) == 0) {
2299                                 /*
2300                                  * Socket filter processing
2301                                  */
2302                                 error = sflt_data_out(so, addr, &top,
2303                                     &control, (sendflags & MSG_OOB) ?
2304                                     sock_data_filt_flag_oob : 0);
2305                                 if (error) {
2306                                         if (error == EJUSTRETURN) {
2307                                                 error = 0;
2308                                                 clen = 0;
2309                                                 control = NULL;
2310                                                 top = NULL;
2311                                         }
2312                                         goto release;
2313                                 }
2314 #if CONTENT_FILTER
2315                                 /*
2316                                  * Content filter processing
2317                                  */
2318                                 error = cfil_sock_data_out(so, addr, top,
2319                                     control, (sendflags & MSG_OOB) ?
2320                                     sock_data_filt_flag_oob : 0);
2321                                 if (error) {
2322                                         if (error == EJUSTRETURN) {
2323                                                 error = 0;
2324                                                 clen = 0;
2325                                                 control = NULL;
2326                                                 top = NULL;
2327                                                 }
2328                                         goto release;
2329                                 }
2330 #endif /* CONTENT_FILTER */
2331                         }
2332                         if (so->so_flags & SOF_ENABLE_MSGS) {
2333                                 /*
2334                                  * Make a copy of control mbuf,
2335                                  * so that msg priority can be
2336                                  * passed to subsequent mbufs.
2337                                  */
2338                                 control_copy = m_dup(control, M_NOWAIT);
2339                         }
2340                         error = (*so->so_proto->pr_usrreqs->pru_send)
2341                             (so, sendflags, top, addr, control, p);
2342
2343                         if (flags & MSG_SEND)
2344                                 so->so_temp = NULL;
2345
2346                         if (dontroute)
2347                                 so->so_options &= ~SO_DONTROUTE;
2348
2349                         clen = 0;
2350                         control = control_copy;
2351                         control_copy = NULL;
2352                         top = NULL;
2353                         mp = &top;
2354                         if (error)
2355                                 goto release;
2356                 } while (resid && space > 0);
2357         } while (resid);
2358
2359 release:
2360         if (sblocked)
2361                 sbunlock(&so->so_snd, FALSE);   /* will unlock socket */
2362         else
2363                 socket_unlock(so, 1);
2364 out:
2365         if (top != NULL)
2366                 m_freem(top);
2367         if (control != NULL)
2368                 m_freem(control);
2369         if (freelist != NULL)
2370                 m_freem_list(freelist);
2371         if (control_copy != NULL)
2372                 m_freem(control_copy);
2373
2374         /*
2375          * One write has been done. This was enough. Get back to "normal"
2376          * behavior.
2377          */
2378         if (so->so_flags1 & SOF1_PRECONNECT_DATA)
2379                 so->so_flags1 &= ~SOF1_PRECONNECT_DATA;
2380
2381         if (en_tracing) {
2382                 /* resid passed here is the bytes left in uio */
2383                 KERNEL_ENERGYTRACE(kEnTrActKernSockWrite, DBG_FUNC_END,
2384                     VM_KERNEL_ADDRPERM(so),
2385                     ((error == EWOULDBLOCK) ? kEnTrFlagNoWork : 0),
2386                     (int64_t)(orig_resid - resid));
2387         }
2388         KERNEL_DEBUG(DBG_FNC_SOSEND | DBG_FUNC_END, so, resid,
2389             so->so_snd.sb_cc, space, error);
2390
2391         return (error);
2392 }
2393
2394 /*
2395  * Supported only connected sockets (no address) without ancillary data
2396  * (control mbuf) for atomic protocols
2397  */
2398 int
2399 sosend_list(struct socket *so, struct uio **uioarray, u_int uiocnt, int flags)
2400 {
2401         struct mbuf *m, *freelist = NULL;
2402         user_ssize_t len, resid;
2403         int error, dontroute, mlen;
2404         int atomic = sosendallatonce(so);
2405         int sblocked = 0;
2406         struct proc *p = current_proc();
2407         u_int uiofirst = 0;
2408         u_int uiolast = 0;
2409         struct mbuf *top = NULL;
2410         uint16_t headroom = 0;
2411         boolean_t bigcl;
2412
2413         KERNEL_DEBUG((DBG_FNC_SOSEND_LIST | DBG_FUNC_START), so, uiocnt,
2414             so->so_snd.sb_cc, so->so_snd.sb_lowat, so->so_snd.sb_hiwat);
2415
2416         if (so->so_type != SOCK_DGRAM) {
2417                 error = EINVAL;
2418                 goto out;
2419         }
2420         if (atomic == 0) {
2421                 error = EINVAL;
2422                 goto out;
2423         }
2424         if (so->so_proto->pr_usrreqs->pru_send_list == NULL) {
2425                 error = EPROTONOSUPPORT;
2426                 goto out;
2427         }
2428         if (flags & ~(MSG_DONTWAIT | MSG_NBIO)) {
2429                 error = EINVAL;
2430                 goto out;
2431         }
2432         resid = uio_array_resid(uioarray, uiocnt);
2433
2434         /*
2435          * In theory resid should be unsigned.
2436          * However, space must be signed, as it might be less than 0
2437          * if we over-committed, and we must use a signed comparison
2438          * of space and resid.  On the other hand, a negative resid
2439          * causes us to loop sending 0-length segments to the protocol.
2440          *
2441          * Note: We limit resid to be a positive int value as we use
2442          * imin() to set bytes_to_copy -- radr://14558484
2443          */
2444         if (resid < 0 || resid > INT_MAX) {
2445                 error = EINVAL;
2446                 goto out;
2447         }
2448
2449         socket_lock(so, 1);
2450         so_update_last_owner_locked(so, p);
2451         so_update_policy(so);
2452
2453 #if NECP
2454         so_update_necp_policy(so, NULL, NULL);
2455 #endif /* NECP */
2456
2457         dontroute = (flags & MSG_DONTROUTE) &&
2458             (so->so_options & SO_DONTROUTE) == 0 &&
2459             (so->so_proto->pr_flags & PR_ATOMIC);
2460         OSIncrementAtomicLong(&p->p_stats->p_ru.ru_msgsnd);
2461
2462         error = sosendcheck(so, NULL, resid, 0, atomic, flags,
2463             &sblocked, NULL);
2464         if (error)
2465                 goto release;
2466
2467         /*
2468          * Use big 4 KB clusters when the outgoing interface does not prefer
2469          * 2 KB clusters
2470          */
2471         bigcl = !(so->so_flags1 & SOF1_IF_2KCL) || sosendbigcl_ignore_capab;
2472
2473         if (soreserveheadroom != 0)
2474                 headroom = so->so_pktheadroom;
2475
2476         do {
2477                 int i;
2478                 int num_needed = 0;
2479                 int chainlength;
2480                 size_t maxpktlen = 0;
2481                 int bytes_to_alloc;
2482
2483                 if (sosendminchain > 0)
2484                         chainlength = 0;
2485                 else
2486                         chainlength = sosendmaxchain;
2487
2488                 socket_unlock(so, 0);
2489
2490                 /*
2491                  * Find a set of uio that fit in a reasonable number
2492                  * of mbuf packets
2493                  */
2494                 for (i = uiofirst; i < uiocnt; i++) {
2495                         struct uio *auio = uioarray[i];
2496
2497                         len = uio_resid(auio);
2498
2499                         /* Do nothing for empty messages */
2500                         if (len == 0)
2501                                 continue;
2502
2503                         num_needed += 1;
2504                         uiolast += 1;
2505
2506                         if (len > maxpktlen)
2507                                 maxpktlen = len;
2508
2509                         chainlength += len;
2510                         if (chainlength > sosendmaxchain)
2511                                 break;
2512                 }
2513                 /*
2514                  * Nothing left to send
2515                  */
2516                 if (num_needed == 0) {
2517                         socket_lock(so, 0);
2518                         break;
2519                 }
2520                 /*
2521                  * Allocate buffer large enough to include headroom space for
2522                  * network and link header
2523                  *
2524                  */
2525                 bytes_to_alloc = maxpktlen + headroom;
2526
2527                 /*
2528                  * Allocate a single contiguous buffer of the smallest available
2529                  * size when possible
2530                  */
2531                 if (bytes_to_alloc > MCLBYTES &&
2532                     bytes_to_alloc <= MBIGCLBYTES && bigcl) {
2533                         freelist = m_getpackets_internal(
2534                             (unsigned int *)&num_needed,
2535                             num_needed, M_WAIT, 1,
2536                             MBIGCLBYTES);
2537                 } else if (bytes_to_alloc > _MHLEN &&
2538                     bytes_to_alloc <= MCLBYTES) {
2539                         freelist = m_getpackets_internal(
2540                             (unsigned int *)&num_needed,
2541                             num_needed, M_WAIT, 1,
2542                             MCLBYTES);
2543                 } else {
2544                         freelist = m_allocpacket_internal(
2545                             (unsigned int *)&num_needed,
2546                             bytes_to_alloc, NULL, M_WAIT, 1, 0);
2547                 }
2548
2549                 if (freelist == NULL) {
2550                         socket_lock(so, 0);
2551                         error = ENOMEM;
2552                         goto release;
2553                 }
2554                 /*
2555                  * Copy each uio of the set into its own mbuf packet
2556                  */
2557                 for (i = uiofirst, m = freelist;
2558                     i < uiolast && m != NULL;
2559                     i++) {
2560                         int bytes_to_copy;
2561                         struct mbuf *n;
2562                         struct uio *auio = uioarray[i];
2563
2564                         bytes_to_copy = uio_resid(auio);
2565
2566                         /* Do nothing for empty messages */
2567                         if (bytes_to_copy == 0)
2568                                 continue;
2569                         /*
2570                          * Leave headroom for protocol headers
2571                          * in the first mbuf of the chain
2572                          */
2573                         m->m_data += headroom;
2574
2575                         for (n = m; n != NULL; n = n->m_next) {
2576                                 if ((m->m_flags & M_EXT))
2577                                         mlen = m->m_ext.ext_size -
2578                                             m_leadingspace(m);
2579                                 else if ((m->m_flags & M_PKTHDR))
2580                                         mlen =
2581                                             MHLEN - m_leadingspace(m);
2582                                 else
2583                                         mlen = MLEN - m_leadingspace(m);
2584                                 len = imin(mlen, bytes_to_copy);
2585
2586                                 /*
2587                                  * Note: uiomove() decrements the iovec
2588                                  * length
2589                                  */
2590                                 error = uiomove(mtod(n, caddr_t),
2591                                     len, auio);
2592                                 if (error != 0)
2593                                         break;
2594                                 n->m_len = len;
2595                                 m->m_pkthdr.len += len;
2596
2597                                 VERIFY(m->m_pkthdr.len <= maxpktlen);
2598
2599                                 bytes_to_copy -= len;
2600                                 resid -= len;
2601                         }
2602                         if (m->m_pkthdr.len == 0) {
2603                                 printf(
2604                                     "%s:%d so %llx pkt %llx type %u len null\n",
2605                                     __func__, __LINE__,
2606                                     (uint64_t)DEBUG_KERNEL_ADDRPERM(so),
2607                                     (uint64_t)DEBUG_KERNEL_ADDRPERM(m),
2608                                     m->m_type);
2609                         }
2610                         if (error != 0)
2611                                 break;
2612                         m = m->m_nextpkt;
2613                 }
2614
2615                 socket_lock(so, 0);
2616
2617                 if (error)
2618                         goto release;
2619                 top = freelist;
2620                 freelist = NULL;
2621
2622                 if (dontroute)
2623                         so->so_options |= SO_DONTROUTE;
2624
2625                 if ((flags & MSG_SKIPCFIL) == 0) {
2626                         struct mbuf **prevnextp = NULL;
2627
2628                         for (i = uiofirst, m = top;
2629                             i < uiolast && m != NULL;
2630                             i++) {
2631                                 struct mbuf *nextpkt = m->m_nextpkt;
2632
2633                                 /*
2634                                  * Socket filter processing
2635                                  */
2636                                 error = sflt_data_out(so, NULL, &m,
2637                                     NULL, 0);
2638                                 if (error != 0 && error != EJUSTRETURN)
2639                                         goto release;
2640
2641 #if CONTENT_FILTER
2642                                 if (error == 0) {
2643                                         /*
2644                                          * Content filter processing
2645                                          */
2646                                         error = cfil_sock_data_out(so, NULL, m,
2647                                             NULL, 0);
2648                                         if (error != 0 && error != EJUSTRETURN)
2649                                                 goto release;
2650                                 }
2651 #endif /* CONTENT_FILTER */
2652                                 /*
2653                                  * Remove packet from the list when
2654                                  * swallowed by a filter
2655                                  */
2656                                 if (error == EJUSTRETURN) {
2657                                         error = 0;
2658                                         if (prevnextp != NULL)
2659                                                 *prevnextp = nextpkt;
2660                                         else
2661                                                 top = nextpkt;
2662                                 }
2663
2664                                 m = nextpkt;
2665                                 if (m != NULL)
2666                                         prevnextp = &m->m_nextpkt;
2667                         }
2668                 }
2669                 if (top != NULL)
2670                         error = (*so->so_proto->pr_usrreqs->pru_send_list)
2671                             (so, 0, top, NULL, NULL, p);
2672
2673                 if (dontroute)
2674                         so->so_options &= ~SO_DONTROUTE;
2675
2676                 top = NULL;
2677                 uiofirst = uiolast;
2678         } while (resid > 0 && error == 0);
2679 release:
2680         if (sblocked)
2681                 sbunlock(&so->so_snd, FALSE);   /* will unlock socket */
2682         else
2683                 socket_unlock(so, 1);
2684 out:
2685         if (top != NULL)
2686                 m_freem(top);
2687         if (freelist != NULL)
2688                 m_freem_list(freelist);
2689
2690         KERNEL_DEBUG(DBG_FNC_SOSEND_LIST | DBG_FUNC_END, so, resid,
2691             so->so_snd.sb_cc, 0, error);
2692
2693         return (error);
2694 }
2695
2696 /*
2697  * May return ERESTART when packet is dropped by MAC policy check
2698  */
2699 static int
2700 soreceive_addr(struct proc *p, struct socket *so, struct sockaddr **psa,
2701     int flags, struct mbuf **mp, struct mbuf **nextrecordp, int canwait)
2702 {
2703         int error = 0;
2704         struct mbuf *m = *mp;
2705         struct mbuf *nextrecord = *nextrecordp;
2706
2707         KASSERT(m->m_type == MT_SONAME, ("receive 1a"));
2708 #if CONFIG_MACF_SOCKET_SUBSET
2709         /*
2710          * Call the MAC framework for policy checking if we're in
2711          * the user process context and the socket isn't connected.
2712          */
2713         if (p != kernproc && !(so->so_state & SS_ISCONNECTED)) {
2714                 struct mbuf *m0 = m;
2715                 /*
2716                  * Dequeue this record (temporarily) from the receive
2717                  * list since we're about to drop the socket's lock
2718                  * where a new record may arrive and be appended to
2719                  * the list.  Upon MAC policy failure, the record
2720                  * will be freed.  Otherwise, we'll add it back to
2721                  * the head of the list.  We cannot rely on SB_LOCK
2722                  * because append operation uses the socket's lock.
2723                  */
2724                 do {
2725                         m->m_nextpkt = NULL;
2726                         sbfree(&so->so_rcv, m);
2727                         m = m->m_next;
2728                 } while (m != NULL);
2729                 m = m0;
2730                 so->so_rcv.sb_mb = nextrecord;
2731                 SB_EMPTY_FIXUP(&so->so_rcv);
2732                 SBLASTRECORDCHK(&so->so_rcv, "soreceive 1a");
2733                 SBLASTMBUFCHK(&so->so_rcv, "soreceive 1a");
2734                 socket_unlock(so, 0);
2735
2736                 if (mac_socket_check_received(proc_ucred(p), so,
2737                     mtod(m, struct sockaddr *)) != 0) {
2738                         /*
2739                          * MAC policy failure; free this record and
2740                          * process the next record (or block until
2741                          * one is available).  We have adjusted sb_cc
2742                          * and sb_mbcnt above so there is no need to
2743                          * call sbfree() again.
2744                          */
2745                         m_freem(m);
2746                         /*
2747                          * Clear SB_LOCK but don't unlock the socket.
2748                          * Process the next record or wait for one.
2749                          */
2750                         socket_lock(so, 0);
2751                         sbunlock(&so->so_rcv, TRUE); /* stay locked */
2752                         error = ERESTART;
2753                         goto done;
2754                 }
2755                 socket_lock(so, 0);
2756                 /*
2757                  * If the socket has been defunct'd, drop it.
2758                  */
2759                 if (so->so_flags & SOF_DEFUNCT) {
2760                         m_freem(m);
2761                         error = ENOTCONN;
2762                         goto done;
2763                 }
2764                 /*
2765                  * Re-adjust the socket receive list and re-enqueue
2766                  * the record in front of any packets which may have
2767                  * been appended while we dropped the lock.
2768                  */
2769                 for (m = m0; m->m_next != NULL; m = m->m_next)
2770                         sballoc(&so->so_rcv, m);
2771                 sballoc(&so->so_rcv, m);
2772                 if (so->so_rcv.sb_mb == NULL) {
2773                         so->so_rcv.sb_lastrecord = m0;
2774                         so->so_rcv.sb_mbtail = m;
2775                 }
2776                 m = m0;
2777                 nextrecord = m->m_nextpkt = so->so_rcv.sb_mb;
2778                 so->so_rcv.sb_mb = m;
2779                 SBLASTRECORDCHK(&so->so_rcv, "soreceive 1b");
2780                 SBLASTMBUFCHK(&so->so_rcv, "soreceive 1b");
2781         }
2782 #endif /* CONFIG_MACF_SOCKET_SUBSET */
2783         if (psa != NULL) {
2784                 *psa = dup_sockaddr(mtod(m, struct sockaddr *), canwait);
2785                 if ((*psa == NULL) && (flags & MSG_NEEDSA)) {
2786                         error = EWOULDBLOCK;
2787                         goto done;
2788                 }
2789         }
2790         if (flags & MSG_PEEK) {
2791                 m = m->m_next;
2792         } else {
2793                 sbfree(&so->so_rcv, m);
2794                 if (m->m_next == NULL && so->so_rcv.sb_cc != 0) {
2795                         panic("%s: about to create invalid socketbuf",
2796                             __func__);
2797                         /* NOTREACHED */
2798                 }
2799                 MFREE(m, so->so_rcv.sb_mb);
2800                 m = so->so_rcv.sb_mb;
2801                 if (m != NULL) {
2802                         m->m_nextpkt = nextrecord;
2803                 } else {
2804                         so->so_rcv.sb_mb = nextrecord;
2805                         SB_EMPTY_FIXUP(&so->so_rcv);
2806                 }
2807         }
2808 done:
2809         *mp = m;
2810         *nextrecordp = nextrecord;
2811
2812         return (error);
2813 }
2814
2815 /*
2816  * Process one or more MT_CONTROL mbufs present before any data mbufs
2817  * in the first mbuf chain on the socket buffer.  If MSG_PEEK, we
2818  * just copy the data; if !MSG_PEEK, we call into the protocol to
2819  * perform externalization.
2820  */
2821 static int
2822 soreceive_ctl(struct socket *so, struct mbuf **controlp, int flags,
2823     struct mbuf **mp, struct mbuf **nextrecordp)
2824 {
2825         int error = 0;
2826         struct mbuf *cm = NULL, *cmn;
2827         struct mbuf **cme = &cm;
2828         struct sockbuf *sb_rcv = &so->so_rcv;
2829         struct mbuf **msgpcm = NULL;
2830         struct mbuf *m = *mp;
2831         struct mbuf *nextrecord = *nextrecordp;
2832         struct protosw *pr = so->so_proto;
2833
2834         /*
2835          * Externalizing the control messages would require us to
2836          * drop the socket's lock below.  Once we re-acquire the
2837          * lock, the mbuf chain might change.  In order to preserve
2838          * consistency, we unlink all control messages from the
2839          * first mbuf chain in one shot and link them separately
2840          * onto a different chain.
2841          */
2842         do {
2843                 if (flags & MSG_PEEK) {
2844                         if (controlp != NULL) {
2845                                 if (*controlp == NULL) {
2846                                         msgpcm = controlp;
2847                                 }
2848                                 *controlp = m_copy(m, 0, m->m_len);
2849
2850                                 /*
2851                                  * If we failed to allocate an mbuf,
2852                                  * release any previously allocated
2853                                  * mbufs for control data. Return
2854                                  * an error. Keep the mbufs in the
2855                                  * socket as this is using
2856                                  * MSG_PEEK flag.
2857                                  */
2858                                 if (*controlp == NULL) {
2859                                         m_freem(*msgpcm);
2860                                         error = ENOBUFS;
2861                                         goto done;
2862                                 }
2863                                 controlp = &(*controlp)->m_next;
2864                         }
2865                         m = m->m_next;
2866                 } else {
2867                         m->m_nextpkt = NULL;
2868                         sbfree(sb_rcv, m);
2869                         sb_rcv->sb_mb = m->m_next;
2870                         m->m_next = NULL;
2871                         *cme = m;
2872                         cme = &(*cme)->m_next;
2873                         m = sb_rcv->sb_mb;
2874                 }
2875         } while (m != NULL && m->m_type == MT_CONTROL);
2876
2877         if (!(flags & MSG_PEEK)) {
2878                 if (sb_rcv->sb_mb != NULL) {
2879                         sb_rcv->sb_mb->m_nextpkt = nextrecord;
2880                 } else {
2881                         sb_rcv->sb_mb = nextrecord;
2882                         SB_EMPTY_FIXUP(sb_rcv);
2883                 }
2884                 if (nextrecord == NULL)
2885                         sb_rcv->sb_lastrecord = m;
2886         }
2887
2888         SBLASTRECORDCHK(&so->so_rcv, "soreceive ctl");
2889         SBLASTMBUFCHK(&so->so_rcv, "soreceive ctl");
2890
2891         while (cm != NULL) {
2892                 int cmsg_type;
2893
2894                 cmn = cm->m_next;
2895                 cm->m_next = NULL;
2896                 cmsg_type = mtod(cm, struct cmsghdr *)->cmsg_type;
2897
2898                 /*
2899                  * Call the protocol to externalize SCM_RIGHTS message
2900                  * and return the modified message to the caller upon
2901                  * success.  Otherwise, all other control messages are
2902                  * returned unmodified to the caller.  Note that we
2903                  * only get into this loop if MSG_PEEK is not set.
2904                  */
2905                 if (pr->pr_domain->dom_externalize != NULL &&
2906                     cmsg_type == SCM_RIGHTS) {
2907                         /*
2908                          * Release socket lock: see 3903171.  This
2909                          * would also allow more records to be appended
2910                          * to the socket buffer.  We still have SB_LOCK
2911                          * set on it, so we can be sure that the head
2912                          * of the mbuf chain won't change.
2913                          */
2914                         socket_unlock(so, 0);
2915                         error = (*pr->pr_domain->dom_externalize)(cm);
2916                         socket_lock(so, 0);
2917                 } else {
2918                         error = 0;
2919                 }
2920
2921                 if (controlp != NULL && error == 0) {
2922                         *controlp = cm;
2923                         controlp = &(*controlp)->m_next;
2924                 } else {
2925                         (void) m_free(cm);
2926                 }
2927                 cm = cmn;
2928         }
2929         /*
2930          * Update the value of nextrecord in case we received new
2931          * records when the socket was unlocked above for
2932          * externalizing SCM_RIGHTS.
2933          */
2934         if (m != NULL)
2935                 nextrecord = sb_rcv->sb_mb->m_nextpkt;
2936         else
2937                 nextrecord = sb_rcv->sb_mb;
2938
2939 done:
2940         *mp = m;
2941         *nextrecordp = nextrecord;
2942
2943         return (error);
2944 }
2945
2946 /*
2947  * Implement receive operations on a socket.
2948  * We depend on the way that records are added to the sockbuf
2949  * by sbappend*.  In particular, each record (mbufs linked through m_next)
2950  * must begin with an address if the protocol so specifies,
2951  * followed by an optional mbuf or mbufs containing ancillary data,
2952  * and then zero or more mbufs of data.
2953  * In order to avoid blocking network interrupts for the entire time here,
2954  * we splx() while doing the actual copy to user space.
2955  * Although the sockbuf is locked, new data may still be appended,
2956  * and thus we must maintain consistency of the sockbuf during that time.
2957  *
2958  * The caller may receive the data as a single mbuf chain by supplying
2959  * an mbuf **mp0 for use in returning the chain.  The uio is then used
2960  * only for the count in uio_resid.
2961  *
2962  * Returns:     0                       Success
2963  *              ENOBUFS
2964  *              ENOTCONN
2965  *              EWOULDBLOCK
2966  *      uiomove:EFAULT
2967  *      sblock:EWOULDBLOCK
2968  *      sblock:EINTR
2969  *      sbwait:EBADF
2970  *      sbwait:EINTR
2971  *      sodelayed_copy:EFAULT
2972  *      <pru_rcvoob>:EINVAL[TCP]
2973  *      <pru_rcvoob>:EWOULDBLOCK[TCP]
2974  *      <pru_rcvoob>:???
2975  *      <pr_domain->dom_externalize>:EMSGSIZE[AF_UNIX]
2976  *      <pr_domain->dom_externalize>:ENOBUFS[AF_UNIX]
2977  *      <pr_domain->dom_externalize>:???
2978  *
2979  * Notes:       Additional return values from calls through <pru_rcvoob> and
2980  *              <pr_domain->dom_externalize> depend on protocols other than
2981  *              TCP or AF_UNIX, which are documented above.
2982  */
2983 int
2984 soreceive(struct socket *so, struct sockaddr **psa, struct uio *uio,
2985     struct mbuf **mp0, struct mbuf **controlp, int *flagsp)
2986 {
2987         struct mbuf *m, **mp, *ml = NULL;
2988         struct mbuf *nextrecord, *free_list;
2989         int flags, error, offset;
2990         user_ssize_t len;
2991         struct protosw *pr = so->so_proto;
2992         int moff, type = 0;
2993         user_ssize_t orig_resid = uio_resid(uio);
2994         user_ssize_t delayed_copy_len;
2995         int can_delay;
2996         int need_event;
2997         struct proc *p = current_proc();
2998         boolean_t en_tracing = FALSE;
2999
3000         /*
3001          * Sanity check on the length passed by caller as we are making 'int'
3002          * comparisons
3003          */
3004         if (orig_resid < 0 || orig_resid > INT_MAX)
3005                 return (EINVAL);
3006
3007         KERNEL_DEBUG(DBG_FNC_SORECEIVE | DBG_FUNC_START, so,
3008             uio_resid(uio), so->so_rcv.sb_cc, so->so_rcv.sb_lowat,
3009             so->so_rcv.sb_hiwat);
3010
3011         socket_lock(so, 1);
3012         so_update_last_owner_locked(so, p);
3013         so_update_policy(so);
3014
3015 #ifdef MORE_LOCKING_DEBUG
3016         if (so->so_usecount == 1) {
3017                 panic("%s: so=%x no other reference on socket\n", __func__, so);
3018                 /* NOTREACHED */
3019         }
3020 #endif
3021         mp = mp0;
3022         if (psa != NULL)
3023                 *psa = NULL;
3024         if (controlp != NULL)
3025                 *controlp = NULL;
3026         if (flagsp != NULL)
3027                 flags = *flagsp &~ MSG_EOR;
3028         else
3029                 flags = 0;
3030
3031         /*
3032          * If a recv attempt is made on a previously-accepted socket
3033          * that has been marked as inactive (disconnected), reject
3034          * the request.
3035          */
3036         if (so->so_flags & SOF_DEFUNCT) {
3037                 struct sockbuf *sb = &so->so_rcv;
3038
3039                 error = ENOTCONN;
3040                 SODEFUNCTLOG("%s[%d, %s]: defunct so 0x%llx [%d,%d] (%d)\n",
3041                     __func__, proc_pid(p), proc_best_name(p),
3042                     (uint64_t)DEBUG_KERNEL_ADDRPERM(so),
3043                     SOCK_DOM(so), SOCK_TYPE(so), error);
3044                 /*
3045                  * This socket should have been disconnected and flushed
3046                  * prior to being returned from sodefunct(); there should
3047                  * be no data on its receive list, so panic otherwise.
3048                  */
3049                 if (so->so_state & SS_DEFUNCT)
3050                         sb_empty_assert(sb, __func__);
3051                 socket_unlock(so, 1);
3052                 return (error);
3053         }
3054
3055         if ((so->so_flags1 & SOF1_PRECONNECT_DATA) &&
3056             pr->pr_usrreqs->pru_preconnect) {
3057                 /*
3058                  * A user may set the CONNECT_RESUME_ON_READ_WRITE-flag but not
3059                  * calling write() right after this. *If* the app calls a read
3060                  * we do not want to block this read indefinetely. Thus,
3061                  * we trigger a connect so that the session gets initiated.
3062                  */
3063                 error = (*pr->pr_usrreqs->pru_preconnect)(so);
3064
3065                 if (error) {
3066                         socket_unlock(so, 1);
3067                         return (error);
3068                 }
3069         }
3070
3071         if (ENTR_SHOULDTRACE &&
3072             (SOCK_CHECK_DOM(so, AF_INET) || SOCK_CHECK_DOM(so, AF_INET6))) {
3073                 /*
3074                  * enable energy tracing for inet sockets that go over
3075                  * non-loopback interfaces only.
3076                  */
3077                 struct inpcb *inp = sotoinpcb(so);
3078                 if (inp->inp_last_outifp != NULL &&
3079                     !(inp->inp_last_outifp->if_flags & IFF_LOOPBACK)) {
3080                         en_tracing = TRUE;
3081                         KERNEL_ENERGYTRACE(kEnTrActKernSockRead, DBG_FUNC_START,
3082                             VM_KERNEL_ADDRPERM(so),
3083                             ((so->so_state & SS_NBIO) ?
3084                             kEnTrFlagNonBlocking : 0),
3085                             (int64_t)orig_resid);
3086                 }
3087         }
3088
3089         /*
3090          * When SO_WANTOOBFLAG is set we try to get out-of-band data
3091          * regardless of the flags argument. Here is the case were
3092          * out-of-band data is not inline.
3093          */
3094         if ((flags & MSG_OOB) ||
3095             ((so->so_options & SO_WANTOOBFLAG) != 0 &&
3096             (so->so_options & SO_OOBINLINE) == 0 &&
3097             (so->so_oobmark || (so->so_state & SS_RCVATMARK)))) {
3098                 m = m_get(M_WAIT, MT_DATA);
3099                 if (m == NULL) {
3100                         socket_unlock(so, 1);
3101                         KERNEL_DEBUG(DBG_FNC_SORECEIVE | DBG_FUNC_END,
3102                             ENOBUFS, 0, 0, 0, 0);
3103                         return (ENOBUFS);
3104                 }
3105                 error = (*pr->pr_usrreqs->pru_rcvoob)(so, m, flags & MSG_PEEK);
3106                 if (error)
3107                         goto bad;
3108                 socket_unlock(so, 0);
3109                 do {
3110                         error = uiomove(mtod(m, caddr_t),
3111                             imin(uio_resid(uio), m->m_len), uio);
3112                         m = m_free(m);
3113                 } while (uio_resid(uio) && error == 0 && m != NULL);
3114                 socket_lock(so, 0);
3115 bad:
3116                 if (m != NULL)
3117                         m_freem(m);
3118
3119                 if ((so->so_options & SO_WANTOOBFLAG) != 0) {
3120                         if (error == EWOULDBLOCK || error == EINVAL) {
3121                                 /*
3122                                  * Let's try to get normal data:
3123                                  * EWOULDBLOCK: out-of-band data not
3124                                  * receive yet. EINVAL: out-of-band data
3125                                  * already read.
3126                                  */
3127                                 error = 0;
3128                                 goto nooob;
3129                         } else if (error == 0 && flagsp != NULL) {
3130                                 *flagsp |= MSG_OOB;
3131                         }
3132                 }
3133                 socket_unlock(so, 1);
3134                 if (en_tracing) {
3135                         KERNEL_ENERGYTRACE(kEnTrActKernSockRead, DBG_FUNC_END,
3136                             VM_KERNEL_ADDRPERM(so), 0,
3137                             (int64_t)(orig_resid - uio_resid(uio)));
3138                 }
3139                 KERNEL_DEBUG(DBG_FNC_SORECEIVE | DBG_FUNC_END, error,
3140                     0, 0, 0, 0);
3141
3142                 return (error);
3143         }
3144 nooob:
3145         if (mp != NULL)
3146                 *mp = NULL;
3147
3148         if (so->so_state & SS_ISCONFIRMING && uio_resid(uio)) {
3149                 (*pr->pr_usrreqs->pru_rcvd)(so, 0);
3150         }
3151
3152         free_list = NULL;
3153         delayed_copy_len = 0;
3154 restart:
3155 #ifdef MORE_LOCKING_DEBUG
3156         if (so->so_usecount <= 1)
3157                 printf("soreceive: sblock so=0x%llx ref=%d on socket\n",
3158                     (uint64_t)DEBUG_KERNEL_ADDRPERM(so), so->so_usecount);
3159 #endif
3160         /*
3161          * See if the socket has been closed (SS_NOFDREF|SS_CANTRCVMORE)
3162          * and if so just return to the caller.  This could happen when
3163          * soreceive() is called by a socket upcall function during the
3164          * time the socket is freed.  The socket buffer would have been
3165          * locked across the upcall, therefore we cannot put this thread
3166          * to sleep (else we will deadlock) or return EWOULDBLOCK (else
3167          * we may livelock), because the lock on the socket buffer will
3168          * only be released when the upcall routine returns to its caller.
3169          * Because the socket has been officially closed, there can be
3170          * no further read on it.
3171          *
3172          * A multipath subflow socket would have its SS_NOFDREF set by
3173          * default, so check for SOF_MP_SUBFLOW socket flag; when the
3174          * socket is closed for real, SOF_MP_SUBFLOW would be cleared.
3175          */
3176         if ((so->so_state & (SS_NOFDREF | SS_CANTRCVMORE)) ==
3177             (SS_NOFDREF | SS_CANTRCVMORE) && !(so->so_flags & SOF_MP_SUBFLOW)) {
3178                 socket_unlock(so, 1);
3179                 return (0);
3180         }
3181
3182         error = sblock(&so->so_rcv, SBLOCKWAIT(flags));
3183         if (error) {
3184                 socket_unlock(so, 1);
3185                 KERNEL_DEBUG(DBG_FNC_SORECEIVE | DBG_FUNC_END, error,
3186                     0, 0, 0, 0);
3187                 if (en_tracing) {
3188                         KERNEL_ENERGYTRACE(kEnTrActKernSockRead, DBG_FUNC_END,
3189                             VM_KERNEL_ADDRPERM(so), 0,
3190                             (int64_t)(orig_resid - uio_resid(uio)));
3191                 }
3192                 return (error);
3193         }
3194
3195         m = so->so_rcv.sb_mb;
3196         /*
3197          * If we have less data than requested, block awaiting more
3198          * (subject to any timeout) if:
3199          *   1. the current count is less than the low water mark, or
3200          *   2. MSG_WAITALL is set, and it is possible to do the entire
3201          *      receive operation at once if we block (resid <= hiwat).
3202          *   3. MSG_DONTWAIT is not set
3203          * If MSG_WAITALL is set but resid is larger than the receive buffer,
3204          * we have to do the receive in sections, and thus risk returning
3205          * a short count if a timeout or signal occurs after we start.
3206          */
3207         if (m == NULL || (((flags & MSG_DONTWAIT) == 0 &&
3208             so->so_rcv.sb_cc < uio_resid(uio)) &&
3209             (so->so_rcv.sb_cc < so->so_rcv.sb_lowat ||
3210             ((flags & MSG_WAITALL) && uio_resid(uio) <= so->so_rcv.sb_hiwat)) &&
3211             m->m_nextpkt == NULL && (pr->pr_flags & PR_ATOMIC) == 0)) {
3212                 /*
3213                  * Panic if we notice inconsistencies in the socket's
3214                  * receive list; both sb_mb and sb_cc should correctly
3215                  * reflect the contents of the list, otherwise we may
3216                  * end up with false positives during select() or poll()
3217                  * which could put the application in a bad state.
3218                  */
3219                 SB_MB_CHECK(&so->so_rcv);
3220
3221                 if (so->so_error) {
3222                         if (m != NULL)
3223                                 goto dontblock;
3224                         error = so->so_error;
3225                         if ((flags & MSG_PEEK) == 0)
3226                                 so->so_error = 0;
3227                         goto release;
3228                 }
3229                 if (so->so_state & SS_CANTRCVMORE) {
3230 #if CONTENT_FILTER
3231                         /*
3232                          * Deal with half closed connections
3233                          */
3234                         if ((so->so_state & SS_ISDISCONNECTED) == 0 &&
3235                                 cfil_sock_data_pending(&so->so_rcv) != 0)
3236                                 CFIL_LOG(LOG_INFO,
3237                                         "so %llx ignore SS_CANTRCVMORE",
3238                                         (uint64_t)DEBUG_KERNEL_ADDRPERM(so));
3239                         else
3240 #endif /* CONTENT_FILTER */
3241                         if (m != NULL)
3242                                 goto dontblock;
3243                         else
3244                                 goto release;
3245                 }
3246                 for (; m != NULL; m = m->m_next)
3247                         if (m->m_type == MT_OOBDATA || (m->m_flags & M_EOR)) {
3248                                 m = so->so_rcv.sb_mb;
3249                                 goto dontblock;
3250                         }
3251                 if ((so->so_state & (SS_ISCONNECTED|SS_ISCONNECTING)) == 0 &&
3252                     (so->so_proto->pr_flags & PR_CONNREQUIRED)) {
3253                         error = ENOTCONN;
3254                         goto release;
3255                 }
3256                 if (uio_resid(uio) == 0)
3257                         goto release;
3258
3259                 if ((so->so_state & SS_NBIO) ||
3260                     (flags & (MSG_DONTWAIT|MSG_NBIO))) {
3261                         error = EWOULDBLOCK;
3262                         goto release;
3263                 }
3264                 SBLASTRECORDCHK(&so->so_rcv, "soreceive sbwait 1");
3265                 SBLASTMBUFCHK(&so->so_rcv, "soreceive sbwait 1");
3266                 sbunlock(&so->so_rcv, TRUE);    /* keep socket locked */
3267 #if EVEN_MORE_LOCKING_DEBUG
3268                 if (socket_debug)
3269                         printf("Waiting for socket data\n");
3270 #endif
3271
3272                 error = sbwait(&so->so_rcv);
3273 #if EVEN_MORE_LOCKING_DEBUG
3274                 if (socket_debug)
3275                         printf("SORECEIVE - sbwait returned %d\n", error);
3276 #endif
3277                 if (so->so_usecount < 1) {
3278                         panic("%s: after 2nd sblock so=%p ref=%d on socket\n",
3279                             __func__, so, so->so_usecount);
3280                         /* NOTREACHED */
3281                 }
3282                 if (error) {
3283                         socket_unlock(so, 1);
3284                         KERNEL_DEBUG(DBG_FNC_SORECEIVE | DBG_FUNC_END, error,
3285                             0, 0, 0, 0);
3286                         if (en_tracing) {
3287                                 KERNEL_ENERGYTRACE(kEnTrActKernSockRead, DBG_FUNC_END,
3288                                     VM_KERNEL_ADDRPERM(so), 0,
3289                                     (int64_t)(orig_resid - uio_resid(uio)));
3290                         }
3291                         return (error);
3292                 }
3293                 goto restart;
3294         }
3295 dontblock:
3296         OSIncrementAtomicLong(&p->p_stats->p_ru.ru_msgrcv);
3297         SBLASTRECORDCHK(&so->so_rcv, "soreceive 1");
3298         SBLASTMBUFCHK(&so->so_rcv, "soreceive 1");
3299         nextrecord = m->m_nextpkt;
3300
3301         if ((pr->pr_flags & PR_ADDR) && m->m_type == MT_SONAME) {
3302                 error = soreceive_addr(p, so, psa, flags, &m, &nextrecord,
3303                     mp0 == NULL);
3304                 if (error == ERESTART)
3305                         goto restart;
3306                 else if (error != 0)
3307                         goto release;
3308                 orig_resid = 0;
3309         }
3310
3311         /*
3312          * Process one or more MT_CONTROL mbufs present before any data mbufs
3313          * in the first mbuf chain on the socket buffer.  If MSG_PEEK, we
3314          * just copy the data; if !MSG_PEEK, we call into the protocol to
3315          * perform externalization.
3316          */
3317         if (m != NULL && m->m_type == MT_CONTROL) {
3318                 error = soreceive_ctl(so, controlp, flags, &m, &nextrecord);
3319                 if (error != 0)
3320                         goto release;
3321                 orig_resid = 0;
3322         }
3323
3324         /*
3325          * If the socket is a TCP socket with message delivery
3326          * enabled, then create a control msg to deliver the
3327          * relative TCP sequence number for this data. Waiting
3328          * until this point will protect against failures to
3329          * allocate an mbuf for control msgs.
3330          */
3331         if (so->so_type == SOCK_STREAM && SOCK_PROTO(so) == IPPROTO_TCP &&
3332             (so->so_flags & SOF_ENABLE_MSGS) && controlp != NULL) {
3333                 struct mbuf *seq_cm;
3334
3335                 seq_cm = sbcreatecontrol((caddr_t)&m->m_pkthdr.msg_seq,
3336                     sizeof (uint32_t), SCM_SEQNUM, SOL_SOCKET);
3337                 if (seq_cm == NULL) {
3338                         /* unable to allocate a control mbuf */
3339                         error = ENOBUFS;
3340                         goto release;
3341                 }
3342                 *controlp = seq_cm;
3343                 controlp = &seq_cm->m_next;
3344         }
3345
3346         if (m != NULL) {
3347                 if (!(flags & MSG_PEEK)) {
3348                         /*
3349                          * We get here because m points to an mbuf following
3350                          * any MT_SONAME or MT_CONTROL mbufs which have been
3351                          * processed above.  In any case, m should be pointing
3352                          * to the head of the mbuf chain, and the nextrecord
3353                          * should be either NULL or equal to m->m_nextpkt.
3354                          * See comments above about SB_LOCK.
3355                          */
3356                         if (m != so->so_rcv.sb_mb ||
3357                             m->m_nextpkt != nextrecord) {
3358                                 panic("%s: post-control !sync so=%p m=%p "
3359                                     "nextrecord=%p\n", __func__, so, m,
3360                                     nextrecord);
3361                                 /* NOTREACHED */
3362                         }
3363                         if (nextrecord == NULL)
3364                                 so->so_rcv.sb_lastrecord = m;
3365                 }
3366                 type = m->m_type;
3367                 if (type == MT_OOBDATA)
3368                         flags |= MSG_OOB;
3369         } else {
3370                 if (!(flags & MSG_PEEK)) {
3371                         SB_EMPTY_FIXUP(&so->so_rcv);
3372                 }
3373         }
3374         SBLASTRECORDCHK(&so->so_rcv, "soreceive 2");
3375         SBLASTMBUFCHK(&so->so_rcv, "soreceive 2");
3376
3377         moff = 0;
3378         offset = 0;
3379
3380         if (!(flags & MSG_PEEK) && uio_resid(uio) > sorecvmincopy)
3381                 can_delay = 1;
3382         else
3383                 can_delay = 0;
3384
3385         need_event = 0;
3386
3387         while (m != NULL &&
3388             (uio_resid(uio) - delayed_copy_len) > 0 && error == 0) {
3389                 if (m->m_type == MT_OOBDATA) {
3390                         if (type != MT_OOBDATA)
3391                                 break;
3392                 } else if (type == MT_OOBDATA) {
3393                         break;
3394                 }
3395                 /*
3396                  * Make sure to allways set MSG_OOB event when getting
3397                  * out of band data inline.
3398                  */
3399                 if ((so->so_options & SO_WANTOOBFLAG) != 0 &&
3400                     (so->so_options & SO_OOBINLINE) != 0 &&
3401                     (so->so_state & SS_RCVATMARK) != 0) {
3402                         flags |= MSG_OOB;
3403                 }
3404                 so->so_state &= ~SS_RCVATMARK;
3405                 len = uio_resid(uio) - delayed_copy_len;
3406                 if (so->so_oobmark && len > so->so_oobmark - offset)
3407                         len = so->so_oobmark - offset;
3408                 if (len > m->m_len - moff)
3409                         len = m->m_len - moff;
3410                 /*
3411                  * If mp is set, just pass back the mbufs.
3412                  * Otherwise copy them out via the uio, then free.
3413                  * Sockbuf must be consistent here (points to current mbuf,
3414                  * it points to next record) when we drop priority;
3415                  * we must note any additions to the sockbuf when we
3416                  * block interrupts again.
3417                  */
3418                 if (mp == NULL) {
3419                         SBLASTRECORDCHK(&so->so_rcv, "soreceive uiomove");
3420                         SBLASTMBUFCHK(&so->so_rcv, "soreceive uiomove");
3421                         if (can_delay && len == m->m_len) {
3422                                 /*
3423                                  * only delay the copy if we're consuming the
3424                                  * mbuf and we're NOT in MSG_PEEK mode
3425                                  * and we have enough data to make it worthwile
3426                                  * to drop and retake the lock... can_delay
3427                                  * reflects the state of the 2 latter
3428                                  * constraints moff should always be zero
3429                                  * in these cases
3430                                  */
3431                                 delayed_copy_len += len;
3432                         } else {
3433                                 if (delayed_copy_len) {
3434                                         error = sodelayed_copy(so, uio,
3435                                             &free_list, &delayed_copy_len);
3436
3437                                         if (error) {
3438                                                 goto release;
3439                                         }
3440                                         /*
3441                                          * can only get here if MSG_PEEK is not
3442                                          * set therefore, m should point at the
3443                                          * head of the rcv queue; if it doesn't,
3444                                          * it means something drastically
3445                                          * changed while we were out from behind
3446                                          * the lock in sodelayed_copy. perhaps
3447                                          * a RST on the stream. in any event,
3448                                          * the stream has been interrupted. it's
3449                                          * probably best just to return whatever
3450                                          * data we've moved and let the caller
3451                                          * sort it out...
3452                                          */
3453                                         if (m != so->so_rcv.sb_mb) {
3454                                                 break;
3455                                         }
3456                                 }
3457                                 socket_unlock(so, 0);
3458                                 error = uiomove(mtod(m, caddr_t) + moff,
3459                                     (int)len, uio);
3460                                 socket_lock(so, 0);
3461
3462                                 if (error)
3463                                         goto release;
3464                         }
3465                 } else {
3466                         uio_setresid(uio, (uio_resid(uio) - len));
3467                 }
3468                 if (len == m->m_len - moff) {
3469                         if (m->m_flags & M_EOR)
3470                                 flags |= MSG_EOR;
3471                         if (flags & MSG_PEEK) {
3472                                 m = m->m_next;
3473                                 moff = 0;
3474                         } else {
3475                                 nextrecord = m->m_nextpkt;
3476                                 sbfree(&so->so_rcv, m);
3477                                 m->m_nextpkt = NULL;
3478
3479                                 /*
3480                                  * If this packet is an unordered packet
3481                                  * (indicated by M_UNORDERED_DATA flag), remove
3482                                  * the additional bytes added to the
3483                                  * receive socket buffer size.
3484                                  */
3485                                 if ((so->so_flags & SOF_ENABLE_MSGS) &&
3486                                     m->m_len &&
3487                                     (m->m_flags & M_UNORDERED_DATA) &&
3488                                     sbreserve(&so->so_rcv,
3489                                     so->so_rcv.sb_hiwat - m->m_len)) {
3490                                         if (so->so_msg_state->msg_uno_bytes >
3491                                             m->m_len) {
3492                                                 so->so_msg_state->
3493                                                     msg_uno_bytes -= m->m_len;
3494                                         } else {
3495                                                 so->so_msg_state->
3496                                                     msg_uno_bytes = 0;
3497                                         }
3498                                         m->m_flags &= ~M_UNORDERED_DATA;
3499                                 }
3500
3501                                 if (mp != NULL) {
3502                                         *mp = m;
3503                                         mp = &m->m_next;
3504                                         so->so_rcv.sb_mb = m = m->m_next;
3505                                         *mp = NULL;
3506                                 } else {
3507                                         if (free_list == NULL)
3508                                                 free_list = m;
3509                                         else
3510                                                 ml->m_next = m;
3511                                         ml = m;
3512                                         so->so_rcv.sb_mb = m = m->m_next;
3513                                         ml->m_next = NULL;
3514                                 }
3515                                 if (m != NULL) {
3516                                         m->m_nextpkt = nextrecord;
3517                                         if (nextrecord == NULL)
3518                                                 so->so_rcv.sb_lastrecord = m;
3519                                 } else {
3520                                         so->so_rcv.sb_mb = nextrecord;
3521                                         SB_EMPTY_FIXUP(&so->so_rcv);
3522                                 }
3523                                 SBLASTRECORDCHK(&so->so_rcv, "soreceive 3");
3524                                 SBLASTMBUFCHK(&so->so_rcv, "soreceive 3");
3525                         }
3526                 } else {
3527                         if (flags & MSG_PEEK) {
3528                                 moff += len;
3529                         } else {
3530                                 if (mp != NULL) {
3531                                         int copy_flag;
3532
3533                                         if (flags & MSG_DONTWAIT)
3534                                                 copy_flag = M_DONTWAIT;
3535                                         else
3536                                                 copy_flag = M_WAIT;
3537                                         *mp = m_copym(m, 0, len, copy_flag);
3538                                         /*
3539                                          * Failed to allocate an mbuf?
3540                                          * Adjust uio_resid back, it was
3541                                          * adjusted down by len bytes which
3542                                          * we didn't copy over.
3543                                          */
3544                                         if (*mp == NULL) {
3545                                                 uio_setresid(uio,
3546                                                     (uio_resid(uio) + len));
3547                                                 break;
3548                                         }
3549                                 }
3550                                 m->m_data += len;
3551                                 m->m_len -= len;
3552                                 so->so_rcv.sb_cc -= len;
3553                         }
3554                 }
3555                 if (so->so_oobmark) {
3556                         if ((flags & MSG_PEEK) == 0) {
3557                                 so->so_oobmark -= len;
3558                                 if (so->so_oobmark == 0) {
3559                                         so->so_state |= SS_RCVATMARK;
3560                                         /*
3561                                          * delay posting the actual event until
3562                                          * after any delayed copy processing
3563                                          * has finished
3564                                          */
3565                                         need_event = 1;
3566                                         break;
3567                                 }
3568                         } else {
3569                                 offset += len;
3570                                 if (offset == so->so_oobmark)
3571                                         break;
3572                         }
3573                 }
3574                 if (flags & MSG_EOR)
3575                         break;
3576                 /*
3577                  * If the MSG_WAITALL or MSG_WAITSTREAM flag is set
3578                  * (for non-atomic socket), we must not quit until
3579                  * "uio->uio_resid == 0" or an error termination.
3580                  * If a signal/timeout occurs, return with a short
3581                  * count but without error.  Keep sockbuf locked
3582                  * against other readers.
3583                  */
3584                 while (flags & (MSG_WAITALL|MSG_WAITSTREAM) && m == NULL &&
3585                     (uio_resid(uio) - delayed_copy_len) > 0 &&
3586                     !sosendallatonce(so) && !nextrecord) {
3587                         if (so->so_error || ((so->so_state & SS_CANTRCVMORE)
3588 #if CONTENT_FILTER
3589                             && cfil_sock_data_pending(&so->so_rcv) == 0
3590 #endif /* CONTENT_FILTER */
3591                             ))
3592                                 goto release;
3593
3594                         /*
3595                          * Depending on the protocol (e.g. TCP), the following
3596                          * might cause the socket lock to be dropped and later
3597                          * be reacquired, and more data could have arrived and
3598                          * have been appended to the receive socket buffer by
3599                          * the time it returns.  Therefore, we only sleep in
3600                          * sbwait() below if and only if the socket buffer is
3601                          * empty, in order to avoid a false sleep.
3602                          */
3603                         if (pr->pr_flags & PR_WANTRCVD && so->so_pcb &&
3604                             (((struct inpcb *)so->so_pcb)->inp_state !=
3605                             INPCB_STATE_DEAD))
3606                                 (*pr->pr_usrreqs->pru_rcvd)(so, flags);
3607
3608                         SBLASTRECORDCHK(&so->so_rcv, "soreceive sbwait 2");
3609                         SBLASTMBUFCHK(&so->so_rcv, "soreceive sbwait 2");
3610
3611                         if (so->so_rcv.sb_mb == NULL && sbwait(&so->so_rcv)) {
3612                                 error = 0;
3613                                 goto release;
3614                         }
3615                         /*
3616                          * have to wait until after we get back from the sbwait
3617                          * to do the copy because we will drop the lock if we
3618                          * have enough data that has been delayed... by dropping
3619                          * the lock we open up a window allowing the netisr
3620                          * thread to process the incoming packets and to change
3621                          * the state of this socket... we're issuing the sbwait
3622                          * because the socket is empty and we're expecting the
3623                          * netisr thread to wake us up when more packets arrive;
3624                          * if we allow that processing to happen and then sbwait
3625                          * we could stall forever with packets sitting in the
3626                          * socket if no further packets arrive from the remote
3627                          * side.
3628                          *
3629                          * we want to copy before we've collected all the data
3630                          * to satisfy this request to allow the copy to overlap
3631                          * the incoming packet processing on an MP system
3632                          */
3633                         if (delayed_copy_len > sorecvmincopy &&
3634                             (delayed_copy_len > (so->so_rcv.sb_hiwat / 2))) {
3635                                 error = sodelayed_copy(so, uio,
3636                                     &free_list, &delayed_copy_len);
3637
3638                                 if (error)
3639                                         goto release;
3640                         }
3641                         m = so->so_rcv.sb_mb;
3642                         if (m != NULL) {
3643                                 nextrecord = m->m_nextpkt;
3644                         }
3645                         SB_MB_CHECK(&so->so_rcv);
3646                 }
3647         }
3648 #ifdef MORE_LOCKING_DEBUG
3649         if (so->so_usecount <= 1) {
3650                 panic("%s: after big while so=%p ref=%d on socket\n",
3651                     __func__, so, so->so_usecount);
3652                 /* NOTREACHED */
3653         }
3654 #endif
3655
3656         if (m != NULL && pr->pr_flags & PR_ATOMIC) {
3657                 if (so->so_options & SO_DONTTRUNC) {
3658                         flags |= MSG_RCVMORE;
3659                 } else {
3660                         flags |= MSG_TRUNC;
3661                         if ((flags & MSG_PEEK) == 0)
3662                                 (void) sbdroprecord(&so->so_rcv);
3663                 }
3664         }
3665
3666         /*
3667          * pru_rcvd below (for TCP) may cause more data to be received
3668          * if the socket lock is dropped prior to sending the ACK; some
3669          * legacy OpenTransport applications don't handle this well
3670          * (if it receives less data than requested while MSG_HAVEMORE
3671          * is set), and so we set the flag now based on what we know
3672          * prior to calling pru_rcvd.
3673          */
3674         if ((so->so_options & SO_WANTMORE) && so->so_rcv.sb_cc > 0)
3675                 flags |= MSG_HAVEMORE;
3676
3677         if ((flags & MSG_PEEK) == 0) {
3678                 if (m == NULL) {
3679                         so->so_rcv.sb_mb = nextrecord;
3680                         /*
3681                          * First part is an inline SB_EMPTY_FIXUP().  Second
3682                          * part makes sure sb_lastrecord is up-to-date if
3683                          * there is still data in the socket buffer.
3684                          */
3685                         if (so->so_rcv.sb_mb == NULL) {
3686                                 so->so_rcv.sb_mbtail = NULL;
3687                                 so->so_rcv.sb_lastrecord = NULL;
3688                         } else if (nextrecord->m_nextpkt == NULL) {
3689                                 so->so_rcv.sb_lastrecord = nextrecord;
3690                         }
3691                         SB_MB_CHECK(&so->so_rcv);
3692                 }
3693                 SBLASTRECORDCHK(&so->so_rcv, "soreceive 4");
3694                 SBLASTMBUFCHK(&so->so_rcv, "soreceive 4");
3695                 if (pr->pr_flags & PR_WANTRCVD && so->so_pcb)
3696                         (*pr->pr_usrreqs->pru_rcvd)(so, flags);
3697         }
3698
3699         if (delayed_copy_len) {
3700                 error = sodelayed_copy(so, uio, &free_list, &delayed_copy_len);
3701                 if (error)
3702                         goto release;
3703         }
3704         if (free_list != NULL) {
3705                 m_freem_list(free_list);
3706                 free_list = NULL;
3707         }
3708         if (need_event)
3709                 postevent(so, 0, EV_OOB);
3710
3711         if (orig_resid == uio_resid(uio) && orig_resid &&
3712             (flags & MSG_EOR) == 0 && (so->so_state & SS_CANTRCVMORE) == 0) {
3713                 sbunlock(&so->so_rcv, TRUE);    /* keep socket locked */
3714                 goto restart;
3715         }
3716
3717         if (flagsp != NULL)
3718                 *flagsp |= flags;
3719 release:
3720 #ifdef MORE_LOCKING_DEBUG
3721         if (so->so_usecount <= 1) {
3722                 panic("%s: release so=%p ref=%d on socket\n", __func__,
3723                     so, so->so_usecount);
3724                 /* NOTREACHED */
3725         }
3726 #endif
3727         if (delayed_copy_len)
3728                 error = sodelayed_copy(so, uio, &free_list, &delayed_copy_len);
3729
3730         if (free_list != NULL)
3731                 m_freem_list(free_list);
3732
3733         sbunlock(&so->so_rcv, FALSE);   /* will unlock socket */
3734
3735         if (en_tracing) {
3736                 KERNEL_ENERGYTRACE(kEnTrActKernSockRead, DBG_FUNC_END,
3737                     VM_KERNEL_ADDRPERM(so),
3738                     ((error == EWOULDBLOCK) ? kEnTrFlagNoWork : 0),
3739                     (int64_t)(orig_resid - uio_resid(uio)));
3740         }
3741         KERNEL_DEBUG(DBG_FNC_SORECEIVE | DBG_FUNC_END, so, uio_resid(uio),
3742             so->so_rcv.sb_cc, 0, error);
3743
3744         return (error);
3745 }
3746
3747 /*
3748  * Returns:     0                       Success
3749  *      uiomove:EFAULT
3750  */
3751 static int
3752 sodelayed_copy(struct socket *so, struct uio *uio, struct mbuf **free_list,
3753     user_ssize_t *resid)
3754 {
3755         int error = 0;
3756         struct mbuf *m;
3757
3758         m = *free_list;
3759
3760         socket_unlock(so, 0);
3761
3762         while (m != NULL && error == 0) {
3763                 error = uiomove(mtod(m, caddr_t), (int)m->m_len, uio);
3764                 m = m->m_next;
3765         }
3766         m_freem_list(*free_list);
3767
3768         *free_list = NULL;
3769         *resid = 0;
3770
3771         socket_lock(so, 0);
3772
3773         return (error);
3774 }
3775
3776 static int
3777 sodelayed_copy_list(struct socket *so, struct recv_msg_elem *msgarray,
3778     u_int uiocnt, struct mbuf **free_list, user_ssize_t *resid)
3779 {
3780 #pragma unused(so)
3781         int error = 0;
3782         struct mbuf *ml, *m;
3783         int i = 0;
3784         struct uio *auio;
3785
3786         for (ml = *free_list, i = 0; ml != NULL && i < uiocnt;
3787             ml = ml->m_nextpkt, i++) {
3788                 auio = msgarray[i].uio;
3789                 for (m = ml; m != NULL; m = m->m_next) {
3790                         error = uiomove(mtod(m, caddr_t), m->m_len, auio);
3791                         if (error != 0)
3792                                 goto out;
3793                 }
3794         }
3795 out:
3796         m_freem_list(*free_list);
3797
3798         *free_list = NULL;
3799         *resid = 0;
3800
3801         return (error);
3802 }
3803
3804 int
3805 soreceive_list(struct socket *so, struct recv_msg_elem *msgarray, u_int uiocnt,
3806     int *flagsp)
3807 {
3808         struct mbuf *m;
3809         struct mbuf *nextrecord;
3810         struct mbuf *ml = NULL, *free_list = NULL, *free_tail = NULL;
3811         int error;
3812         user_ssize_t len, pktlen, delayed_copy_len = 0;
3813         struct protosw *pr = so->so_proto;
3814         user_ssize_t resid;
3815         struct proc *p = current_proc();
3816         struct uio *auio = NULL;
3817         int npkts = 0;
3818         int sblocked = 0;
3819         struct sockaddr **psa = NULL;
3820         struct mbuf **controlp = NULL;
3821         int can_delay;
3822         int flags;
3823         struct mbuf *free_others = NULL;
3824
3825         KERNEL_DEBUG(DBG_FNC_SORECEIVE_LIST | DBG_FUNC_START,
3826             so, uiocnt,
3827             so->so_rcv.sb_cc, so->so_rcv.sb_lowat, so->so_rcv.sb_hiwat);
3828
3829         /*
3830          * Sanity checks:
3831          * - Only supports don't wait flags
3832          * - Only support datagram sockets (could be extended to raw)
3833          * - Must be atomic
3834          * - Protocol must support packet chains
3835          * - The uio array is NULL (should we panic?)
3836          */
3837         if (flagsp != NULL)
3838                 flags = *flagsp;
3839         else
3840                 flags = 0;
3841         if (flags & ~(MSG_PEEK | MSG_WAITALL | MSG_DONTWAIT | MSG_NEEDSA |
3842             MSG_NBIO)) {
3843                 printf("%s invalid flags 0x%x\n", __func__, flags);
3844                 error = EINVAL;
3845                 goto out;
3846         }
3847         if (so->so_type != SOCK_DGRAM) {
3848                 error = EINVAL;
3849                 goto out;
3850         }
3851         if (sosendallatonce(so) == 0) {
3852                 error = EINVAL;
3853                 goto out;
3854         }
3855         if (so->so_proto->pr_usrreqs->pru_send_list == NULL) {
3856                 error = EPROTONOSUPPORT;
3857                 goto out;
3858         }
3859         if (msgarray == NULL) {
3860                 printf("%s uioarray is NULL\n", __func__);
3861                 error = EINVAL;
3862                 goto out;
3863         }
3864         if (uiocnt == 0) {
3865                 printf("%s uiocnt is 0\n", __func__);
3866                 error = EINVAL;
3867                 goto out;
3868         }
3869         /*
3870          * Sanity check on the length passed by caller as we are making 'int'
3871          * comparisons
3872          */
3873         resid = recv_msg_array_resid(msgarray, uiocnt);
3874         if (resid < 0 || resid > INT_MAX) {
3875                 error = EINVAL;
3876                 goto out;
3877         }
3878
3879         if (!(flags & MSG_PEEK) && sorecvmincopy > 0)
3880                 can_delay = 1;
3881         else
3882                 can_delay = 0;
3883
3884         socket_lock(so, 1);
3885         so_update_last_owner_locked(so, p);
3886         so_update_policy(so);
3887
3888 #if NECP
3889         so_update_necp_policy(so, NULL, NULL);
3890 #endif /* NECP */
3891
3892         /*
3893          * If a recv attempt is made on a previously-accepted socket
3894          * that has been marked as inactive (disconnected), reject
3895          * the request.
3896          */
3897         if (so->so_flags & SOF_DEFUNCT) {
3898                 struct sockbuf *sb = &so->so_rcv;
3899
3900                 error = ENOTCONN;
3901                 SODEFUNCTLOG("%s[%d, %s]: defunct so 0x%llx [%d,%d] (%d)\n",
3902                     __func__, proc_pid(p), proc_best_name(p),
3903                     (uint64_t)DEBUG_KERNEL_ADDRPERM(so),
3904                     SOCK_DOM(so), SOCK_TYPE(so), error);
3905                 /*
3906                  * This socket should have been disconnected and flushed
3907                  * prior to being returned from sodefunct(); there should
3908                  * be no data on its receive list, so panic otherwise.
3909                  */
3910                 if (so->so_state & SS_DEFUNCT)
3911                         sb_empty_assert(sb, __func__);
3912                 goto release;
3913         }
3914
3915 next:
3916         /*
3917          * The uio may be empty
3918          */
3919         if (npkts >= uiocnt) {
3920                 error = 0;
3921                 goto release;
3922         }
3923 restart:
3924         /*
3925          * See if the socket has been closed (SS_NOFDREF|SS_CANTRCVMORE)
3926          * and if so just return to the caller.  This could happen when
3927          * soreceive() is called by a socket upcall function during the
3928          * time the socket is freed.  The socket buffer would have been
3929          * locked across the upcall, therefore we cannot put this thread
3930          * to sleep (else we will deadlock) or return EWOULDBLOCK (else
3931          * we may livelock), because the lock on the socket buffer will
3932          * only be released when the upcall routine returns to its caller.
3933          * Because the socket has been officially closed, there can be
3934          * no further read on it.
3935          */
3936         if ((so->so_state & (SS_NOFDREF | SS_CANTRCVMORE)) ==
3937             (SS_NOFDREF | SS_CANTRCVMORE)) {
3938                 error = 0;
3939                 goto release;
3940         }
3941
3942         error = sblock(&so->so_rcv, SBLOCKWAIT(flags));
3943         if (error) {
3944                 goto release;
3945         }
3946         sblocked = 1;
3947
3948         m = so->so_rcv.sb_mb;
3949         /*
3950          * Block awaiting more datagram if needed
3951          */
3952         if (m == NULL || (((flags & MSG_DONTWAIT) == 0 &&
3953             (so->so_rcv.sb_cc < so->so_rcv.sb_lowat ||
3954             ((flags & MSG_WAITALL) && npkts < uiocnt))))) {
3955                 /*
3956                  * Panic if we notice inconsistencies in the socket's
3957                  * receive list; both sb_mb and sb_cc should correctly
3958                  * reflect the contents of the list, otherwise we may
3959                  * end up with false positives during select() or poll()
3960                  * which could put the application in a bad state.
3961                  */
3962                 SB_MB_CHECK(&so->so_rcv);
3963
3964                 if (so->so_error) {
3965                         error = so->so_error;
3966                         if ((flags & MSG_PEEK) == 0)
3967                                 so->so_error = 0;
3968                         goto release;
3969                 }
3970                 if (so->so_state & SS_CANTRCVMORE) {
3971                         goto release;
3972                 }
3973                 if ((so->so_state & (SS_ISCONNECTED|SS_ISCONNECTING)) == 0 &&
3974                     (so->so_proto->pr_flags & PR_CONNREQUIRED)) {
3975                         error = ENOTCONN;
3976                         goto release;
3977                 }
3978                 if ((so->so_state & SS_NBIO) ||
3979                     (flags & (MSG_DONTWAIT|MSG_NBIO))) {
3980                         error = EWOULDBLOCK;
3981                         goto release;
3982                 }
3983                 /*
3984                  * Do not block if we got some data
3985                  */
3986                 if (free_list != NULL) {
3987                         error = 0;
3988                         goto release;
3989                 }
3990
3991                 SBLASTRECORDCHK(&so->so_rcv, "soreceive sbwait 1");
3992                 SBLASTMBUFCHK(&so->so_rcv, "soreceive sbwait 1");
3993
3994                 sbunlock(&so->so_rcv, TRUE);    /* keep socket locked */
3995                 sblocked = 0;
3996
3997                 error = sbwait(&so->so_rcv);
3998                 if (error) {
3999                         goto release;
4000                 }
4001                 goto restart;
4002         }
4003
4004         OSIncrementAtomicLong(&p->p_stats->p_ru.ru_msgrcv);
4005         SBLASTRECORDCHK(&so->so_rcv, "soreceive 1");
4006         SBLASTMBUFCHK(&so->so_rcv, "soreceive 1");
4007
4008         /*
4009          * Consume the current uio index as we have a datagram
4010          */
4011         auio = msgarray[npkts].uio;
4012         resid = uio_resid(auio);
4013         msgarray[npkts].which |= SOCK_MSG_DATA;
4014         psa = (msgarray[npkts].which & SOCK_MSG_SA) ?
4015             &msgarray[npkts].psa : NULL;
4016         controlp = (msgarray[npkts].which & SOCK_MSG_CONTROL) ?
4017             &msgarray[npkts].controlp : NULL;
4018         npkts += 1;
4019         nextrecord = m->m_nextpkt;
4020
4021         if ((pr->pr_flags & PR_ADDR) && m->m_type == MT_SONAME) {
4022                 error = soreceive_addr(p, so, psa, flags, &m, &nextrecord, 1);
4023                 if (error == ERESTART)
4024                         goto restart;
4025                 else if (error != 0)
4026                         goto release;
4027         }
4028
4029         if (m != NULL && m->m_type == MT_CONTROL) {
4030                 error = soreceive_ctl(so, controlp, flags, &m, &nextrecord);
4031                 if (error != 0)
4032                         goto release;
4033         }
4034
4035         if (m->m_pkthdr.len == 0) {
4036                 printf("%s:%d so %llx pkt %llx type %u pktlen null\n",
4037                     __func__, __LINE__,
4038                     (uint64_t)DEBUG_KERNEL_ADDRPERM(so),
4039                     (uint64_t)DEBUG_KERNEL_ADDRPERM(m),
4040                     m->m_type);
4041         }
4042
4043         /*
4044          * Loop to copy the mbufs of the current record
4045          * Support zero length packets
4046          */
4047         ml = NULL;
4048         pktlen = 0;
4049         while (m != NULL && (len = resid - pktlen) >= 0 && error == 0) {
4050                 if (m->m_len == 0)
4051                         panic("%p m_len zero", m);
4052                 if (m->m_type == 0)
4053                         panic("%p m_type zero", m);
4054                 /*
4055                  * Clip to the residual length
4056                  */
4057                 if (len > m->m_len)
4058                         len = m->m_len;
4059                 pktlen += len;
4060                 /*
4061                  * Copy the mbufs via the uio or delay the copy
4062                  * Sockbuf must be consistent here (points to current mbuf,
4063                  * it points to next record) when we drop priority;
4064                  * we must note any additions to the sockbuf when we
4065                  * block interrupts again.
4066                  */
4067                 if (len > 0 && can_delay == 0) {
4068                         socket_unlock(so, 0);
4069                         error = uiomove(mtod(m, caddr_t), (int)len, auio);
4070                         socket_lock(so, 0);
4071                         if (error)
4072                                 goto release;
4073                 } else {
4074                         delayed_copy_len += len;
4075                 }
4076
4077                 if (len == m->m_len) {
4078                         /*
4079                          * m was entirely copied
4080                          */
4081                         sbfree(&so->so_rcv, m);
4082                         nextrecord = m->m_nextpkt;
4083                         m->m_nextpkt = NULL;
4084
4085                         /*
4086                          * Set the first packet to the head of the free list
4087                          */
4088                         if (free_list == NULL)
4089                                 free_list = m;
4090                         /*
4091                          * Link current packet to tail of free list
4092                          */
4093                         if (ml == NULL) {
4094                                 if (free_tail != NULL)
4095                                         free_tail->m_nextpkt = m;
4096                                 free_tail = m;
4097                         }
4098                         /*
4099                          * Link current mbuf to last mbuf of current packet
4100                          */
4101                         if (ml != NULL)
4102                                 ml->m_next = m;
4103                         ml = m;
4104
4105                         /*
4106                          * Move next buf to head of socket buffer
4107                          */
4108                         so->so_rcv.sb_mb = m = ml->m_next;
4109                         ml->m_next = NULL;
4110
4111                         if (m != NULL) {
4112                                 m->m_nextpkt = nextrecord;
4113                                 if (nextrecord == NULL)
4114                                         so->so_rcv.sb_lastrecord = m;
4115                         } else {
4116                                 so->so_rcv.sb_mb = nextrecord;
4117                                 SB_EMPTY_FIXUP(&so->so_rcv);
4118                         }
4119                         SBLASTRECORDCHK(&so->so_rcv, "soreceive 3");
4120                         SBLASTMBUFCHK(&so->so_rcv, "soreceive 3");
4121                 } else {
4122                         /*
4123                          * Stop the loop on partial copy
4124                          */
4125                         break;
4126                 }
4127         }
4128 #ifdef MORE_LOCKING_DEBUG
4129         if (so->so_usecount <= 1) {
4130                 panic("%s: after big while so=%llx ref=%d on socket\n",
4131                     __func__,
4132                     (uint64_t)DEBUG_KERNEL_ADDRPERM(so), so->so_usecount);
4133                 /* NOTREACHED */
4134         }
4135 #endif
4136         /*
4137          * Tell the caller we made a partial copy
4138          */
4139         if (m != NULL) {
4140                 if (so->so_options & SO_DONTTRUNC) {
4141                         /*
4142                          * Copyout first the freelist then the partial mbuf
4143                          */
4144                         socket_unlock(so, 0);
4145                         if (delayed_copy_len)
4146                                 error = sodelayed_copy_list(so, msgarray,
4147                                     uiocnt, &free_list, &delayed_copy_len);
4148
4149                         if (error == 0) {
4150                                 error = uiomove(mtod(m, caddr_t), (int)len,
4151                                     auio);
4152                         }
4153                         socket_lock(so, 0);
4154                         if (error)
4155                                 goto release;
4156
4157                         m->m_data += len;
4158                         m->m_len -= len;
4159                         so->so_rcv.sb_cc -= len;
4160                         flags |= MSG_RCVMORE;
4161                 } else {
4162                         (void) sbdroprecord(&so->so_rcv);
4163                         nextrecord = so->so_rcv.sb_mb;
4164                         m = NULL;
4165                         flags |= MSG_TRUNC;
4166                 }
4167         }
4168
4169         if (m == NULL) {
4170                 so->so_rcv.sb_mb = nextrecord;
4171                 /*
4172                  * First part is an inline SB_EMPTY_FIXUP().  Second
4173                  * part makes sure sb_lastrecord is up-to-date if
4174                  * there is still data in the socket buffer.
4175                  */
4176                 if (so->so_rcv.sb_mb == NULL) {
4177                         so->so_rcv.sb_mbtail = NULL;
4178                         so->so_rcv.sb_lastrecord = NULL;
4179                 } else if (nextrecord->m_nextpkt == NULL) {
4180                         so->so_rcv.sb_lastrecord = nextrecord;
4181                 }
4182                 SB_MB_CHECK(&so->so_rcv);
4183         }
4184         SBLASTRECORDCHK(&so->so_rcv, "soreceive 4");
4185         SBLASTMBUFCHK(&so->so_rcv, "soreceive 4");
4186
4187         /*
4188          * We can continue to the next packet as long as:
4189          * - We haven't exhausted the uio array
4190          * - There was no error
4191          * - A packet was not truncated
4192          * - We can still receive more data
4193          */
4194         if (npkts < uiocnt && error == 0 &&
4195             (flags & (MSG_RCVMORE | MSG_TRUNC)) == 0 &&
4196             (so->so_state & SS_CANTRCVMORE) == 0) {
4197                 sbunlock(&so->so_rcv, TRUE);    /* keep socket locked */
4198                 sblocked = 0;
4199
4200                 goto next;
4201         }
4202         if (flagsp != NULL)
4203                 *flagsp |= flags;
4204
4205 release:
4206         /*
4207          * pru_rcvd may cause more data to be received if the socket lock
4208          * is dropped so we set MSG_HAVEMORE now based on what we know.
4209          * That way the caller won't be surprised if it receives less data
4210          * than requested.
4211          */
4212         if ((so->so_options & SO_WANTMORE) && so->so_rcv.sb_cc > 0)
4213                 flags |= MSG_HAVEMORE;
4214
4215         if (pr->pr_flags & PR_WANTRCVD && so->so_pcb)
4216                 (*pr->pr_usrreqs->pru_rcvd)(so, flags);
4217
4218         if (sblocked)
4219                 sbunlock(&so->so_rcv, FALSE);   /* will unlock socket */
4220         else
4221                 socket_unlock(so, 1);
4222
4223         if (delayed_copy_len)
4224                 error = sodelayed_copy_list(so, msgarray, uiocnt,
4225                     &free_list, &delayed_copy_len);
4226 out:
4227         /*
4228          * Amortize the cost of freeing the mbufs
4229          */
4230         if (free_list != NULL)
4231                 m_freem_list(free_list);
4232         if (free_others != NULL)
4233                 m_freem_list(free_others);
4234
4235         KERNEL_DEBUG(DBG_FNC_SORECEIVE_LIST | DBG_FUNC_END, error,
4236             0, 0, 0, 0);
4237         return (error);
4238 }
4239
4240 /*
4241  * Returns:     0                       Success
4242  *              EINVAL
4243  *              ENOTCONN
4244  *      <pru_shutdown>:EINVAL
4245  *      <pru_shutdown>:EADDRNOTAVAIL[TCP]
4246  *      <pru_shutdown>:ENOBUFS[TCP]
4247  *      <pru_shutdown>:EMSGSIZE[TCP]
4248  *      <pru_shutdown>:EHOSTUNREACH[TCP]
4249  *      <pru_shutdown>:ENETUNREACH[TCP]
4250  *      <pru_shutdown>:ENETDOWN[TCP]
4251  *      <pru_shutdown>:ENOMEM[TCP]
4252  *      <pru_shutdown>:EACCES[TCP]
4253  *      <pru_shutdown>:EMSGSIZE[TCP]
4254  *      <pru_shutdown>:ENOBUFS[TCP]
4255  *      <pru_shutdown>:???[TCP]         [ignorable: mostly IPSEC/firewall/DLIL]
4256  *      <pru_shutdown>:???              [other protocol families]
4257  */
4258 int
4259 soshutdown(struct socket *so, int how)
4260 {
4261         int error;
4262
4263         KERNEL_DEBUG(DBG_FNC_SOSHUTDOWN | DBG_FUNC_START, how, 0, 0, 0, 0);
4264
4265         switch (how) {
4266         case SHUT_RD:
4267         case SHUT_WR:
4268         case SHUT_RDWR:
4269                 socket_lock(so, 1);
4270                 if ((so->so_state &
4271                     (SS_ISCONNECTED|SS_ISCONNECTING|SS_ISDISCONNECTING)) == 0) {
4272                         error = ENOTCONN;
4273                 } else {
4274                         error = soshutdownlock(so, how);
4275                 }
4276                 socket_unlock(so, 1);
4277                 break;
4278         default:
4279                 error = EINVAL;
4280                 break;
4281         }
4282
4283         KERNEL_DEBUG(DBG_FNC_SOSHUTDOWN | DBG_FUNC_END, how, error, 0, 0, 0);
4284
4285         return (error);
4286 }
4287
4288 int
4289 soshutdownlock_final(struct socket *so, int how)
4290 {
4291         struct protosw *pr = so->so_proto;
4292         int error = 0;
4293
4294         sflt_notify(so, sock_evt_shutdown, &how);
4295
4296         if (how != SHUT_WR) {
4297                 if ((so->so_state & SS_CANTRCVMORE) != 0) {
4298                         /* read already shut down */
4299                         error = ENOTCONN;
4300                         goto done;
4301                 }
4302                 sorflush(so);
4303                 postevent(so, 0, EV_RCLOSED);
4304         }
4305         if (how != SHUT_RD) {
4306                 if ((so->so_state & SS_CANTSENDMORE) != 0) {
4307                         /* write already shut down */
4308                         error = ENOTCONN;
4309                         goto done;
4310                 }
4311                 error = (*pr->pr_usrreqs->pru_shutdown)(so);
4312                 postevent(so, 0, EV_WCLOSED);
4313         }
4314 done:
4315         KERNEL_DEBUG(DBG_FNC_SOSHUTDOWN, how, 1, 0, 0, 0);
4316         return (error);
4317 }
4318
4319 int
4320 soshutdownlock(struct socket *so, int how)
4321 {
4322         int error = 0;
4323
4324 #if CONTENT_FILTER
4325         /*
4326          * A content filter may delay the actual shutdown until it
4327          * has processed the pending data
4328          */
4329         if (so->so_flags & SOF_CONTENT_FILTER) {
4330                 error = cfil_sock_shutdown(so, &how);
4331                 if (error == EJUSTRETURN) {
4332                         error = 0;
4333                         goto done;
4334                 } else if (error != 0) {
4335                         goto done;
4336                 }
4337         }
4338 #endif /* CONTENT_FILTER */
4339
4340         error = soshutdownlock_final(so, how);
4341
4342 done:
4343         return (error);
4344 }
4345
4346 void
4347 sowflush(struct socket *so)
4348 {
4349         struct sockbuf *sb = &so->so_snd;
4350
4351         /*
4352          * Obtain lock on the socket buffer (SB_LOCK).  This is required
4353          * to prevent the socket buffer from being unexpectedly altered
4354          * while it is used by another thread in socket send/receive.
4355          *
4356          * sblock() must not fail here, hence the assertion.
4357          */
4358         (void) sblock(sb, SBL_WAIT | SBL_NOINTR | SBL_IGNDEFUNCT);
4359         VERIFY(sb->sb_flags & SB_LOCK);
4360
4361         sb->sb_flags            &= ~(SB_SEL|SB_UPCALL);
4362         sb->sb_flags            |= SB_DROP;
4363         sb->sb_upcall           = NULL;
4364         sb->sb_upcallarg        = NULL;
4365
4366         sbunlock(sb, TRUE);     /* keep socket locked */
4367
4368         selthreadclear(&sb->sb_sel);
4369         sbrelease(sb);
4370 }
4371
4372 void
4373 sorflush(struct socket *so)
4374 {
4375         struct sockbuf *sb = &so->so_rcv;
4376         struct protosw *pr = so->so_proto;
4377         struct sockbuf asb;
4378 #ifdef notyet
4379         lck_mtx_t *mutex_held;
4380         /*
4381          * XXX: This code is currently commented out, because we may get here
4382          * as part of sofreelastref(), and at that time, pr_getlock() may no
4383          * longer be able to return us the lock; this will be fixed in future.
4384          */
4385         if (so->so_proto->pr_getlock != NULL)
4386                 mutex_held = (*so->so_proto->pr_getlock)(so, 0);
4387         else
4388                 mutex_held = so->so_proto->pr_domain->dom_mtx;
4389
4390         lck_mtx_assert(mutex_held, LCK_MTX_ASSERT_OWNED);
4391 #endif /* notyet */
4392
4393         sflt_notify(so, sock_evt_flush_read, NULL);
4394
4395         socantrcvmore(so);
4396
4397         /*
4398          * Obtain lock on the socket buffer (SB_LOCK).  This is required
4399          * to prevent the socket buffer from being unexpectedly altered
4400          * while it is used by another thread in socket send/receive.
4401          *
4402          * sblock() must not fail here, hence the assertion.
4403          */
4404         (void) sblock(sb, SBL_WAIT | SBL_NOINTR | SBL_IGNDEFUNCT);
4405         VERIFY(sb->sb_flags & SB_LOCK);
4406
4407         /*
4408          * Copy only the relevant fields from "sb" to "asb" which we
4409          * need for sbrelease() to function.  In particular, skip
4410          * sb_sel as it contains the wait queue linkage, which would
4411          * wreak havoc if we were to issue selthreadclear() on "asb".
4412          * Make sure to not carry over SB_LOCK in "asb", as we need
4413          * to acquire it later as part of sbrelease().
4414          */
4415         bzero(&asb, sizeof (asb));
4416         asb.sb_cc               = sb->sb_cc;
4417         asb.sb_hiwat            = sb->sb_hiwat;
4418         asb.sb_mbcnt            = sb->sb_mbcnt;
4419         asb.sb_mbmax            = sb->sb_mbmax;
4420         asb.sb_ctl              = sb->sb_ctl;
4421         asb.sb_lowat            = sb->sb_lowat;
4422         asb.sb_mb               = sb->sb_mb;
4423         asb.sb_mbtail           = sb->sb_mbtail;
4424         asb.sb_lastrecord       = sb->sb_lastrecord;
4425         asb.sb_so               = sb->sb_so;
4426         asb.sb_flags            = sb->sb_flags;
4427         asb.sb_flags            &= ~(SB_LOCK|SB_SEL|SB_KNOTE|SB_UPCALL);
4428         asb.sb_flags            |= SB_DROP;
4429
4430         /*
4431          * Ideally we'd bzero() these and preserve the ones we need;
4432          * but to do that we'd need to shuffle things around in the
4433          * sockbuf, and we can't do it now because there are KEXTS
4434          * that are directly referring to the socket structure.
4435          *
4436          * Setting SB_DROP acts as a barrier to prevent further appends.
4437          * Clearing SB_SEL is done for selthreadclear() below.
4438          */
4439         sb->sb_cc               = 0;
4440         sb->sb_hiwat            = 0;
4441         sb->sb_mbcnt            = 0;
4442         sb->sb_mbmax            = 0;
4443         sb->sb_ctl              = 0;
4444         sb->sb_lowat            = 0;
4445         sb->sb_mb               = NULL;
4446         sb->sb_mbtail           = NULL;
4447         sb->sb_lastrecord       = NULL;
4448         sb->sb_timeo.tv_sec     = 0;
4449         sb->sb_timeo.tv_usec    = 0;
4450         sb->sb_upcall           = NULL;
4451         sb->sb_upcallarg        = NULL;
4452         sb->sb_flags            &= ~(SB_SEL|SB_UPCALL);
4453         sb->sb_flags            |= SB_DROP;
4454
4455         sbunlock(sb, TRUE);     /* keep socket locked */
4456
4457         /*
4458          * Note that selthreadclear() is called on the original "sb" and
4459          * not the local "asb" because of the way wait queue linkage is
4460          * implemented.  Given that selwakeup() may be triggered, SB_SEL
4461          * should no longer be set (cleared above.)
4462          */
4463         selthreadclear(&sb->sb_sel);
4464
4465         if ((pr->pr_flags & PR_RIGHTS) && pr->pr_domain->dom_dispose)
4466                 (*pr->pr_domain->dom_dispose)(asb.sb_mb);
4467
4468         sbrelease(&asb);
4469 }
4470
4471 /*
4472  * Perhaps this routine, and sooptcopyout(), below, ought to come in
4473  * an additional variant to handle the case where the option value needs
4474  * to be some kind of integer, but not a specific size.
4475  * In addition to their use here, these functions are also called by the
4476  * protocol-level pr_ctloutput() routines.
4477  *
4478  * Returns:     0                       Success
4479  *              EINVAL
4480  *      copyin:EFAULT
4481  */
4482 int
4483 sooptcopyin(struct sockopt *sopt, void *buf, size_t len, size_t minlen)
4484 {
4485         size_t  valsize;
4486
4487         /*
4488          * If the user gives us more than we wanted, we ignore it,
4489          * but if we don't get the minimum length the caller
4490          * wants, we return EINVAL.  On success, sopt->sopt_valsize
4491          * is set to however much we actually retrieved.
4492          */
4493         if ((valsize = sopt->sopt_valsize) < minlen)
4494                 return (EINVAL);
4495         if (valsize > len)
4496                 sopt->sopt_valsize = valsize = len;
4497
4498         if (sopt->sopt_p != kernproc)
4499                 return (copyin(sopt->sopt_val, buf, valsize));
4500
4501         bcopy(CAST_DOWN(caddr_t, sopt->sopt_val), buf, valsize);
4502         return (0);
4503 }
4504
4505 /*
4506  * sooptcopyin_timeval
4507  *   Copy in a timeval value into tv_p, and take into account whether the
4508  *   the calling process is 64-bit or 32-bit.  Moved the sanity checking
4509  *   code here so that we can verify the 64-bit tv_sec value before we lose
4510  *   the top 32-bits assigning tv64.tv_sec to tv_p->tv_sec.
4511  */
4512 static int
4513 sooptcopyin_timeval(struct sockopt *sopt, struct timeval *tv_p)
4514 {
4515         int                     error;
4516
4517         if (proc_is64bit(sopt->sopt_p)) {
4518                 struct user64_timeval   tv64;
4519
4520                 if (sopt->sopt_valsize < sizeof (tv64))
4521                         return (EINVAL);
4522
4523                 sopt->sopt_valsize = sizeof (tv64);
4524                 if (sopt->sopt_p != kernproc) {
4525                         error = copyin(sopt->sopt_val, &tv64, sizeof (tv64));
4526                         if (error != 0)
4527                                 return (error);
4528                 } else {
4529                         bcopy(CAST_DOWN(caddr_t, sopt->sopt_val), &tv64,
4530                             sizeof (tv64));
4531                 }
4532                 if (tv64.tv_sec < 0 || tv64.tv_sec > LONG_MAX ||
4533                     tv64.tv_usec < 0 || tv64.tv_usec >= 1000000)
4534                         return (EDOM);
4535
4536                 tv_p->tv_sec = tv64.tv_sec;
4537                 tv_p->tv_usec = tv64.tv_usec;
4538         } else {
4539                 struct user32_timeval   tv32;
4540
4541                 if (sopt->sopt_valsize < sizeof (tv32))
4542                         return (EINVAL);
4543
4544                 sopt->sopt_valsize = sizeof (tv32);
4545                 if (sopt->sopt_p != kernproc) {
4546                         error = copyin(sopt->sopt_val, &tv32, sizeof (tv32));
4547                         if (error != 0) {
4548                                 return (error);
4549                         }
4550                 } else {
4551                         bcopy(CAST_DOWN(caddr_t, sopt->sopt_val), &tv32,
4552                             sizeof (tv32));
4553                 }
4554 #ifndef __LP64__
4555                 /*
4556                  * K64todo "comparison is always false due to
4557                  * limited range of data type"
4558                  */
4559                 if (tv32.tv_sec < 0 || tv32.tv_sec > LONG_MAX ||
4560                     tv32.tv_usec < 0 || tv32.tv_usec >= 1000000)
4561                         return (EDOM);
4562 #endif
4563                 tv_p->tv_sec = tv32.tv_sec;
4564                 tv_p->tv_usec = tv32.tv_usec;
4565         }
4566         return (0);
4567 }
4568
4569 static int
4570 soopt_cred_check(struct socket *so, int priv)
4571 {
4572         kauth_cred_t cred =  NULL;
4573         proc_t ep = PROC_NULL;
4574         int error;
4575
4576         if (so->so_flags & SOF_DELEGATED) {
4577                 ep = proc_find(so->e_pid);
4578                 if (ep)
4579                         cred = kauth_cred_proc_ref(ep);
4580         }
4581         error = priv_check_cred(cred ? cred : so->so_cred, priv, 0);
4582         if (cred)
4583                 kauth_cred_unref(&cred);
4584         if (ep != PROC_NULL)
4585                 proc_rele(ep);
4586
4587         return (error);
4588 }
4589
4590 /*
4591  * Returns:     0                       Success
4592  *              EINVAL
4593  *              ENOPROTOOPT
4594  *              ENOBUFS
4595  *              EDOM
4596  *      sooptcopyin:EINVAL
4597  *      sooptcopyin:EFAULT
4598  *      sooptcopyin_timeval:EINVAL
4599  *      sooptcopyin_timeval:EFAULT
4600  *      sooptcopyin_timeval:EDOM
4601  *      <pr_ctloutput>:EOPNOTSUPP[AF_UNIX]
4602  *      <pr_ctloutput>:???w
4603  *      sflt_attach_private:???         [whatever a filter author chooses]
4604  *      <sf_setoption>:???              [whatever a filter author chooses]
4605  *
4606  * Notes:       Other <pru_listen> returns depend on the protocol family; all
4607  *              <sf_listen> returns depend on what the filter author causes
4608  *              their filter to return.
4609  */
4610 int
4611 sosetoptlock(struct socket *so, struct sockopt *sopt, int dolock)
4612 {
4613         int     error, optval;
4614         struct  linger l;
4615         struct  timeval tv;
4616 #if CONFIG_MACF_SOCKET
4617         struct mac extmac;
4618 #endif /* MAC_SOCKET */
4619
4620         if (sopt->sopt_dir != SOPT_SET)
4621                 sopt->sopt_dir = SOPT_SET;
4622
4623         if (dolock)
4624                 socket_lock(so, 1);
4625
4626         if ((so->so_state & (SS_CANTRCVMORE | SS_CANTSENDMORE)) ==
4627             (SS_CANTRCVMORE | SS_CANTSENDMORE) &&
4628             (so->so_flags & SOF_NPX_SETOPTSHUT) == 0) {
4629                 /* the socket has been shutdown, no more sockopt's */
4630                 error = EINVAL;
4631                 goto out;
4632         }
4633
4634         error = sflt_setsockopt(so, sopt);
4635         if (error != 0) {
4636                 if (error == EJUSTRETURN)
4637                         error = 0;
4638                 goto out;
4639         }
4640
4641         if (sopt->sopt_level != SOL_SOCKET) {
4642                 if (so->so_proto != NULL &&
4643                     so->so_proto->pr_ctloutput != NULL) {
4644                         error = (*so->so_proto->pr_ctloutput)(so, sopt);
4645                         goto out;
4646                 }
4647                 error = ENOPROTOOPT;
4648         } else {
4649                 /*
4650                  * Allow socket-level (SOL_SOCKET) options to be filtered by
4651                  * the protocol layer, if needed.  A zero value returned from
4652                  * the handler means use default socket-level processing as
4653                  * done by the rest of this routine.  Otherwise, any other
4654                  * return value indicates that the option is unsupported.
4655                  */
4656                 if (so->so_proto != NULL && (error = so->so_proto->pr_usrreqs->
4657                     pru_socheckopt(so, sopt)) != 0)
4658                         goto out;
4659
4660                 error = 0;
4661                 switch (sopt->sopt_name) {
4662                 case SO_LINGER:
4663                 case SO_LINGER_SEC:
4664                         error = sooptcopyin(sopt, &l, sizeof (l), sizeof (l));
4665                         if (error != 0)
4666                                 goto out;
4667
4668                         so->so_linger = (sopt->sopt_name == SO_LINGER) ?
4669                             l.l_linger : l.l_linger * hz;
4670                         if (l.l_onoff != 0)
4671                                 so->so_options |= SO_LINGER;
4672                         else
4673                                 so->so_options &= ~SO_LINGER;
4674                         break;
4675
4676                 case SO_DEBUG:
4677                 case SO_KEEPALIVE:
4678                 case SO_DONTROUTE:
4679                 case SO_USELOOPBACK:
4680                 case SO_BROADCAST:
4681                 case SO_REUSEADDR:
4682                 case SO_REUSEPORT:
4683                 case SO_OOBINLINE:
4684                 case SO_TIMESTAMP:
4685                 case SO_TIMESTAMP_MONOTONIC:
4686                 case SO_DONTTRUNC:
4687                 case SO_WANTMORE:
4688                 case SO_WANTOOBFLAG:
4689                 case SO_NOWAKEFROMSLEEP:
4690                 case SO_NOAPNFALLBK:
4691                         error = sooptcopyin(sopt, &optval, sizeof (optval),
4692                             sizeof (optval));
4693                         if (error != 0)
4694                                 goto out;
4695                         if (optval)
4696                                 so->so_options |= sopt->sopt_name;
4697                         else
4698                                 so->so_options &= ~sopt->sopt_name;
4699                         break;
4700
4701                 case SO_SNDBUF:
4702                 case SO_RCVBUF:
4703                 case SO_SNDLOWAT:
4704                 case SO_RCVLOWAT:
4705                         error = sooptcopyin(sopt, &optval, sizeof (optval),
4706                             sizeof (optval));
4707                         if (error != 0)
4708                                 goto out;
4709
4710                         /*
4711                          * Values < 1 make no sense for any of these
4712                          * options, so disallow them.
4713                          */
4714                         if (optval < 1) {
4715                                 error = EINVAL;
4716                                 goto out;
4717                         }
4718
4719                         switch (sopt->sopt_name) {
4720                         case SO_SNDBUF:
4721                         case SO_RCVBUF: {
4722                                 struct sockbuf *sb =
4723                                     (sopt->sopt_name == SO_SNDBUF) ?
4724                                     &so->so_snd : &so->so_rcv;
4725                                 if (sbreserve(sb, (u_int32_t)optval) == 0) {
4726                                         error = ENOBUFS;
4727                                         goto out;
4728                                 }
4729                                 sb->sb_flags |= SB_USRSIZE;
4730                                 sb->sb_flags &= ~SB_AUTOSIZE;
4731                                 sb->sb_idealsize = (u_int32_t)optval;
4732                                 break;
4733                         }
4734                         /*
4735                          * Make sure the low-water is never greater than
4736                          * the high-water.
4737                          */
4738                         case SO_SNDLOWAT: {
4739                                 int space = sbspace(&so->so_snd);
4740                                 u_int32_t hiwat = so->so_snd.sb_hiwat;
4741
4742                                 if (so->so_snd.sb_flags & SB_UNIX) {
4743                                         struct unpcb *unp =
4744                                             (struct unpcb *)(so->so_pcb);
4745                                         if (unp != NULL &&
4746                                             unp->unp_conn != NULL) {
4747                                                 hiwat += unp->unp_conn->unp_cc;
4748                                         }
4749                                 }
4750
4751                                 so->so_snd.sb_lowat =
4752                                     (optval > hiwat) ?
4753                                     hiwat : optval;
4754
4755                                 if (space >= so->so_snd.sb_lowat) {
4756                                         sowwakeup(so);
4757                                 }
4758                                 break;
4759                         }
4760                         case SO_RCVLOWAT: {
4761                                 int64_t data_len;
4762                                 so->so_rcv.sb_lowat =
4763                                     (optval > so->so_rcv.sb_hiwat) ?
4764                                     so->so_rcv.sb_hiwat : optval;
4765                                 data_len = so->so_rcv.sb_cc
4766                                     - so->so_rcv.sb_ctl;
4767                                 if (data_len >= so->so_rcv.sb_lowat)
4768                                     sorwakeup(so);
4769                                 break;
4770                         }
4771                         }
4772                         break;
4773
4774                 case SO_SNDTIMEO:
4775                 case SO_RCVTIMEO:
4776                         error = sooptcopyin_timeval(sopt, &tv);
4777                         if (error != 0)
4778                                 goto out;
4779
4780                         switch (sopt->sopt_name) {
4781                         case SO_SNDTIMEO:
4782                                 so->so_snd.sb_timeo = tv;
4783                                 break;
4784                         case SO_RCVTIMEO:
4785                                 so->so_rcv.sb_timeo = tv;
4786                                 break;
4787                         }
4788                         break;
4789
4790                 case SO_NKE: {
4791                         struct so_nke nke;
4792
4793                         error = sooptcopyin(sopt, &nke, sizeof (nke),
4794                             sizeof (nke));
4795                         if (error != 0)
4796                                 goto out;
4797
4798                         error = sflt_attach_internal(so, nke.nke_handle);
4799                         break;
4800                 }
4801
4802                 case SO_NOSIGPIPE:
4803                         error = sooptcopyin(sopt, &optval, sizeof (optval),
4804                             sizeof (optval));
4805                         if (error != 0)
4806                                 goto out;
4807                         if (optval != 0)
4808                                 so->so_flags |= SOF_NOSIGPIPE;
4809                         else
4810                                 so->so_flags &= ~SOF_NOSIGPIPE;
4811                         break;
4812
4813                 case SO_NOADDRERR:
4814                         error = sooptcopyin(sopt, &optval, sizeof (optval),
4815                             sizeof (optval));
4816                         if (error != 0)
4817                                 goto out;
4818                         if (optval != 0)
4819                                 so->so_flags |= SOF_NOADDRAVAIL;
4820                         else
4821                                 so->so_flags &= ~SOF_NOADDRAVAIL;
4822                         break;
4823
4824                 case SO_REUSESHAREUID:
4825                         error = sooptcopyin(sopt, &optval, sizeof (optval),
4826                             sizeof (optval));
4827                         if (error != 0)
4828                                 goto out;
4829                         if (optval != 0)
4830                                 so->so_flags |= SOF_REUSESHAREUID;
4831                         else
4832                                 so->so_flags &= ~SOF_REUSESHAREUID;
4833                         break;
4834
4835                 case SO_NOTIFYCONFLICT:
4836                         if (kauth_cred_issuser(kauth_cred_get()) == 0) {
4837                                 error = EPERM;
4838                                 goto out;
4839                         }
4840                         error = sooptcopyin(sopt, &optval, sizeof (optval),
4841                             sizeof (optval));
4842                         if (error != 0)
4843                                 goto out;
4844                         if (optval != 0)
4845                                 so->so_flags |= SOF_NOTIFYCONFLICT;
4846                         else
4847                                 so->so_flags &= ~SOF_NOTIFYCONFLICT;
4848                         break;
4849
4850                 case SO_RESTRICTIONS:
4851                         error = sooptcopyin(sopt, &optval, sizeof (optval),
4852                             sizeof (optval));
4853                         if (error != 0)
4854                                 goto out;
4855
4856                         error = so_set_restrictions(so, optval);
4857                         break;
4858
4859                 case SO_AWDL_UNRESTRICTED:
4860                         if (SOCK_DOM(so) != PF_INET &&
4861                             SOCK_DOM(so) != PF_INET6) {
4862                                 error = EOPNOTSUPP;
4863                                 goto out;
4864                         }
4865                         error = sooptcopyin(sopt, &optval, sizeof(optval),
4866                             sizeof(optval));
4867                         if (error != 0)
4868                                 goto out;
4869                         if (optval != 0) {
4870                                 error = soopt_cred_check(so,
4871                                     PRIV_NET_RESTRICTED_AWDL);
4872                                 if (error == 0)
4873                                         inp_set_awdl_unrestricted(
4874                                             sotoinpcb(so));
4875                         } else
4876                                 inp_clear_awdl_unrestricted(sotoinpcb(so));
4877                         break;
4878                 case SO_INTCOPROC_ALLOW:
4879                         if (SOCK_DOM(so) != PF_INET6) {
4880                                 error = EOPNOTSUPP;
4881                                 goto out;
4882                         }
4883                         error = sooptcopyin(sopt, &optval, sizeof(optval),
4884                             sizeof(optval));
4885                         if (error != 0)
4886                                 goto out;
4887                         if (optval != 0 &&
4888                                         inp_get_intcoproc_allowed(sotoinpcb(so)) == FALSE) {
4889                                 error = soopt_cred_check(so,
4890                                     PRIV_NET_RESTRICTED_INTCOPROC);
4891                                 if (error == 0)
4892                                         inp_set_intcoproc_allowed(
4893                                             sotoinpcb(so));
4894                         } else if (optval == 0)
4895                                 inp_clear_intcoproc_allowed(sotoinpcb(so));
4896                         break;
4897
4898                 case SO_LABEL:
4899 #if CONFIG_MACF_SOCKET
4900                         if ((error = sooptcopyin(sopt, &extmac, sizeof (extmac),
4901                             sizeof (extmac))) != 0)
4902                                 goto out;
4903
4904                         error = mac_setsockopt_label(proc_ucred(sopt->sopt_p),
4905                             so, &extmac);
4906 #else
4907                         error = EOPNOTSUPP;
4908 #endif /* MAC_SOCKET */
4909                         break;
4910
4911                 case SO_UPCALLCLOSEWAIT:
4912                         error = sooptcopyin(sopt, &optval, sizeof (optval),
4913                             sizeof (optval));
4914                         if (error != 0)
4915                                 goto out;
4916                         if (optval != 0)
4917                                 so->so_flags |= SOF_UPCALLCLOSEWAIT;
4918                         else
4919                                 so->so_flags &= ~SOF_UPCALLCLOSEWAIT;
4920                         break;
4921
4922                 case SO_RANDOMPORT:
4923                         error = sooptcopyin(sopt, &optval, sizeof (optval),
4924                             sizeof (optval));
4925                         if (error != 0)
4926                                 goto out;
4927                         if (optval != 0)
4928                                 so->so_flags |= SOF_BINDRANDOMPORT;
4929                         else
4930                                 so->so_flags &= ~SOF_BINDRANDOMPORT;
4931                         break;
4932
4933                 case SO_NP_EXTENSIONS: {
4934                         struct so_np_extensions sonpx;
4935
4936                         error = sooptcopyin(sopt, &sonpx, sizeof (sonpx),
4937                             sizeof (sonpx));
4938                         if (error != 0)
4939                                 goto out;
4940                         if (sonpx.npx_mask & ~SONPX_MASK_VALID) {
4941                                 error = EINVAL;
4942                                 goto out;
4943                         }
4944                         /*
4945                          * Only one bit defined for now
4946                          */
4947                         if ((sonpx.npx_mask & SONPX_SETOPTSHUT)) {
4948                                 if ((sonpx.npx_flags & SONPX_SETOPTSHUT))
4949                                         so->so_flags |= SOF_NPX_SETOPTSHUT;
4950                                 else
4951                                         so->so_flags &= ~SOF_NPX_SETOPTSHUT;
4952                         }
4953                         break;
4954                 }
4955
4956                 case SO_TRAFFIC_CLASS: {
4957                         error = sooptcopyin(sopt, &optval, sizeof (optval),
4958                             sizeof (optval));
4959                         if (error != 0)
4960                                 goto out;
4961                         if (optval >= SO_TC_NET_SERVICE_OFFSET) {
4962                                 int netsvc = optval - SO_TC_NET_SERVICE_OFFSET;
4963                                 error = so_set_net_service_type(so, netsvc);
4964                                 goto out;
4965                         }
4966                         error = so_set_traffic_class(so, optval);
4967                         if (error != 0)
4968                                 goto out;
4969                         so->so_flags1 &= ~SOF1_TC_NET_SERV_TYPE;
4970                         so->so_netsvctype = _NET_SERVICE_TYPE_UNSPEC;
4971                         break;
4972                 }
4973
4974                 case SO_RECV_TRAFFIC_CLASS: {
4975                         error = sooptcopyin(sopt, &optval, sizeof (optval),
4976                             sizeof (optval));
4977                         if (error != 0)
4978                                 goto out;
4979                         if (optval == 0)
4980                                 so->so_flags &= ~SOF_RECV_TRAFFIC_CLASS;
4981                         else
4982                                 so->so_flags |= SOF_RECV_TRAFFIC_CLASS;
4983                         break;
4984                 }
4985
4986 #if (DEVELOPMENT || DEBUG)
4987                 case SO_TRAFFIC_CLASS_DBG: {
4988                         struct so_tcdbg so_tcdbg;
4989
4990                         error = sooptcopyin(sopt, &so_tcdbg,
4991                             sizeof (struct so_tcdbg), sizeof (struct so_tcdbg));
4992                         if (error != 0)
4993                                 goto out;
4994                         error = so_set_tcdbg(so, &so_tcdbg);
4995                         if (error != 0)
4996                                 goto out;
4997                         break;
4998                 }
4999 #endif /* (DEVELOPMENT || DEBUG) */
5000
5001                 case SO_PRIVILEGED_TRAFFIC_CLASS:
5002                         error = priv_check_cred(kauth_cred_get(),
5003                             PRIV_NET_PRIVILEGED_TRAFFIC_CLASS, 0);
5004                         if (error != 0)
5005                                 goto out;
5006                         error = sooptcopyin(sopt, &optval, sizeof (optval),
5007                             sizeof (optval));
5008                         if (error != 0)
5009                                 goto out;
5010                         if (optval == 0)
5011                                 so->so_flags &= ~SOF_PRIVILEGED_TRAFFIC_CLASS;
5012                         else
5013                                 so->so_flags |= SOF_PRIVILEGED_TRAFFIC_CLASS;
5014                         break;
5015
5016                 case SO_DEFUNCTOK:
5017                         error = sooptcopyin(sopt, &optval, sizeof (optval),
5018                             sizeof (optval));
5019                         if (error != 0 || (so->so_flags & SOF_DEFUNCT)) {
5020                                 if (error == 0)
5021                                         error = EBADF;
5022                                 goto out;
5023                         }
5024                         /*
5025                          * Any process can set SO_DEFUNCTOK (clear
5026                          * SOF_NODEFUNCT), but only root can clear
5027                          * SO_DEFUNCTOK (set SOF_NODEFUNCT).
5028                          */
5029                         if (optval == 0 &&
5030                             kauth_cred_issuser(kauth_cred_get()) == 0) {
5031                                 error = EPERM;
5032                                 goto out;
5033                         }
5034                         if (optval)
5035                                 so->so_flags &= ~SOF_NODEFUNCT;
5036                         else
5037                                 so->so_flags |= SOF_NODEFUNCT;
5038
5039                         if (SOCK_DOM(so) == PF_INET ||
5040                             SOCK_DOM(so) == PF_INET6) {
5041                                 char s[MAX_IPv6_STR_LEN];
5042                                 char d[MAX_IPv6_STR_LEN];
5043                                 struct inpcb *inp = sotoinpcb(so);
5044
5045                                 SODEFUNCTLOG("%s[%d, %s]: so 0x%llx "
5046                                     "[%s %s:%d -> %s:%d] is now marked "
5047                                     "as %seligible for "
5048                                     "defunct\n", __func__, proc_selfpid(),
5049                                     proc_best_name(current_proc()),
5050                                     (uint64_t)DEBUG_KERNEL_ADDRPERM(so),
5051                                     (SOCK_TYPE(so) == SOCK_STREAM) ?
5052                                     "TCP" : "UDP", inet_ntop(SOCK_DOM(so),
5053                                     ((SOCK_DOM(so) == PF_INET) ?
5054                                     (void *)&inp->inp_laddr.s_addr :
5055                                     (void *)&inp->in6p_laddr), s, sizeof (s)),
5056                                     ntohs(inp->in6p_lport),
5057                                     inet_ntop(SOCK_DOM(so),
5058                                     (SOCK_DOM(so) == PF_INET) ?
5059                                     (void *)&inp->inp_faddr.s_addr :
5060                                     (void *)&inp->in6p_faddr, d, sizeof (d)),
5061                                     ntohs(inp->in6p_fport),
5062                                     (so->so_flags & SOF_NODEFUNCT) ?
5063                                     "not " : "");
5064                         } else {
5065                                 SODEFUNCTLOG("%s[%d, %s]: so 0x%llx [%d,%d] "
5066                                     "is now marked as %seligible for "
5067                                     "defunct\n",
5068                                     __func__, proc_selfpid(),
5069                                     proc_best_name(current_proc()),
5070                                     (uint64_t)DEBUG_KERNEL_ADDRPERM(so),
5071                                     SOCK_DOM(so), SOCK_TYPE(so),
5072                                     (so->so_flags & SOF_NODEFUNCT) ?
5073                                     "not " : "");
5074                         }
5075                         break;
5076
5077                 case SO_ISDEFUNCT:
5078                         /* This option is not settable */
5079                         error = EINVAL;
5080                         break;
5081
5082                 case SO_OPPORTUNISTIC:
5083                         error = sooptcopyin(sopt, &optval, sizeof (optval),
5084                             sizeof (optval));
5085                         if (error == 0)
5086                                 error = so_set_opportunistic(so, optval);
5087                         break;
5088
5089                 case SO_FLUSH:
5090                         /* This option is handled by lower layer(s) */
5091                         error = 0;
5092                         break;
5093
5094                 case SO_RECV_ANYIF:
5095                         error = sooptcopyin(sopt, &optval, sizeof (optval),
5096                             sizeof (optval));
5097                         if (error == 0)
5098                                 error = so_set_recv_anyif(so, optval);
5099                         break;
5100
5101                 case SO_TRAFFIC_MGT_BACKGROUND: {
5102                         /* This option is handled by lower layer(s) */
5103                         error = 0;
5104                         break;
5105                 }
5106
5107 #if FLOW_DIVERT
5108                 case SO_FLOW_DIVERT_TOKEN:
5109                         error = flow_divert_token_set(so, sopt);
5110                         break;
5111 #endif  /* FLOW_DIVERT */
5112
5113
5114                 case SO_DELEGATED:
5115                         if ((error = sooptcopyin(sopt, &optval, sizeof (optval),
5116                             sizeof (optval))) != 0)
5117                                 break;
5118
5119                         error = so_set_effective_pid(so, optval, sopt->sopt_p);
5120                         break;
5121
5122                 case SO_DELEGATED_UUID: {
5123                         uuid_t euuid;
5124
5125                         if ((error = sooptcopyin(sopt, &euuid, sizeof (euuid),
5126                             sizeof (euuid))) != 0)
5127                                 break;
5128
5129                         error = so_set_effective_uuid(so, euuid, sopt->sopt_p);
5130                         break;
5131                 }
5132
5133 #if NECP
5134                 case SO_NECP_ATTRIBUTES:
5135                         error = necp_set_socket_attributes(so, sopt);
5136                         break;
5137 #endif /* NECP */
5138
5139 #if MPTCP
5140                 case SO_MPTCP_FASTJOIN:
5141                         if (!((so->so_flags & SOF_MP_SUBFLOW) ||
5142                             ((SOCK_CHECK_DOM(so, PF_MULTIPATH)) &&
5143                             (SOCK_CHECK_PROTO(so, IPPROTO_TCP))))) {
5144                                 error = ENOPROTOOPT;
5145                                 break;
5146                         }
5147
5148                         error = sooptcopyin(sopt, &optval, sizeof (optval),
5149                             sizeof (optval));
5150                         if (error != 0)
5151                                 goto out;
5152                         if (optval == 0)
5153                                 so->so_flags &= ~SOF_MPTCP_FASTJOIN;
5154                         else
5155                                 so->so_flags |= SOF_MPTCP_FASTJOIN;
5156                         break;
5157 #endif /* MPTCP */
5158
5159                 case SO_EXTENDED_BK_IDLE:
5160                         error = sooptcopyin(sopt, &optval, sizeof (optval),
5161                             sizeof (optval));
5162                         if (error == 0)
5163                                 error = so_set_extended_bk_idle(so, optval);
5164                         break;
5165
5166                 case SO_MARK_CELLFALLBACK:
5167                         error = sooptcopyin(sopt, &optval, sizeof(optval),
5168                             sizeof(optval));
5169                         if (error != 0)
5170                                 goto out;
5171                         if (optval < 0) {
5172                                 error = EINVAL;
5173                                 goto out;
5174                         }
5175                         if (optval == 0)
5176                                 so->so_flags1 &= ~SOF1_CELLFALLBACK;
5177                         else
5178                                 so->so_flags1 |= SOF1_CELLFALLBACK;
5179                         break;
5180
5181                 case SO_NET_SERVICE_TYPE: {
5182                         error = sooptcopyin(sopt, &optval, sizeof(optval),
5183                             sizeof(optval));
5184                         if (error != 0)
5185                                 goto out;
5186                         error = so_set_net_service_type(so, optval);
5187                         break;
5188                 }
5189
5190                 case SO_QOSMARKING_POLICY_OVERRIDE:
5191                         error = priv_check_cred(kauth_cred_get(),
5192                             PRIV_NET_QOSMARKING_POLICY_OVERRIDE, 0);
5193                         if (error != 0)
5194                                 goto out;
5195                         error = sooptcopyin(sopt, &optval, sizeof(optval),
5196                             sizeof(optval));
5197                         if (error != 0)
5198                                 goto out;
5199                         if (optval == 0)
5200                                 so->so_flags1 &= ~SOF1_QOSMARKING_POLICY_OVERRIDE;
5201                         else
5202                                 so->so_flags1 |= SOF1_QOSMARKING_POLICY_OVERRIDE;
5203                         break;
5204
5205                 default:
5206                         error = ENOPROTOOPT;
5207                         break;
5208                 }
5209                 if (error == 0 && so->so_proto != NULL &&
5210                     so->so_proto->pr_ctloutput != NULL) {
5211                         (void) so->so_proto->pr_ctloutput(so, sopt);
5212                 }
5213         }
5214 out:
5215         if (dolock)
5216                 socket_unlock(so, 1);
5217         return (error);
5218 }
5219
5220 /* Helper routines for getsockopt */
5221 int
5222 sooptcopyout(struct sockopt *sopt, void *buf, size_t len)
5223 {
5224         int     error;
5225         size_t  valsize;
5226
5227         error = 0;
5228
5229         /*
5230          * Documented get behavior is that we always return a value,
5231          * possibly truncated to fit in the user's buffer.
5232          * Traditional behavior is that we always tell the user
5233          * precisely how much we copied, rather than something useful
5234          * like the total amount we had available for her.
5235          * Note that this interface is not idempotent; the entire answer must
5236          * generated ahead of time.
5237          */
5238         valsize = min(len, sopt->sopt_valsize);
5239         sopt->sopt_valsize = valsize;
5240         if (sopt->sopt_val != USER_ADDR_NULL) {
5241                 if (sopt->sopt_p != kernproc)
5242                         error = copyout(buf, sopt->sopt_val, valsize);
5243                 else
5244                         bcopy(buf, CAST_DOWN(caddr_t, sopt->sopt_val), valsize);
5245         }
5246         return (error);
5247 }
5248
5249 static int
5250 sooptcopyout_timeval(struct sockopt *sopt, const struct timeval *tv_p)
5251 {
5252         int                     error;
5253         size_t                  len;
5254         struct user64_timeval   tv64;
5255         struct user32_timeval   tv32;
5256         const void *            val;
5257         size_t                  valsize;
5258
5259         error = 0;
5260         if (proc_is64bit(sopt->sopt_p)) {
5261                 len = sizeof (tv64);
5262                 tv64.tv_sec = tv_p->tv_sec;
5263                 tv64.tv_usec = tv_p->tv_usec;
5264                 val = &tv64;
5265         } else {
5266                 len = sizeof (tv32);
5267                 tv32.tv_sec = tv_p->tv_sec;
5268                 tv32.tv_usec = tv_p->tv_usec;
5269                 val = &tv32;
5270         }
5271         valsize = min(len, sopt->sopt_valsize);
5272         sopt->sopt_valsize = valsize;
5273         if (sopt->sopt_val != USER_ADDR_NULL) {
5274                 if (sopt->sopt_p != kernproc)
5275                         error = copyout(val, sopt->sopt_val, valsize);
5276                 else
5277                         bcopy(val, CAST_DOWN(caddr_t, sopt->sopt_val), valsize);
5278         }
5279         return (error);
5280 }
5281
5282 /*
5283  * Return:      0                       Success
5284  *              ENOPROTOOPT
5285  *      <pr_ctloutput>:EOPNOTSUPP[AF_UNIX]
5286  *      <pr_ctloutput>:???
5287  *      <sf_getoption>:???
5288  */
5289 int
5290 sogetoptlock(struct socket *so, struct sockopt *sopt, int dolock)
5291 {
5292         int     error, optval;
5293         struct  linger l;
5294         struct  timeval tv;
5295 #if CONFIG_MACF_SOCKET
5296         struct mac extmac;
5297 #endif /* MAC_SOCKET */
5298
5299         if (sopt->sopt_dir != SOPT_GET)
5300                 sopt->sopt_dir = SOPT_GET;
5301
5302         if (dolock)
5303                 socket_lock(so, 1);
5304
5305         error = sflt_getsockopt(so, sopt);
5306         if (error != 0) {
5307                 if (error == EJUSTRETURN)
5308                         error = 0;
5309                 goto out;
5310         }
5311
5312         if (sopt->sopt_level != SOL_SOCKET) {
5313                 if (so->so_proto != NULL &&
5314                     so->so_proto->pr_ctloutput != NULL) {
5315                         error = (*so->so_proto->pr_ctloutput)(so, sopt);
5316                         goto out;
5317                 }
5318                 error = ENOPROTOOPT;
5319         } else {
5320                 /*
5321                  * Allow socket-level (SOL_SOCKET) options to be filtered by
5322                  * the protocol layer, if needed.  A zero value returned from
5323                  * the handler means use default socket-level processing as
5324                  * done by the rest of this routine.  Otherwise, any other
5325                  * return value indicates that the option is unsupported.
5326                  */
5327                 if (so->so_proto != NULL && (error = so->so_proto->pr_usrreqs->
5328                     pru_socheckopt(so, sopt)) != 0)
5329                         goto out;
5330
5331                 error = 0;
5332                 switch (sopt->sopt_name) {
5333                 case SO_LINGER:
5334                 case SO_LINGER_SEC:
5335                         l.l_onoff = ((so->so_options & SO_LINGER) ? 1 : 0);
5336                         l.l_linger = (sopt->sopt_name == SO_LINGER) ?
5337                             so->so_linger : so->so_linger / hz;
5338                         error = sooptcopyout(sopt, &l, sizeof (l));
5339                         break;
5340
5341                 case SO_USELOOPBACK:
5342                 case SO_DONTROUTE:
5343                 case SO_DEBUG:
5344                 case SO_KEEPALIVE:
5345                 case SO_REUSEADDR:
5346                 case SO_REUSEPORT:
5347                 case SO_BROADCAST:
5348                 case SO_OOBINLINE:
5349                 case SO_TIMESTAMP:
5350                 case SO_TIMESTAMP_MONOTONIC:
5351                 case SO_DONTTRUNC:
5352                 case SO_WANTMORE:
5353                 case SO_WANTOOBFLAG:
5354                 case SO_NOWAKEFROMSLEEP:
5355                 case SO_NOAPNFALLBK:
5356                         optval = so->so_options & sopt->sopt_name;
5357 integer:
5358                         error = sooptcopyout(sopt, &optval, sizeof (optval));
5359                         break;
5360
5361                 case SO_TYPE:
5362                         optval = so->so_type;
5363                         goto integer;
5364
5365                 case SO_NREAD:
5366                         if (so->so_proto->pr_flags & PR_ATOMIC) {
5367                                 int pkt_total;
5368                                 struct mbuf *m1;
5369
5370                                 pkt_total = 0;
5371                                 m1 = so->so_rcv.sb_mb;
5372                                 while (m1 != NULL) {
5373                                         if (m1->m_type == MT_DATA ||
5374                                             m1->m_type == MT_HEADER ||
5375                                             m1->m_type == MT_OOBDATA)
5376                                                 pkt_total += m1->m_len;
5377                                         m1 = m1->m_next;
5378                                 }
5379                                 optval = pkt_total;
5380                         } else {
5381                                 optval = so->so_rcv.sb_cc - so->so_rcv.sb_ctl;
5382                         }
5383                         goto integer;
5384
5385                 case SO_NUMRCVPKT:
5386                         if (so->so_proto->pr_flags & PR_ATOMIC) {
5387                                 int cnt = 0;
5388                                 struct mbuf *m1;
5389
5390                                 m1 = so->so_rcv.sb_mb;
5391                                 while (m1 != NULL) {
5392                                         if (m1->m_type == MT_DATA ||
5393                                             m1->m_type == MT_HEADER ||
5394                                             m1->m_type == MT_OOBDATA)
5395                                                 cnt += 1;
5396                                         m1 = m1->m_nextpkt;
5397                                 }
5398                                 optval = cnt;
5399                                 goto integer;
5400                         } else {
5401                                 error = EINVAL;
5402                                 break;
5403                         }
5404
5405                 case SO_NWRITE:
5406                         optval = so->so_snd.sb_cc;
5407                         goto integer;
5408
5409                 case SO_ERROR:
5410                         optval = so->so_error;
5411                         so->so_error = 0;
5412                         goto integer;
5413
5414                 case SO_SNDBUF: {
5415                         u_int32_t hiwat = so->so_snd.sb_hiwat;
5416
5417                         if (so->so_snd.sb_flags & SB_UNIX) {
5418                                 struct unpcb *unp =
5419                                     (struct unpcb *)(so->so_pcb);
5420                                 if (unp != NULL && unp->unp_conn != NULL) {
5421                                         hiwat += unp->unp_conn->unp_cc;
5422                                 }
5423                         }
5424
5425                         optval = hiwat;
5426                         goto integer;
5427                 }
5428                 case SO_RCVBUF:
5429                         optval = so->so_rcv.sb_hiwat;
5430                         goto integer;
5431
5432                 case SO_SNDLOWAT:
5433                         optval = so->so_snd.sb_lowat;
5434                         goto integer;
5435
5436                 case SO_RCVLOWAT:
5437                         optval = so->so_rcv.sb_lowat;
5438                         goto integer;
5439
5440                 case SO_SNDTIMEO:
5441                 case SO_RCVTIMEO:
5442                         tv = (sopt->sopt_name == SO_SNDTIMEO ?
5443                             so->so_snd.sb_timeo : so->so_rcv.sb_timeo);
5444
5445                         error = sooptcopyout_timeval(sopt, &tv);
5446                         break;
5447
5448                 case SO_NOSIGPIPE:
5449                         optval = (so->so_flags & SOF_NOSIGPIPE);
5450                         goto integer;
5451
5452                 case SO_NOADDRERR:
5453                         optval = (so->so_flags & SOF_NOADDRAVAIL);
5454                         goto integer;
5455
5456                 case SO_REUSESHAREUID:
5457                         optval = (so->so_flags & SOF_REUSESHAREUID);
5458                         goto integer;
5459
5460
5461                 case SO_NOTIFYCONFLICT:
5462                         optval = (so->so_flags & SOF_NOTIFYCONFLICT);
5463                         goto integer;
5464
5465                 case SO_RESTRICTIONS:
5466                         optval = so_get_restrictions(so);
5467                         goto integer;
5468
5469                 case SO_AWDL_UNRESTRICTED:
5470                         if (SOCK_DOM(so) == PF_INET ||
5471                             SOCK_DOM(so) == PF_INET6) {
5472                                 optval = inp_get_awdl_unrestricted(
5473                                     sotoinpcb(so));
5474                                 goto integer;
5475                         } else
5476                                 error = EOPNOTSUPP;
5477                         break;
5478
5479                 case SO_INTCOPROC_ALLOW:
5480                         if (SOCK_DOM(so) == PF_INET6) {
5481                                 optval = inp_get_intcoproc_allowed(
5482                                     sotoinpcb(so));
5483                                 goto integer;
5484                         } else
5485                                 error = EOPNOTSUPP;
5486                         break;
5487
5488                 case SO_LABEL:
5489 #if CONFIG_MACF_SOCKET
5490                         if ((error = sooptcopyin(sopt, &extmac, sizeof (extmac),
5491                             sizeof (extmac))) != 0 ||
5492                             (error = mac_socket_label_get(proc_ucred(
5493                             sopt->sopt_p), so, &extmac)) != 0)
5494                                 break;
5495
5496                         error = sooptcopyout(sopt, &extmac, sizeof (extmac));
5497 #else
5498                         error = EOPNOTSUPP;
5499 #endif /* MAC_SOCKET */
5500                         break;
5501
5502                 case SO_PEERLABEL:
5503 #if CONFIG_MACF_SOCKET
5504                         if ((error = sooptcopyin(sopt, &extmac, sizeof (extmac),
5505                             sizeof (extmac))) != 0 ||
5506                             (error = mac_socketpeer_label_get(proc_ucred(
5507                             sopt->sopt_p), so, &extmac)) != 0)
5508                                 break;
5509
5510                         error = sooptcopyout(sopt, &extmac, sizeof (extmac));
5511 #else
5512                         error = EOPNOTSUPP;
5513 #endif /* MAC_SOCKET */
5514                         break;
5515
5516 #ifdef __APPLE_API_PRIVATE
5517                 case SO_UPCALLCLOSEWAIT:
5518                         optval = (so->so_flags & SOF_UPCALLCLOSEWAIT);
5519                         goto integer;
5520 #endif
5521                 case SO_RANDOMPORT:
5522                         optval = (so->so_flags & SOF_BINDRANDOMPORT);
5523                         goto integer;
5524
5525                 case SO_NP_EXTENSIONS: {
5526                         struct so_np_extensions sonpx;
5527
5528                         sonpx.npx_flags = (so->so_flags & SOF_NPX_SETOPTSHUT) ?
5529                             SONPX_SETOPTSHUT : 0;
5530                         sonpx.npx_mask = SONPX_MASK_VALID;
5531
5532                         error = sooptcopyout(sopt, &sonpx,
5533                             sizeof (struct so_np_extensions));
5534                         break;
5535                 }
5536
5537                 case SO_TRAFFIC_CLASS:
5538                         optval = so->so_traffic_class;
5539                         goto integer;
5540
5541                 case SO_RECV_TRAFFIC_CLASS:
5542                         optval = (so->so_flags & SOF_RECV_TRAFFIC_CLASS);
5543                         goto integer;
5544
5545                 case SO_TRAFFIC_CLASS_STATS:
5546                         error = sooptcopyout(sopt, &so->so_tc_stats,
5547                             sizeof (so->so_tc_stats));
5548                         break;
5549
5550 #if (DEVELOPMENT || DEBUG)
5551                 case SO_TRAFFIC_CLASS_DBG:
5552                         error = sogetopt_tcdbg(so, sopt);
5553                         break;
5554 #endif /* (DEVELOPMENT || DEBUG) */
5555
5556                 case SO_PRIVILEGED_TRAFFIC_CLASS:
5557                         optval = (so->so_flags & SOF_PRIVILEGED_TRAFFIC_CLASS);
5558                         goto integer;
5559
5560                 case SO_DEFUNCTOK:
5561                         optval = !(so->so_flags & SOF_NODEFUNCT);
5562                         goto integer;
5563
5564                 case SO_ISDEFUNCT:
5565                         optval = (so->so_flags & SOF_DEFUNCT);
5566                         goto integer;
5567
5568                 case SO_OPPORTUNISTIC:
5569                         optval = so_get_opportunistic(so);
5570                         goto integer;
5571
5572                 case SO_FLUSH:
5573                         /* This option is not gettable */
5574                         error = EINVAL;
5575                         break;
5576
5577                 case SO_RECV_ANYIF:
5578                         optval = so_get_recv_anyif(so);
5579                         goto integer;
5580
5581                 case SO_TRAFFIC_MGT_BACKGROUND:
5582                         /* This option is handled by lower layer(s) */
5583                         if (so->so_proto != NULL &&
5584                             so->so_proto->pr_ctloutput != NULL) {
5585                                 (void) so->so_proto->pr_ctloutput(so, sopt);
5586                         }
5587                         break;
5588
5589 #if FLOW_DIVERT
5590                 case SO_FLOW_DIVERT_TOKEN:
5591                         error = flow_divert_token_get(so, sopt);
5592                         break;
5593 #endif  /* FLOW_DIVERT */
5594
5595 #if NECP
5596                 case SO_NECP_ATTRIBUTES:
5597                         error = necp_get_socket_attributes(so, sopt);
5598                         break;
5599 #endif /* NECP */
5600
5601 #if CONTENT_FILTER
5602                 case SO_CFIL_SOCK_ID: {
5603                         cfil_sock_id_t sock_id;
5604
5605                         sock_id = cfil_sock_id_from_socket(so);
5606
5607                         error = sooptcopyout(sopt, &sock_id,
5608                                 sizeof(cfil_sock_id_t));
5609                         break;
5610                 }
5611 #endif  /* CONTENT_FILTER */
5612
5613 #if MPTCP
5614                 case SO_MPTCP_FASTJOIN:
5615                         if (!((so->so_flags & SOF_MP_SUBFLOW) ||
5616                             ((SOCK_CHECK_DOM(so, PF_MULTIPATH)) &&
5617                             (SOCK_CHECK_PROTO(so, IPPROTO_TCP))))) {
5618                                 error = ENOPROTOOPT;
5619                                 break;
5620                         }
5621                         optval = (so->so_flags & SOF_MPTCP_FASTJOIN);
5622                         /* Fixed along with rdar://19391339 */
5623                         goto integer;
5624 #endif /* MPTCP */
5625
5626                 case SO_EXTENDED_BK_IDLE:
5627                         optval = (so->so_flags1 & SOF1_EXTEND_BK_IDLE_WANTED);
5628                         goto integer;
5629                 case SO_MARK_CELLFALLBACK:
5630                         optval = ((so->so_flags1 & SOF1_CELLFALLBACK) > 0)
5631                             ? 1 : 0;
5632                         goto integer;
5633                 case SO_NET_SERVICE_TYPE: {
5634                         if ((so->so_flags1 & SOF1_TC_NET_SERV_TYPE))
5635                                 optval = so->so_netsvctype;
5636                         else
5637                                 optval = NET_SERVICE_TYPE_BE;
5638                         goto integer;
5639                 }
5640                 case SO_NETSVC_MARKING_LEVEL:
5641                         optval = so_get_netsvc_marking_level(so);
5642                         goto integer;
5643
5644                 default:
5645                         error = ENOPROTOOPT;
5646                         break;
5647                 }
5648         }
5649 out:
5650         if (dolock)
5651                 socket_unlock(so, 1);
5652         return (error);
5653 }
5654
5655 /*
5656  * The size limits on our soopt_getm is different from that on FreeBSD.
5657  * We limit the size of options to MCLBYTES. This will have to change
5658  * if we need to define options that need more space than MCLBYTES.
5659  */
5660 int
5661 soopt_getm(struct sockopt *sopt, struct mbuf **mp)
5662 {
5663         struct mbuf *m, *m_prev;
5664         int sopt_size = sopt->sopt_valsize;
5665         int how;
5666
5667         if (sopt_size <= 0 || sopt_size > MCLBYTES)
5668                 return (EMSGSIZE);
5669
5670         how = sopt->sopt_p != kernproc ? M_WAIT : M_DONTWAIT;
5671         MGET(m, how, MT_DATA);
5672         if (m == NULL)
5673                 return (ENOBUFS);
5674         if (sopt_size > MLEN) {
5675                 MCLGET(m, how);
5676                 if ((m->m_flags & M_EXT) == 0) {
5677                         m_free(m);
5678                         return (ENOBUFS);
5679                 }
5680                 m->m_len = min(MCLBYTES, sopt_size);
5681         } else {
5682                 m->m_len = min(MLEN, sopt_size);
5683         }
5684         sopt_size -= m->m_len;
5685         *mp = m;
5686         m_prev = m;
5687
5688         while (sopt_size > 0) {
5689                 MGET(m, how, MT_DATA);
5690                 if (m == NULL) {
5691                         m_freem(*mp);
5692                         return (ENOBUFS);
5693                 }
5694                 if (sopt_size > MLEN) {
5695                         MCLGET(m, how);
5696                         if ((m->m_flags & M_EXT) == 0) {
5697                                 m_freem(*mp);
5698                                 m_freem(m);
5699                                 return (ENOBUFS);
5700                         }
5701                         m->m_len = min(MCLBYTES, sopt_size);
5702                 } else {
5703                         m->m_len = min(MLEN, sopt_size);
5704                 }
5705                 sopt_size -= m->m_len;
5706                 m_prev->m_next = m;
5707                 m_prev = m;
5708         }
5709         return (0);
5710 }
5711
5712 /* copyin sopt data into mbuf chain */
5713 int
5714 soopt_mcopyin(struct sockopt *sopt, struct mbuf *m)
5715 {
5716         struct mbuf *m0 = m;
5717
5718         if (sopt->sopt_val == USER_ADDR_NULL)
5719                 return (0);
5720         while (m != NULL && sopt->sopt_valsize >= m->m_len) {
5721                 if (sopt->sopt_p != kernproc) {
5722                         int error;
5723
5724                         error = copyin(sopt->sopt_val, mtod(m, char *),
5725                             m->m_len);
5726                         if (error != 0) {
5727                                 m_freem(m0);
5728                                 return (error);
5729                         }
5730                 } else {
5731                         bcopy(CAST_DOWN(caddr_t, sopt->sopt_val),
5732                             mtod(m, char *), m->m_len);
5733                 }
5734                 sopt->sopt_valsize -= m->m_len;
5735                 sopt->sopt_val += m->m_len;
5736                 m = m->m_next;
5737         }
5738         /* should be allocated enoughly at ip6_sooptmcopyin() */
5739         if (m != NULL) {
5740                 panic("soopt_mcopyin");
5741                 /* NOTREACHED */
5742         }
5743         return (0);
5744 }
5745
5746 /* copyout mbuf chain data into soopt */
5747 int
5748 soopt_mcopyout(struct sockopt *sopt, struct mbuf *m)
5749 {
5750         struct mbuf *m0 = m;
5751         size_t valsize = 0;
5752
5753         if (sopt->sopt_val == USER_ADDR_NULL)
5754                 return (0);
5755         while (m != NULL && sopt->sopt_valsize >= m->m_len) {
5756                 if (sopt->sopt_p != kernproc) {
5757                         int error;
5758
5759                         error = copyout(mtod(m, char *), sopt->sopt_val,
5760                             m->m_len);
5761                         if (error != 0) {
5762                                 m_freem(m0);
5763                                 return (error);
5764                         }
5765                 } else {
5766                         bcopy(mtod(m, char *),
5767                             CAST_DOWN(caddr_t, sopt->sopt_val), m->m_len);
5768                 }
5769                 sopt->sopt_valsize -= m->m_len;
5770                 sopt->sopt_val += m->m_len;
5771                 valsize += m->m_len;
5772                 m = m->m_next;
5773         }
5774         if (m != NULL) {
5775                 /* enough soopt buffer should be given from user-land */
5776                 m_freem(m0);
5777                 return (EINVAL);
5778         }
5779         sopt->sopt_valsize = valsize;
5780         return (0);
5781 }
5782
5783 void
5784 sohasoutofband(struct socket *so)
5785 {
5786         if (so->so_pgid < 0)
5787                 gsignal(-so->so_pgid, SIGURG);
5788         else if (so->so_pgid > 0)
5789                 proc_signal(so->so_pgid, SIGURG);
5790         selwakeup(&so->so_rcv.sb_sel);
5791         if (so->so_rcv.sb_flags & SB_KNOTE) {
5792                 KNOTE(&so->so_rcv.sb_sel.si_note,
5793                     (NOTE_OOB | SO_FILT_HINT_LOCKED));
5794         }
5795 }
5796
5797 int
5798 sopoll(struct socket *so, int events, kauth_cred_t cred, void * wql)
5799 {
5800 #pragma unused(cred)
5801         struct proc *p = current_proc();
5802         int revents = 0;
5803
5804         socket_lock(so, 1);
5805         so_update_last_owner_locked(so, PROC_NULL);
5806         so_update_policy(so);
5807
5808         if (events & (POLLIN | POLLRDNORM))
5809                 if (soreadable(so))
5810                         revents |= events & (POLLIN | POLLRDNORM);
5811
5812         if (events & (POLLOUT | POLLWRNORM))
5813                 if (sowriteable(so))
5814                         revents |= events & (POLLOUT | POLLWRNORM);
5815
5816         if (events & (POLLPRI | POLLRDBAND))
5817                 if (so->so_oobmark || (so->so_state & SS_RCVATMARK))
5818                         revents |= events & (POLLPRI | POLLRDBAND);
5819
5820         if (revents == 0) {
5821                 if (events & (POLLIN | POLLPRI | POLLRDNORM | POLLRDBAND)) {
5822                         /*
5823                          * Darwin sets the flag first,
5824                          * BSD calls selrecord first
5825                          */
5826                         so->so_rcv.sb_flags |= SB_SEL;
5827                         selrecord(p, &so->so_rcv.sb_sel, wql);
5828                 }
5829
5830                 if (events & (POLLOUT | POLLWRNORM)) {
5831                         /*
5832                          * Darwin sets the flag first,
5833                          * BSD calls selrecord first
5834                          */
5835                         so->so_snd.sb_flags |= SB_SEL;
5836                         selrecord(p, &so->so_snd.sb_sel, wql);
5837                 }
5838         }
5839
5840         socket_unlock(so, 1);
5841         return (revents);
5842 }
5843
5844 int
5845 soo_kqfilter(struct fileproc *fp, struct knote *kn, vfs_context_t ctx)
5846 {
5847 #pragma unused(fp)
5848 #if !CONFIG_MACF_SOCKET
5849 #pragma unused(ctx)
5850 #endif /* MAC_SOCKET */
5851         struct socket *so = (struct socket *)kn->kn_fp->f_fglob->fg_data;
5852         int result;
5853
5854         socket_lock(so, 1);
5855         so_update_last_owner_locked(so, PROC_NULL);
5856         so_update_policy(so);
5857
5858 #if CONFIG_MACF_SOCKET
5859         if (mac_socket_check_kqfilter(proc_ucred(vfs_context_proc(ctx)),
5860             kn, so) != 0) {
5861                 socket_unlock(so, 1);
5862                 kn->kn_flags = EV_ERROR;
5863                 kn->kn_data = EPERM;
5864                 return 0;
5865         }
5866 #endif /* MAC_SOCKET */
5867
5868         switch (kn->kn_filter) {
5869         case EVFILT_READ:
5870                 kn->kn_filtid = EVFILTID_SOREAD;
5871                 break;
5872         case EVFILT_WRITE:
5873                 kn->kn_filtid = EVFILTID_SOWRITE;
5874                 break;
5875         case EVFILT_SOCK:
5876                 kn->kn_filtid = EVFILTID_SCK;
5877                 break;
5878         case EVFILT_EXCEPT:
5879                 kn->kn_filtid = EVFILTID_SOEXCEPT;
5880                 break;
5881         default:
5882                 socket_unlock(so, 1);
5883                 kn->kn_flags = EV_ERROR;
5884                 kn->kn_data = EINVAL;
5885                 return 0;
5886         }
5887
5888         /*
5889          * call the appropriate sub-filter attach
5890          * with the socket still locked
5891          */
5892         result = knote_fops(kn)->f_attach(kn);
5893
5894         socket_unlock(so, 1);
5895
5896         return result;
5897 }
5898
5899 static int
5900 filt_soread_common(struct knote *kn, struct socket *so)
5901 {
5902         if (so->so_options & SO_ACCEPTCONN) {
5903                 int is_not_empty;
5904
5905                 /*
5906                  * Radar 6615193 handle the listen case dynamically
5907                  * for kqueue read filter. This allows to call listen()
5908                  * after registering the kqueue EVFILT_READ.
5909                  */
5910
5911                 kn->kn_data = so->so_qlen;
5912                 is_not_empty = ! TAILQ_EMPTY(&so->so_comp);
5913
5914                 return (is_not_empty);
5915         }
5916
5917         /* socket isn't a listener */
5918         /*
5919          * NOTE_LOWAT specifies new low water mark in data, i.e.
5920          * the bytes of protocol data. We therefore exclude any
5921          * control bytes.
5922          */
5923         kn->kn_data = so->so_rcv.sb_cc - so->so_rcv.sb_ctl;
5924
5925         if (kn->kn_sfflags & NOTE_OOB) {
5926                 if (so->so_oobmark || (so->so_state & SS_RCVATMARK)) {
5927                         kn->kn_fflags |= NOTE_OOB;
5928                         kn->kn_data -= so->so_oobmark;
5929                         return (1);
5930                 }
5931         }
5932
5933         if ((so->so_state & SS_CANTRCVMORE)
5934 #if CONTENT_FILTER
5935             && cfil_sock_data_pending(&so->so_rcv) == 0
5936 #endif /* CONTENT_FILTER */
5937            ) {
5938                 kn->kn_flags |= EV_EOF;
5939                 kn->kn_fflags = so->so_error;
5940                 return (1);
5941         }
5942
5943         if (so->so_error) {     /* temporary udp error */
5944                 return (1);
5945         }
5946
5947         int64_t lowwat = so->so_rcv.sb_lowat;
5948         /*
5949          * Ensure that when NOTE_LOWAT is used, the derived
5950          * low water mark is bounded by socket's rcv buf's
5951          * high and low water mark values.
5952          */
5953         if (kn->kn_sfflags & NOTE_LOWAT) {
5954                 if (kn->kn_sdata > so->so_rcv.sb_hiwat)
5955                         lowwat = so->so_rcv.sb_hiwat;
5956                 else if (kn->kn_sdata > lowwat)
5957                         lowwat = kn->kn_sdata;
5958         }
5959
5960         /*
5961          * The order below is important. Since NOTE_LOWAT
5962          * overrides sb_lowat, check for NOTE_LOWAT case
5963          * first.
5964          */
5965         if (kn->kn_sfflags & NOTE_LOWAT)
5966                 return (kn->kn_data >= lowwat);
5967
5968         return (so->so_rcv.sb_cc >= lowwat);
5969 }
5970
5971 static int
5972 filt_sorattach(struct knote *kn)
5973 {
5974         struct socket *so = (struct socket *)kn->kn_fp->f_fglob->fg_data;
5975
5976         /* socket locked */
5977
5978         /*
5979          * If the caller explicitly asked for OOB results (e.g. poll())
5980          * from EVFILT_READ, then save that off in the hookid field
5981          * and reserve the kn_flags EV_OOBAND bit for output only.
5982          */
5983         if (kn->kn_filter == EVFILT_READ &&
5984             kn->kn_flags & EV_OOBAND) {
5985                 kn->kn_flags &= ~EV_OOBAND;
5986                 kn->kn_hookid = EV_OOBAND;
5987         } else {
5988                 kn->kn_hookid = 0;
5989         }
5990         if (KNOTE_ATTACH(&so->so_rcv.sb_sel.si_note, kn))
5991                 so->so_rcv.sb_flags |= SB_KNOTE;
5992
5993         /* indicate if event is already fired */
5994         return filt_soread_common(kn, so);
5995 }
5996
5997 static void
5998 filt_sordetach(struct knote *kn)
5999 {
6000         struct socket *so = (struct socket *)kn->kn_fp->f_fglob->fg_data;
6001
6002         socket_lock(so, 1);
6003         if (so->so_rcv.sb_flags & SB_KNOTE)
6004                 if (KNOTE_DETACH(&so->so_rcv.sb_sel.si_note, kn))
6005                         so->so_rcv.sb_flags &= ~SB_KNOTE;
6006         socket_unlock(so, 1);
6007 }
6008
6009 /*ARGSUSED*/
6010 static int
6011 filt_soread(struct knote *kn, long hint)
6012 {
6013         struct socket *so = (struct socket *)kn->kn_fp->f_fglob->fg_data;
6014         int retval;
6015
6016         if ((hint & SO_FILT_HINT_LOCKED) == 0)
6017                 socket_lock(so, 1);
6018
6019         retval = filt_soread_common(kn, so);
6020
6021         if ((hint & SO_FILT_HINT_LOCKED) == 0)
6022                 socket_unlock(so, 1);
6023
6024         return retval;
6025 }
6026
6027 static int
6028 filt_sortouch(struct knote *kn, struct kevent_internal_s *kev)
6029 {
6030         struct socket *so = (struct socket *)kn->kn_fp->f_fglob->fg_data;
6031         int retval;
6032
6033         socket_lock(so, 1);
6034
6035         /* save off the new input fflags and data */
6036         kn->kn_sfflags = kev->fflags;
6037         kn->kn_sdata = kev->data;
6038         if ((kn->kn_status & KN_UDATA_SPECIFIC) == 0)
6039                 kn->kn_udata = kev->udata;
6040
6041         /* determine if changes result in fired events */
6042         retval = filt_soread_common(kn, so);
6043
6044         socket_unlock(so, 1);
6045
6046         return retval;
6047 }
6048
6049 static int
6050 filt_sorprocess(struct knote *kn, struct filt_process_s *data, struct kevent_internal_s *kev)
6051 {
6052 #pragma unused(data)
6053         struct socket *so = (struct socket *)kn->kn_fp->f_fglob->fg_data;
6054         int retval;
6055
6056         socket_lock(so, 1);
6057         retval = filt_soread_common(kn, so);
6058         if (retval) {
6059                 *kev = kn->kn_kevent;
6060                 if (kn->kn_flags & EV_CLEAR) {
6061                         kn->kn_fflags = 0;
6062                         kn->kn_data = 0;
6063                 }
6064         }
6065         socket_unlock(so, 1);
6066
6067         return retval;
6068 }
6069
6070 int
6071 so_wait_for_if_feedback(struct socket *so)
6072 {
6073         if ((SOCK_DOM(so) == PF_INET || SOCK_DOM(so) == PF_INET6) &&
6074             (so->so_state & SS_ISCONNECTED)) {
6075                 struct inpcb *inp = sotoinpcb(so);
6076                 if (INP_WAIT_FOR_IF_FEEDBACK(inp))
6077                         return (1);
6078         }
6079         return (0);
6080 }
6081
6082 static int
6083 filt_sowrite_common(struct knote *kn, struct socket *so)
6084 {
6085         int ret = 0;
6086
6087         kn->kn_data = sbspace(&so->so_snd);
6088         if (so->so_state & SS_CANTSENDMORE) {
6089                 kn->kn_flags |= EV_EOF;
6090                 kn->kn_fflags = so->so_error;
6091                 return 1;
6092         }
6093         if (so->so_error) {     /* temporary udp error */
6094                 return 1;
6095         }
6096         if (!socanwrite(so)) {
6097                 return 0;
6098         }
6099         if (so->so_flags1 & SOF1_PRECONNECT_DATA) {
6100                 return 1;
6101         }
6102         int64_t lowwat = so->so_snd.sb_lowat;
6103         if (kn->kn_sfflags & NOTE_LOWAT) {
6104                 if (kn->kn_sdata > so->so_snd.sb_hiwat)
6105                         lowwat = so->so_snd.sb_hiwat;
6106                 else if (kn->kn_sdata > lowwat)
6107                         lowwat = kn->kn_sdata;
6108         }
6109         if (kn->kn_data >= lowwat) {
6110                 if ((so->so_flags & SOF_NOTSENT_LOWAT)
6111 #if (DEBUG || DEVELOPMENT)
6112                     && so_notsent_lowat_check == 1
6113 #endif /* DEBUG || DEVELOPMENT */
6114                     ) {
6115                         if ((SOCK_DOM(so) == PF_INET ||
6116                             SOCK_DOM(so) == PF_INET6) &&
6117                             so->so_type == SOCK_STREAM) {
6118                                 ret = tcp_notsent_lowat_check(so);
6119                         }
6120 #if MPTCP
6121                         else if ((SOCK_DOM(so) == PF_MULTIPATH) &&
6122                             (SOCK_PROTO(so) == IPPROTO_TCP)) {
6123                                 ret = mptcp_notsent_lowat_check(so);
6124                         }
6125 #endif
6126                         else {
6127                                 return 1;
6128                         }
6129                 } else {
6130                         ret = 1;
6131                 }
6132         }
6133         if (so_wait_for_if_feedback(so))
6134                 ret = 0;
6135         return (ret);
6136 }
6137
6138 static int
6139 filt_sowattach(struct knote *kn)
6140 {
6141         struct socket *so = (struct socket *)kn->kn_fp->f_fglob->fg_data;
6142
6143         /* socket locked */
6144         if (KNOTE_ATTACH(&so->so_snd.sb_sel.si_note, kn))
6145                 so->so_snd.sb_flags |= SB_KNOTE;
6146
6147         /* determine if its already fired */
6148         return filt_sowrite_common(kn, so);
6149 }
6150
6151 static void
6152 filt_sowdetach(struct knote *kn)
6153 {
6154         struct socket *so = (struct socket *)kn->kn_fp->f_fglob->fg_data;
6155         socket_lock(so, 1);
6156
6157         if (so->so_snd.sb_flags & SB_KNOTE)
6158                 if (KNOTE_DETACH(&so->so_snd.sb_sel.si_note, kn))
6159                         so->so_snd.sb_flags &= ~SB_KNOTE;
6160         socket_unlock(so, 1);
6161 }
6162
6163 /*ARGSUSED*/
6164 static int
6165 filt_sowrite(struct knote *kn, long hint)
6166 {
6167         struct socket *so = (struct socket *)kn->kn_fp->f_fglob->fg_data;
6168         int ret;
6169
6170         if ((hint & SO_FILT_HINT_LOCKED) == 0)
6171                 socket_lock(so, 1);
6172
6173         ret = filt_sowrite_common(kn, so);
6174
6175         if ((hint & SO_FILT_HINT_LOCKED) == 0)
6176                 socket_unlock(so, 1);
6177
6178         return ret;
6179 }
6180
6181 static int
6182 filt_sowtouch(struct knote *kn, struct kevent_internal_s *kev)
6183 {
6184         struct socket *so = (struct socket *)kn->kn_fp->f_fglob->fg_data;
6185         int ret;
6186
6187         socket_lock(so, 1);
6188
6189         /*save off the new input fflags and data */
6190         kn->kn_sfflags = kev->fflags;
6191         kn->kn_sdata = kev->data;
6192         if ((kn->kn_status & KN_UDATA_SPECIFIC) == 0)
6193                 kn->kn_udata = kev->udata;
6194
6195         /* determine if these changes result in a triggered event */
6196         ret = filt_sowrite_common(kn, so);
6197
6198         socket_unlock(so, 1);
6199
6200         return ret;
6201 }
6202
6203 static int
6204 filt_sowprocess(struct knote *kn, struct filt_process_s *data, struct kevent_internal_s *kev)
6205 {
6206 #pragma unused(data)
6207         struct socket *so = (struct socket *)kn->kn_fp->f_fglob->fg_data;
6208         int ret;
6209
6210         socket_lock(so, 1);
6211         ret = filt_sowrite_common(kn, so);
6212         if (ret) {
6213                 *kev = kn->kn_kevent;
6214                 if (kn->kn_flags & EV_CLEAR) {
6215                         kn->kn_fflags = 0;
6216                         kn->kn_data = 0;
6217                 }
6218         }
6219         socket_unlock(so, 1);
6220         return ret;
6221 }
6222
6223 static int
6224 filt_sockev_common(struct knote *kn, struct socket *so, long ev_hint)
6225 {
6226         int ret = 0;
6227         uint32_t level_trigger = 0;
6228
6229         if (ev_hint & SO_FILT_HINT_CONNRESET) {
6230                 kn->kn_fflags |= NOTE_CONNRESET;
6231         }
6232         if (ev_hint & SO_FILT_HINT_TIMEOUT) {
6233                 kn->kn_fflags |= NOTE_TIMEOUT;
6234         }
6235         if (ev_hint & SO_FILT_HINT_NOSRCADDR) {
6236                 kn->kn_fflags |= NOTE_NOSRCADDR;
6237         }
6238         if (ev_hint & SO_FILT_HINT_IFDENIED) {
6239                 kn->kn_fflags |= NOTE_IFDENIED;
6240         }
6241         if (ev_hint & SO_FILT_HINT_KEEPALIVE) {
6242                 kn->kn_fflags |= NOTE_KEEPALIVE;
6243         }
6244         if (ev_hint & SO_FILT_HINT_ADAPTIVE_WTIMO) {
6245                 kn->kn_fflags |= NOTE_ADAPTIVE_WTIMO;
6246         }
6247         if (ev_hint & SO_FILT_HINT_ADAPTIVE_RTIMO) {
6248                 kn->kn_fflags |= NOTE_ADAPTIVE_RTIMO;
6249         }
6250         if ((ev_hint & SO_FILT_HINT_CONNECTED) ||
6251             (so->so_state & SS_ISCONNECTED)) {
6252                 kn->kn_fflags |= NOTE_CONNECTED;
6253                 level_trigger |= NOTE_CONNECTED;
6254         }
6255         if ((ev_hint & SO_FILT_HINT_DISCONNECTED) ||
6256             (so->so_state & SS_ISDISCONNECTED)) {
6257                 kn->kn_fflags |= NOTE_DISCONNECTED;
6258                 level_trigger |= NOTE_DISCONNECTED;
6259         }
6260         if (ev_hint & SO_FILT_HINT_CONNINFO_UPDATED) {
6261                 if (so->so_proto != NULL &&
6262                     (so->so_proto->pr_flags & PR_EVCONNINFO))
6263                         kn->kn_fflags |= NOTE_CONNINFO_UPDATED;
6264         }
6265
6266         if ((ev_hint & SO_FILT_HINT_NOTIFY_ACK) ||
6267             tcp_notify_ack_active(so)) {
6268                 kn->kn_fflags |= NOTE_NOTIFY_ACK;
6269         }
6270
6271         if ((so->so_state & SS_CANTRCVMORE)
6272 #if CONTENT_FILTER
6273             && cfil_sock_data_pending(&so->so_rcv) == 0
6274 #endif /* CONTENT_FILTER */
6275             ) {
6276                 kn->kn_fflags |= NOTE_READCLOSED;
6277                 level_trigger |= NOTE_READCLOSED;
6278         }
6279
6280         if (so->so_state & SS_CANTSENDMORE) {
6281                 kn->kn_fflags |= NOTE_WRITECLOSED;
6282                 level_trigger |= NOTE_WRITECLOSED;
6283         }
6284
6285         if ((ev_hint & SO_FILT_HINT_SUSPEND) ||
6286             (so->so_flags & SOF_SUSPENDED)) {
6287                 kn->kn_fflags &= ~(NOTE_SUSPEND | NOTE_RESUME);
6288
6289                 /* If resume event was delivered before, reset it */
6290                 kn->kn_hookid &= ~NOTE_RESUME;
6291
6292                 kn->kn_fflags |= NOTE_SUSPEND;
6293                 level_trigger |= NOTE_SUSPEND;
6294         }
6295
6296         if ((ev_hint & SO_FILT_HINT_RESUME) ||
6297             (so->so_flags & SOF_SUSPENDED) == 0) {
6298                 kn->kn_fflags &= ~(NOTE_SUSPEND | NOTE_RESUME);
6299
6300                 /* If suspend event was delivered before, reset it */
6301                 kn->kn_hookid &= ~NOTE_SUSPEND;
6302
6303                 kn->kn_fflags |= NOTE_RESUME;
6304                 level_trigger |= NOTE_RESUME;
6305         }
6306
6307         if (so->so_error != 0) {
6308                 ret = 1;
6309                 kn->kn_data = so->so_error;
6310                 kn->kn_flags |= EV_EOF;
6311         } else {
6312                 get_sockev_state(so, (u_int32_t *)&(kn->kn_data));
6313         }
6314
6315         /* Reset any events that are not requested on this knote */
6316         kn->kn_fflags &= (kn->kn_sfflags & EVFILT_SOCK_ALL_MASK);
6317         level_trigger &= (kn->kn_sfflags & EVFILT_SOCK_ALL_MASK);
6318
6319         /* Find the level triggerred events that are already delivered */
6320         level_trigger &= kn->kn_hookid;
6321         level_trigger &= EVFILT_SOCK_LEVEL_TRIGGER_MASK;
6322
6323         /* Do not deliver level triggerred events more than once */
6324         if ((kn->kn_fflags & ~level_trigger) != 0)
6325                 ret = 1;
6326
6327         return (ret);
6328 }
6329
6330 static int
6331 filt_sockattach(struct knote *kn)
6332 {
6333         struct socket *so = (struct socket *)kn->kn_fp->f_fglob->fg_data;
6334
6335         /* socket locked */
6336         kn->kn_hookid = 0;
6337         if (KNOTE_ATTACH(&so->so_klist, kn))
6338                 so->so_flags |= SOF_KNOTE;
6339
6340         /* determine if event already fired */
6341         return filt_sockev_common(kn, so, 0);
6342 }
6343
6344 static void
6345 filt_sockdetach(struct knote *kn)
6346 {
6347         struct socket *so = (struct socket *)kn->kn_fp->f_fglob->fg_data;
6348         socket_lock(so, 1);
6349
6350         if ((so->so_flags & SOF_KNOTE) != 0)
6351                 if (KNOTE_DETACH(&so->so_klist, kn))
6352                         so->so_flags &= ~SOF_KNOTE;
6353         socket_unlock(so, 1);
6354 }
6355
6356 static int
6357 filt_sockev(struct knote *kn, long hint)
6358 {
6359         int ret = 0, locked = 0;
6360         struct socket *so = (struct socket *)kn->kn_fp->f_fglob->fg_data;
6361         long ev_hint = (hint & SO_FILT_HINT_EV);
6362
6363         if ((hint & SO_FILT_HINT_LOCKED) == 0) {
6364                 socket_lock(so, 1);
6365                 locked = 1;
6366         }
6367
6368         ret = filt_sockev_common(kn, so, ev_hint);
6369
6370         if (locked)
6371                 socket_unlock(so, 1);
6372
6373         return ret;
6374 }
6375
6376
6377
6378 /*
6379  *      filt_socktouch - update event state
6380  */
6381 static int
6382 filt_socktouch(
6383         struct knote *kn,
6384         struct kevent_internal_s *kev)
6385 {
6386         struct socket *so = (struct socket *)kn->kn_fp->f_fglob->fg_data;
6387         uint32_t changed_flags;
6388         int ret;
6389
6390         socket_lock(so, 1);
6391
6392         /* save off the [result] data and fflags */
6393         changed_flags = (kn->kn_sfflags ^ kn->kn_hookid);
6394
6395         /* save off the new input fflags and data */
6396         kn->kn_sfflags = kev->fflags;
6397         kn->kn_sdata = kev->data;
6398         if ((kn->kn_status & KN_UDATA_SPECIFIC) == 0)
6399                 kn->kn_udata = kev->udata;
6400
6401         /* restrict the current results to the (smaller?) set of new interest */
6402         /*
6403          * For compatibility with previous implementations, we leave kn_fflags
6404          * as they were before.
6405          */
6406         //kn->kn_fflags &= kev->fflags;
6407
6408         /*
6409          * Since we keep track of events that are already
6410          * delivered, if any of those events are not requested
6411          * anymore the state related to them can be reset
6412          */
6413         kn->kn_hookid &=
6414             ~(changed_flags & EVFILT_SOCK_LEVEL_TRIGGER_MASK);
6415
6416         /* determine if we have events to deliver */
6417         ret = filt_sockev_common(kn, so, 0);
6418
6419         socket_unlock(so, 1);
6420
6421         return ret;
6422 }
6423
6424 /*
6425  *      filt_sockprocess - query event fired state and return data
6426  */
6427 static int
6428 filt_sockprocess(
6429         struct knote *kn,
6430         struct filt_process_s *data,
6431         struct kevent_internal_s *kev)
6432 {
6433 #pragma unused(data)
6434
6435         struct socket *so = (struct socket *)kn->kn_fp->f_fglob->fg_data;
6436         int ret = 0;
6437
6438         socket_lock(so, 1);
6439
6440         ret = filt_sockev_common(kn, so, 0);
6441         if (ret) {
6442                 *kev = kn->kn_kevent;
6443
6444                 /*
6445                  * Store the state of the events being delivered. This
6446                  * state can be used to deliver level triggered events
6447                  * ateast once and still avoid waking up the application
6448                  * multiple times as long as the event is active.
6449                  */
6450                 if (kn->kn_fflags != 0)
6451                         kn->kn_hookid |= (kn->kn_fflags &
6452                                           EVFILT_SOCK_LEVEL_TRIGGER_MASK);
6453
6454                 /*
6455                  * NOTE_RESUME and NOTE_SUSPEND are an exception, deliver
6456                  * only one of them and remember the last one that was
6457                  * delivered last
6458                  */
6459                 if (kn->kn_fflags & NOTE_SUSPEND)
6460                         kn->kn_hookid &= ~NOTE_RESUME;
6461                 if (kn->kn_fflags & NOTE_RESUME)
6462                         kn->kn_hookid &= ~NOTE_SUSPEND;
6463
6464                 if (kn->kn_flags & EV_CLEAR) {
6465                         kn->kn_data = 0;
6466                         kn->kn_fflags = 0;
6467                 }
6468         }
6469
6470         socket_unlock(so, 1);
6471
6472         return ret;
6473 }
6474
6475 void
6476 get_sockev_state(struct socket *so, u_int32_t *statep)
6477 {
6478         u_int32_t state = *(statep);
6479
6480         /*
6481          * If the state variable is already used by a previous event,
6482          * reset it.
6483          */
6484         if (state != 0)
6485                 return;
6486
6487         if (so->so_state & SS_ISCONNECTED)
6488                 state |= SOCKEV_CONNECTED;
6489         else
6490                 state &= ~(SOCKEV_CONNECTED);
6491         state |= ((so->so_state & SS_ISDISCONNECTED) ? SOCKEV_DISCONNECTED : 0);
6492         *(statep) = state;
6493 }
6494
6495 #define SO_LOCK_HISTORY_STR_LEN \
6496         (2 * SO_LCKDBG_MAX * (2 + (2 * sizeof (void *)) + 1) + 1)
6497
6498 __private_extern__ const char *
6499 solockhistory_nr(struct socket *so)
6500 {
6501         size_t n = 0;
6502         int i;
6503         static char lock_history_str[SO_LOCK_HISTORY_STR_LEN];
6504
6505         bzero(lock_history_str, sizeof (lock_history_str));
6506         for (i = SO_LCKDBG_MAX - 1; i >= 0; i--) {
6507                 n += snprintf(lock_history_str + n,
6508                     SO_LOCK_HISTORY_STR_LEN - n, "%p:%p ",
6509                     so->lock_lr[(so->next_lock_lr + i) % SO_LCKDBG_MAX],
6510                     so->unlock_lr[(so->next_unlock_lr + i) % SO_LCKDBG_MAX]);
6511         }
6512         return (lock_history_str);
6513 }
6514
6515 int
6516 socket_lock(struct socket *so, int refcount)
6517 {
6518         int error = 0;
6519         void *lr_saved;
6520
6521         lr_saved = __builtin_return_address(0);
6522
6523         if (so->so_proto->pr_lock) {
6524                 error = (*so->so_proto->pr_lock)(so, refcount, lr_saved);
6525         } else {
6526 #ifdef MORE_LOCKING_DEBUG
6527                 lck_mtx_assert(so->so_proto->pr_domain->dom_mtx,
6528                     LCK_MTX_ASSERT_NOTOWNED);
6529 #endif
6530                 lck_mtx_lock(so->so_proto->pr_domain->dom_mtx);
6531                 if (refcount)
6532                         so->so_usecount++;
6533                 so->lock_lr[so->next_lock_lr] = lr_saved;
6534                 so->next_lock_lr = (so->next_lock_lr+1) % SO_LCKDBG_MAX;
6535         }
6536
6537         return (error);
6538 }
6539
6540 int
6541 socket_unlock(struct socket *so, int refcount)
6542 {
6543         int error = 0;
6544         void *lr_saved;
6545         lck_mtx_t *mutex_held;
6546
6547         lr_saved = __builtin_return_address(0);
6548
6549         if (so->so_proto == NULL) {
6550                 panic("%s: null so_proto so=%p\n", __func__, so);
6551                 /* NOTREACHED */
6552         }
6553
6554         if (so && so->so_proto->pr_unlock) {
6555                 error = (*so->so_proto->pr_unlock)(so, refcount, lr_saved);
6556         } else {
6557                 mutex_held = so->so_proto->pr_domain->dom_mtx;
6558 #ifdef MORE_LOCKING_DEBUG
6559                 lck_mtx_assert(mutex_held, LCK_MTX_ASSERT_OWNED);
6560 #endif
6561                 so->unlock_lr[so->next_unlock_lr] = lr_saved;
6562                 so->next_unlock_lr = (so->next_unlock_lr+1) % SO_LCKDBG_MAX;
6563
6564                 if (refcount) {
6565                         if (so->so_usecount <= 0) {
6566                                 panic("%s: bad refcount=%d so=%p (%d, %d, %d) "
6567                                     "lrh=%s", __func__, so->so_usecount, so,
6568                                     SOCK_DOM(so), so->so_type,
6569                                     SOCK_PROTO(so), solockhistory_nr(so));
6570                                 /* NOTREACHED */
6571                         }
6572
6573                         so->so_usecount--;
6574                         if (so->so_usecount == 0)
6575                                 sofreelastref(so, 1);
6576                 }
6577                 lck_mtx_unlock(mutex_held);
6578         }
6579
6580         return (error);
6581 }
6582
6583 /* Called with socket locked, will unlock socket */
6584 void
6585 sofree(struct socket *so)
6586 {
6587         lck_mtx_t *mutex_held;
6588
6589         if (so->so_proto->pr_getlock != NULL)
6590                 mutex_held = (*so->so_proto->pr_getlock)(so, 0);
6591         else
6592                 mutex_held = so->so_proto->pr_domain->dom_mtx;
6593         lck_mtx_assert(mutex_held, LCK_MTX_ASSERT_OWNED);
6594
6595         sofreelastref(so, 0);
6596 }
6597
6598 void
6599 soreference(struct socket *so)
6600 {
6601         socket_lock(so, 1);     /* locks & take one reference on socket */
6602         socket_unlock(so, 0);   /* unlock only */
6603 }
6604
6605 void
6606 sodereference(struct socket *so)
6607 {
6608         socket_lock(so, 0);
6609         socket_unlock(so, 1);
6610 }
6611
6612 /*
6613  * Set or clear SOF_MULTIPAGES on the socket to enable or disable the
6614  * possibility of using jumbo clusters.  Caller must ensure to hold
6615  * the socket lock.
6616  */
6617 void
6618 somultipages(struct socket *so, boolean_t set)
6619 {
6620         if (set)
6621                 so->so_flags |= SOF_MULTIPAGES;
6622         else
6623                 so->so_flags &= ~SOF_MULTIPAGES;
6624 }
6625
6626 void
6627 soif2kcl(struct socket *so, boolean_t set)
6628 {
6629         if (set)
6630                 so->so_flags1 |= SOF1_IF_2KCL;
6631         else
6632                 so->so_flags1 &= ~SOF1_IF_2KCL;
6633 }
6634
6635 int
6636 so_isdstlocal(struct socket *so) {
6637
6638         struct inpcb *inp = (struct inpcb *)so->so_pcb;
6639
6640         if (SOCK_DOM(so) == PF_INET)
6641                 return (inaddr_local(inp->inp_faddr));
6642         else if (SOCK_DOM(so) == PF_INET6)
6643                 return (in6addr_local(&inp->in6p_faddr));
6644
6645         return (0);
6646 }
6647
6648 int
6649 sosetdefunct(struct proc *p, struct socket *so, int level, boolean_t noforce)
6650 {
6651         struct sockbuf *rcv, *snd;
6652         int err = 0, defunct;
6653
6654         rcv = &so->so_rcv;
6655         snd = &so->so_snd;
6656
6657         defunct = (so->so_flags & SOF_DEFUNCT);
6658         if (defunct) {
6659                 if (!(snd->sb_flags & rcv->sb_flags & SB_DROP)) {
6660                         panic("%s: SB_DROP not set", __func__);
6661                         /* NOTREACHED */
6662                 }
6663                 goto done;
6664         }
6665
6666         if (so->so_flags & SOF_NODEFUNCT) {
6667                 if (noforce) {
6668                         err = EOPNOTSUPP;
6669                         SODEFUNCTLOG("%s[%d, %s]: (target pid %d "
6670                             "name %s level %d) so 0x%llx [%d,%d] "
6671                             "is not eligible for defunct "
6672                             "(%d)\n", __func__, proc_selfpid(),
6673                             proc_best_name(current_proc()), proc_pid(p),
6674                             proc_best_name(p), level,
6675                             (uint64_t)DEBUG_KERNEL_ADDRPERM(so),
6676                             SOCK_DOM(so), SOCK_TYPE(so), err);
6677                         return (err);
6678                 }
6679                 so->so_flags &= ~SOF_NODEFUNCT;
6680                 SODEFUNCTLOG("%s[%d, %s]: (target pid %d name %s level %d) "
6681                     "so 0x%llx [%d,%d] defunct by force\n", __func__,
6682                     proc_selfpid(), proc_best_name(current_proc()),
6683                     proc_pid(p), proc_best_name(p), level,
6684                     (uint64_t)DEBUG_KERNEL_ADDRPERM(so),
6685                     SOCK_DOM(so), SOCK_TYPE(so));
6686         } else if (so->so_flags1 & SOF1_EXTEND_BK_IDLE_WANTED) {
6687                 struct inpcb *inp = (struct inpcb *)so->so_pcb;
6688                 struct ifnet *ifp = inp->inp_last_outifp;
6689
6690                 if (ifp && IFNET_IS_CELLULAR(ifp)) {
6691                         OSIncrementAtomic(&soextbkidlestat.so_xbkidle_nocell);
6692                 } else if (so->so_flags & SOF_DELEGATED) {
6693                         OSIncrementAtomic(&soextbkidlestat.so_xbkidle_nodlgtd);
6694                 } else if (soextbkidlestat.so_xbkidle_time == 0) {
6695                         OSIncrementAtomic(&soextbkidlestat.so_xbkidle_notime);
6696                 } else if (noforce) {
6697                         OSIncrementAtomic(&soextbkidlestat.so_xbkidle_active);
6698
6699                         so->so_flags1 |= SOF1_EXTEND_BK_IDLE_INPROG;
6700                         so->so_extended_bk_start = net_uptime();
6701                         OSBitOrAtomic(P_LXBKIDLEINPROG, &p->p_ladvflag);
6702
6703                         inpcb_timer_sched(inp->inp_pcbinfo, INPCB_TIMER_LAZY);
6704
6705                         err = EOPNOTSUPP;
6706                         SODEFUNCTLOG("%s[%d, %s]: (target pid %d name %s "
6707                             "level %d) extend bk idle so 0x%llx rcv hw %d "
6708                             "cc %d\n",
6709                             __func__, proc_selfpid(),
6710                             proc_best_name(current_proc()), proc_pid(p),
6711                             proc_best_name(p), level,
6712                             (uint64_t)DEBUG_KERNEL_ADDRPERM(so),
6713                             so->so_rcv.sb_hiwat, so->so_rcv.sb_cc);
6714                         return (err);
6715                 } else {
6716                         OSIncrementAtomic(&soextbkidlestat.so_xbkidle_forced);
6717                 }
6718         }
6719
6720         so->so_flags |= SOF_DEFUNCT;
6721
6722         /* Prevent further data from being appended to the socket buffers */
6723         snd->sb_flags |= SB_DROP;
6724         rcv->sb_flags |= SB_DROP;
6725
6726         /* Flush any existing data in the socket buffers */
6727         if (rcv->sb_cc != 0) {
6728                 rcv->sb_flags &= ~SB_SEL;
6729                 selthreadclear(&rcv->sb_sel);
6730                 sbrelease(rcv);
6731         }
6732         if (snd->sb_cc != 0) {
6733                 snd->sb_flags &= ~SB_SEL;
6734                 selthreadclear(&snd->sb_sel);
6735                 sbrelease(snd);
6736         }
6737
6738 done:
6739         SODEFUNCTLOG("%s[%d, %s]: (target pid %d name %s level %d) "
6740             "so 0x%llx [%d,%d] %s defunct%s\n", __func__, proc_selfpid(),
6741             proc_best_name(current_proc()), proc_pid(p), proc_best_name(p),
6742             level, (uint64_t)DEBUG_KERNEL_ADDRPERM(so), SOCK_DOM(so),
6743             SOCK_TYPE(so), defunct ? "is already" : "marked as",
6744             (so->so_flags1 & SOF1_EXTEND_BK_IDLE_WANTED) ? " extbkidle" : "");
6745
6746         return (err);
6747 }
6748
6749 int
6750 sodefunct(struct proc *p, struct socket *so, int level)
6751 {
6752         struct sockbuf *rcv, *snd;
6753
6754         if (!(so->so_flags & SOF_DEFUNCT)) {
6755                 panic("%s improperly called", __func__);
6756                 /* NOTREACHED */
6757         }
6758         if (so->so_state & SS_DEFUNCT)
6759                 goto done;
6760
6761         rcv = &so->so_rcv;
6762         snd = &so->so_snd;
6763
6764         if (SOCK_DOM(so) == PF_INET || SOCK_DOM(so) == PF_INET6) {
6765                 char s[MAX_IPv6_STR_LEN];
6766                 char d[MAX_IPv6_STR_LEN];
6767                 struct inpcb *inp = sotoinpcb(so);
6768
6769                 SODEFUNCTLOG("%s[%d, %s]: (target pid %d name %s level %d) "
6770                     "so 0x%llx [%s %s:%d -> %s:%d] is now defunct "
6771                     "[rcv_si 0x%x, snd_si 0x%x, rcv_fl 0x%x, snd_fl 0x%x]\n",
6772                     __func__, proc_selfpid(), proc_best_name(current_proc()),
6773                     proc_pid(p), proc_best_name(p), level,
6774                     (uint64_t)DEBUG_KERNEL_ADDRPERM(so),
6775                     (SOCK_TYPE(so) == SOCK_STREAM) ? "TCP" : "UDP",
6776                     inet_ntop(SOCK_DOM(so), ((SOCK_DOM(so) == PF_INET) ?
6777                     (void *)&inp->inp_laddr.s_addr : (void *)&inp->in6p_laddr),
6778                     s, sizeof (s)), ntohs(inp->in6p_lport),
6779                     inet_ntop(SOCK_DOM(so), (SOCK_DOM(so) == PF_INET) ?
6780                     (void *)&inp->inp_faddr.s_addr : (void *)&inp->in6p_faddr,
6781                     d, sizeof (d)), ntohs(inp->in6p_fport),
6782                     (uint32_t)rcv->sb_sel.si_flags,
6783                     (uint32_t)snd->sb_sel.si_flags,
6784                     rcv->sb_flags, snd->sb_flags);
6785         } else {
6786                 SODEFUNCTLOG("%s[%d, %s]: (target pid %d name %s level %d) "
6787                     "so 0x%llx [%d,%d] is now defunct [rcv_si 0x%x, "
6788                     "snd_si 0x%x, rcv_fl 0x%x, snd_fl 0x%x]\n", __func__,
6789                     proc_selfpid(), proc_best_name(current_proc()),
6790                     proc_pid(p), proc_best_name(p), level,
6791                     (uint64_t)DEBUG_KERNEL_ADDRPERM(so),
6792                     SOCK_DOM(so), SOCK_TYPE(so),
6793                     (uint32_t)rcv->sb_sel.si_flags,
6794                     (uint32_t)snd->sb_sel.si_flags, rcv->sb_flags,
6795                     snd->sb_flags);
6796         }
6797
6798         /*
6799          * Unwedge threads blocked on sbwait() and sb_lock().
6800          */
6801         sbwakeup(rcv);
6802         sbwakeup(snd);
6803
6804         so->so_flags1 |= SOF1_DEFUNCTINPROG;
6805         if (rcv->sb_flags & SB_LOCK)
6806                 sbunlock(rcv, TRUE);    /* keep socket locked */
6807         if (snd->sb_flags & SB_LOCK)
6808                 sbunlock(snd, TRUE);    /* keep socket locked */
6809
6810         /*
6811          * Flush the buffers and disconnect.  We explicitly call shutdown
6812          * on both data directions to ensure that SS_CANT{RCV,SEND}MORE
6813          * states are set for the socket.  This would also flush out data
6814          * hanging off the receive list of this socket.
6815          */
6816         (void) soshutdownlock_final(so, SHUT_RD);
6817         (void) soshutdownlock_final(so, SHUT_WR);
6818         (void) sodisconnectlocked(so);
6819
6820         /*
6821          * Explicitly handle connectionless-protocol disconnection
6822          * and release any remaining data in the socket buffers.
6823          */
6824         if (!(so->so_flags & SS_ISDISCONNECTED))
6825                 (void) soisdisconnected(so);
6826
6827         if (so->so_error == 0)
6828                 so->so_error = EBADF;
6829
6830         if (rcv->sb_cc != 0) {
6831                 rcv->sb_flags &= ~SB_SEL;
6832                 selthreadclear(&rcv->sb_sel);
6833                 sbrelease(rcv);
6834         }
6835         if (snd->sb_cc != 0) {
6836                 snd->sb_flags &= ~SB_SEL;
6837                 selthreadclear(&snd->sb_sel);
6838                 sbrelease(snd);
6839         }
6840         so->so_state |= SS_DEFUNCT;
6841         OSIncrementAtomicLong((volatile long *)&sodefunct_calls);
6842
6843 done:
6844         return (0);
6845 }
6846
6847 int
6848 soresume(struct proc *p, struct socket *so, int locked)
6849 {
6850         if (locked == 0)
6851                 socket_lock(so, 1);
6852
6853         if (so->so_flags1 & SOF1_EXTEND_BK_IDLE_INPROG) {
6854                 SODEFUNCTLOG("%s[%d, %s]: (target pid %d name %s) so 0x%llx "
6855                     "[%d,%d] resumed from bk idle\n",
6856                     __func__, proc_selfpid(), proc_best_name(current_proc()),
6857                     proc_pid(p), proc_best_name(p),
6858                     (uint64_t)DEBUG_KERNEL_ADDRPERM(so),
6859                     SOCK_DOM(so), SOCK_TYPE(so));
6860
6861                 so->so_flags1 &= ~SOF1_EXTEND_BK_IDLE_INPROG;
6862                 so->so_extended_bk_start = 0;
6863                 OSBitAndAtomic(~P_LXBKIDLEINPROG, &p->p_ladvflag);
6864
6865                 OSIncrementAtomic(&soextbkidlestat.so_xbkidle_resumed);
6866                 OSDecrementAtomic(&soextbkidlestat.so_xbkidle_active);
6867                 VERIFY(soextbkidlestat.so_xbkidle_active >= 0);
6868         }
6869         if (locked == 0)
6870                 socket_unlock(so, 1);
6871
6872         return (0);
6873 }
6874
6875 /*
6876  * Does not attempt to account for sockets that are delegated from
6877  * the current process
6878  */
6879 int
6880 so_set_extended_bk_idle(struct socket *so, int optval)
6881 {
6882         int error = 0;
6883
6884         if ((SOCK_DOM(so) != PF_INET && SOCK_DOM(so) != PF_INET6) ||
6885             SOCK_PROTO(so) != IPPROTO_TCP) {
6886                 OSDecrementAtomic(&soextbkidlestat.so_xbkidle_notsupp);
6887                 error = EOPNOTSUPP;
6888         } else if (optval == 0) {
6889                 so->so_flags1 &= ~SOF1_EXTEND_BK_IDLE_WANTED;
6890
6891                 soresume(current_proc(), so, 1);
6892         } else {
6893                 struct proc *p = current_proc();
6894                 int i;
6895                 struct filedesc *fdp;
6896                 int count = 0;
6897
6898                 proc_fdlock(p);
6899
6900                 fdp = p->p_fd;
6901                 for (i = 0; i < fdp->fd_nfiles; i++) {
6902                         struct fileproc *fp = fdp->fd_ofiles[i];
6903                         struct socket *so2;
6904
6905                         if (fp == NULL ||
6906                             (fdp->fd_ofileflags[i] & UF_RESERVED) != 0 ||
6907                             FILEGLOB_DTYPE(fp->f_fglob) != DTYPE_SOCKET)
6908                                 continue;
6909
6910                         so2 = (struct socket *)fp->f_fglob->fg_data;
6911                         if (so != so2 &&
6912                             so2->so_flags1 & SOF1_EXTEND_BK_IDLE_WANTED)
6913                                 count++;
6914                         if (count >= soextbkidlestat.so_xbkidle_maxperproc)
6915                                 break;
6916                 }
6917                 if (count >= soextbkidlestat.so_xbkidle_maxperproc) {
6918                         OSIncrementAtomic(&soextbkidlestat.so_xbkidle_toomany);
6919                         error = EBUSY;
6920                 } else if (so->so_flags & SOF_DELEGATED) {
6921                         OSIncrementAtomic(&soextbkidlestat.so_xbkidle_nodlgtd);
6922                         error = EBUSY;
6923                 } else {
6924                         so->so_flags1 |= SOF1_EXTEND_BK_IDLE_WANTED;
6925                         OSIncrementAtomic(&soextbkidlestat.so_xbkidle_wantok);
6926                 }
6927                 SODEFUNCTLOG("%s[%d, %s]: so 0x%llx [%d,%d] "
6928                     "%s marked for extended bk idle\n",
6929                     __func__, proc_selfpid(), proc_best_name(current_proc()),
6930                     (uint64_t)DEBUG_KERNEL_ADDRPERM(so),
6931                     SOCK_DOM(so), SOCK_TYPE(so),
6932                     (so->so_flags1 & SOF1_EXTEND_BK_IDLE_WANTED) ?
6933                     "is" : "not");
6934
6935                 proc_fdunlock(p);
6936         }
6937
6938         return (error);
6939 }
6940
6941 static void
6942 so_stop_extended_bk_idle(struct socket *so)
6943 {
6944         so->so_flags1 &= ~SOF1_EXTEND_BK_IDLE_INPROG;
6945         so->so_extended_bk_start = 0;
6946
6947         OSDecrementAtomic(&soextbkidlestat.so_xbkidle_active);
6948         VERIFY(soextbkidlestat.so_xbkidle_active >= 0);
6949         /*
6950          * Force defunct
6951          */
6952         sosetdefunct(current_proc(), so,
6953             SHUTDOWN_SOCKET_LEVEL_DISCONNECT_INTERNAL, FALSE);
6954         if (so->so_flags & SOF_DEFUNCT) {
6955                 sodefunct(current_proc(), so,
6956                     SHUTDOWN_SOCKET_LEVEL_DISCONNECT_INTERNAL);
6957         }
6958 }
6959
6960 void
6961 so_drain_extended_bk_idle(struct socket *so)
6962 {
6963         if (so && (so->so_flags1 & SOF1_EXTEND_BK_IDLE_INPROG)) {
6964                 /*
6965                  * Only penalize sockets that have outstanding data
6966                  */
6967                 if (so->so_rcv.sb_cc || so->so_snd.sb_cc) {
6968                         so_stop_extended_bk_idle(so);
6969
6970                         OSIncrementAtomic(&soextbkidlestat.so_xbkidle_drained);
6971                 }
6972         }
6973 }
6974
6975 /*
6976  * Return values tells if socket is still in extended background idle
6977  */
6978 int
6979 so_check_extended_bk_idle_time(struct socket *so)
6980 {
6981         int ret = 1;
6982
6983         if ((so->so_flags1 & SOF1_EXTEND_BK_IDLE_INPROG)) {
6984                 SODEFUNCTLOG("%s[%d, %s]: so 0x%llx [%d,%d]\n",
6985                     __func__, proc_selfpid(), proc_best_name(current_proc()),
6986                     (uint64_t)DEBUG_KERNEL_ADDRPERM(so),
6987                     SOCK_DOM(so), SOCK_TYPE(so));
6988                 if (net_uptime() - so->so_extended_bk_start >
6989                     soextbkidlestat.so_xbkidle_time) {
6990                         so_stop_extended_bk_idle(so);
6991
6992                         OSIncrementAtomic(&soextbkidlestat.so_xbkidle_expired);
6993
6994                         ret = 0;
6995                 } else {
6996                         struct inpcb *inp = (struct inpcb *)so->so_pcb;
6997
6998                         inpcb_timer_sched(inp->inp_pcbinfo, INPCB_TIMER_LAZY);
6999                         OSIncrementAtomic(&soextbkidlestat.so_xbkidle_resched);
7000                 }
7001         }
7002
7003         return (ret);
7004 }
7005
7006 void
7007 resume_proc_sockets(proc_t p)
7008 {
7009         if (p->p_ladvflag & P_LXBKIDLEINPROG) {
7010                 struct filedesc *fdp;
7011                 int i;
7012
7013                 proc_fdlock(p);
7014                 fdp = p->p_fd;
7015                 for (i = 0; i < fdp->fd_nfiles; i++) {
7016                         struct fileproc *fp;
7017                         struct socket *so;
7018
7019                         fp = fdp->fd_ofiles[i];
7020                         if (fp == NULL ||
7021                             (fdp->fd_ofileflags[i] & UF_RESERVED) != 0 ||
7022                             FILEGLOB_DTYPE(fp->f_fglob) != DTYPE_SOCKET)
7023                                 continue;
7024
7025                         so = (struct socket *)fp->f_fglob->fg_data;
7026                         (void) soresume(p, so, 0);
7027                 }
7028                 proc_fdunlock(p);
7029
7030                 OSBitAndAtomic(~P_LXBKIDLEINPROG, &p->p_ladvflag);
7031         }
7032 }
7033
7034 __private_extern__ int
7035 so_set_recv_anyif(struct socket *so, int optval)
7036 {
7037         int ret = 0;
7038
7039 #if INET6
7040         if (SOCK_DOM(so) == PF_INET || SOCK_DOM(so) == PF_INET6) {
7041 #else
7042         if (SOCK_DOM(so) == PF_INET) {
7043 #endif /* !INET6 */
7044                 if (optval)
7045                         sotoinpcb(so)->inp_flags |= INP_RECV_ANYIF;
7046                 else
7047                         sotoinpcb(so)->inp_flags &= ~INP_RECV_ANYIF;
7048         }
7049
7050         return (ret);
7051 }
7052
7053 __private_extern__ int
7054 so_get_recv_anyif(struct socket *so)
7055 {
7056         int ret = 0;
7057
7058 #if INET6
7059         if (SOCK_DOM(so) == PF_INET || SOCK_DOM(so) == PF_INET6) {
7060 #else
7061         if (SOCK_DOM(so) == PF_INET) {
7062 #endif /* !INET6 */
7063                 ret = (sotoinpcb(so)->inp_flags & INP_RECV_ANYIF) ? 1 : 0;
7064         }
7065
7066         return (ret);
7067 }
7068
7069 int
7070 so_set_restrictions(struct socket *so, uint32_t vals)
7071 {
7072         int nocell_old, nocell_new;
7073         int noexpensive_old, noexpensive_new;
7074
7075         /*
7076          * Deny-type restrictions are trapdoors; once set they cannot be
7077          * unset for the lifetime of the socket.  This allows them to be
7078          * issued by a framework on behalf of the application without
7079          * having to worry that they can be undone.
7080          *
7081          * Note here that socket-level restrictions overrides any protocol
7082          * level restrictions.  For instance, SO_RESTRICT_DENY_CELLULAR
7083          * socket restriction issued on the socket has a higher precendence
7084          * than INP_NO_IFT_CELLULAR.  The latter is affected by the UUID
7085          * policy PROC_UUID_NO_CELLULAR for unrestricted sockets only,
7086          * i.e. when SO_RESTRICT_DENY_CELLULAR has not been issued.
7087          */
7088         nocell_old = (so->so_restrictions & SO_RESTRICT_DENY_CELLULAR);
7089         noexpensive_old = (so->so_restrictions & SO_RESTRICT_DENY_EXPENSIVE);
7090         so->so_restrictions |= (vals & (SO_RESTRICT_DENY_IN |
7091             SO_RESTRICT_DENY_OUT | SO_RESTRICT_DENY_CELLULAR |
7092             SO_RESTRICT_DENY_EXPENSIVE));
7093         nocell_new = (so->so_restrictions & SO_RESTRICT_DENY_CELLULAR);
7094         noexpensive_new = (so->so_restrictions & SO_RESTRICT_DENY_EXPENSIVE);
7095
7096         /* we can only set, not clear restrictions */
7097         if ((nocell_new - nocell_old) == 0 &&
7098             (noexpensive_new - noexpensive_old) == 0)
7099                 return (0);
7100 #if INET6
7101         if (SOCK_DOM(so) == PF_INET || SOCK_DOM(so) == PF_INET6) {
7102 #else
7103         if (SOCK_DOM(so) == PF_INET) {
7104 #endif /* !INET6 */
7105                 if (nocell_new - nocell_old != 0) {
7106                         /*
7107                          * if deny cellular is now set, do what's needed
7108                          * for INPCB
7109                          */
7110                         inp_set_nocellular(sotoinpcb(so));
7111                 }
7112                 if (noexpensive_new - noexpensive_old != 0) {
7113                         inp_set_noexpensive(sotoinpcb(so));
7114                 }
7115         }
7116
7117         return (0);
7118 }
7119
7120 uint32_t
7121 so_get_restrictions(struct socket *so)
7122 {
7123         return (so->so_restrictions & (SO_RESTRICT_DENY_IN |
7124             SO_RESTRICT_DENY_OUT |
7125             SO_RESTRICT_DENY_CELLULAR | SO_RESTRICT_DENY_EXPENSIVE));
7126 }
7127
7128 struct sockaddr_entry *
7129 sockaddrentry_alloc(int how)
7130 {
7131         struct sockaddr_entry *se;
7132
7133         se = (how == M_WAITOK) ? zalloc(se_zone) : zalloc_noblock(se_zone);
7134         if (se != NULL)
7135                 bzero(se, se_zone_size);
7136
7137         return (se);
7138 }
7139
7140 void
7141 sockaddrentry_free(struct sockaddr_entry *se)
7142 {
7143         if (se->se_addr != NULL) {
7144                 FREE(se->se_addr, M_SONAME);
7145                 se->se_addr = NULL;
7146         }
7147         zfree(se_zone, se);
7148 }
7149
7150 struct sockaddr_entry *
7151 sockaddrentry_dup(const struct sockaddr_entry *src_se, int how)
7152 {
7153         struct sockaddr_entry *dst_se;
7154
7155         dst_se = sockaddrentry_alloc(how);
7156         if (dst_se != NULL) {
7157                 int len = src_se->se_addr->sa_len;
7158
7159                 MALLOC(dst_se->se_addr, struct sockaddr *,
7160                     len, M_SONAME, how | M_ZERO);
7161                 if (dst_se->se_addr != NULL) {
7162                         bcopy(src_se->se_addr, dst_se->se_addr, len);
7163                 } else {
7164                         sockaddrentry_free(dst_se);
7165                         dst_se = NULL;
7166                 }
7167         }
7168
7169         return (dst_se);
7170 }
7171
7172 struct sockaddr_list *
7173 sockaddrlist_alloc(int how)
7174 {
7175         struct sockaddr_list *sl;
7176
7177         sl = (how == M_WAITOK) ? zalloc(sl_zone) : zalloc_noblock(sl_zone);
7178         if (sl != NULL) {
7179                 bzero(sl, sl_zone_size);
7180                 TAILQ_INIT(&sl->sl_head);
7181         }
7182         return (sl);
7183 }
7184
7185 void
7186 sockaddrlist_free(struct sockaddr_list *sl)
7187 {
7188         struct sockaddr_entry *se, *tse;
7189
7190         TAILQ_FOREACH_SAFE(se, &sl->sl_head, se_link, tse) {
7191                 sockaddrlist_remove(sl, se);
7192                 sockaddrentry_free(se);
7193         }
7194         VERIFY(sl->sl_cnt == 0 && TAILQ_EMPTY(&sl->sl_head));
7195         zfree(sl_zone, sl);
7196 }
7197
7198 void
7199 sockaddrlist_insert(struct sockaddr_list *sl, struct sockaddr_entry *se)
7200 {
7201         VERIFY(!(se->se_flags & SEF_ATTACHED));
7202         se->se_flags |= SEF_ATTACHED;
7203         TAILQ_INSERT_TAIL(&sl->sl_head, se, se_link);
7204         sl->sl_cnt++;
7205         VERIFY(sl->sl_cnt != 0);
7206 }
7207
7208 void
7209 sockaddrlist_remove(struct sockaddr_list *sl, struct sockaddr_entry *se)
7210 {
7211         VERIFY(se->se_flags & SEF_ATTACHED);
7212         se->se_flags &= ~SEF_ATTACHED;
7213         VERIFY(sl->sl_cnt != 0);
7214         sl->sl_cnt--;
7215         TAILQ_REMOVE(&sl->sl_head, se, se_link);
7216 }
7217
7218 struct sockaddr_list *
7219 sockaddrlist_dup(const struct sockaddr_list *src_sl, int how)
7220 {
7221         struct sockaddr_entry *src_se, *tse;
7222         struct sockaddr_list *dst_sl;
7223
7224         dst_sl = sockaddrlist_alloc(how);
7225         if (dst_sl == NULL)
7226                 return (NULL);
7227
7228         TAILQ_FOREACH_SAFE(src_se, &src_sl->sl_head, se_link, tse) {
7229                 struct sockaddr_entry *dst_se;
7230
7231                 if (src_se->se_addr == NULL)
7232                         continue;
7233
7234                 dst_se = sockaddrentry_dup(src_se, how);
7235                 if (dst_se == NULL) {
7236                         sockaddrlist_free(dst_sl);
7237                         return (NULL);
7238                 }
7239
7240                 sockaddrlist_insert(dst_sl, dst_se);
7241         }
7242         VERIFY(src_sl->sl_cnt == dst_sl->sl_cnt);
7243
7244         return (dst_sl);
7245 }
7246
7247 int
7248 so_set_effective_pid(struct socket *so, int epid, struct proc *p)
7249 {
7250         struct proc *ep = PROC_NULL;
7251         int error = 0;
7252
7253         /* pid 0 is reserved for kernel */
7254         if (epid == 0) {
7255                 error = EINVAL;
7256                 goto done;
7257         }
7258
7259         /*
7260          * If this is an in-kernel socket, prevent its delegate
7261          * association from changing unless the socket option is
7262          * coming from within the kernel itself.
7263          */
7264         if (so->last_pid == 0 && p != kernproc) {
7265                 error = EACCES;
7266                 goto done;
7267         }
7268
7269         /*
7270          * If this is issued by a process that's recorded as the
7271          * real owner of the socket, or if the pid is the same as
7272          * the process's own pid, then proceed.  Otherwise ensure
7273          * that the issuing process has the necessary privileges.
7274          */
7275         if (epid != so->last_pid || epid != proc_pid(p)) {
7276                 if ((error = priv_check_cred(kauth_cred_get(),
7277                     PRIV_NET_PRIVILEGED_SOCKET_DELEGATE, 0))) {
7278                         error = EACCES;
7279                         goto done;
7280                 }
7281         }
7282
7283         /* Find the process that corresponds to the effective pid */
7284         if ((ep = proc_find(epid)) == PROC_NULL) {
7285                 error = ESRCH;
7286                 goto done;
7287         }
7288
7289         /*
7290          * If a process tries to delegate the socket to itself, then
7291          * there's really nothing to do; treat it as a way for the
7292          * delegate association to be cleared.  Note that we check
7293          * the passed-in proc rather than calling proc_selfpid(),
7294          * as we need to check the process issuing the socket option
7295          * which could be kernproc.  Given that we don't allow 0 for
7296          * effective pid, it means that a delegated in-kernel socket
7297          * stays delegated during its lifetime (which is probably OK.)
7298          */
7299         if (epid == proc_pid(p)) {
7300                 so->so_flags &= ~SOF_DELEGATED;
7301                 so->e_upid = 0;
7302                 so->e_pid = 0;
7303                 uuid_clear(so->e_uuid);
7304         } else {
7305                 so->so_flags |= SOF_DELEGATED;
7306                 so->e_upid = proc_uniqueid(ep);
7307                 so->e_pid = proc_pid(ep);
7308                 proc_getexecutableuuid(ep, so->e_uuid, sizeof (so->e_uuid));
7309         }
7310 done:
7311         if (error == 0 && net_io_policy_log) {
7312                 uuid_string_t buf;
7313
7314                 uuid_unparse(so->e_uuid, buf);
7315                 log(LOG_DEBUG, "%s[%s,%d]: so 0x%llx [%d,%d] epid %d (%s) "
7316                     "euuid %s%s\n", __func__, proc_name_address(p),
7317                     proc_pid(p), (uint64_t)DEBUG_KERNEL_ADDRPERM(so),
7318                     SOCK_DOM(so), SOCK_TYPE(so),
7319                     so->e_pid, proc_name_address(ep), buf,
7320                     ((so->so_flags & SOF_DELEGATED) ? " [delegated]" : ""));
7321         } else if (error != 0 && net_io_policy_log) {
7322                 log(LOG_ERR, "%s[%s,%d]: so 0x%llx [%d,%d] epid %d (%s) "
7323                     "ERROR (%d)\n", __func__, proc_name_address(p),
7324                     proc_pid(p), (uint64_t)DEBUG_KERNEL_ADDRPERM(so),
7325                     SOCK_DOM(so), SOCK_TYPE(so),
7326                     epid, (ep == PROC_NULL) ? "PROC_NULL" :
7327                     proc_name_address(ep), error);
7328         }
7329
7330         /* Update this socket's policy upon success */
7331         if (error == 0) {
7332                 so->so_policy_gencnt *= -1;
7333                 so_update_policy(so);
7334 #if NECP
7335                 so_update_necp_policy(so, NULL, NULL);
7336 #endif /* NECP */
7337         }
7338
7339         if (ep != PROC_NULL)
7340                 proc_rele(ep);
7341
7342         return (error);
7343 }
7344
7345 int
7346 so_set_effective_uuid(struct socket *so, uuid_t euuid, struct proc *p)
7347 {
7348         uuid_string_t buf;
7349         uuid_t uuid;
7350         int error = 0;
7351
7352         /* UUID must not be all-zeroes (reserved for kernel) */
7353         if (uuid_is_null(euuid)) {
7354                 error = EINVAL;
7355                 goto done;
7356         }
7357
7358         /*
7359          * If this is an in-kernel socket, prevent its delegate
7360          * association from changing unless the socket option is
7361          * coming from within the kernel itself.
7362          */
7363         if (so->last_pid == 0 && p != kernproc) {
7364                 error = EACCES;
7365                 goto done;
7366         }
7367
7368         /* Get the UUID of the issuing process */
7369         proc_getexecutableuuid(p, uuid, sizeof (uuid));
7370
7371         /*
7372          * If this is issued by a process that's recorded as the
7373          * real owner of the socket, or if the uuid is the same as
7374          * the process's own uuid, then proceed.  Otherwise ensure
7375          * that the issuing process has the necessary privileges.
7376          */
7377         if (uuid_compare(euuid, so->last_uuid) != 0 ||
7378             uuid_compare(euuid, uuid) != 0) {
7379                 if ((error = priv_check_cred(kauth_cred_get(),
7380                     PRIV_NET_PRIVILEGED_SOCKET_DELEGATE, 0))) {
7381                         error = EACCES;
7382                         goto done;
7383                 }
7384         }
7385
7386         /*
7387          * If a process tries to delegate the socket to itself, then
7388          * there's really nothing to do; treat it as a way for the
7389          * delegate association to be cleared.  Note that we check
7390          * the uuid of the passed-in proc rather than that of the
7391          * current process, as we need to check the process issuing
7392          * the socket option which could be kernproc itself.  Given
7393          * that we don't allow 0 for effective uuid, it means that
7394          * a delegated in-kernel socket stays delegated during its
7395          * lifetime (which is okay.)
7396          */
7397         if (uuid_compare(euuid, uuid) == 0) {
7398                 so->so_flags &= ~SOF_DELEGATED;
7399                 so->e_upid = 0;
7400                 so->e_pid = 0;
7401                 uuid_clear(so->e_uuid);
7402         } else {
7403                 so->so_flags |= SOF_DELEGATED;
7404                 /*
7405                  * Unlike so_set_effective_pid(), we only have the UUID
7406                  * here and the process ID is not known.  Inherit the
7407                  * real {pid,upid} of the socket.
7408                  */
7409                 so->e_upid = so->last_upid;
7410                 so->e_pid = so->last_pid;
7411                 uuid_copy(so->e_uuid, euuid);
7412         }
7413
7414 done:
7415         if (error == 0 && net_io_policy_log) {
7416                 uuid_unparse(so->e_uuid, buf);
7417                 log(LOG_DEBUG, "%s[%s,%d]: so 0x%llx [%d,%d] epid %d "
7418                     "euuid %s%s\n", __func__, proc_name_address(p), proc_pid(p),
7419                     (uint64_t)DEBUG_KERNEL_ADDRPERM(so), SOCK_DOM(so),
7420                     SOCK_TYPE(so), so->e_pid, buf,
7421                     ((so->so_flags & SOF_DELEGATED) ? " [delegated]" : ""));
7422         } else if (error != 0 && net_io_policy_log) {
7423                 uuid_unparse(euuid, buf);
7424                 log(LOG_DEBUG, "%s[%s,%d]: so 0x%llx [%d,%d] euuid %s "
7425                     "ERROR (%d)\n", __func__, proc_name_address(p), proc_pid(p),
7426                     (uint64_t)DEBUG_KERNEL_ADDRPERM(so), SOCK_DOM(so),
7427                     SOCK_TYPE(so), buf, error);
7428         }
7429
7430         /* Update this socket's policy upon success */
7431         if (error == 0) {
7432                 so->so_policy_gencnt *= -1;
7433                 so_update_policy(so);
7434 #if NECP
7435                 so_update_necp_policy(so, NULL, NULL);
7436 #endif /* NECP */
7437         }
7438
7439         return (error);
7440 }
7441
7442 void
7443 netpolicy_post_msg(uint32_t ev_code, struct netpolicy_event_data *ev_data,
7444     uint32_t ev_datalen)
7445 {
7446         struct kev_msg ev_msg;
7447
7448         /*
7449          * A netpolicy event always starts with a netpolicy_event_data
7450          * structure, but the caller can provide for a longer event
7451          * structure to post, depending on the event code.
7452          */
7453         VERIFY(ev_data != NULL && ev_datalen >= sizeof (*ev_data));
7454
7455         bzero(&ev_msg, sizeof (ev_msg));
7456         ev_msg.vendor_code      = KEV_VENDOR_APPLE;
7457         ev_msg.kev_class        = KEV_NETWORK_CLASS;
7458         ev_msg.kev_subclass     = KEV_NETPOLICY_SUBCLASS;
7459         ev_msg.event_code       = ev_code;
7460
7461         ev_msg.dv[0].data_ptr   = ev_data;
7462         ev_msg.dv[0].data_length = ev_datalen;
7463
7464         kev_post_msg(&ev_msg);
7465 }
7466
7467 void
7468 socket_post_kev_msg(uint32_t ev_code,
7469     struct kev_socket_event_data *ev_data,
7470     uint32_t ev_datalen)
7471 {
7472         struct kev_msg ev_msg;
7473
7474         bzero(&ev_msg, sizeof(ev_msg));
7475         ev_msg.vendor_code = KEV_VENDOR_APPLE;
7476         ev_msg.kev_class = KEV_NETWORK_CLASS;
7477         ev_msg.kev_subclass = KEV_SOCKET_SUBCLASS;
7478         ev_msg.event_code = ev_code;
7479
7480         ev_msg.dv[0].data_ptr = ev_data;
7481         ev_msg.dv[0]. data_length = ev_datalen;
7482
7483         kev_post_msg(&ev_msg);
7484 }
7485
7486 void
7487 socket_post_kev_msg_closed(struct socket *so)
7488 {
7489         struct kev_socket_closed ev;
7490         struct sockaddr *socksa = NULL, *peersa = NULL;
7491         int err;
7492         bzero(&ev, sizeof(ev));
7493         err = (*so->so_proto->pr_usrreqs->pru_sockaddr)(so, &socksa);
7494         if (err == 0) {
7495                 err = (*so->so_proto->pr_usrreqs->pru_peeraddr)(so,
7496                     &peersa);
7497                 if (err == 0) {
7498                         memcpy(&ev.ev_data.kev_sockname, socksa,
7499                             min(socksa->sa_len,
7500                             sizeof (ev.ev_data.kev_sockname)));
7501                         memcpy(&ev.ev_data.kev_peername, peersa,
7502                             min(peersa->sa_len,
7503                             sizeof (ev.ev_data.kev_peername)));
7504                         socket_post_kev_msg(KEV_SOCKET_CLOSED,
7505                             &ev.ev_data, sizeof (ev));
7506                 }
7507         }
7508         if (socksa != NULL)
7509                 FREE(socksa, M_SONAME);
7510         if (peersa != NULL)
7511                 FREE(peersa, M_SONAME);
7512 }