bsd/kern/uipc_socket.c

   1 /*
   2  * Copyright (c) 1998-2016 Apple Inc. All rights reserved.
   3  *
   4  * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
   5  *
   6  * This file contains Original Code and/or Modifications of Original Code
   7  * as defined in and that are subject to the Apple Public Source License
   8  * Version 2.0 (the 'License'). You may not use this file except in
   9  * compliance with the License. The rights granted to you under the License
  10  * may not be used to create, or enable the creation or redistribution of,
  11  * unlawful or unlicensed copies of an Apple operating system, or to
  12  * circumvent, violate, or enable the circumvention or violation of, any
  13  * terms of an Apple operating system software license agreement.
  14  *
  15  * Please obtain a copy of the License at
  16  * http://www.opensource.apple.com/apsl/ and read it before using this file.
  17  *
  18  * The Original Code and all software distributed under the License are
  19  * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
  20  * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
  21  * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
  22  * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
  23  * Please see the License for the specific language governing rights and
  24  * limitations under the License.
  25  *
  26  * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
  27  */
  28 /* Copyright (c) 1995 NeXT Computer, Inc. All Rights Reserved */
  29 /*
  30  * Copyright (c) 1982, 1986, 1988, 1990, 1993
  31  *      The Regents of the University of California.  All rights reserved.
  32  *
  33  * Redistribution and use in source and binary forms, with or without
  34  * modification, are permitted provided that the following conditions
  35  * are met:
  36  * 1. Redistributions of source code must retain the above copyright
  37  *    notice, this list of conditions and the following disclaimer.
  38  * 2. Redistributions in binary form must reproduce the above copyright
  39  *    notice, this list of conditions and the following disclaimer in the
  40  *    documentation and/or other materials provided with the distribution.
  41  * 3. All advertising materials mentioning features or use of this software
  42  *    must display the following acknowledgement:
  43  *      This product includes software developed by the University of
  44  *      California, Berkeley and its contributors.
  45  * 4. Neither the name of the University nor the names of its contributors
  46  *    may be used to endorse or promote products derived from this software
  47  *    without specific prior written permission.
  48  *
  49  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
  50  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  51  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  52  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  53  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  54  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  55  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  56  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  57  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  58  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  59  * SUCH DAMAGE.
  60  *
  61  *      @(#)uipc_socket.c       8.3 (Berkeley) 4/15/94
  62  */
  63 /*
  64  * NOTICE: This file was modified by SPARTA, Inc. in 2005 to introduce
  65  * support for mandatory and extensible security protections.  This notice
  66  * is included in support of clause 2.2 (b) of the Apple Public License,
  67  * Version 2.0.
  68  */
  69
  70 #include <sys/param.h>
  71 #include <sys/systm.h>
  72 #include <sys/filedesc.h>
  73 #include <sys/proc.h>
  74 #include <sys/proc_internal.h>
  75 #include <sys/kauth.h>
  76 #include <sys/file_internal.h>
  77 #include <sys/fcntl.h>
  78 #include <sys/malloc.h>
  79 #include <sys/mbuf.h>
  80 #include <sys/domain.h>
  81 #include <sys/kernel.h>
  82 #include <sys/event.h>
  83 #include <sys/poll.h>
  84 #include <sys/protosw.h>
  85 #include <sys/socket.h>
  86 #include <sys/socketvar.h>
  87 #include <sys/resourcevar.h>
  88 #include <sys/signalvar.h>
  89 #include <sys/sysctl.h>
  90 #include <sys/syslog.h>
  91 #include <sys/uio.h>
  92 #include <sys/uio_internal.h>
  93 #include <sys/ev.h>
  94 #include <sys/kdebug.h>
  95 #include <sys/un.h>
  96 #include <sys/user.h>
  97 #include <sys/priv.h>
  98 #include <sys/kern_event.h>
  99 #include <net/route.h>
 100 #include <net/init.h>
 101 #include <net/ntstat.h>
 102 #include <net/content_filter.h>
 103 #include <netinet/in.h>
 104 #include <netinet/in_pcb.h>
 105 #include <netinet/in_tclass.h>
 106 #include <netinet/tcp_var.h>
 107 #include <netinet/ip6.h>
 108 #include <netinet6/ip6_var.h>
 109 #include <netinet/flow_divert.h>
 110 #include <kern/zalloc.h>
 111 #include <kern/locks.h>
 112 #include <machine/limits.h>
 113 #include <libkern/OSAtomic.h>
 114 #include <pexpert/pexpert.h>
 115 #include <kern/assert.h>
 116 #include <kern/task.h>
 117 #include <kern/policy_internal.h>
 118
 119 #include <sys/kpi_mbuf.h>
 120 #include <sys/mcache.h>
 121 #include <sys/unpcb.h>
 122
 123 #if CONFIG_MACF
 124 #include <security/mac.h>
 125 #include <security/mac_framework.h>
 126 #endif /* MAC */
 127
 128 #if MULTIPATH
 129 #include <netinet/mp_pcb.h>
 130 #include <netinet/mptcp_var.h>
 131 #endif /* MULTIPATH */
 132
 133 #define ROUNDUP(a, b) (((a) + ((b) - 1)) & (~((b) - 1)))
 134
 135 #if DEBUG || DEVELOPMENT
 136 #define DEBUG_KERNEL_ADDRPERM(_v) (_v)
 137 #else
 138 #define DEBUG_KERNEL_ADDRPERM(_v) VM_KERNEL_ADDRPERM(_v)
 139 #endif
 140
 141 /* TODO: this should be in a header file somewhere */
 142 extern char *proc_name_address(void *p);
 143 extern char *proc_best_name(proc_t);
 144
 145 static u_int32_t        so_cache_hw;    /* High water mark for socache */
 146 static u_int32_t        so_cache_timeouts;      /* number of timeouts */
 147 static u_int32_t        so_cache_max_freed;     /* max freed per timeout */
 148 static u_int32_t        cached_sock_count = 0;
 149 STAILQ_HEAD(, socket)   so_cache_head;
 150 int     max_cached_sock_count = MAX_CACHED_SOCKETS;
 151 static u_int32_t        so_cache_time;
 152 static int              socketinit_done;
 153 static struct zone      *so_cache_zone;
 154
 155 static lck_grp_t        *so_cache_mtx_grp;
 156 static lck_attr_t       *so_cache_mtx_attr;
 157 static lck_grp_attr_t   *so_cache_mtx_grp_attr;
 158 static lck_mtx_t        *so_cache_mtx;
 159
 160 #include <machine/limits.h>
 161
 162 static int      filt_sorattach(struct knote *kn);
 163 static void     filt_sordetach(struct knote *kn);
 164 static int      filt_soread(struct knote *kn, long hint);
 165 static int      filt_sortouch(struct knote *kn, struct kevent_internal_s *kev);
 166 static int      filt_sorprocess(struct knote *kn, struct filt_process_s *data, struct kevent_internal_s *kev);
 167
 168 static int      filt_sowattach(struct knote *kn);
 169 static void     filt_sowdetach(struct knote *kn);
 170 static int      filt_sowrite(struct knote *kn, long hint);
 171 static int      filt_sowtouch(struct knote *kn, struct kevent_internal_s *kev);
 172 static int      filt_sowprocess(struct knote *kn, struct filt_process_s *data, struct kevent_internal_s *kev);
 173
 174 static int      filt_sockattach(struct knote *kn);
 175 static void     filt_sockdetach(struct knote *kn);
 176 static int      filt_sockev(struct knote *kn, long hint);
 177 static int      filt_socktouch(struct knote *kn, struct kevent_internal_s *kev);
 178 static int      filt_sockprocess(struct knote *kn, struct filt_process_s *data, struct kevent_internal_s *kev);
 179
 180 static int sooptcopyin_timeval(struct sockopt *, struct timeval *);
 181 static int sooptcopyout_timeval(struct sockopt *, const struct timeval *);
 182
 183 struct filterops soread_filtops = {
 184         .f_isfd = 1,
 185         .f_attach = filt_sorattach,
 186         .f_detach = filt_sordetach,
 187         .f_event = filt_soread,
 188         .f_touch = filt_sortouch,
 189         .f_process = filt_sorprocess,
 190 };
 191
 192 struct filterops sowrite_filtops = {
 193         .f_isfd = 1,
 194         .f_attach = filt_sowattach,
 195         .f_detach = filt_sowdetach,
 196         .f_event = filt_sowrite,
 197         .f_touch = filt_sowtouch,
 198         .f_process = filt_sowprocess,
 199 };
 200
 201 struct filterops sock_filtops = {
 202         .f_isfd = 1,
 203         .f_attach = filt_sockattach,
 204         .f_detach = filt_sockdetach,
 205         .f_event = filt_sockev,
 206         .f_touch = filt_socktouch,
 207         .f_process = filt_sockprocess,
 208 };
 209
 210 struct filterops soexcept_filtops = {
 211         .f_isfd = 1,
 212         .f_attach = filt_sorattach,
 213         .f_detach = filt_sordetach,
 214         .f_event = filt_soread,
 215         .f_touch = filt_sortouch,
 216         .f_process = filt_sorprocess,
 217 };
 218
 219 SYSCTL_DECL(_kern_ipc);
 220
 221 #define EVEN_MORE_LOCKING_DEBUG 0
 222
 223 int socket_debug = 0;
 224 SYSCTL_INT(_kern_ipc, OID_AUTO, socket_debug,
 225         CTLFLAG_RW | CTLFLAG_LOCKED, &socket_debug, 0, "");
 226
 227 static unsigned long sodefunct_calls = 0;
 228 SYSCTL_LONG(_kern_ipc, OID_AUTO, sodefunct_calls, CTLFLAG_LOCKED,
 229     &sodefunct_calls, "");
 230
 231 static int socket_zone = M_SOCKET;
 232 so_gen_t        so_gencnt;      /* generation count for sockets */
 233
 234 MALLOC_DEFINE(M_SONAME, "soname", "socket name");
 235 MALLOC_DEFINE(M_PCB, "pcb", "protocol control block");
 236
 237 #define DBG_LAYER_IN_BEG        NETDBG_CODE(DBG_NETSOCK, 0)
 238 #define DBG_LAYER_IN_END        NETDBG_CODE(DBG_NETSOCK, 2)
 239 #define DBG_LAYER_OUT_BEG       NETDBG_CODE(DBG_NETSOCK, 1)
 240 #define DBG_LAYER_OUT_END       NETDBG_CODE(DBG_NETSOCK, 3)
 241 #define DBG_FNC_SOSEND          NETDBG_CODE(DBG_NETSOCK, (4 << 8) | 1)
 242 #define DBG_FNC_SOSEND_LIST     NETDBG_CODE(DBG_NETSOCK, (4 << 8) | 3)
 243 #define DBG_FNC_SORECEIVE       NETDBG_CODE(DBG_NETSOCK, (8 << 8))
 244 #define DBG_FNC_SORECEIVE_LIST  NETDBG_CODE(DBG_NETSOCK, (8 << 8) | 3)
 245 #define DBG_FNC_SOSHUTDOWN      NETDBG_CODE(DBG_NETSOCK, (9 << 8))
 246
 247 #define MAX_SOOPTGETM_SIZE      (128 * MCLBYTES)
 248
 249 int somaxconn = SOMAXCONN;
 250 SYSCTL_INT(_kern_ipc, KIPC_SOMAXCONN, somaxconn,
 251         CTLFLAG_RW | CTLFLAG_LOCKED, &somaxconn, 0, "");
 252
 253 /* Should we get a maximum also ??? */
 254 static int sosendmaxchain = 65536;
 255 static int sosendminchain = 16384;
 256 static int sorecvmincopy  = 16384;
 257 SYSCTL_INT(_kern_ipc, OID_AUTO, sosendminchain,
 258         CTLFLAG_RW | CTLFLAG_LOCKED, &sosendminchain, 0, "");
 259 SYSCTL_INT(_kern_ipc, OID_AUTO, sorecvmincopy,
 260         CTLFLAG_RW | CTLFLAG_LOCKED, &sorecvmincopy, 0, "");
 261
 262 /*
 263  * Set to enable jumbo clusters (if available) for large writes when
 264  * the socket is marked with SOF_MULTIPAGES; see below.
 265  */
 266 int sosendjcl = 1;
 267 SYSCTL_INT(_kern_ipc, OID_AUTO, sosendjcl,
 268         CTLFLAG_RW | CTLFLAG_LOCKED, &sosendjcl, 0, "");
 269
 270 /*
 271  * Set this to ignore SOF_MULTIPAGES and use jumbo clusters for large
 272  * writes on the socket for all protocols on any network interfaces,
 273  * depending upon sosendjcl above.  Be extra careful when setting this
 274  * to 1, because sending down packets that cross physical pages down to
 275  * broken drivers (those that falsely assume that the physical pages
 276  * are contiguous) might lead to system panics or silent data corruption.
 277  * When set to 0, the system will respect SOF_MULTIPAGES, which is set
 278  * only for TCP sockets whose outgoing interface is IFNET_MULTIPAGES
 279  * capable.  Set this to 1 only for testing/debugging purposes.
 280  */
 281 int sosendjcl_ignore_capab = 0;
 282 SYSCTL_INT(_kern_ipc, OID_AUTO, sosendjcl_ignore_capab,
 283         CTLFLAG_RW | CTLFLAG_LOCKED, &sosendjcl_ignore_capab, 0, "");
 284
 285 /*
 286  * Set this to ignore SOF1_IF_2KCL and use big clusters for large
 287  * writes on the socket for all protocols on any network interfaces.
 288  * Be extra careful when setting this to 1, because sending down packets with
 289  * clusters larger that 2 KB might lead to system panics or data corruption.
 290  * When set to 0, the system will respect SOF1_IF_2KCL, which is set
 291  * on the outgoing interface
 292  * Set this to 1  for testing/debugging purposes only.
 293  */
 294 int sosendbigcl_ignore_capab = 0;
 295 SYSCTL_INT(_kern_ipc, OID_AUTO, sosendbigcl_ignore_capab,
 296         CTLFLAG_RW | CTLFLAG_LOCKED, &sosendbigcl_ignore_capab, 0, "");
 297
 298 int sodefunctlog = 0;
 299 SYSCTL_INT(_kern_ipc, OID_AUTO, sodefunctlog, CTLFLAG_RW | CTLFLAG_LOCKED,
 300         &sodefunctlog, 0, "");
 301
 302 int sothrottlelog = 0;
 303 SYSCTL_INT(_kern_ipc, OID_AUTO, sothrottlelog, CTLFLAG_RW | CTLFLAG_LOCKED,
 304         &sothrottlelog, 0, "");
 305
 306 int sorestrictrecv = 1;
 307 SYSCTL_INT(_kern_ipc, OID_AUTO, sorestrictrecv, CTLFLAG_RW | CTLFLAG_LOCKED,
 308         &sorestrictrecv, 0, "Enable inbound interface restrictions");
 309
 310 int sorestrictsend = 1;
 311 SYSCTL_INT(_kern_ipc, OID_AUTO, sorestrictsend, CTLFLAG_RW | CTLFLAG_LOCKED,
 312         &sorestrictsend, 0, "Enable outbound interface restrictions");
 313
 314 int soreserveheadroom = 1;
 315 SYSCTL_INT(_kern_ipc, OID_AUTO, soreserveheadroom, CTLFLAG_RW | CTLFLAG_LOCKED,
 316         &soreserveheadroom, 0, "To allocate contiguous datagram buffers");
 317
 318 #if (DEBUG || DEVELOPMENT)
 319 int so_notsent_lowat_check = 1;
 320 SYSCTL_INT(_kern_ipc, OID_AUTO, notsent_lowat, CTLFLAG_RW|CTLFLAG_LOCKED,
 321     &so_notsent_lowat_check, 0, "enable/disable notsnet lowat check");
 322 #endif /* DEBUG || DEVELOPMENT */
 323
 324 extern struct inpcbinfo tcbinfo;
 325
 326 /* TODO: these should be in header file */
 327 extern int get_inpcb_str_size(void);
 328 extern int get_tcp_str_size(void);
 329
 330 static unsigned int sl_zone_size;               /* size of sockaddr_list */
 331 static struct zone *sl_zone;                    /* zone for sockaddr_list */
 332
 333 static unsigned int se_zone_size;               /* size of sockaddr_entry */
 334 static struct zone *se_zone;                    /* zone for sockaddr_entry */
 335
 336 vm_size_t       so_cache_zone_element_size;
 337
 338 static int sodelayed_copy(struct socket *, struct uio *, struct mbuf **,
 339     user_ssize_t *);
 340 static void cached_sock_alloc(struct socket **, int);
 341 static void cached_sock_free(struct socket *);
 342
 343 /*
 344  * Maximum of extended background idle sockets per process
 345  * Set to zero to disable further setting of the option
 346  */
 347
 348 #define SO_IDLE_BK_IDLE_MAX_PER_PROC    1
 349 #define SO_IDLE_BK_IDLE_TIME            600
 350 #define SO_IDLE_BK_IDLE_RCV_HIWAT       131072
 351
 352 struct soextbkidlestat soextbkidlestat;
 353
 354 SYSCTL_UINT(_kern_ipc, OID_AUTO, maxextbkidleperproc,
 355         CTLFLAG_RW | CTLFLAG_LOCKED, &soextbkidlestat.so_xbkidle_maxperproc, 0,
 356         "Maximum of extended background idle sockets per process");
 357
 358 SYSCTL_UINT(_kern_ipc, OID_AUTO, extbkidletime, CTLFLAG_RW | CTLFLAG_LOCKED,
 359         &soextbkidlestat.so_xbkidle_time, 0,
 360         "Time in seconds to keep extended background idle sockets");
 361
 362 SYSCTL_UINT(_kern_ipc, OID_AUTO, extbkidlercvhiwat, CTLFLAG_RW | CTLFLAG_LOCKED,
 363         &soextbkidlestat.so_xbkidle_rcvhiwat, 0,
 364         "High water mark for extended background idle sockets");
 365
 366 SYSCTL_STRUCT(_kern_ipc, OID_AUTO, extbkidlestat, CTLFLAG_RD | CTLFLAG_LOCKED,
 367         &soextbkidlestat, soextbkidlestat, "");
 368
 369 int so_set_extended_bk_idle(struct socket *, int);
 370
 371 /*
 372  * SOTCDB_NO_DSCP is set by default, to prevent the networking stack from
 373  * setting the DSCP code on the packet based on the service class; see
 374  * <rdar://problem/11277343> for details.
 375  */
 376 __private_extern__ u_int32_t sotcdb = 0;
 377 SYSCTL_INT(_kern_ipc, OID_AUTO, sotcdb, CTLFLAG_RW | CTLFLAG_LOCKED,
 378         &sotcdb, 0, "");
 379
 380 void
 381 socketinit(void)
 382 {
 383         _CASSERT(sizeof(so_gencnt) == sizeof(uint64_t));
 384         VERIFY(IS_P2ALIGNED(&so_gencnt, sizeof(uint32_t)));
 385
 386 #ifdef __LP64__
 387         _CASSERT(sizeof(struct sa_endpoints) == sizeof(struct user64_sa_endpoints));
 388         _CASSERT(offsetof(struct sa_endpoints, sae_srcif) == offsetof(struct user64_sa_endpoints, sae_srcif));
 389         _CASSERT(offsetof(struct sa_endpoints, sae_srcaddr) == offsetof(struct user64_sa_endpoints, sae_srcaddr));
 390         _CASSERT(offsetof(struct sa_endpoints, sae_srcaddrlen) == offsetof(struct user64_sa_endpoints, sae_srcaddrlen));
 391         _CASSERT(offsetof(struct sa_endpoints, sae_dstaddr) == offsetof(struct user64_sa_endpoints, sae_dstaddr));
 392         _CASSERT(offsetof(struct sa_endpoints, sae_dstaddrlen) == offsetof(struct user64_sa_endpoints, sae_dstaddrlen));
 393 #else
 394         _CASSERT(sizeof(struct sa_endpoints) == sizeof(struct user32_sa_endpoints));
 395         _CASSERT(offsetof(struct sa_endpoints, sae_srcif) == offsetof(struct user32_sa_endpoints, sae_srcif));
 396         _CASSERT(offsetof(struct sa_endpoints, sae_srcaddr) == offsetof(struct user32_sa_endpoints, sae_srcaddr));
 397         _CASSERT(offsetof(struct sa_endpoints, sae_srcaddrlen) == offsetof(struct user32_sa_endpoints, sae_srcaddrlen));
 398         _CASSERT(offsetof(struct sa_endpoints, sae_dstaddr) == offsetof(struct user32_sa_endpoints, sae_dstaddr));
 399         _CASSERT(offsetof(struct sa_endpoints, sae_dstaddrlen) == offsetof(struct user32_sa_endpoints, sae_dstaddrlen));
 400 #endif
 401
 402         if (socketinit_done) {
 403                 printf("socketinit: already called...\n");
 404                 return;
 405         }
 406         socketinit_done = 1;
 407
 408         PE_parse_boot_argn("socket_debug", &socket_debug,
 409             sizeof (socket_debug));
 410
 411         /*
 412          * allocate lock group attribute and group for socket cache mutex
 413          */
 414         so_cache_mtx_grp_attr = lck_grp_attr_alloc_init();
 415         so_cache_mtx_grp = lck_grp_alloc_init("so_cache",
 416             so_cache_mtx_grp_attr);
 417
 418         /*
 419          * allocate the lock attribute for socket cache mutex
 420          */
 421         so_cache_mtx_attr = lck_attr_alloc_init();
 422
 423         /* cached sockets mutex */
 424         so_cache_mtx = lck_mtx_alloc_init(so_cache_mtx_grp, so_cache_mtx_attr);
 425         if (so_cache_mtx == NULL) {
 426                 panic("%s: unable to allocate so_cache_mtx\n", __func__);
 427                 /* NOTREACHED */
 428         }
 429         STAILQ_INIT(&so_cache_head);
 430
 431         so_cache_zone_element_size = (vm_size_t)(sizeof (struct socket) + 4
 432             + get_inpcb_str_size() + 4 + get_tcp_str_size());
 433
 434         so_cache_zone = zinit(so_cache_zone_element_size,
 435             (120000 * so_cache_zone_element_size), 8192, "socache zone");
 436         zone_change(so_cache_zone, Z_CALLERACCT, FALSE);
 437         zone_change(so_cache_zone, Z_NOENCRYPT, TRUE);
 438
 439         sl_zone_size = sizeof (struct sockaddr_list);
 440         if ((sl_zone = zinit(sl_zone_size, 1024 * sl_zone_size, 1024,
 441             "sockaddr_list")) == NULL) {
 442                 panic("%s: unable to allocate sockaddr_list zone\n", __func__);
 443                 /* NOTREACHED */
 444         }
 445         zone_change(sl_zone, Z_CALLERACCT, FALSE);
 446         zone_change(sl_zone, Z_EXPAND, TRUE);
 447
 448         se_zone_size = sizeof (struct sockaddr_entry);
 449         if ((se_zone = zinit(se_zone_size, 1024 * se_zone_size, 1024,
 450             "sockaddr_entry")) == NULL) {
 451                 panic("%s: unable to allocate sockaddr_entry zone\n", __func__);
 452                 /* NOTREACHED */
 453         }
 454         zone_change(se_zone, Z_CALLERACCT, FALSE);
 455         zone_change(se_zone, Z_EXPAND, TRUE);
 456
 457         bzero(&soextbkidlestat, sizeof(struct soextbkidlestat));
 458         soextbkidlestat.so_xbkidle_maxperproc = SO_IDLE_BK_IDLE_MAX_PER_PROC;
 459         soextbkidlestat.so_xbkidle_time = SO_IDLE_BK_IDLE_TIME;
 460         soextbkidlestat.so_xbkidle_rcvhiwat = SO_IDLE_BK_IDLE_RCV_HIWAT;
 461
 462         in_pcbinit();
 463         sflt_init();
 464         socket_tclass_init();
 465 #if MULTIPATH
 466         mp_pcbinit();
 467 #endif /* MULTIPATH */
 468 }
 469
 470 static void
 471 cached_sock_alloc(struct socket **so, int waitok)
 472 {
 473         caddr_t temp;
 474         uintptr_t offset;
 475
 476         lck_mtx_lock(so_cache_mtx);
 477
 478         if (!STAILQ_EMPTY(&so_cache_head)) {
 479                 VERIFY(cached_sock_count > 0);
 480
 481                 *so = STAILQ_FIRST(&so_cache_head);
 482                 STAILQ_REMOVE_HEAD(&so_cache_head, so_cache_ent);
 483                 STAILQ_NEXT((*so), so_cache_ent) = NULL;
 484
 485                 cached_sock_count--;
 486                 lck_mtx_unlock(so_cache_mtx);
 487
 488                 temp = (*so)->so_saved_pcb;
 489                 bzero((caddr_t)*so, sizeof (struct socket));
 490
 491                 (*so)->so_saved_pcb = temp;
 492         } else {
 493
 494                 lck_mtx_unlock(so_cache_mtx);
 495
 496                 if (waitok)
 497                         *so = (struct socket *)zalloc(so_cache_zone);
 498                 else
 499                         *so = (struct socket *)zalloc_noblock(so_cache_zone);
 500
 501                 if (*so == NULL)
 502                         return;
 503
 504                 bzero((caddr_t)*so, sizeof (struct socket));
 505
 506                 /*
 507                  * Define offsets for extra structures into our
 508                  * single block of memory. Align extra structures
 509                  * on longword boundaries.
 510                  */
 511
 512                 offset = (uintptr_t)*so;
 513                 offset += sizeof (struct socket);
 514
 515                 offset = ALIGN(offset);
 516
 517                 (*so)->so_saved_pcb = (caddr_t)offset;
 518                 offset += get_inpcb_str_size();
 519
 520                 offset = ALIGN(offset);
 521
 522                 ((struct inpcb *)(void *)(*so)->so_saved_pcb)->inp_saved_ppcb =
 523                     (caddr_t)offset;
 524         }
 525
 526         OSBitOrAtomic(SOF1_CACHED_IN_SOCK_LAYER, &(*so)->so_flags1);
 527 }
 528
 529 static void
 530 cached_sock_free(struct socket *so)
 531 {
 532
 533         lck_mtx_lock(so_cache_mtx);
 534
 535         so_cache_time = net_uptime();
 536         if (++cached_sock_count > max_cached_sock_count) {
 537                 --cached_sock_count;
 538                 lck_mtx_unlock(so_cache_mtx);
 539                 zfree(so_cache_zone, so);
 540         } else {
 541                 if (so_cache_hw < cached_sock_count)
 542                         so_cache_hw = cached_sock_count;
 543
 544                 STAILQ_INSERT_TAIL(&so_cache_head, so, so_cache_ent);
 545
 546                 so->cache_timestamp = so_cache_time;
 547                 lck_mtx_unlock(so_cache_mtx);
 548         }
 549 }
 550
 551 void
 552 so_update_last_owner_locked(struct socket *so, proc_t self)
 553 {
 554         if (so->last_pid != 0) {
 555                 /*
 556                  * last_pid and last_upid should remain zero for sockets
 557                  * created using sock_socket. The check above achieves that
 558                  */
 559                 if (self == PROC_NULL)
 560                         self = current_proc();
 561
 562                 if (so->last_upid != proc_uniqueid(self) ||
 563                     so->last_pid != proc_pid(self)) {
 564                         so->last_upid = proc_uniqueid(self);
 565                         so->last_pid = proc_pid(self);
 566                         proc_getexecutableuuid(self, so->last_uuid,
 567                             sizeof (so->last_uuid));
 568                 }
 569                 proc_pidoriginatoruuid(so->so_vuuid, sizeof(so->so_vuuid));
 570         }
 571 }
 572
 573 void
 574 so_update_policy(struct socket *so)
 575 {
 576         if (SOCK_DOM(so) == PF_INET || SOCK_DOM(so) == PF_INET6)
 577                 (void) inp_update_policy(sotoinpcb(so));
 578 }
 579
 580 #if NECP
 581 static void
 582 so_update_necp_policy(struct socket *so, struct sockaddr *override_local_addr,
 583     struct sockaddr *override_remote_addr)
 584 {
 585         if (SOCK_DOM(so) == PF_INET || SOCK_DOM(so) == PF_INET6)
 586                 inp_update_necp_policy(sotoinpcb(so), override_local_addr,
 587                     override_remote_addr, 0);
 588 }
 589 #endif /* NECP */
 590
 591 boolean_t
 592 so_cache_timer(void)
 593 {
 594         struct socket   *p;
 595         int             n_freed = 0;
 596         boolean_t rc = FALSE;
 597
 598         lck_mtx_lock(so_cache_mtx);
 599         so_cache_timeouts++;
 600         so_cache_time = net_uptime();
 601
 602         while (!STAILQ_EMPTY(&so_cache_head)) {
 603                 VERIFY(cached_sock_count > 0);
 604                 p = STAILQ_FIRST(&so_cache_head);
 605                 if ((so_cache_time - p->cache_timestamp) <
 606                         SO_CACHE_TIME_LIMIT)
 607                         break;
 608
 609                 STAILQ_REMOVE_HEAD(&so_cache_head, so_cache_ent);
 610                 --cached_sock_count;
 611
 612                 zfree(so_cache_zone, p);
 613
 614                 if (++n_freed >= SO_CACHE_MAX_FREE_BATCH) {
 615                         so_cache_max_freed++;
 616                         break;
 617                 }
 618         }
 619
 620         /* Schedule again if there is more to cleanup */
 621         if (!STAILQ_EMPTY(&so_cache_head))
 622                 rc = TRUE;
 623
 624         lck_mtx_unlock(so_cache_mtx);
 625         return (rc);
 626 }
 627
 628 /*
 629  * Get a socket structure from our zone, and initialize it.
 630  * We don't implement `waitok' yet (see comments in uipc_domain.c).
 631  * Note that it would probably be better to allocate socket
 632  * and PCB at the same time, but I'm not convinced that all
 633  * the protocols can be easily modified to do this.
 634  */
 635 struct socket *
 636 soalloc(int waitok, int dom, int type)
 637 {
 638         struct socket *so;
 639
 640         if ((dom == PF_INET) && (type == SOCK_STREAM)) {
 641                 cached_sock_alloc(&so, waitok);
 642         } else {
 643                 MALLOC_ZONE(so, struct socket *, sizeof (*so), socket_zone,
 644                     M_WAITOK);
 645                 if (so != NULL)
 646                         bzero(so, sizeof (*so));
 647         }
 648         if (so != NULL) {
 649                 so->so_gencnt = OSIncrementAtomic64((SInt64 *)&so_gencnt);
 650                 so->so_zone = socket_zone;
 651 #if CONFIG_MACF_SOCKET
 652                 /* Convert waitok to  M_WAITOK/M_NOWAIT for MAC Framework. */
 653                 if (mac_socket_label_init(so, !waitok) != 0) {
 654                         sodealloc(so);
 655                         return (NULL);
 656                 }
 657 #endif /* MAC_SOCKET */
 658         }
 659
 660         return (so);
 661 }
 662
 663 int
 664 socreate_internal(int dom, struct socket **aso, int type, int proto,
 665     struct proc *p, uint32_t flags, struct proc *ep)
 666 {
 667         struct protosw *prp;
 668         struct socket *so;
 669         int error = 0;
 670
 671 #if TCPDEBUG
 672         extern int tcpconsdebug;
 673 #endif
 674
 675         VERIFY(aso != NULL);
 676         *aso = NULL;
 677
 678         if (proto != 0)
 679                 prp = pffindproto(dom, proto, type);
 680         else
 681                 prp = pffindtype(dom, type);
 682
 683         if (prp == NULL || prp->pr_usrreqs->pru_attach == NULL) {
 684                 if (pffinddomain(dom) == NULL)
 685                         return (EAFNOSUPPORT);
 686                 if (proto != 0) {
 687                         if (pffindprotonotype(dom, proto) != NULL)
 688                                 return (EPROTOTYPE);
 689                 }
 690                 return (EPROTONOSUPPORT);
 691         }
 692         if (prp->pr_type != type)
 693                 return (EPROTOTYPE);
 694         so = soalloc(1, dom, type);
 695         if (so == NULL)
 696                 return (ENOBUFS);
 697
 698         if (flags & SOCF_ASYNC)
 699                 so->so_state |= SS_NBIO;
 700 #if MULTIPATH
 701         if (flags & SOCF_MP_SUBFLOW) {
 702                 /*
 703                  * A multipath subflow socket is used internally in the kernel,
 704                  * therefore it does not have a file desciptor associated by
 705                  * default.
 706                  */
 707                 so->so_state |= SS_NOFDREF;
 708                 so->so_flags |= SOF_MP_SUBFLOW;
 709         }
 710 #endif /* MULTIPATH */
 711
 712         TAILQ_INIT(&so->so_incomp);
 713         TAILQ_INIT(&so->so_comp);
 714         so->so_type = type;
 715         so->last_upid = proc_uniqueid(p);
 716         so->last_pid = proc_pid(p);
 717         proc_getexecutableuuid(p, so->last_uuid, sizeof (so->last_uuid));
 718         proc_pidoriginatoruuid(so->so_vuuid, sizeof(so->so_vuuid));
 719
 720         if (ep != PROC_NULL && ep != p) {
 721                 so->e_upid = proc_uniqueid(ep);
 722                 so->e_pid = proc_pid(ep);
 723                 proc_getexecutableuuid(ep, so->e_uuid, sizeof (so->e_uuid));
 724                 so->so_flags |= SOF_DELEGATED;
 725         }
 726
 727         so->so_cred = kauth_cred_proc_ref(p);
 728         if (!suser(kauth_cred_get(), NULL))
 729                 so->so_state |= SS_PRIV;
 730
 731         so->so_proto = prp;
 732         so->so_rcv.sb_flags |= SB_RECV;
 733         so->so_rcv.sb_so = so->so_snd.sb_so = so;
 734         so->next_lock_lr = 0;
 735         so->next_unlock_lr = 0;
 736
 737 #if CONFIG_MACF_SOCKET
 738         mac_socket_label_associate(kauth_cred_get(), so);
 739 #endif /* MAC_SOCKET */
 740
 741         /*
 742          * Attachment will create the per pcb lock if necessary and
 743          * increase refcount for creation, make sure it's done before
 744          * socket is inserted in lists.
 745          */
 746         so->so_usecount++;
 747
 748         error = (*prp->pr_usrreqs->pru_attach)(so, proto, p);
 749         if (error != 0) {
 750                 /*
 751                  * Warning:
 752                  * If so_pcb is not zero, the socket will be leaked,
 753                  * so protocol attachment handler must be coded carefuly
 754                  */
 755                 so->so_state |= SS_NOFDREF;
 756                 VERIFY(so->so_usecount > 0);
 757                 so->so_usecount--;
 758                 sofreelastref(so, 1);   /* will deallocate the socket */
 759                 return (error);
 760         }
 761
 762         atomic_add_32(&prp->pr_domain->dom_refs, 1);
 763         TAILQ_INIT(&so->so_evlist);
 764
 765         /* Attach socket filters for this protocol */
 766         sflt_initsock(so);
 767 #if TCPDEBUG
 768         if (tcpconsdebug == 2)
 769                 so->so_options |= SO_DEBUG;
 770 #endif
 771         so_set_default_traffic_class(so);
 772
 773         /*
 774          * If this thread or task is marked to create backgrounded sockets,
 775          * mark the socket as background.
 776          */
 777         if (proc_get_effective_thread_policy(current_thread(),
 778             TASK_POLICY_NEW_SOCKETS_BG)) {
 779                 socket_set_traffic_mgt_flags(so, TRAFFIC_MGT_SO_BACKGROUND);
 780                 so->so_background_thread = current_thread();
 781         }
 782
 783         switch (dom) {
 784         /*
 785          * Don't mark Unix domain, system or multipath sockets as
 786          * eligible for defunct by default.
 787          */
 788         case PF_LOCAL:
 789         case PF_SYSTEM:
 790         case PF_MULTIPATH:
 791                 so->so_flags |= SOF_NODEFUNCT;
 792                 break;
 793         default:
 794                 break;
 795         }
 796
 797         /*
 798          * Entitlements can't be checked at socket creation time except if the
 799          * application requested a feature guarded by a privilege (c.f., socket
 800          * delegation).
 801          * The priv(9) and the Sandboxing APIs are designed with the idea that
 802          * a privilege check should only be triggered by a userland request.
 803          * A privilege check at socket creation time is time consuming and
 804          * could trigger many authorisation error messages from the security
 805          * APIs.
 806          */
 807
 808         *aso = so;
 809
 810         return (0);
 811 }
 812
 813 /*
 814  * Returns:     0                       Success
 815  *              EAFNOSUPPORT
 816  *              EPROTOTYPE
 817  *              EPROTONOSUPPORT
 818  *              ENOBUFS
 819  *      <pru_attach>:ENOBUFS[AF_UNIX]
 820  *      <pru_attach>:ENOBUFS[TCP]
 821  *      <pru_attach>:ENOMEM[TCP]
 822  *      <pru_attach>:???                [other protocol families, IPSEC]
 823  */
 824 int
 825 socreate(int dom, struct socket **aso, int type, int proto)
 826 {
 827         return (socreate_internal(dom, aso, type, proto, current_proc(), 0,
 828             PROC_NULL));
 829 }
 830
 831 int
 832 socreate_delegate(int dom, struct socket **aso, int type, int proto, pid_t epid)
 833 {
 834         int error = 0;
 835         struct proc *ep = PROC_NULL;
 836
 837         if ((proc_selfpid() != epid) && ((ep = proc_find(epid)) == PROC_NULL)) {
 838                 error = ESRCH;
 839                 goto done;
 840         }
 841
 842         error = socreate_internal(dom, aso, type, proto, current_proc(), 0, ep);
 843
 844         /*
 845          * It might not be wise to hold the proc reference when calling
 846          * socreate_internal since it calls soalloc with M_WAITOK
 847          */
 848 done:
 849         if (ep != PROC_NULL)
 850                 proc_rele(ep);
 851
 852         return (error);
 853 }
 854
 855 /*
 856  * Returns:     0                       Success
 857  *      <pru_bind>:EINVAL               Invalid argument [COMMON_START]
 858  *      <pru_bind>:EAFNOSUPPORT         Address family not supported
 859  *      <pru_bind>:EADDRNOTAVAIL        Address not available.
 860  *      <pru_bind>:EINVAL               Invalid argument
 861  *      <pru_bind>:EAFNOSUPPORT         Address family not supported [notdef]
 862  *      <pru_bind>:EACCES               Permission denied
 863  *      <pru_bind>:EADDRINUSE           Address in use
 864  *      <pru_bind>:EAGAIN               Resource unavailable, try again
 865  *      <pru_bind>:EPERM                Operation not permitted
 866  *      <pru_bind>:???
 867  *      <sf_bind>:???
 868  *
 869  * Notes:       It's not possible to fully enumerate the return codes above,
 870  *              since socket filter authors and protocol family authors may
 871  *              not choose to limit their error returns to those listed, even
 872  *              though this may result in some software operating incorrectly.
 873  *
 874  *              The error codes which are enumerated above are those known to
 875  *              be returned by the tcp_usr_bind function supplied.
 876  */
 877 int
 878 sobindlock(struct socket *so, struct sockaddr *nam, int dolock)
 879 {
 880         struct proc *p = current_proc();
 881         int error = 0;
 882
 883         if (dolock)
 884                 socket_lock(so, 1);
 885         VERIFY(so->so_usecount > 1);
 886
 887         so_update_last_owner_locked(so, p);
 888         so_update_policy(so);
 889
 890 #if NECP
 891         so_update_necp_policy(so, nam, NULL);
 892 #endif /* NECP */
 893
 894         /*
 895          * If this is a bind request on a socket that has been marked
 896          * as inactive, reject it now before we go any further.
 897          */
 898         if (so->so_flags & SOF_DEFUNCT) {
 899                 error = EINVAL;
 900                 SODEFUNCTLOG("%s[%d, %s]: defunct so 0x%llx [%d,%d] (%d)\n",
 901                     __func__, proc_pid(p), proc_best_name(p),
 902                     (uint64_t)DEBUG_KERNEL_ADDRPERM(so),
 903                     SOCK_DOM(so), SOCK_TYPE(so), error);
 904                 goto out;
 905         }
 906
 907         /* Socket filter */
 908         error = sflt_bind(so, nam);
 909
 910         if (error == 0)
 911                 error = (*so->so_proto->pr_usrreqs->pru_bind)(so, nam, p);
 912 out:
 913         if (dolock)
 914                 socket_unlock(so, 1);
 915
 916         if (error == EJUSTRETURN)
 917                 error = 0;
 918
 919         return (error);
 920 }
 921
 922 void
 923 sodealloc(struct socket *so)
 924 {
 925         kauth_cred_unref(&so->so_cred);
 926
 927         /* Remove any filters */
 928         sflt_termsock(so);
 929
 930 #if CONTENT_FILTER
 931         cfil_sock_detach(so);
 932 #endif /* CONTENT_FILTER */
 933
 934         /* Delete the state allocated for msg queues on a socket */
 935         if (so->so_flags & SOF_ENABLE_MSGS) {
 936                 FREE(so->so_msg_state, M_TEMP);
 937                 so->so_msg_state = NULL;
 938         }
 939         VERIFY(so->so_msg_state == NULL);
 940
 941         so->so_gencnt = OSIncrementAtomic64((SInt64 *)&so_gencnt);
 942
 943 #if CONFIG_MACF_SOCKET
 944         mac_socket_label_destroy(so);
 945 #endif /* MAC_SOCKET */
 946
 947         if (so->so_flags1 & SOF1_CACHED_IN_SOCK_LAYER) {
 948                 cached_sock_free(so);
 949         } else {
 950                 FREE_ZONE(so, sizeof (*so), so->so_zone);
 951         }
 952 }
 953
 954 /*
 955  * Returns:     0                       Success
 956  *              EINVAL
 957  *              EOPNOTSUPP
 958  *      <pru_listen>:EINVAL[AF_UNIX]
 959  *      <pru_listen>:EINVAL[TCP]
 960  *      <pru_listen>:EADDRNOTAVAIL[TCP] Address not available.
 961  *      <pru_listen>:EINVAL[TCP]        Invalid argument
 962  *      <pru_listen>:EAFNOSUPPORT[TCP]  Address family not supported [notdef]
 963  *      <pru_listen>:EACCES[TCP]        Permission denied
 964  *      <pru_listen>:EADDRINUSE[TCP]    Address in use
 965  *      <pru_listen>:EAGAIN[TCP]        Resource unavailable, try again
 966  *      <pru_listen>:EPERM[TCP]         Operation not permitted
 967  *      <sf_listen>:???
 968  *
 969  * Notes:       Other <pru_listen> returns depend on the protocol family; all
 970  *              <sf_listen> returns depend on what the filter author causes
 971  *              their filter to return.
 972  */
 973 int
 974 solisten(struct socket *so, int backlog)
 975 {
 976         struct proc *p = current_proc();
 977         int error = 0;
 978
 979         socket_lock(so, 1);
 980
 981         so_update_last_owner_locked(so, p);
 982         so_update_policy(so);
 983
 984 #if NECP
 985         so_update_necp_policy(so, NULL, NULL);
 986 #endif /* NECP */
 987
 988         if (so->so_proto == NULL) {
 989                 error = EINVAL;
 990                 goto out;
 991         }
 992         if ((so->so_proto->pr_flags & PR_CONNREQUIRED) == 0) {
 993                 error = EOPNOTSUPP;
 994                 goto out;
 995         }
 996
 997         /*
 998          * If the listen request is made on a socket that is not fully
 999          * disconnected, or on a socket that has been marked as inactive,
1000          * reject the request now.
1001          */
1002         if ((so->so_state &
1003             (SS_ISCONNECTED|SS_ISCONNECTING|SS_ISDISCONNECTING)) ||
1004             (so->so_flags & SOF_DEFUNCT)) {
1005                 error = EINVAL;
1006                 if (so->so_flags & SOF_DEFUNCT) {
1007                         SODEFUNCTLOG("%s[%d, %s]: defunct so 0x%llx [%d,%d] "
1008                             "(%d)\n", __func__, proc_pid(p),
1009                             proc_best_name(p),
1010                             (uint64_t)DEBUG_KERNEL_ADDRPERM(so),
1011                             SOCK_DOM(so), SOCK_TYPE(so), error);
1012                 }
1013                 goto out;
1014         }
1015
1016         if ((so->so_restrictions & SO_RESTRICT_DENY_IN) != 0) {
1017                 error = EPERM;
1018                 goto out;
1019         }
1020
1021         error = sflt_listen(so);
1022         if (error == 0)
1023                 error = (*so->so_proto->pr_usrreqs->pru_listen)(so, p);
1024
1025         if (error) {
1026                 if (error == EJUSTRETURN)
1027                         error = 0;
1028                 goto out;
1029         }
1030
1031         if (TAILQ_EMPTY(&so->so_comp))
1032                 so->so_options |= SO_ACCEPTCONN;
1033         /*
1034          * POSIX: The implementation may have an upper limit on the length of
1035          * the listen queue-either global or per accepting socket. If backlog
1036          * exceeds this limit, the length of the listen queue is set to the
1037          * limit.
1038          *
1039          * If listen() is called with a backlog argument value that is less
1040          * than 0, the function behaves as if it had been called with a backlog
1041          * argument value of 0.
1042          *
1043          * A backlog argument of 0 may allow the socket to accept connections,
1044          * in which case the length of the listen queue may be set to an
1045          * implementation-defined minimum value.
1046          */
1047         if (backlog <= 0 || backlog > somaxconn)
1048                 backlog = somaxconn;
1049
1050         so->so_qlimit = backlog;
1051 out:
1052         socket_unlock(so, 1);
1053         return (error);
1054 }
1055
1056 void
1057 sofreelastref(struct socket *so, int dealloc)
1058 {
1059         struct socket *head = so->so_head;
1060
1061         /* Assume socket is locked */
1062
1063         if (!(so->so_flags & SOF_PCBCLEARING) || !(so->so_state & SS_NOFDREF)) {
1064                 selthreadclear(&so->so_snd.sb_sel);
1065                 selthreadclear(&so->so_rcv.sb_sel);
1066                 so->so_rcv.sb_flags &= ~(SB_SEL|SB_UPCALL);
1067                 so->so_snd.sb_flags &= ~(SB_SEL|SB_UPCALL);
1068                 so->so_event = sonullevent;
1069                 return;
1070         }
1071         if (head != NULL) {
1072                 /*
1073                  * Need to lock the listener when the protocol has
1074                  * per socket locks
1075                  */
1076                 if (head->so_proto->pr_getlock != NULL)
1077                         socket_lock(head, 1);
1078
1079                 if (so->so_state & SS_INCOMP) {
1080                         so->so_state &= ~SS_INCOMP;
1081                         TAILQ_REMOVE(&head->so_incomp, so, so_list);
1082                         head->so_incqlen--;
1083                         head->so_qlen--;
1084                         so->so_head = NULL;
1085                 } else if (so->so_state & SS_COMP) {
1086                         /*
1087                          * We must not decommission a socket that's
1088                          * on the accept(2) queue.  If we do, then
1089                          * accept(2) may hang after select(2) indicated
1090                          * that the listening socket was ready.
1091                          */
1092                         selthreadclear(&so->so_snd.sb_sel);
1093                         selthreadclear(&so->so_rcv.sb_sel);
1094                         so->so_rcv.sb_flags &= ~(SB_SEL|SB_UPCALL);
1095                         so->so_snd.sb_flags &= ~(SB_SEL|SB_UPCALL);
1096                         so->so_event = sonullevent;
1097                         if (head->so_proto->pr_getlock != NULL)
1098                                 socket_unlock(head, 1);
1099                         return;
1100                 } else {
1101                         panic("sofree: not queued");
1102                 }
1103                 if (head->so_proto->pr_getlock != NULL)
1104                         socket_unlock(head, 1);
1105         }
1106         sowflush(so);
1107         sorflush(so);
1108
1109 #if FLOW_DIVERT
1110         if (so->so_flags & SOF_FLOW_DIVERT) {
1111                 flow_divert_detach(so);
1112         }
1113 #endif  /* FLOW_DIVERT */
1114
1115         /* 3932268: disable upcall */
1116         so->so_rcv.sb_flags &= ~SB_UPCALL;
1117         so->so_snd.sb_flags &= ~(SB_UPCALL|SB_SNDBYTE_CNT);
1118         so->so_event = sonullevent;
1119
1120         if (dealloc)
1121                 sodealloc(so);
1122 }
1123
1124 void
1125 soclose_wait_locked(struct socket *so)
1126 {
1127         lck_mtx_t *mutex_held;
1128
1129         if (so->so_proto->pr_getlock != NULL)
1130                 mutex_held = (*so->so_proto->pr_getlock)(so, 0);
1131         else
1132                 mutex_held = so->so_proto->pr_domain->dom_mtx;
1133         lck_mtx_assert(mutex_held, LCK_MTX_ASSERT_OWNED);
1134
1135         /*
1136          * Double check here and return if there's no outstanding upcall;
1137          * otherwise proceed further only if SOF_UPCALLCLOSEWAIT is set.
1138          */
1139         if (!so->so_upcallusecount || !(so->so_flags & SOF_UPCALLCLOSEWAIT))
1140                 return;
1141         so->so_rcv.sb_flags &= ~SB_UPCALL;
1142         so->so_snd.sb_flags &= ~SB_UPCALL;
1143         so->so_flags |= SOF_CLOSEWAIT;
1144         (void) msleep((caddr_t)&so->so_upcallusecount, mutex_held, (PZERO - 1),
1145             "soclose_wait_locked", NULL);
1146         lck_mtx_assert(mutex_held, LCK_MTX_ASSERT_OWNED);
1147         so->so_flags &= ~SOF_CLOSEWAIT;
1148 }
1149
1150 /*
1151  * Close a socket on last file table reference removal.
1152  * Initiate disconnect if connected.
1153  * Free socket when disconnect complete.
1154  */
1155 int
1156 soclose_locked(struct socket *so)
1157 {
1158         int error = 0;
1159         lck_mtx_t *mutex_held;
1160         struct timespec ts;
1161
1162         if (so->so_usecount == 0) {
1163                 panic("soclose: so=%p refcount=0\n", so);
1164                 /* NOTREACHED */
1165         }
1166
1167         sflt_notify(so, sock_evt_closing, NULL);
1168
1169         if (so->so_upcallusecount)
1170                 soclose_wait_locked(so);
1171
1172 #if CONTENT_FILTER
1173         /*
1174          * We have to wait until the content filters are done
1175          */
1176         if ((so->so_flags & SOF_CONTENT_FILTER) != 0) {
1177                 cfil_sock_close_wait(so);
1178                 cfil_sock_is_closed(so);
1179                 cfil_sock_detach(so);
1180         }
1181 #endif /* CONTENT_FILTER */
1182
1183         if (so->so_flags1 & SOF1_EXTEND_BK_IDLE_INPROG) {
1184                 soresume(current_proc(), so, 1);
1185                 so->so_flags1 &= ~SOF1_EXTEND_BK_IDLE_WANTED;
1186         }
1187
1188         if ((so->so_options & SO_ACCEPTCONN)) {
1189                 struct socket *sp;
1190
1191                 /*
1192                  * We do not want new connection to be added
1193                  * to the connection queues
1194                  */
1195                 so->so_options &= ~SO_ACCEPTCONN;
1196
1197                 while ((sp = TAILQ_FIRST(&so->so_incomp)) != NULL) {
1198                         int socklock = 0;
1199
1200                         /*
1201                          * Radar 5350314
1202                          * skip sockets thrown away by tcpdropdropblreq
1203                          * they will get cleanup by the garbage collection.
1204                          * otherwise, remove the incomp socket from the queue
1205                          * and let soabort trigger the appropriate cleanup.
1206                          */
1207                         if (sp->so_flags & SOF_OVERFLOW)
1208                                 continue;
1209
1210                         if (so->so_proto->pr_getlock != NULL) {
1211                                 /*
1212                                  * Lock ordering for consistency with the
1213                                  * rest of the stack, we lock the socket
1214                                  * first and then grab the head.
1215                                  */
1216                                 socket_unlock(so, 0);
1217                                 socket_lock(sp, 1);
1218                                 socket_lock(so, 0);
1219                                 socklock = 1;
1220                         }
1221
1222                         /*
1223                          * Radar 27945981
1224                          * The extra reference for the list insure the
1225                          * validity of the socket pointer when we perform the
1226                          * unlock of the head above
1227                          */
1228                         if (sp->so_state & SS_INCOMP) {
1229                                 sp->so_state &= ~SS_INCOMP;
1230                                 sp->so_head = NULL;
1231                                 TAILQ_REMOVE(&so->so_incomp, sp, so_list);
1232                                 so->so_incqlen--;
1233                                 so->so_qlen--;
1234
1235                                 (void) soabort(sp);
1236                         }
1237
1238                         if (socklock != 0)
1239                                 socket_unlock(sp, 1);
1240                 }
1241
1242                 while ((sp = TAILQ_FIRST(&so->so_comp)) != NULL) {
1243                         int socklock = 0;
1244
1245                         /* Dequeue from so_comp since sofree() won't do it */
1246                         if (so->so_proto->pr_getlock != NULL) {
1247                                 /*
1248                                  * Lock ordering for consistency with the
1249                                  * rest of the stack, we lock the socket
1250                                  * first and then grab the head.
1251                                  */
1252                                 socket_unlock(so, 0);
1253                                 socket_lock(sp, 1);
1254                                 socket_lock(so, 0);
1255                                 socklock = 1;
1256                         }
1257
1258                         if (sp->so_state & SS_COMP) {
1259                                 sp->so_state &= ~SS_COMP;
1260                                 sp->so_head = NULL;
1261                                 TAILQ_REMOVE(&so->so_comp, sp, so_list);
1262                                 so->so_qlen--;
1263
1264                                 (void) soabort(sp);
1265                         }
1266
1267                         if (socklock)
1268                                 socket_unlock(sp, 1);
1269                         }
1270                 }
1271         if (so->so_pcb == NULL) {
1272                 /* 3915887: mark the socket as ready for dealloc */
1273                 so->so_flags |= SOF_PCBCLEARING;
1274                 goto discard;
1275         }
1276         if (so->so_state & SS_ISCONNECTED) {
1277                 if ((so->so_state & SS_ISDISCONNECTING) == 0) {
1278                         error = sodisconnectlocked(so);
1279                         if (error)
1280                                 goto drop;
1281                 }
1282                 if (so->so_options & SO_LINGER) {
1283                         if ((so->so_state & SS_ISDISCONNECTING) &&
1284                             (so->so_state & SS_NBIO))
1285                                 goto drop;
1286                         if (so->so_proto->pr_getlock != NULL)
1287                                 mutex_held = (*so->so_proto->pr_getlock)(so, 0);
1288                         else
1289                                 mutex_held = so->so_proto->pr_domain->dom_mtx;
1290                         while (so->so_state & SS_ISCONNECTED) {
1291                                 ts.tv_sec = (so->so_linger/100);
1292                                 ts.tv_nsec = (so->so_linger % 100) *
1293                                     NSEC_PER_USEC * 1000 * 10;
1294                                 error = msleep((caddr_t)&so->so_timeo,
1295                                     mutex_held, PSOCK | PCATCH, "soclose", &ts);
1296                                 if (error) {
1297                                         /*
1298                                          * It's OK when the time fires,
1299                                          * don't report an error
1300                                          */
1301                                         if (error == EWOULDBLOCK)
1302                                                 error = 0;
1303                                         break;
1304                                 }
1305                         }
1306                 }
1307         }
1308 drop:
1309         if (so->so_usecount == 0) {
1310                 panic("soclose: usecount is zero so=%p\n", so);
1311                 /* NOTREACHED */
1312         }
1313         if (so->so_pcb != NULL && !(so->so_flags & SOF_PCBCLEARING)) {
1314                 int error2 = (*so->so_proto->pr_usrreqs->pru_detach)(so);
1315                 if (error == 0)
1316                         error = error2;
1317         }
1318         if (so->so_usecount <= 0) {
1319                 panic("soclose: usecount is zero so=%p\n", so);
1320                 /* NOTREACHED */
1321         }
1322 discard:
1323         if (so->so_pcb != NULL && !(so->so_flags & SOF_MP_SUBFLOW) &&
1324             (so->so_state & SS_NOFDREF)) {
1325                 panic("soclose: NOFDREF");
1326                 /* NOTREACHED */
1327         }
1328         so->so_state |= SS_NOFDREF;
1329
1330         if (so->so_flags & SOF_MP_SUBFLOW)
1331                 so->so_flags &= ~SOF_MP_SUBFLOW;
1332
1333         if ((so->so_flags & SOF_KNOTE) != 0)
1334                 KNOTE(&so->so_klist, SO_FILT_HINT_LOCKED);
1335
1336         atomic_add_32(&so->so_proto->pr_domain->dom_refs, -1);
1337         evsofree(so);
1338
1339         VERIFY(so->so_usecount > 0);
1340         so->so_usecount--;
1341         sofree(so);
1342         return (error);
1343 }
1344
1345 int
1346 soclose(struct socket *so)
1347 {
1348         int error = 0;
1349         socket_lock(so, 1);
1350
1351         if (so->so_retaincnt == 0) {
1352                 error = soclose_locked(so);
1353         } else {
1354                 /*
1355                  * if the FD is going away, but socket is
1356                  * retained in kernel remove its reference
1357                  */
1358                 so->so_usecount--;
1359                 if (so->so_usecount < 2)
1360                         panic("soclose: retaincnt non null and so=%p "
1361                             "usecount=%d\n", so, so->so_usecount);
1362         }
1363         socket_unlock(so, 1);
1364         return (error);
1365 }
1366
1367 /*
1368  * Must be called at splnet...
1369  */
1370 /* Should already be locked */
1371 int
1372 soabort(struct socket *so)
1373 {
1374         int error;
1375
1376 #ifdef MORE_LOCKING_DEBUG
1377         lck_mtx_t *mutex_held;
1378
1379         if (so->so_proto->pr_getlock != NULL)
1380                 mutex_held = (*so->so_proto->pr_getlock)(so, 0);
1381         else
1382                 mutex_held = so->so_proto->pr_domain->dom_mtx;
1383         lck_mtx_assert(mutex_held, LCK_MTX_ASSERT_OWNED);
1384 #endif
1385
1386         if ((so->so_flags & SOF_ABORTED) == 0) {
1387                 so->so_flags |= SOF_ABORTED;
1388                 error = (*so->so_proto->pr_usrreqs->pru_abort)(so);
1389                 if (error) {
1390                         sofree(so);
1391                         return (error);
1392                 }
1393         }
1394         return (0);
1395 }
1396
1397 int
1398 soacceptlock(struct socket *so, struct sockaddr **nam, int dolock)
1399 {
1400         int error;
1401
1402         if (dolock)
1403                 socket_lock(so, 1);
1404
1405         so_update_last_owner_locked(so, PROC_NULL);
1406         so_update_policy(so);
1407 #if NECP
1408         so_update_necp_policy(so, NULL, NULL);
1409 #endif /* NECP */
1410
1411         if ((so->so_state & SS_NOFDREF) == 0)
1412                 panic("soaccept: !NOFDREF");
1413         so->so_state &= ~SS_NOFDREF;
1414         error = (*so->so_proto->pr_usrreqs->pru_accept)(so, nam);
1415
1416         if (dolock)
1417                 socket_unlock(so, 1);
1418         return (error);
1419 }
1420
1421 int
1422 soaccept(struct socket *so, struct sockaddr **nam)
1423 {
1424         return (soacceptlock(so, nam, 1));
1425 }
1426
1427 int
1428 soacceptfilter(struct socket *so, struct socket *head)
1429 {
1430         struct sockaddr *local = NULL, *remote = NULL;
1431         int error = 0;
1432
1433         /*
1434          * Hold the lock even if this socket has not been made visible
1435          * to the filter(s).  For sockets with global locks, this protects
1436          * against the head or peer going away
1437          */
1438         socket_lock(so, 1);
1439         if (sogetaddr_locked(so, &remote, 1) != 0 ||
1440             sogetaddr_locked(so, &local, 0) != 0) {
1441                 so->so_state &= ~SS_NOFDREF;
1442                 socket_unlock(so, 1);
1443                 soclose(so);
1444                 /* Out of resources; try it again next time */
1445                 error = ECONNABORTED;
1446                 goto done;
1447         }
1448
1449         error = sflt_accept(head, so, local, remote);
1450
1451         /*
1452          * If we get EJUSTRETURN from one of the filters, mark this socket
1453          * as inactive and return it anyway.  This newly accepted socket
1454          * will be disconnected later before we hand it off to the caller.
1455          */
1456         if (error == EJUSTRETURN) {
1457                 error = 0;
1458                 (void) sosetdefunct(current_proc(), so,
1459                     SHUTDOWN_SOCKET_LEVEL_DISCONNECT_INTERNAL, FALSE);
1460         }
1461
1462         if (error != 0) {
1463                 /*
1464                  * This may seem like a duplication to the above error
1465                  * handling part when we return ECONNABORTED, except
1466                  * the following is done while holding the lock since
1467                  * the socket has been exposed to the filter(s) earlier.
1468                  */
1469                 so->so_state &= ~SS_COMP;
1470                 socket_unlock(so, 1);
1471                 soclose(so);
1472                 /* Propagate socket filter's error code to the caller */
1473         } else {
1474                 socket_unlock(so, 1);
1475         }
1476 done:
1477         /* Callee checks for NULL pointer */
1478         sock_freeaddr(remote);
1479         sock_freeaddr(local);
1480         return (error);
1481 }
1482
1483 /*
1484  * Returns:     0                       Success
1485  *              EOPNOTSUPP              Operation not supported on socket
1486  *              EISCONN                 Socket is connected
1487  *      <pru_connect>:EADDRNOTAVAIL     Address not available.
1488  *      <pru_connect>:EINVAL            Invalid argument
1489  *      <pru_connect>:EAFNOSUPPORT      Address family not supported [notdef]
1490  *      <pru_connect>:EACCES            Permission denied
1491  *      <pru_connect>:EADDRINUSE        Address in use
1492  *      <pru_connect>:EAGAIN            Resource unavailable, try again
1493  *      <pru_connect>:EPERM             Operation not permitted
1494  *      <sf_connect_out>:???            [anything a filter writer might set]
1495  */
1496 int
1497 soconnectlock(struct socket *so, struct sockaddr *nam, int dolock)
1498 {
1499         int error;
1500         struct proc *p = current_proc();
1501
1502         if (dolock)
1503                 socket_lock(so, 1);
1504
1505         so_update_last_owner_locked(so, p);
1506         so_update_policy(so);
1507
1508 #if NECP
1509         so_update_necp_policy(so, NULL, nam);
1510 #endif /* NECP */
1511
1512         /*
1513          * If this is a listening socket or if this is a previously-accepted
1514          * socket that has been marked as inactive, reject the connect request.
1515          */
1516         if ((so->so_options & SO_ACCEPTCONN) || (so->so_flags & SOF_DEFUNCT)) {
1517                 error = EOPNOTSUPP;
1518                 if (so->so_flags & SOF_DEFUNCT) {
1519                         SODEFUNCTLOG("%s[%d, %s]: defunct so 0x%llx [%d,%d] "
1520                             "(%d)\n", __func__, proc_pid(p),
1521                             proc_best_name(p),
1522                             (uint64_t)DEBUG_KERNEL_ADDRPERM(so),
1523                             SOCK_DOM(so), SOCK_TYPE(so), error);
1524                 }
1525                 if (dolock)
1526                         socket_unlock(so, 1);
1527                 return (error);
1528         }
1529
1530         if ((so->so_restrictions & SO_RESTRICT_DENY_OUT) != 0) {
1531                 if (dolock)
1532                         socket_unlock(so, 1);
1533                 return (EPERM);
1534         }
1535
1536         /*
1537          * If protocol is connection-based, can only connect once.
1538          * Otherwise, if connected, try to disconnect first.
1539          * This allows user to disconnect by connecting to, e.g.,
1540          * a null address.
1541          */
1542         if (so->so_state & (SS_ISCONNECTED|SS_ISCONNECTING) &&
1543             ((so->so_proto->pr_flags & PR_CONNREQUIRED) ||
1544             (error = sodisconnectlocked(so)))) {
1545                 error = EISCONN;
1546         } else {
1547                 /*
1548                  * Run connect filter before calling protocol:
1549                  *  - non-blocking connect returns before completion;
1550                  */
1551                 error = sflt_connectout(so, nam);
1552                 if (error != 0) {
1553                         if (error == EJUSTRETURN)
1554                                 error = 0;
1555                 } else {
1556                         error = (*so->so_proto->pr_usrreqs->pru_connect)
1557                             (so, nam, p);
1558                 }
1559         }
1560         if (dolock)
1561                 socket_unlock(so, 1);
1562         return (error);
1563 }
1564
1565 int
1566 soconnect(struct socket *so, struct sockaddr *nam)
1567 {
1568         return (soconnectlock(so, nam, 1));
1569 }
1570
1571 /*
1572  * Returns:     0                       Success
1573  *      <pru_connect2>:EINVAL[AF_UNIX]
1574  *      <pru_connect2>:EPROTOTYPE[AF_UNIX]
1575  *      <pru_connect2>:???              [other protocol families]
1576  *
1577  * Notes:       <pru_connect2> is not supported by [TCP].
1578  */
1579 int
1580 soconnect2(struct socket *so1, struct socket *so2)
1581 {
1582         int error;
1583
1584         socket_lock(so1, 1);
1585         if (so2->so_proto->pr_lock)
1586                 socket_lock(so2, 1);
1587
1588         error = (*so1->so_proto->pr_usrreqs->pru_connect2)(so1, so2);
1589
1590         socket_unlock(so1, 1);
1591         if (so2->so_proto->pr_lock)
1592                 socket_unlock(so2, 1);
1593         return (error);
1594 }
1595
1596 int
1597 soconnectxlocked(struct socket *so, struct sockaddr_list **src_sl,
1598     struct sockaddr_list **dst_sl, struct proc *p, uint32_t ifscope,
1599     sae_associd_t aid, sae_connid_t *pcid, uint32_t flags, void *arg,
1600     uint32_t arglen, uio_t auio, user_ssize_t *bytes_written)
1601 {
1602         int error;
1603
1604         so_update_last_owner_locked(so, p);
1605         so_update_policy(so);
1606
1607         /*
1608          * If this is a listening socket or if this is a previously-accepted
1609          * socket that has been marked as inactive, reject the connect request.
1610          */
1611         if ((so->so_options & SO_ACCEPTCONN) || (so->so_flags & SOF_DEFUNCT)) {
1612                 error = EOPNOTSUPP;
1613                 if (so->so_flags & SOF_DEFUNCT) {
1614                         SODEFUNCTLOG("%s[%d, %s]: defunct so 0x%llx [%d,%d] "
1615                             "(%d)\n", __func__, proc_pid(p),
1616                             proc_best_name(p),
1617                             (uint64_t)DEBUG_KERNEL_ADDRPERM(so),
1618                             SOCK_DOM(so), SOCK_TYPE(so), error);
1619                 }
1620                 return (error);
1621         }
1622
1623         if ((so->so_restrictions & SO_RESTRICT_DENY_OUT) != 0)
1624                 return (EPERM);
1625
1626         /*
1627          * If protocol is connection-based, can only connect once
1628          * unless PR_MULTICONN is set.  Otherwise, if connected,
1629          * try to disconnect first.  This allows user to disconnect
1630          * by connecting to, e.g., a null address.
1631          */
1632         if ((so->so_state & (SS_ISCONNECTED|SS_ISCONNECTING)) &&
1633             !(so->so_proto->pr_flags & PR_MULTICONN) &&
1634             ((so->so_proto->pr_flags & PR_CONNREQUIRED) ||
1635             (error = sodisconnectlocked(so)) != 0)) {
1636                 error = EISCONN;
1637         } else {
1638                 /*
1639                  * Run connect filter before calling protocol:
1640                  *  - non-blocking connect returns before completion;
1641                  */
1642                 error = sflt_connectxout(so, dst_sl);
1643                 if (error != 0) {
1644                         /* Disable PRECONNECT_DATA, as we don't need to send a SYN anymore. */
1645                         so->so_flags1 &= ~SOF1_PRECONNECT_DATA;
1646                         if (error == EJUSTRETURN)
1647                                 error = 0;
1648                 } else {
1649                         error = (*so->so_proto->pr_usrreqs->pru_connectx)
1650                             (so, src_sl, dst_sl, p, ifscope, aid, pcid,
1651                             flags, arg, arglen, auio, bytes_written);
1652                 }
1653         }
1654
1655         return (error);
1656 }
1657
1658 int
1659 sodisconnectlocked(struct socket *so)
1660 {
1661         int error;
1662
1663         if ((so->so_state & SS_ISCONNECTED) == 0) {
1664                 error = ENOTCONN;
1665                 goto bad;
1666         }
1667         if (so->so_state & SS_ISDISCONNECTING) {
1668                 error = EALREADY;
1669                 goto bad;
1670         }
1671
1672         error = (*so->so_proto->pr_usrreqs->pru_disconnect)(so);
1673         if (error == 0)
1674                 sflt_notify(so, sock_evt_disconnected, NULL);
1675
1676 bad:
1677         return (error);
1678 }
1679
1680 /* Locking version */
1681 int
1682 sodisconnect(struct socket *so)
1683 {
1684         int error;
1685
1686         socket_lock(so, 1);
1687         error = sodisconnectlocked(so);
1688         socket_unlock(so, 1);
1689         return (error);
1690 }
1691
1692 int
1693 sodisconnectxlocked(struct socket *so, sae_associd_t aid, sae_connid_t cid)
1694 {
1695         int error;
1696
1697         /*
1698          * Call the protocol disconnectx handler; let it handle all
1699          * matters related to the connection state of this session.
1700          */
1701         error = (*so->so_proto->pr_usrreqs->pru_disconnectx)(so, aid, cid);
1702         if (error == 0) {
1703                 /*
1704                  * The event applies only for the session, not for
1705                  * the disconnection of individual subflows.
1706                  */
1707                 if (so->so_state & (SS_ISDISCONNECTING|SS_ISDISCONNECTED))
1708                         sflt_notify(so, sock_evt_disconnected, NULL);
1709         }
1710         return (error);
1711 }
1712
1713 int
1714 sodisconnectx(struct socket *so, sae_associd_t aid, sae_connid_t cid)
1715 {
1716         int error;
1717
1718         socket_lock(so, 1);
1719         error = sodisconnectxlocked(so, aid, cid);
1720         socket_unlock(so, 1);
1721         return (error);
1722 }
1723
1724 int
1725 sopeelofflocked(struct socket *so, sae_associd_t aid, struct socket **psop)
1726 {
1727         return ((*so->so_proto->pr_usrreqs->pru_peeloff)(so, aid, psop));
1728 }
1729
1730 #define SBLOCKWAIT(f)   (((f) & MSG_DONTWAIT) ? 0 : SBL_WAIT)
1731
1732 /*
1733  * sosendcheck will lock the socket buffer if it isn't locked and
1734  * verify that there is space for the data being inserted.
1735  *
1736  * Returns:     0                       Success
1737  *              EPIPE
1738  *      sblock:EWOULDBLOCK
1739  *      sblock:EINTR
1740  *      sbwait:EBADF
1741  *      sbwait:EINTR
1742  *      [so_error]:???
1743  */
1744 int
1745 sosendcheck(struct socket *so, struct sockaddr *addr, user_ssize_t resid,
1746     int32_t clen, int32_t atomic, int flags, int *sblocked,
1747     struct mbuf *control)
1748 {
1749         int     error = 0;
1750         int32_t space;
1751         int     assumelock = 0;
1752
1753 restart:
1754         if (*sblocked == 0) {
1755                 if ((so->so_snd.sb_flags & SB_LOCK) != 0 &&
1756                     so->so_send_filt_thread != 0 &&
1757                     so->so_send_filt_thread == current_thread()) {
1758                         /*
1759                          * We're being called recursively from a filter,
1760                          * allow this to continue. Radar 4150520.
1761                          * Don't set sblocked because we don't want
1762                          * to perform an unlock later.
1763                          */
1764                         assumelock = 1;
1765                 } else {
1766                         error = sblock(&so->so_snd, SBLOCKWAIT(flags));
1767                         if (error) {
1768                                 if (so->so_flags & SOF_DEFUNCT)
1769                                         goto defunct;
1770                                 return (error);
1771                         }
1772                         *sblocked = 1;
1773                 }
1774         }
1775
1776         /*
1777          * If a send attempt is made on a socket that has been marked
1778          * as inactive (disconnected), reject the request.
1779          */
1780         if (so->so_flags & SOF_DEFUNCT) {
1781 defunct:
1782                 error = EPIPE;
1783                 SODEFUNCTLOG("%s[%d, %s]: defunct so 0x%llx [%d,%d] (%d)\n",
1784                     __func__, proc_selfpid(), proc_best_name(current_proc()),
1785                     (uint64_t)DEBUG_KERNEL_ADDRPERM(so),
1786                     SOCK_DOM(so), SOCK_TYPE(so), error);
1787                 return (error);
1788         }
1789
1790         if (so->so_state & SS_CANTSENDMORE) {
1791 #if CONTENT_FILTER
1792                 /*
1793                  * Can re-inject data of half closed connections
1794                  */
1795                 if ((so->so_state & SS_ISDISCONNECTED) == 0 &&
1796                         so->so_snd.sb_cfil_thread == current_thread() &&
1797                         cfil_sock_data_pending(&so->so_snd) != 0)
1798                         CFIL_LOG(LOG_INFO,
1799                                 "so %llx ignore SS_CANTSENDMORE",
1800                                 (uint64_t)DEBUG_KERNEL_ADDRPERM(so));
1801                 else
1802 #endif /* CONTENT_FILTER */
1803                         return (EPIPE);
1804         }
1805         if (so->so_error) {
1806                 error = so->so_error;
1807                 so->so_error = 0;
1808                 return (error);
1809         }
1810
1811         if ((so->so_state & SS_ISCONNECTED) == 0) {
1812                 if ((so->so_proto->pr_flags & PR_CONNREQUIRED) != 0) {
1813                         if (((so->so_state & SS_ISCONFIRMING) == 0) &&
1814                             (resid != 0 || clen == 0) &&
1815                             !(so->so_flags1 & SOF1_PRECONNECT_DATA)) {
1816 #if MPTCP
1817                                 /*
1818                                  * MPTCP Fast Join sends data before the
1819                                  * socket is truly connected.
1820                                  */
1821                                 if ((so->so_flags & (SOF_MP_SUBFLOW |
1822                                         SOF_MPTCP_FASTJOIN)) !=
1823                                     (SOF_MP_SUBFLOW | SOF_MPTCP_FASTJOIN))
1824 #endif /* MPTCP */
1825                                 return (ENOTCONN);
1826                         }
1827                 } else if (addr == 0 && !(flags&MSG_HOLD)) {
1828                         return ((so->so_proto->pr_flags & PR_CONNREQUIRED) ?
1829                             ENOTCONN : EDESTADDRREQ);
1830                 }
1831         }
1832
1833         if (so->so_flags & SOF_ENABLE_MSGS)
1834                 space = msgq_sbspace(so, control);
1835         else
1836                 space = sbspace(&so->so_snd);
1837
1838         if (flags & MSG_OOB)
1839                 space += 1024;
1840         if ((atomic && resid > so->so_snd.sb_hiwat) ||
1841             clen > so->so_snd.sb_hiwat)
1842                 return (EMSGSIZE);
1843
1844         if ((space < resid + clen &&
1845             (atomic || (space < (int32_t)so->so_snd.sb_lowat) ||
1846             space < clen)) ||
1847             (so->so_type == SOCK_STREAM && so_wait_for_if_feedback(so))) {
1848                 /*
1849                  * don't block the connectx call when there's more data
1850                  * than can be copied.
1851                  */
1852                 if (so->so_flags1 & SOF1_PRECONNECT_DATA) {
1853                         if (space == 0) {
1854                                 return (EWOULDBLOCK);
1855                         }
1856                         if (space < (int32_t)so->so_snd.sb_lowat) {
1857                                 return (0);
1858                         }
1859                 }
1860                 if ((so->so_state & SS_NBIO) || (flags & MSG_NBIO) ||
1861                     assumelock) {
1862                         return (EWOULDBLOCK);
1863                 }
1864                 sbunlock(&so->so_snd, TRUE);    /* keep socket locked */
1865                 *sblocked = 0;
1866                 error = sbwait(&so->so_snd);
1867                 if (error) {
1868                         if (so->so_flags & SOF_DEFUNCT)
1869                                 goto defunct;
1870                         return (error);
1871                 }
1872                 goto restart;
1873         }
1874         return (0);
1875 }
1876
1877 /*
1878  * Send on a socket.
1879  * If send must go all at once and message is larger than
1880  * send buffering, then hard error.
1881  * Lock against other senders.
1882  * If must go all at once and not enough room now, then
1883  * inform user that this would block and do nothing.
1884  * Otherwise, if nonblocking, send as much as possible.
1885  * The data to be sent is described by "uio" if nonzero,
1886  * otherwise by the mbuf chain "top" (which must be null
1887  * if uio is not).  Data provided in mbuf chain must be small
1888  * enough to send all at once.
1889  *
1890  * Returns nonzero on error, timeout or signal; callers
1891  * must check for short counts if EINTR/ERESTART are returned.
1892  * Data and control buffers are freed on return.
1893  * Experiment:
1894  * MSG_HOLD: go thru most of sosend(), but just enqueue the mbuf
1895  * MSG_SEND: go thru as for MSG_HOLD on current fragment, then
1896  *  point at the mbuf chain being constructed and go from there.
1897  *
1898  * Returns:     0                       Success
1899  *              EOPNOTSUPP
1900  *              EINVAL
1901  *              ENOBUFS
1902  *      uiomove:EFAULT
1903  *      sosendcheck:EPIPE
1904  *      sosendcheck:EWOULDBLOCK
1905  *      sosendcheck:EINTR
1906  *      sosendcheck:EBADF
1907  *      sosendcheck:EINTR
1908  *      sosendcheck:???                 [value from so_error]
1909  *      <pru_send>:ECONNRESET[TCP]
1910  *      <pru_send>:EINVAL[TCP]
1911  *      <pru_send>:ENOBUFS[TCP]
1912  *      <pru_send>:EADDRINUSE[TCP]
1913  *      <pru_send>:EADDRNOTAVAIL[TCP]
1914  *      <pru_send>:EAFNOSUPPORT[TCP]
1915  *      <pru_send>:EACCES[TCP]
1916  *      <pru_send>:EAGAIN[TCP]
1917  *      <pru_send>:EPERM[TCP]
1918  *      <pru_send>:EMSGSIZE[TCP]
1919  *      <pru_send>:EHOSTUNREACH[TCP]
1920  *      <pru_send>:ENETUNREACH[TCP]
1921  *      <pru_send>:ENETDOWN[TCP]
1922  *      <pru_send>:ENOMEM[TCP]
1923  *      <pru_send>:ENOBUFS[TCP]
1924  *      <pru_send>:???[TCP]             [ignorable: mostly IPSEC/firewall/DLIL]
1925  *      <pru_send>:EINVAL[AF_UNIX]
1926  *      <pru_send>:EOPNOTSUPP[AF_UNIX]
1927  *      <pru_send>:EPIPE[AF_UNIX]
1928  *      <pru_send>:ENOTCONN[AF_UNIX]
1929  *      <pru_send>:EISCONN[AF_UNIX]
1930  *      <pru_send>:???[AF_UNIX]         [whatever a filter author chooses]
1931  *      <sf_data_out>:???               [whatever a filter author chooses]
1932  *
1933  * Notes:       Other <pru_send> returns depend on the protocol family; all
1934  *              <sf_data_out> returns depend on what the filter author causes
1935  *              their filter to return.
1936  */
1937 int
1938 sosend(struct socket *so, struct sockaddr *addr, struct uio *uio,
1939     struct mbuf *top, struct mbuf *control, int flags)
1940 {
1941         struct mbuf **mp;
1942         struct mbuf *m, *freelist = NULL;
1943         user_ssize_t space, len, resid, orig_resid;
1944         int clen = 0, error, dontroute, mlen, sendflags;
1945         int atomic = sosendallatonce(so) || top;
1946         int sblocked = 0;
1947         struct proc *p = current_proc();
1948         struct mbuf *control_copy = NULL;
1949         uint16_t headroom = 0;
1950         boolean_t en_tracing = FALSE;
1951
1952         if (uio != NULL)
1953                 resid = uio_resid(uio);
1954         else
1955                 resid = top->m_pkthdr.len;
1956
1957         KERNEL_DEBUG((DBG_FNC_SOSEND | DBG_FUNC_START), so, resid,
1958             so->so_snd.sb_cc, so->so_snd.sb_lowat, so->so_snd.sb_hiwat);
1959
1960         socket_lock(so, 1);
1961
1962         /*
1963          * trace if tracing & network (vs. unix) sockets & and
1964          * non-loopback
1965          */
1966         if (ENTR_SHOULDTRACE &&
1967             (SOCK_CHECK_DOM(so, AF_INET) || SOCK_CHECK_DOM(so, AF_INET6))) {
1968                 struct inpcb *inp = sotoinpcb(so);
1969                 if (inp->inp_last_outifp != NULL &&
1970                     !(inp->inp_last_outifp->if_flags & IFF_LOOPBACK)) {
1971                         en_tracing = TRUE;
1972                         KERNEL_ENERGYTRACE(kEnTrActKernSockWrite, DBG_FUNC_START,
1973                             VM_KERNEL_ADDRPERM(so),
1974                             ((so->so_state & SS_NBIO) ? kEnTrFlagNonBlocking : 0),
1975                             (int64_t)resid);
1976                         orig_resid = resid;
1977                 }
1978         }
1979
1980         /*
1981          * Re-injection should not affect process accounting
1982          */
1983         if ((flags & MSG_SKIPCFIL) == 0) {
1984                 so_update_last_owner_locked(so, p);
1985                 so_update_policy(so);
1986
1987 #if NECP
1988                 so_update_necp_policy(so, NULL, addr);
1989 #endif /* NECP */
1990         }
1991
1992         if (so->so_type != SOCK_STREAM && (flags & MSG_OOB) != 0) {
1993                 error = EOPNOTSUPP;
1994                 socket_unlock(so, 1);
1995                 goto out;
1996         }
1997
1998         /*
1999          * In theory resid should be unsigned.
2000          * However, space must be signed, as it might be less than 0
2001          * if we over-committed, and we must use a signed comparison
2002          * of space and resid.  On the other hand, a negative resid
2003          * causes us to loop sending 0-length segments to the protocol.
2004          *
2005          * Usually, MSG_EOR isn't used on SOCK_STREAM type sockets.
2006          * But it will be used by sockets doing message delivery.
2007          *
2008          * Note: We limit resid to be a positive int value as we use
2009          * imin() to set bytes_to_copy -- radr://14558484
2010          */
2011         if (resid < 0 || resid > INT_MAX || (so->so_type == SOCK_STREAM &&
2012             !(so->so_flags & SOF_ENABLE_MSGS) && (flags & MSG_EOR))) {
2013                 error = EINVAL;
2014                 socket_unlock(so, 1);
2015                 goto out;
2016         }
2017
2018         dontroute = (flags & MSG_DONTROUTE) &&
2019             (so->so_options & SO_DONTROUTE) == 0 &&
2020             (so->so_proto->pr_flags & PR_ATOMIC);
2021         OSIncrementAtomicLong(&p->p_stats->p_ru.ru_msgsnd);
2022
2023         if (control != NULL)
2024                 clen = control->m_len;
2025
2026         if (soreserveheadroom != 0)
2027                 headroom = so->so_pktheadroom;
2028
2029         do {
2030                 error = sosendcheck(so, addr, resid, clen, atomic, flags,
2031                     &sblocked, control);
2032                 if (error)
2033                         goto release;
2034
2035                 mp = &top;
2036                 if (so->so_flags & SOF_ENABLE_MSGS)
2037                         space = msgq_sbspace(so, control);
2038                 else
2039                         space = sbspace(&so->so_snd) - clen;
2040                 space += ((flags & MSG_OOB) ? 1024 : 0);
2041
2042                 do {
2043                         if (uio == NULL) {
2044                                 /*
2045                                  * Data is prepackaged in "top".
2046                                  */
2047                                 resid = 0;
2048                                 if (flags & MSG_EOR)
2049                                         top->m_flags |= M_EOR;
2050                         } else {
2051                                 int chainlength;
2052                                 int bytes_to_copy;
2053                                 boolean_t jumbocl;
2054                                 boolean_t bigcl;
2055                                 int bytes_to_alloc;
2056
2057                                 bytes_to_copy = imin(resid, space);
2058
2059                                 bytes_to_alloc = bytes_to_copy;
2060                                 if (top == NULL)
2061                                         bytes_to_alloc += headroom;
2062
2063                                 if (sosendminchain > 0)
2064                                         chainlength = 0;
2065                                 else
2066                                         chainlength = sosendmaxchain;
2067
2068                                 /*
2069                                  * Use big 4 KB cluster when the outgoing interface
2070                                  * does not prefer 2 KB clusters
2071                                  */
2072                                 bigcl = !(so->so_flags1 & SOF1_IF_2KCL) ||
2073                                     sosendbigcl_ignore_capab;
2074
2075                                 /*
2076                                  * Attempt to use larger than system page-size
2077                                  * clusters for large writes only if there is
2078                                  * a jumbo cluster pool and if the socket is
2079                                  * marked accordingly.
2080                                  */
2081                                 jumbocl = sosendjcl && njcl > 0 &&
2082                                     ((so->so_flags & SOF_MULTIPAGES) ||
2083                                     sosendjcl_ignore_capab) &&
2084                                     bigcl;
2085
2086                                 socket_unlock(so, 0);
2087
2088                                 do {
2089                                         int num_needed;
2090                                         int hdrs_needed = (top == NULL) ? 1 : 0;
2091
2092                                         /*
2093                                          * try to maintain a local cache of mbuf
2094                                          * clusters needed to complete this
2095                                          * write the list is further limited to
2096                                          * the number that are currently needed
2097                                          * to fill the socket this mechanism
2098                                          * allows a large number of mbufs/
2099                                          * clusters to be grabbed under a single
2100                                          * mbuf lock... if we can't get any
2101                                          * clusters, than fall back to trying
2102                                          * for mbufs if we fail early (or
2103                                          * miscalcluate the number needed) make
2104                                          * sure to release any clusters we
2105                                          * haven't yet consumed.
2106                                          */
2107                                         if (freelist == NULL &&
2108                                             bytes_to_alloc > MBIGCLBYTES &&
2109                                             jumbocl) {
2110                                                 num_needed =
2111                                                     bytes_to_alloc / M16KCLBYTES;
2112
2113                                                 if ((bytes_to_alloc -
2114                                                     (num_needed * M16KCLBYTES))
2115                                                     >= MINCLSIZE)
2116                                                         num_needed++;
2117
2118                                                 freelist =
2119                                                     m_getpackets_internal(
2120                                                     (unsigned int *)&num_needed,
2121                                                     hdrs_needed, M_WAIT, 0,
2122                                                     M16KCLBYTES);
2123                                                 /*
2124                                                  * Fall back to 4K cluster size
2125                                                  * if allocation failed
2126                                                  */
2127                                         }
2128
2129                                         if (freelist == NULL &&
2130                                             bytes_to_alloc > MCLBYTES &&
2131                                             bigcl) {
2132                                                 num_needed =
2133                                                     bytes_to_alloc / MBIGCLBYTES;
2134
2135                                                 if ((bytes_to_alloc -
2136                                                     (num_needed * MBIGCLBYTES)) >=
2137                                                     MINCLSIZE)
2138                                                         num_needed++;
2139
2140                                                 freelist =
2141                                                     m_getpackets_internal(
2142                                                     (unsigned int *)&num_needed,
2143                                                     hdrs_needed, M_WAIT, 0,
2144                                                     MBIGCLBYTES);
2145                                                 /*
2146                                                  * Fall back to cluster size
2147                                                  * if allocation failed
2148                                                  */
2149                                         }
2150
2151                                         /*
2152                                          * Allocate a cluster as we want to
2153                                          * avoid to split the data in more
2154                                          * that one segment and using MINCLSIZE
2155                                          * would lead us to allocate two mbufs
2156                                          */
2157                                         if (soreserveheadroom != 0 &&
2158                                             freelist == NULL &&
2159                                             ((top == NULL &&
2160                                             bytes_to_alloc > _MHLEN) ||
2161                                             bytes_to_alloc > _MLEN)) {
2162                                                 num_needed = ROUNDUP(bytes_to_alloc, MCLBYTES) /
2163                                                     MCLBYTES;
2164                                                 freelist =
2165                                                     m_getpackets_internal(
2166                                                     (unsigned int *)&num_needed,
2167                                                     hdrs_needed, M_WAIT, 0,
2168                                                     MCLBYTES);
2169                                                 /*
2170                                                  * Fall back to a single mbuf
2171                                                  * if allocation failed
2172                                                  */
2173                                         } else if (freelist == NULL &&
2174                                             bytes_to_alloc > MINCLSIZE) {
2175                                                 num_needed =
2176                                                     bytes_to_alloc / MCLBYTES;
2177
2178                                                 if ((bytes_to_alloc -
2179                                                     (num_needed * MCLBYTES)) >=
2180                                                     MINCLSIZE)
2181                                                         num_needed++;
2182
2183                                                 freelist =
2184                                                     m_getpackets_internal(
2185                                                     (unsigned int *)&num_needed,
2186                                                     hdrs_needed, M_WAIT, 0,
2187                                                     MCLBYTES);
2188                                                 /*
2189                                                  * Fall back to a single mbuf
2190                                                  * if allocation failed
2191                                                  */
2192                                         }
2193                                         /*
2194                                          * For datagram protocols, leave
2195                                          * headroom for protocol headers
2196                                          * in the first cluster of the chain
2197                                          */
2198                                         if (freelist != NULL && atomic &&
2199                                             top == NULL && headroom > 0) {
2200                                                 freelist->m_data += headroom;
2201                                         }
2202
2203                                         /*
2204                                          * Fall back to regular mbufs without
2205                                          * reserving the socket headroom
2206                                          */
2207                                         if (freelist == NULL) {
2208                                                 if (top == NULL)
2209                                                         MGETHDR(freelist,
2210                                                             M_WAIT, MT_DATA);
2211                                                 else
2212                                                         MGET(freelist,
2213                                                             M_WAIT, MT_DATA);
2214
2215                                                 if (freelist == NULL) {
2216                                                         error = ENOBUFS;
2217                                                         socket_lock(so, 0);
2218                                                         goto release;
2219                                                 }
2220                                                 /*
2221                                                  * For datagram protocols,
2222                                                  * leave room for protocol
2223                                                  * headers in first mbuf.
2224                                                  */
2225                                                 if (atomic && top == NULL &&
2226                                                     bytes_to_copy < MHLEN) {
2227                                                         MH_ALIGN(freelist,
2228                                                             bytes_to_copy);
2229                                                 }
2230                                         }
2231                                         m = freelist;
2232                                         freelist = m->m_next;
2233                                         m->m_next = NULL;
2234
2235                                         if ((m->m_flags & M_EXT))
2236                                                 mlen = m->m_ext.ext_size -
2237                                                     m_leadingspace(m);
2238                                         else if ((m->m_flags & M_PKTHDR))
2239                                                 mlen =
2240                                                     MHLEN - m_leadingspace(m);
2241                                         else
2242                                                 mlen = MLEN - m_leadingspace(m);
2243                                         len = imin(mlen, bytes_to_copy);
2244
2245                                         chainlength += len;
2246
2247                                         space -= len;
2248
2249                                         error = uiomove(mtod(m, caddr_t),
2250                                             len, uio);
2251
2252                                         resid = uio_resid(uio);
2253
2254                                         m->m_len = len;
2255                                         *mp = m;
2256                                         top->m_pkthdr.len += len;
2257                                         if (error)
2258                                                 break;
2259                                         mp = &m->m_next;
2260                                         if (resid <= 0) {
2261                                                 if (flags & MSG_EOR)
2262                                                         top->m_flags |= M_EOR;
2263                                                 break;
2264                                         }
2265                                         bytes_to_copy = min(resid, space);
2266
2267                                 } while (space > 0 &&
2268                                     (chainlength < sosendmaxchain || atomic ||
2269                                     resid < MINCLSIZE));
2270
2271                                 socket_lock(so, 0);
2272
2273                                 if (error)
2274                                         goto release;
2275                         }
2276
2277                         if (flags & (MSG_HOLD|MSG_SEND)) {
2278                                 /* Enqueue for later, go away if HOLD */
2279                                 struct mbuf *mb1;
2280                                 if (so->so_temp && (flags & MSG_FLUSH)) {
2281                                         m_freem(so->so_temp);
2282                                         so->so_temp = NULL;
2283                                 }
2284                                 if (so->so_temp)
2285                                         so->so_tail->m_next = top;
2286                                 else
2287                                         so->so_temp = top;
2288                                 mb1 = top;
2289                                 while (mb1->m_next)
2290                                         mb1 = mb1->m_next;
2291                                 so->so_tail = mb1;
2292                                 if (flags & MSG_HOLD) {
2293                                         top = NULL;
2294                                         goto release;
2295                                 }
2296                                 top = so->so_temp;
2297                         }
2298                         if (dontroute)
2299                                 so->so_options |= SO_DONTROUTE;
2300
2301                         /*
2302                          * Compute flags here, for pru_send and NKEs
2303                          *
2304                          * If the user set MSG_EOF, the protocol
2305                          * understands this flag and nothing left to
2306                          * send then use PRU_SEND_EOF instead of PRU_SEND.
2307                          */
2308                         sendflags = (flags & MSG_OOB) ? PRUS_OOB :
2309                             ((flags & MSG_EOF) &&
2310                             (so->so_proto->pr_flags & PR_IMPLOPCL) &&
2311                             (resid <= 0)) ? PRUS_EOF :
2312                             /* If there is more to send set PRUS_MORETOCOME */
2313                             (resid > 0 && space > 0) ? PRUS_MORETOCOME : 0;
2314
2315                         if ((flags & MSG_SKIPCFIL) == 0) {
2316                                 /*
2317                                  * Socket filter processing
2318                                  */
2319                                 error = sflt_data_out(so, addr, &top,
2320                                     &control, (sendflags & MSG_OOB) ?
2321                                     sock_data_filt_flag_oob : 0);
2322                                 if (error) {
2323                                         if (error == EJUSTRETURN) {
2324                                                 error = 0;
2325                                                 clen = 0;
2326                                                 control = NULL;
2327                                                 top = NULL;
2328                                         }
2329                                         goto release;
2330                                 }
2331 #if CONTENT_FILTER
2332                                 /*
2333                                  * Content filter processing
2334                                  */
2335                                 error = cfil_sock_data_out(so, addr, top,
2336                                     control, (sendflags & MSG_OOB) ?
2337                                     sock_data_filt_flag_oob : 0);
2338                                 if (error) {
2339                                         if (error == EJUSTRETURN) {
2340                                                 error = 0;
2341                                                 clen = 0;
2342                                                 control = NULL;
2343                                                 top = NULL;
2344                                                 }
2345                                         goto release;
2346                                 }
2347 #endif /* CONTENT_FILTER */
2348                         }
2349                         if (so->so_flags & SOF_ENABLE_MSGS) {
2350                                 /*
2351                                  * Make a copy of control mbuf,
2352                                  * so that msg priority can be
2353                                  * passed to subsequent mbufs.
2354                                  */
2355                                 control_copy = m_dup(control, M_NOWAIT);
2356                         }
2357                         error = (*so->so_proto->pr_usrreqs->pru_send)
2358                             (so, sendflags, top, addr, control, p);
2359
2360                         if (flags & MSG_SEND)
2361                                 so->so_temp = NULL;
2362
2363                         if (dontroute)
2364                                 so->so_options &= ~SO_DONTROUTE;
2365
2366                         clen = 0;
2367                         control = control_copy;
2368                         control_copy = NULL;
2369                         top = NULL;
2370                         mp = &top;
2371                         if (error)
2372                                 goto release;
2373                 } while (resid && space > 0);
2374         } while (resid);
2375
2376 release:
2377         if (sblocked)
2378                 sbunlock(&so->so_snd, FALSE);   /* will unlock socket */
2379         else
2380                 socket_unlock(so, 1);
2381 out:
2382         if (top != NULL)
2383                 m_freem(top);
2384         if (control != NULL)
2385                 m_freem(control);
2386         if (freelist != NULL)
2387                 m_freem_list(freelist);
2388         if (control_copy != NULL)
2389                 m_freem(control_copy);
2390
2391         /*
2392          * One write has been done. This was enough. Get back to "normal"
2393          * behavior.
2394          */
2395         if (so->so_flags1 & SOF1_PRECONNECT_DATA)
2396                 so->so_flags1 &= ~SOF1_PRECONNECT_DATA;
2397
2398         if (en_tracing) {
2399                 /* resid passed here is the bytes left in uio */
2400                 KERNEL_ENERGYTRACE(kEnTrActKernSockWrite, DBG_FUNC_END,
2401                     VM_KERNEL_ADDRPERM(so),
2402                     ((error == EWOULDBLOCK) ? kEnTrFlagNoWork : 0),
2403                     (int64_t)(orig_resid - resid));
2404         }
2405         KERNEL_DEBUG(DBG_FNC_SOSEND | DBG_FUNC_END, so, resid,
2406             so->so_snd.sb_cc, space, error);
2407
2408         return (error);
2409 }
2410
2411 /*
2412  * Supported only connected sockets (no address) without ancillary data
2413  * (control mbuf) for atomic protocols
2414  */
2415 int
2416 sosend_list(struct socket *so, struct uio **uioarray, u_int uiocnt, int flags)
2417 {
2418         struct mbuf *m, *freelist = NULL;
2419         user_ssize_t len, resid;
2420         int error, dontroute, mlen;
2421         int atomic = sosendallatonce(so);
2422         int sblocked = 0;
2423         struct proc *p = current_proc();
2424         u_int uiofirst = 0;
2425         u_int uiolast = 0;
2426         struct mbuf *top = NULL;
2427         uint16_t headroom = 0;
2428         boolean_t bigcl;
2429
2430         KERNEL_DEBUG((DBG_FNC_SOSEND_LIST | DBG_FUNC_START), so, uiocnt,
2431             so->so_snd.sb_cc, so->so_snd.sb_lowat, so->so_snd.sb_hiwat);
2432
2433         if (so->so_type != SOCK_DGRAM) {
2434                 error = EINVAL;
2435                 goto out;
2436         }
2437         if (atomic == 0) {
2438                 error = EINVAL;
2439                 goto out;
2440         }
2441         if (so->so_proto->pr_usrreqs->pru_send_list == NULL) {
2442                 error = EPROTONOSUPPORT;
2443                 goto out;
2444         }
2445         if (flags & ~(MSG_DONTWAIT | MSG_NBIO)) {
2446                 error = EINVAL;
2447                 goto out;
2448         }
2449         resid = uio_array_resid(uioarray, uiocnt);
2450
2451         /*
2452          * In theory resid should be unsigned.
2453          * However, space must be signed, as it might be less than 0
2454          * if we over-committed, and we must use a signed comparison
2455          * of space and resid.  On the other hand, a negative resid
2456          * causes us to loop sending 0-length segments to the protocol.
2457          *
2458          * Note: We limit resid to be a positive int value as we use
2459          * imin() to set bytes_to_copy -- radr://14558484
2460          */
2461         if (resid < 0 || resid > INT_MAX) {
2462                 error = EINVAL;
2463                 goto out;
2464         }
2465
2466         socket_lock(so, 1);
2467         so_update_last_owner_locked(so, p);
2468         so_update_policy(so);
2469
2470 #if NECP
2471         so_update_necp_policy(so, NULL, NULL);
2472 #endif /* NECP */
2473
2474         dontroute = (flags & MSG_DONTROUTE) &&
2475             (so->so_options & SO_DONTROUTE) == 0 &&
2476             (so->so_proto->pr_flags & PR_ATOMIC);
2477         OSIncrementAtomicLong(&p->p_stats->p_ru.ru_msgsnd);
2478
2479         error = sosendcheck(so, NULL, resid, 0, atomic, flags,
2480             &sblocked, NULL);
2481         if (error)
2482                 goto release;
2483
2484         /*
2485          * Use big 4 KB clusters when the outgoing interface does not prefer
2486          * 2 KB clusters
2487          */
2488         bigcl = !(so->so_flags1 & SOF1_IF_2KCL) || sosendbigcl_ignore_capab;
2489
2490         if (soreserveheadroom != 0)
2491                 headroom = so->so_pktheadroom;
2492
2493         do {
2494                 int i;
2495                 int num_needed = 0;
2496                 int chainlength;
2497                 size_t maxpktlen = 0;
2498                 int bytes_to_alloc;
2499
2500                 if (sosendminchain > 0)
2501                         chainlength = 0;
2502                 else
2503                         chainlength = sosendmaxchain;
2504
2505                 socket_unlock(so, 0);
2506
2507                 /*
2508                  * Find a set of uio that fit in a reasonable number
2509                  * of mbuf packets
2510                  */
2511                 for (i = uiofirst; i < uiocnt; i++) {
2512                         struct uio *auio = uioarray[i];
2513
2514                         len = uio_resid(auio);
2515
2516                         /* Do nothing for empty messages */
2517                         if (len == 0)
2518                                 continue;
2519
2520                         num_needed += 1;
2521                         uiolast += 1;
2522
2523                         if (len > maxpktlen)
2524                                 maxpktlen = len;
2525
2526                         chainlength += len;
2527                         if (chainlength > sosendmaxchain)
2528                                 break;
2529                 }
2530                 /*
2531                  * Nothing left to send
2532                  */
2533                 if (num_needed == 0) {
2534                         socket_lock(so, 0);
2535                         break;
2536                 }
2537                 /*
2538                  * Allocate buffer large enough to include headroom space for
2539                  * network and link header
2540                  *
2541                  */
2542                 bytes_to_alloc = maxpktlen + headroom;
2543
2544                 /*
2545                  * Allocate a single contiguous buffer of the smallest available
2546                  * size when possible
2547                  */
2548                 if (bytes_to_alloc > MCLBYTES &&
2549                     bytes_to_alloc <= MBIGCLBYTES && bigcl) {
2550                         freelist = m_getpackets_internal(
2551                             (unsigned int *)&num_needed,
2552                             num_needed, M_WAIT, 1,
2553                             MBIGCLBYTES);
2554                 } else if (bytes_to_alloc > _MHLEN &&
2555                     bytes_to_alloc <= MCLBYTES) {
2556                         freelist = m_getpackets_internal(
2557                             (unsigned int *)&num_needed,
2558                             num_needed, M_WAIT, 1,
2559                             MCLBYTES);
2560                 } else {
2561                         freelist = m_allocpacket_internal(
2562                             (unsigned int *)&num_needed,
2563                             bytes_to_alloc, NULL, M_WAIT, 1, 0);
2564                 }
2565
2566                 if (freelist == NULL) {
2567                         socket_lock(so, 0);
2568                         error = ENOMEM;
2569                         goto release;
2570                 }
2571                 /*
2572                  * Copy each uio of the set into its own mbuf packet
2573                  */
2574                 for (i = uiofirst, m = freelist;
2575                     i < uiolast && m != NULL;
2576                     i++) {
2577                         int bytes_to_copy;
2578                         struct mbuf *n;
2579                         struct uio *auio = uioarray[i];
2580
2581                         bytes_to_copy = uio_resid(auio);
2582
2583                         /* Do nothing for empty messages */
2584                         if (bytes_to_copy == 0)
2585                                 continue;
2586                         /*
2587                          * Leave headroom for protocol headers
2588                          * in the first mbuf of the chain
2589                          */
2590                         m->m_data += headroom;
2591
2592                         for (n = m; n != NULL; n = n->m_next) {
2593                                 if ((m->m_flags & M_EXT))
2594                                         mlen = m->m_ext.ext_size -
2595                                             m_leadingspace(m);
2596                                 else if ((m->m_flags & M_PKTHDR))
2597                                         mlen =
2598                                             MHLEN - m_leadingspace(m);
2599                                 else
2600                                         mlen = MLEN - m_leadingspace(m);
2601                                 len = imin(mlen, bytes_to_copy);
2602
2603                                 /*
2604                                  * Note: uiomove() decrements the iovec
2605                                  * length
2606                                  */
2607                                 error = uiomove(mtod(n, caddr_t),
2608                                     len, auio);
2609                                 if (error != 0)
2610                                         break;
2611                                 n->m_len = len;
2612                                 m->m_pkthdr.len += len;
2613
2614                                 VERIFY(m->m_pkthdr.len <= maxpktlen);
2615
2616                                 bytes_to_copy -= len;
2617                                 resid -= len;
2618                         }
2619                         if (m->m_pkthdr.len == 0) {
2620                                 printf(
2621                                     "%s:%d so %llx pkt %llx type %u len null\n",
2622                                     __func__, __LINE__,
2623                                     (uint64_t)DEBUG_KERNEL_ADDRPERM(so),
2624                                     (uint64_t)DEBUG_KERNEL_ADDRPERM(m),
2625                                     m->m_type);
2626                         }
2627                         if (error != 0)
2628                                 break;
2629                         m = m->m_nextpkt;
2630                 }
2631
2632                 socket_lock(so, 0);
2633
2634                 if (error)
2635                         goto release;
2636                 top = freelist;
2637                 freelist = NULL;
2638
2639                 if (dontroute)
2640                         so->so_options |= SO_DONTROUTE;
2641
2642                 if ((flags & MSG_SKIPCFIL) == 0) {
2643                         struct mbuf **prevnextp = NULL;
2644
2645                         for (i = uiofirst, m = top;
2646                             i < uiolast && m != NULL;
2647                             i++) {
2648                                 struct mbuf *nextpkt = m->m_nextpkt;
2649
2650                                 /*
2651                                  * Socket filter processing
2652                                  */
2653                                 error = sflt_data_out(so, NULL, &m,
2654                                     NULL, 0);
2655                                 if (error != 0 && error != EJUSTRETURN)
2656                                         goto release;
2657
2658 #if CONTENT_FILTER
2659                                 if (error == 0) {
2660                                         /*
2661                                          * Content filter processing
2662                                          */
2663                                         error = cfil_sock_data_out(so, NULL, m,
2664                                             NULL, 0);
2665                                         if (error != 0 && error != EJUSTRETURN)
2666                                                 goto release;
2667                                 }
2668 #endif /* CONTENT_FILTER */
2669                                 /*
2670                                  * Remove packet from the list when
2671                                  * swallowed by a filter
2672                                  */
2673                                 if (error == EJUSTRETURN) {
2674                                         error = 0;
2675                                         if (prevnextp != NULL)
2676                                                 *prevnextp = nextpkt;
2677                                         else
2678                                                 top = nextpkt;
2679                                 }
2680
2681                                 m = nextpkt;
2682                                 if (m != NULL)
2683                                         prevnextp = &m->m_nextpkt;
2684                         }
2685                 }
2686                 if (top != NULL)
2687                         error = (*so->so_proto->pr_usrreqs->pru_send_list)
2688                             (so, 0, top, NULL, NULL, p);
2689
2690                 if (dontroute)
2691                         so->so_options &= ~SO_DONTROUTE;
2692
2693                 top = NULL;
2694                 uiofirst = uiolast;
2695         } while (resid > 0 && error == 0);
2696 release:
2697         if (sblocked)
2698                 sbunlock(&so->so_snd, FALSE);   /* will unlock socket */
2699         else
2700                 socket_unlock(so, 1);
2701 out:
2702         if (top != NULL)
2703                 m_freem(top);
2704         if (freelist != NULL)
2705                 m_freem_list(freelist);
2706
2707         KERNEL_DEBUG(DBG_FNC_SOSEND_LIST | DBG_FUNC_END, so, resid,
2708             so->so_snd.sb_cc, 0, error);
2709
2710         return (error);
2711 }
2712
2713 /*
2714  * May return ERESTART when packet is dropped by MAC policy check
2715  */
2716 static int
2717 soreceive_addr(struct proc *p, struct socket *so, struct sockaddr **psa,
2718     int flags, struct mbuf **mp, struct mbuf **nextrecordp, int canwait)
2719 {
2720         int error = 0;
2721         struct mbuf *m = *mp;
2722         struct mbuf *nextrecord = *nextrecordp;
2723
2724         KASSERT(m->m_type == MT_SONAME, ("receive 1a"));
2725 #if CONFIG_MACF_SOCKET_SUBSET
2726         /*
2727          * Call the MAC framework for policy checking if we're in
2728          * the user process context and the socket isn't connected.
2729          */
2730         if (p != kernproc && !(so->so_state & SS_ISCONNECTED)) {
2731                 struct mbuf *m0 = m;
2732                 /*
2733                  * Dequeue this record (temporarily) from the receive
2734                  * list since we're about to drop the socket's lock
2735                  * where a new record may arrive and be appended to
2736                  * the list.  Upon MAC policy failure, the record
2737                  * will be freed.  Otherwise, we'll add it back to
2738                  * the head of the list.  We cannot rely on SB_LOCK
2739                  * because append operation uses the socket's lock.
2740                  */
2741                 do {
2742                         m->m_nextpkt = NULL;
2743                         sbfree(&so->so_rcv, m);
2744                         m = m->m_next;
2745                 } while (m != NULL);
2746                 m = m0;
2747                 so->so_rcv.sb_mb = nextrecord;
2748                 SB_EMPTY_FIXUP(&so->so_rcv);
2749                 SBLASTRECORDCHK(&so->so_rcv, "soreceive 1a");
2750                 SBLASTMBUFCHK(&so->so_rcv, "soreceive 1a");
2751                 socket_unlock(so, 0);
2752
2753                 if (mac_socket_check_received(proc_ucred(p), so,
2754                     mtod(m, struct sockaddr *)) != 0) {
2755                         /*
2756                          * MAC policy failure; free this record and
2757                          * process the next record (or block until
2758                          * one is available).  We have adjusted sb_cc
2759                          * and sb_mbcnt above so there is no need to
2760                          * call sbfree() again.
2761                          */
2762                         m_freem(m);
2763                         /*
2764                          * Clear SB_LOCK but don't unlock the socket.
2765                          * Process the next record or wait for one.
2766                          */
2767                         socket_lock(so, 0);
2768                         sbunlock(&so->so_rcv, TRUE); /* stay locked */
2769                         error = ERESTART;
2770                         goto done;
2771                 }
2772                 socket_lock(so, 0);
2773                 /*
2774                  * If the socket has been defunct'd, drop it.
2775                  */
2776                 if (so->so_flags & SOF_DEFUNCT) {
2777                         m_freem(m);
2778                         error = ENOTCONN;
2779                         goto done;
2780                 }
2781                 /*
2782                  * Re-adjust the socket receive list and re-enqueue
2783                  * the record in front of any packets which may have
2784                  * been appended while we dropped the lock.
2785                  */
2786                 for (m = m0; m->m_next != NULL; m = m->m_next)
2787                         sballoc(&so->so_rcv, m);
2788                 sballoc(&so->so_rcv, m);
2789                 if (so->so_rcv.sb_mb == NULL) {
2790                         so->so_rcv.sb_lastrecord = m0;
2791                         so->so_rcv.sb_mbtail = m;
2792                 }
2793                 m = m0;
2794                 nextrecord = m->m_nextpkt = so->so_rcv.sb_mb;
2795                 so->so_rcv.sb_mb = m;
2796                 SBLASTRECORDCHK(&so->so_rcv, "soreceive 1b");
2797                 SBLASTMBUFCHK(&so->so_rcv, "soreceive 1b");
2798         }
2799 #endif /* CONFIG_MACF_SOCKET_SUBSET */
2800         if (psa != NULL) {
2801                 *psa = dup_sockaddr(mtod(m, struct sockaddr *), canwait);
2802                 if ((*psa == NULL) && (flags & MSG_NEEDSA)) {
2803                         error = EWOULDBLOCK;
2804                         goto done;
2805                 }
2806         }
2807         if (flags & MSG_PEEK) {
2808                 m = m->m_next;
2809         } else {
2810                 sbfree(&so->so_rcv, m);
2811                 if (m->m_next == NULL && so->so_rcv.sb_cc != 0) {
2812                         panic("%s: about to create invalid socketbuf",
2813                             __func__);
2814                         /* NOTREACHED */
2815                 }
2816                 MFREE(m, so->so_rcv.sb_mb);
2817                 m = so->so_rcv.sb_mb;
2818                 if (m != NULL) {
2819                         m->m_nextpkt = nextrecord;
2820                 } else {
2821                         so->so_rcv.sb_mb = nextrecord;
2822                         SB_EMPTY_FIXUP(&so->so_rcv);
2823                 }
2824         }
2825 done:
2826         *mp = m;
2827         *nextrecordp = nextrecord;
2828
2829         return (error);
2830 }
2831
2832 /*
2833  * Process one or more MT_CONTROL mbufs present before any data mbufs
2834  * in the first mbuf chain on the socket buffer.  If MSG_PEEK, we
2835  * just copy the data; if !MSG_PEEK, we call into the protocol to
2836  * perform externalization.
2837  */
2838 static int
2839 soreceive_ctl(struct socket *so, struct mbuf **controlp, int flags,
2840     struct mbuf **mp, struct mbuf **nextrecordp)
2841 {
2842         int error = 0;
2843         struct mbuf *cm = NULL, *cmn;
2844         struct mbuf **cme = &cm;
2845         struct sockbuf *sb_rcv = &so->so_rcv;
2846         struct mbuf **msgpcm = NULL;
2847         struct mbuf *m = *mp;
2848         struct mbuf *nextrecord = *nextrecordp;
2849         struct protosw *pr = so->so_proto;
2850
2851         /*
2852          * Externalizing the control messages would require us to
2853          * drop the socket's lock below.  Once we re-acquire the
2854          * lock, the mbuf chain might change.  In order to preserve
2855          * consistency, we unlink all control messages from the
2856          * first mbuf chain in one shot and link them separately
2857          * onto a different chain.
2858          */
2859         do {
2860                 if (flags & MSG_PEEK) {
2861                         if (controlp != NULL) {
2862                                 if (*controlp == NULL) {
2863                                         msgpcm = controlp;
2864                                 }
2865                                 *controlp = m_copy(m, 0, m->m_len);
2866
2867                                 /*
2868                                  * If we failed to allocate an mbuf,
2869                                  * release any previously allocated
2870                                  * mbufs for control data. Return
2871                                  * an error. Keep the mbufs in the
2872                                  * socket as this is using
2873                                  * MSG_PEEK flag.
2874                                  */
2875                                 if (*controlp == NULL) {
2876                                         m_freem(*msgpcm);
2877                                         error = ENOBUFS;
2878                                         goto done;
2879                                 }
2880                                 controlp = &(*controlp)->m_next;
2881                         }
2882                         m = m->m_next;
2883                 } else {
2884                         m->m_nextpkt = NULL;
2885                         sbfree(sb_rcv, m);
2886                         sb_rcv->sb_mb = m->m_next;
2887                         m->m_next = NULL;
2888                         *cme = m;
2889                         cme = &(*cme)->m_next;
2890                         m = sb_rcv->sb_mb;
2891                 }
2892         } while (m != NULL && m->m_type == MT_CONTROL);
2893
2894         if (!(flags & MSG_PEEK)) {
2895                 if (sb_rcv->sb_mb != NULL) {
2896                         sb_rcv->sb_mb->m_nextpkt = nextrecord;
2897                 } else {
2898                         sb_rcv->sb_mb = nextrecord;
2899                         SB_EMPTY_FIXUP(sb_rcv);
2900                 }
2901                 if (nextrecord == NULL)
2902                         sb_rcv->sb_lastrecord = m;
2903         }
2904
2905         SBLASTRECORDCHK(&so->so_rcv, "soreceive ctl");
2906         SBLASTMBUFCHK(&so->so_rcv, "soreceive ctl");
2907
2908         while (cm != NULL) {
2909                 int cmsg_type;
2910
2911                 cmn = cm->m_next;
2912                 cm->m_next = NULL;
2913                 cmsg_type = mtod(cm, struct cmsghdr *)->cmsg_type;
2914
2915                 /*
2916                  * Call the protocol to externalize SCM_RIGHTS message
2917                  * and return the modified message to the caller upon
2918                  * success.  Otherwise, all other control messages are
2919                  * returned unmodified to the caller.  Note that we
2920                  * only get into this loop if MSG_PEEK is not set.
2921                  */
2922                 if (pr->pr_domain->dom_externalize != NULL &&
2923                     cmsg_type == SCM_RIGHTS) {
2924                         /*
2925                          * Release socket lock: see 3903171.  This
2926                          * would also allow more records to be appended
2927                          * to the socket buffer.  We still have SB_LOCK
2928                          * set on it, so we can be sure that the head
2929                          * of the mbuf chain won't change.
2930                          */
2931                         socket_unlock(so, 0);
2932                         error = (*pr->pr_domain->dom_externalize)(cm);
2933                         socket_lock(so, 0);
2934                 } else {
2935                         error = 0;
2936                 }
2937
2938                 if (controlp != NULL && error == 0) {
2939                         *controlp = cm;
2940                         controlp = &(*controlp)->m_next;
2941                 } else {
2942                         (void) m_free(cm);
2943                 }
2944                 cm = cmn;
2945         }
2946         /*
2947          * Update the value of nextrecord in case we received new
2948          * records when the socket was unlocked above for
2949          * externalizing SCM_RIGHTS.
2950          */
2951         if (m != NULL)
2952                 nextrecord = sb_rcv->sb_mb->m_nextpkt;
2953         else
2954                 nextrecord = sb_rcv->sb_mb;
2955
2956 done:
2957         *mp = m;
2958         *nextrecordp = nextrecord;
2959
2960         return (error);
2961 }
2962
2963 /*
2964  * Implement receive operations on a socket.
2965  * We depend on the way that records are added to the sockbuf
2966  * by sbappend*.  In particular, each record (mbufs linked through m_next)
2967  * must begin with an address if the protocol so specifies,
2968  * followed by an optional mbuf or mbufs containing ancillary data,
2969  * and then zero or more mbufs of data.
2970  * In order to avoid blocking network interrupts for the entire time here,
2971  * we splx() while doing the actual copy to user space.
2972  * Although the sockbuf is locked, new data may still be appended,
2973  * and thus we must maintain consistency of the sockbuf during that time.
2974  *
2975  * The caller may receive the data as a single mbuf chain by supplying
2976  * an mbuf **mp0 for use in returning the chain.  The uio is then used
2977  * only for the count in uio_resid.
2978  *
2979  * Returns:     0                       Success
2980  *              ENOBUFS
2981  *              ENOTCONN
2982  *              EWOULDBLOCK
2983  *      uiomove:EFAULT
2984  *      sblock:EWOULDBLOCK
2985  *      sblock:EINTR
2986  *      sbwait:EBADF
2987  *      sbwait:EINTR
2988  *      sodelayed_copy:EFAULT
2989  *      <pru_rcvoob>:EINVAL[TCP]
2990  *      <pru_rcvoob>:EWOULDBLOCK[TCP]
2991  *      <pru_rcvoob>:???
2992  *      <pr_domain->dom_externalize>:EMSGSIZE[AF_UNIX]
2993  *      <pr_domain->dom_externalize>:ENOBUFS[AF_UNIX]
2994  *      <pr_domain->dom_externalize>:???
2995  *
2996  * Notes:       Additional return values from calls through <pru_rcvoob> and
2997  *              <pr_domain->dom_externalize> depend on protocols other than
2998  *              TCP or AF_UNIX, which are documented above.
2999  */
3000 int
3001 soreceive(struct socket *so, struct sockaddr **psa, struct uio *uio,
3002     struct mbuf **mp0, struct mbuf **controlp, int *flagsp)
3003 {
3004         struct mbuf *m, **mp, *ml = NULL;
3005         struct mbuf *nextrecord, *free_list;
3006         int flags, error, offset;
3007         user_ssize_t len;
3008         struct protosw *pr = so->so_proto;
3009         int moff, type = 0;
3010         user_ssize_t orig_resid = uio_resid(uio);
3011         user_ssize_t delayed_copy_len;
3012         int can_delay;
3013         int need_event;
3014         struct proc *p = current_proc();
3015         boolean_t en_tracing = FALSE;
3016
3017         /*
3018          * Sanity check on the length passed by caller as we are making 'int'
3019          * comparisons
3020          */
3021         if (orig_resid < 0 || orig_resid > INT_MAX)
3022                 return (EINVAL);
3023
3024         KERNEL_DEBUG(DBG_FNC_SORECEIVE | DBG_FUNC_START, so,
3025             uio_resid(uio), so->so_rcv.sb_cc, so->so_rcv.sb_lowat,
3026             so->so_rcv.sb_hiwat);
3027
3028         socket_lock(so, 1);
3029         so_update_last_owner_locked(so, p);
3030         so_update_policy(so);
3031
3032 #ifdef MORE_LOCKING_DEBUG
3033         if (so->so_usecount == 1) {
3034                 panic("%s: so=%x no other reference on socket\n", __func__, so);
3035                 /* NOTREACHED */
3036         }
3037 #endif
3038         mp = mp0;
3039         if (psa != NULL)
3040                 *psa = NULL;
3041         if (controlp != NULL)
3042                 *controlp = NULL;
3043         if (flagsp != NULL)
3044                 flags = *flagsp &~ MSG_EOR;
3045         else
3046                 flags = 0;
3047
3048         /*
3049          * If a recv attempt is made on a previously-accepted socket
3050          * that has been marked as inactive (disconnected), reject
3051          * the request.
3052          */
3053         if (so->so_flags & SOF_DEFUNCT) {
3054                 struct sockbuf *sb = &so->so_rcv;
3055
3056                 error = ENOTCONN;
3057                 SODEFUNCTLOG("%s[%d, %s]: defunct so 0x%llx [%d,%d] (%d)\n",
3058                     __func__, proc_pid(p), proc_best_name(p),
3059                     (uint64_t)DEBUG_KERNEL_ADDRPERM(so),
3060                     SOCK_DOM(so), SOCK_TYPE(so), error);
3061                 /*
3062                  * This socket should have been disconnected and flushed
3063                  * prior to being returned from sodefunct(); there should
3064                  * be no data on its receive list, so panic otherwise.
3065                  */
3066                 if (so->so_state & SS_DEFUNCT)
3067                         sb_empty_assert(sb, __func__);
3068                 socket_unlock(so, 1);
3069                 return (error);
3070         }
3071
3072         if ((so->so_flags1 & SOF1_PRECONNECT_DATA) &&
3073             pr->pr_usrreqs->pru_preconnect) {
3074                 /*
3075                  * A user may set the CONNECT_RESUME_ON_READ_WRITE-flag but not
3076                  * calling write() right after this. *If* the app calls a read
3077                  * we do not want to block this read indefinetely. Thus,
3078                  * we trigger a connect so that the session gets initiated.
3079                  */
3080                 error = (*pr->pr_usrreqs->pru_preconnect)(so);
3081
3082                 if (error) {
3083                         socket_unlock(so, 1);
3084                         return (error);
3085                 }
3086         }
3087
3088         if (ENTR_SHOULDTRACE &&
3089             (SOCK_CHECK_DOM(so, AF_INET) || SOCK_CHECK_DOM(so, AF_INET6))) {
3090                 /*
3091                  * enable energy tracing for inet sockets that go over
3092                  * non-loopback interfaces only.
3093                  */
3094                 struct inpcb *inp = sotoinpcb(so);
3095                 if (inp->inp_last_outifp != NULL &&
3096                     !(inp->inp_last_outifp->if_flags & IFF_LOOPBACK)) {
3097                         en_tracing = TRUE;
3098                         KERNEL_ENERGYTRACE(kEnTrActKernSockRead, DBG_FUNC_START,
3099                             VM_KERNEL_ADDRPERM(so),
3100                             ((so->so_state & SS_NBIO) ?
3101                             kEnTrFlagNonBlocking : 0),
3102                             (int64_t)orig_resid);
3103                 }
3104         }
3105
3106         /*
3107          * When SO_WANTOOBFLAG is set we try to get out-of-band data
3108          * regardless of the flags argument. Here is the case were
3109          * out-of-band data is not inline.
3110          */
3111         if ((flags & MSG_OOB) ||
3112             ((so->so_options & SO_WANTOOBFLAG) != 0 &&
3113             (so->so_options & SO_OOBINLINE) == 0 &&
3114             (so->so_oobmark || (so->so_state & SS_RCVATMARK)))) {
3115                 m = m_get(M_WAIT, MT_DATA);
3116                 if (m == NULL) {
3117                         socket_unlock(so, 1);
3118                         KERNEL_DEBUG(DBG_FNC_SORECEIVE | DBG_FUNC_END,
3119                             ENOBUFS, 0, 0, 0, 0);
3120                         return (ENOBUFS);
3121                 }
3122                 error = (*pr->pr_usrreqs->pru_rcvoob)(so, m, flags & MSG_PEEK);
3123                 if (error)
3124                         goto bad;
3125                 socket_unlock(so, 0);
3126                 do {
3127                         error = uiomove(mtod(m, caddr_t),
3128                             imin(uio_resid(uio), m->m_len), uio);
3129                         m = m_free(m);
3130                 } while (uio_resid(uio) && error == 0 && m != NULL);
3131                 socket_lock(so, 0);
3132 bad:
3133                 if (m != NULL)
3134                         m_freem(m);
3135
3136                 if ((so->so_options & SO_WANTOOBFLAG) != 0) {
3137                         if (error == EWOULDBLOCK || error == EINVAL) {
3138                                 /*
3139                                  * Let's try to get normal data:
3140                                  * EWOULDBLOCK: out-of-band data not
3141                                  * receive yet. EINVAL: out-of-band data
3142                                  * already read.
3143                                  */
3144                                 error = 0;
3145                                 goto nooob;
3146                         } else if (error == 0 && flagsp != NULL) {
3147                                 *flagsp |= MSG_OOB;
3148                         }
3149                 }
3150                 socket_unlock(so, 1);
3151                 if (en_tracing) {
3152                         KERNEL_ENERGYTRACE(kEnTrActKernSockRead, DBG_FUNC_END,
3153                             VM_KERNEL_ADDRPERM(so), 0,
3154                             (int64_t)(orig_resid - uio_resid(uio)));
3155                 }
3156                 KERNEL_DEBUG(DBG_FNC_SORECEIVE | DBG_FUNC_END, error,
3157                     0, 0, 0, 0);
3158
3159                 return (error);
3160         }
3161 nooob:
3162         if (mp != NULL)
3163                 *mp = NULL;
3164
3165         if (so->so_state & SS_ISCONFIRMING && uio_resid(uio)) {
3166                 (*pr->pr_usrreqs->pru_rcvd)(so, 0);
3167         }
3168
3169         free_list = NULL;
3170         delayed_copy_len = 0;
3171 restart:
3172 #ifdef MORE_LOCKING_DEBUG
3173         if (so->so_usecount <= 1)
3174                 printf("soreceive: sblock so=0x%llx ref=%d on socket\n",
3175                     (uint64_t)DEBUG_KERNEL_ADDRPERM(so), so->so_usecount);
3176 #endif
3177         /*
3178          * See if the socket has been closed (SS_NOFDREF|SS_CANTRCVMORE)
3179          * and if so just return to the caller.  This could happen when
3180          * soreceive() is called by a socket upcall function during the
3181          * time the socket is freed.  The socket buffer would have been
3182          * locked across the upcall, therefore we cannot put this thread
3183          * to sleep (else we will deadlock) or return EWOULDBLOCK (else
3184          * we may livelock), because the lock on the socket buffer will
3185          * only be released when the upcall routine returns to its caller.
3186          * Because the socket has been officially closed, there can be
3187          * no further read on it.
3188          *
3189          * A multipath subflow socket would have its SS_NOFDREF set by
3190          * default, so check for SOF_MP_SUBFLOW socket flag; when the
3191          * socket is closed for real, SOF_MP_SUBFLOW would be cleared.
3192          */
3193         if ((so->so_state & (SS_NOFDREF | SS_CANTRCVMORE)) ==
3194             (SS_NOFDREF | SS_CANTRCVMORE) && !(so->so_flags & SOF_MP_SUBFLOW)) {
3195                 socket_unlock(so, 1);
3196                 return (0);
3197         }
3198
3199         error = sblock(&so->so_rcv, SBLOCKWAIT(flags));
3200         if (error) {
3201                 socket_unlock(so, 1);
3202                 KERNEL_DEBUG(DBG_FNC_SORECEIVE | DBG_FUNC_END, error,
3203                     0, 0, 0, 0);
3204                 if (en_tracing) {
3205                         KERNEL_ENERGYTRACE(kEnTrActKernSockRead, DBG_FUNC_END,
3206                             VM_KERNEL_ADDRPERM(so), 0,
3207                             (int64_t)(orig_resid - uio_resid(uio)));
3208                 }
3209                 return (error);
3210         }
3211
3212         m = so->so_rcv.sb_mb;
3213         /*
3214          * If we have less data than requested, block awaiting more
3215          * (subject to any timeout) if:
3216          *   1. the current count is less than the low water mark, or
3217          *   2. MSG_WAITALL is set, and it is possible to do the entire
3218          *      receive operation at once if we block (resid <= hiwat).
3219          *   3. MSG_DONTWAIT is not set
3220          * If MSG_WAITALL is set but resid is larger than the receive buffer,
3221          * we have to do the receive in sections, and thus risk returning
3222          * a short count if a timeout or signal occurs after we start.
3223          */
3224         if (m == NULL || (((flags & MSG_DONTWAIT) == 0 &&
3225             so->so_rcv.sb_cc < uio_resid(uio)) &&
3226             (so->so_rcv.sb_cc < so->so_rcv.sb_lowat ||
3227             ((flags & MSG_WAITALL) && uio_resid(uio) <= so->so_rcv.sb_hiwat)) &&
3228             m->m_nextpkt == NULL && (pr->pr_flags & PR_ATOMIC) == 0)) {
3229                 /*
3230                  * Panic if we notice inconsistencies in the socket's
3231                  * receive list; both sb_mb and sb_cc should correctly
3232                  * reflect the contents of the list, otherwise we may
3233                  * end up with false positives during select() or poll()
3234                  * which could put the application in a bad state.
3235                  */
3236                 SB_MB_CHECK(&so->so_rcv);
3237
3238                 if (so->so_error) {
3239                         if (m != NULL)
3240                                 goto dontblock;
3241                         error = so->so_error;
3242                         if ((flags & MSG_PEEK) == 0)
3243                                 so->so_error = 0;
3244                         goto release;
3245                 }
3246                 if (so->so_state & SS_CANTRCVMORE) {
3247 #if CONTENT_FILTER
3248                         /*
3249                          * Deal with half closed connections
3250                          */
3251                         if ((so->so_state & SS_ISDISCONNECTED) == 0 &&
3252                                 cfil_sock_data_pending(&so->so_rcv) != 0)
3253                                 CFIL_LOG(LOG_INFO,
3254                                         "so %llx ignore SS_CANTRCVMORE",
3255                                         (uint64_t)DEBUG_KERNEL_ADDRPERM(so));
3256                         else
3257 #endif /* CONTENT_FILTER */
3258                         if (m != NULL)
3259                                 goto dontblock;
3260                         else
3261                                 goto release;
3262                 }
3263                 for (; m != NULL; m = m->m_next)
3264                         if (m->m_type == MT_OOBDATA || (m->m_flags & M_EOR)) {
3265                                 m = so->so_rcv.sb_mb;
3266                                 goto dontblock;
3267                         }
3268                 if ((so->so_state & (SS_ISCONNECTED|SS_ISCONNECTING)) == 0 &&
3269                     (so->so_proto->pr_flags & PR_CONNREQUIRED)) {
3270                         error = ENOTCONN;
3271                         goto release;
3272                 }
3273                 if (uio_resid(uio) == 0)
3274                         goto release;
3275
3276                 if ((so->so_state & SS_NBIO) ||
3277                     (flags & (MSG_DONTWAIT|MSG_NBIO))) {
3278                         error = EWOULDBLOCK;
3279                         goto release;
3280                 }
3281                 SBLASTRECORDCHK(&so->so_rcv, "soreceive sbwait 1");
3282                 SBLASTMBUFCHK(&so->so_rcv, "soreceive sbwait 1");
3283                 sbunlock(&so->so_rcv, TRUE);    /* keep socket locked */
3284 #if EVEN_MORE_LOCKING_DEBUG
3285                 if (socket_debug)
3286                         printf("Waiting for socket data\n");
3287 #endif
3288
3289                 error = sbwait(&so->so_rcv);
3290 #if EVEN_MORE_LOCKING_DEBUG
3291                 if (socket_debug)
3292                         printf("SORECEIVE - sbwait returned %d\n", error);
3293 #endif
3294                 if (so->so_usecount < 1) {
3295                         panic("%s: after 2nd sblock so=%p ref=%d on socket\n",
3296                             __func__, so, so->so_usecount);
3297                         /* NOTREACHED */
3298                 }
3299                 if (error) {
3300                         socket_unlock(so, 1);
3301                         KERNEL_DEBUG(DBG_FNC_SORECEIVE | DBG_FUNC_END, error,
3302                             0, 0, 0, 0);
3303                         if (en_tracing) {
3304                                 KERNEL_ENERGYTRACE(kEnTrActKernSockRead, DBG_FUNC_END,
3305                                     VM_KERNEL_ADDRPERM(so), 0,
3306                                     (int64_t)(orig_resid - uio_resid(uio)));
3307                         }
3308                         return (error);
3309                 }
3310                 goto restart;
3311         }
3312 dontblock:
3313         OSIncrementAtomicLong(&p->p_stats->p_ru.ru_msgrcv);
3314         SBLASTRECORDCHK(&so->so_rcv, "soreceive 1");
3315         SBLASTMBUFCHK(&so->so_rcv, "soreceive 1");
3316         nextrecord = m->m_nextpkt;
3317
3318         if ((pr->pr_flags & PR_ADDR) && m->m_type == MT_SONAME) {
3319                 error = soreceive_addr(p, so, psa, flags, &m, &nextrecord,
3320                     mp0 == NULL);
3321                 if (error == ERESTART)
3322                         goto restart;
3323                 else if (error != 0)
3324                         goto release;
3325                 orig_resid = 0;
3326         }
3327
3328         /*
3329          * Process one or more MT_CONTROL mbufs present before any data mbufs
3330          * in the first mbuf chain on the socket buffer.  If MSG_PEEK, we
3331          * just copy the data; if !MSG_PEEK, we call into the protocol to
3332          * perform externalization.
3333          */
3334         if (m != NULL && m->m_type == MT_CONTROL) {
3335                 error = soreceive_ctl(so, controlp, flags, &m, &nextrecord);
3336                 if (error != 0)
3337                         goto release;
3338                 orig_resid = 0;
3339         }
3340
3341         /*
3342          * If the socket is a TCP socket with message delivery
3343          * enabled, then create a control msg to deliver the
3344          * relative TCP sequence number for this data. Waiting
3345          * until this point will protect against failures to
3346          * allocate an mbuf for control msgs.
3347          */
3348         if (so->so_type == SOCK_STREAM && SOCK_PROTO(so) == IPPROTO_TCP &&
3349             (so->so_flags & SOF_ENABLE_MSGS) && controlp != NULL) {
3350                 struct mbuf *seq_cm;
3351
3352                 seq_cm = sbcreatecontrol((caddr_t)&m->m_pkthdr.msg_seq,
3353                     sizeof (uint32_t), SCM_SEQNUM, SOL_SOCKET);
3354                 if (seq_cm == NULL) {
3355                         /* unable to allocate a control mbuf */
3356                         error = ENOBUFS;
3357                         goto release;
3358                 }
3359                 *controlp = seq_cm;
3360                 controlp = &seq_cm->m_next;
3361         }
3362
3363         if (m != NULL) {
3364                 if (!(flags & MSG_PEEK)) {
3365                         /*
3366                          * We get here because m points to an mbuf following
3367                          * any MT_SONAME or MT_CONTROL mbufs which have been
3368                          * processed above.  In any case, m should be pointing
3369                          * to the head of the mbuf chain, and the nextrecord
3370                          * should be either NULL or equal to m->m_nextpkt.
3371                          * See comments above about SB_LOCK.
3372                          */
3373                         if (m != so->so_rcv.sb_mb ||
3374                             m->m_nextpkt != nextrecord) {
3375                                 panic("%s: post-control !sync so=%p m=%p "
3376                                     "nextrecord=%p\n", __func__, so, m,
3377                                     nextrecord);
3378                                 /* NOTREACHED */
3379                         }
3380                         if (nextrecord == NULL)
3381                                 so->so_rcv.sb_lastrecord = m;
3382                 }
3383                 type = m->m_type;
3384                 if (type == MT_OOBDATA)
3385                         flags |= MSG_OOB;
3386         } else {
3387                 if (!(flags & MSG_PEEK)) {
3388                         SB_EMPTY_FIXUP(&so->so_rcv);
3389                 }
3390         }
3391         SBLASTRECORDCHK(&so->so_rcv, "soreceive 2");
3392         SBLASTMBUFCHK(&so->so_rcv, "soreceive 2");
3393
3394         moff = 0;
3395         offset = 0;
3396
3397         if (!(flags & MSG_PEEK) && uio_resid(uio) > sorecvmincopy)
3398                 can_delay = 1;
3399         else
3400                 can_delay = 0;
3401
3402         need_event = 0;
3403
3404         while (m != NULL &&
3405             (uio_resid(uio) - delayed_copy_len) > 0 && error == 0) {
3406                 if (m->m_type == MT_OOBDATA) {
3407                         if (type != MT_OOBDATA)
3408                                 break;
3409                 } else if (type == MT_OOBDATA) {
3410                         break;
3411                 }
3412                 /*
3413                  * Make sure to allways set MSG_OOB event when getting
3414                  * out of band data inline.
3415                  */
3416                 if ((so->so_options & SO_WANTOOBFLAG) != 0 &&
3417                     (so->so_options & SO_OOBINLINE) != 0 &&
3418                     (so->so_state & SS_RCVATMARK) != 0) {
3419                         flags |= MSG_OOB;
3420                 }
3421                 so->so_state &= ~SS_RCVATMARK;
3422                 len = uio_resid(uio) - delayed_copy_len;
3423                 if (so->so_oobmark && len > so->so_oobmark - offset)
3424                         len = so->so_oobmark - offset;
3425                 if (len > m->m_len - moff)
3426                         len = m->m_len - moff;
3427                 /*
3428                  * If mp is set, just pass back the mbufs.
3429                  * Otherwise copy them out via the uio, then free.
3430                  * Sockbuf must be consistent here (points to current mbuf,
3431                  * it points to next record) when we drop priority;
3432                  * we must note any additions to the sockbuf when we
3433                  * block interrupts again.
3434                  */
3435                 if (mp == NULL) {
3436                         SBLASTRECORDCHK(&so->so_rcv, "soreceive uiomove");
3437                         SBLASTMBUFCHK(&so->so_rcv, "soreceive uiomove");
3438                         if (can_delay && len == m->m_len) {
3439                                 /*
3440                                  * only delay the copy if we're consuming the
3441                                  * mbuf and we're NOT in MSG_PEEK mode
3442                                  * and we have enough data to make it worthwile
3443                                  * to drop and retake the lock... can_delay
3444                                  * reflects the state of the 2 latter
3445                                  * constraints moff should always be zero
3446                                  * in these cases
3447                                  */
3448                                 delayed_copy_len += len;
3449                         } else {
3450                                 if (delayed_copy_len) {
3451                                         error = sodelayed_copy(so, uio,
3452                                             &free_list, &delayed_copy_len);
3453
3454                                         if (error) {
3455                                                 goto release;
3456                                         }
3457                                         /*
3458                                          * can only get here if MSG_PEEK is not
3459                                          * set therefore, m should point at the
3460                                          * head of the rcv queue; if it doesn't,
3461                                          * it means something drastically
3462                                          * changed while we were out from behind
3463                                          * the lock in sodelayed_copy. perhaps
3464                                          * a RST on the stream. in any event,
3465                                          * the stream has been interrupted. it's
3466                                          * probably best just to return whatever
3467                                          * data we've moved and let the caller
3468                                          * sort it out...
3469                                          */
3470                                         if (m != so->so_rcv.sb_mb) {
3471                                                 break;
3472                                         }
3473                                 }
3474                                 socket_unlock(so, 0);
3475                                 error = uiomove(mtod(m, caddr_t) + moff,
3476                                     (int)len, uio);
3477                                 socket_lock(so, 0);
3478
3479                                 if (error)
3480                                         goto release;
3481                         }
3482                 } else {
3483                         uio_setresid(uio, (uio_resid(uio) - len));
3484                 }
3485                 if (len == m->m_len - moff) {
3486                         if (m->m_flags & M_EOR)
3487                                 flags |= MSG_EOR;
3488                         if (flags & MSG_PEEK) {
3489                                 m = m->m_next;
3490                                 moff = 0;
3491                         } else {
3492                                 nextrecord = m->m_nextpkt;
3493                                 sbfree(&so->so_rcv, m);
3494                                 m->m_nextpkt = NULL;
3495
3496                                 /*
3497                                  * If this packet is an unordered packet
3498                                  * (indicated by M_UNORDERED_DATA flag), remove
3499                                  * the additional bytes added to the
3500                                  * receive socket buffer size.
3501                                  */
3502                                 if ((so->so_flags & SOF_ENABLE_MSGS) &&
3503                                     m->m_len &&
3504                                     (m->m_flags & M_UNORDERED_DATA) &&
3505                                     sbreserve(&so->so_rcv,
3506                                     so->so_rcv.sb_hiwat - m->m_len)) {
3507                                         if (so->so_msg_state->msg_uno_bytes >
3508                                             m->m_len) {
3509                                                 so->so_msg_state->
3510                                                     msg_uno_bytes -= m->m_len;
3511                                         } else {
3512                                                 so->so_msg_state->
3513                                                     msg_uno_bytes = 0;
3514                                         }
3515                                         m->m_flags &= ~M_UNORDERED_DATA;
3516                                 }
3517
3518                                 if (mp != NULL) {
3519                                         *mp = m;
3520                                         mp = &m->m_next;
3521                                         so->so_rcv.sb_mb = m = m->m_next;
3522                                         *mp = NULL;
3523                                 } else {
3524                                         if (free_list == NULL)
3525                                                 free_list = m;
3526                                         else
3527                                                 ml->m_next = m;
3528                                         ml = m;
3529                                         so->so_rcv.sb_mb = m = m->m_next;
3530                                         ml->m_next = NULL;
3531                                 }
3532                                 if (m != NULL) {
3533                                         m->m_nextpkt = nextrecord;
3534                                         if (nextrecord == NULL)
3535                                                 so->so_rcv.sb_lastrecord = m;
3536                                 } else {
3537                                         so->so_rcv.sb_mb = nextrecord;
3538                                         SB_EMPTY_FIXUP(&so->so_rcv);
3539                                 }
3540                                 SBLASTRECORDCHK(&so->so_rcv, "soreceive 3");
3541                                 SBLASTMBUFCHK(&so->so_rcv, "soreceive 3");
3542                         }
3543                 } else {
3544                         if (flags & MSG_PEEK) {
3545                                 moff += len;
3546                         } else {
3547                                 if (mp != NULL) {
3548                                         int copy_flag;
3549
3550                                         if (flags & MSG_DONTWAIT)
3551                                                 copy_flag = M_DONTWAIT;
3552                                         else
3553                                                 copy_flag = M_WAIT;
3554                                         *mp = m_copym(m, 0, len, copy_flag);
3555                                         /*
3556                                          * Failed to allocate an mbuf?
3557                                          * Adjust uio_resid back, it was
3558                                          * adjusted down by len bytes which
3559                                          * we didn't copy over.
3560                                          */
3561                                         if (*mp == NULL) {
3562                                                 uio_setresid(uio,
3563                                                     (uio_resid(uio) + len));
3564                                                 break;
3565                                         }
3566                                 }
3567                                 m->m_data += len;
3568                                 m->m_len -= len;
3569                                 so->so_rcv.sb_cc -= len;
3570                         }
3571                 }
3572                 if (so->so_oobmark) {
3573                         if ((flags & MSG_PEEK) == 0) {
3574                                 so->so_oobmark -= len;
3575                                 if (so->so_oobmark == 0) {
3576                                         so->so_state |= SS_RCVATMARK;
3577                                         /*
3578                                          * delay posting the actual event until
3579                                          * after any delayed copy processing
3580                                          * has finished
3581                                          */
3582                                         need_event = 1;
3583                                         break;
3584                                 }
3585                         } else {
3586                                 offset += len;
3587                                 if (offset == so->so_oobmark)
3588                                         break;
3589                         }
3590                 }
3591                 if (flags & MSG_EOR)
3592                         break;
3593                 /*
3594                  * If the MSG_WAITALL or MSG_WAITSTREAM flag is set
3595                  * (for non-atomic socket), we must not quit until
3596                  * "uio->uio_resid == 0" or an error termination.
3597                  * If a signal/timeout occurs, return with a short
3598                  * count but without error.  Keep sockbuf locked
3599                  * against other readers.
3600                  */
3601                 while (flags & (MSG_WAITALL|MSG_WAITSTREAM) && m == NULL &&
3602                     (uio_resid(uio) - delayed_copy_len) > 0 &&
3603                     !sosendallatonce(so) && !nextrecord) {
3604                         if (so->so_error || ((so->so_state & SS_CANTRCVMORE)
3605 #if CONTENT_FILTER
3606                             && cfil_sock_data_pending(&so->so_rcv) == 0
3607 #endif /* CONTENT_FILTER */
3608                             ))
3609                                 goto release;
3610
3611                         /*
3612                          * Depending on the protocol (e.g. TCP), the following
3613                          * might cause the socket lock to be dropped and later
3614                          * be reacquired, and more data could have arrived and
3615                          * have been appended to the receive socket buffer by
3616                          * the time it returns.  Therefore, we only sleep in
3617                          * sbwait() below if and only if the socket buffer is
3618                          * empty, in order to avoid a false sleep.
3619                          */
3620                         if (pr->pr_flags & PR_WANTRCVD && so->so_pcb &&
3621                             (((struct inpcb *)so->so_pcb)->inp_state !=
3622                             INPCB_STATE_DEAD))
3623                                 (*pr->pr_usrreqs->pru_rcvd)(so, flags);
3624
3625                         SBLASTRECORDCHK(&so->so_rcv, "soreceive sbwait 2");
3626                         SBLASTMBUFCHK(&so->so_rcv, "soreceive sbwait 2");
3627
3628                         if (so->so_rcv.sb_mb == NULL && sbwait(&so->so_rcv)) {
3629                                 error = 0;
3630                                 goto release;
3631                         }
3632                         /*
3633                          * have to wait until after we get back from the sbwait
3634                          * to do the copy because we will drop the lock if we
3635                          * have enough data that has been delayed... by dropping
3636                          * the lock we open up a window allowing the netisr
3637                          * thread to process the incoming packets and to change
3638                          * the state of this socket... we're issuing the sbwait
3639                          * because the socket is empty and we're expecting the
3640                          * netisr thread to wake us up when more packets arrive;
3641                          * if we allow that processing to happen and then sbwait
3642                          * we could stall forever with packets sitting in the
3643                          * socket if no further packets arrive from the remote
3644                          * side.
3645                          *
3646                          * we want to copy before we've collected all the data
3647                          * to satisfy this request to allow the copy to overlap
3648                          * the incoming packet processing on an MP system
3649                          */
3650                         if (delayed_copy_len > sorecvmincopy &&
3651                             (delayed_copy_len > (so->so_rcv.sb_hiwat / 2))) {
3652                                 error = sodelayed_copy(so, uio,
3653                                     &free_list, &delayed_copy_len);
3654
3655                                 if (error)
3656                                         goto release;
3657                         }
3658                         m = so->so_rcv.sb_mb;
3659                         if (m != NULL) {
3660                                 nextrecord = m->m_nextpkt;
3661                         }
3662                         SB_MB_CHECK(&so->so_rcv);
3663                 }
3664         }
3665 #ifdef MORE_LOCKING_DEBUG
3666         if (so->so_usecount <= 1) {
3667                 panic("%s: after big while so=%p ref=%d on socket\n",
3668                     __func__, so, so->so_usecount);
3669                 /* NOTREACHED */
3670         }
3671 #endif
3672
3673         if (m != NULL && pr->pr_flags & PR_ATOMIC) {
3674                 if (so->so_options & SO_DONTTRUNC) {
3675                         flags |= MSG_RCVMORE;
3676                 } else {
3677                         flags |= MSG_TRUNC;
3678                         if ((flags & MSG_PEEK) == 0)
3679                                 (void) sbdroprecord(&so->so_rcv);
3680                 }
3681         }
3682
3683         /*
3684          * pru_rcvd below (for TCP) may cause more data to be received
3685          * if the socket lock is dropped prior to sending the ACK; some
3686          * legacy OpenTransport applications don't handle this well
3687          * (if it receives less data than requested while MSG_HAVEMORE
3688          * is set), and so we set the flag now based on what we know
3689          * prior to calling pru_rcvd.
3690          */
3691         if ((so->so_options & SO_WANTMORE) && so->so_rcv.sb_cc > 0)
3692                 flags |= MSG_HAVEMORE;
3693
3694         if ((flags & MSG_PEEK) == 0) {
3695                 if (m == NULL) {
3696                         so->so_rcv.sb_mb = nextrecord;
3697                         /*
3698                          * First part is an inline SB_EMPTY_FIXUP().  Second
3699                          * part makes sure sb_lastrecord is up-to-date if
3700                          * there is still data in the socket buffer.
3701                          */
3702                         if (so->so_rcv.sb_mb == NULL) {
3703                                 so->so_rcv.sb_mbtail = NULL;
3704                                 so->so_rcv.sb_lastrecord = NULL;
3705                         } else if (nextrecord->m_nextpkt == NULL) {
3706                                 so->so_rcv.sb_lastrecord = nextrecord;
3707                         }
3708                         SB_MB_CHECK(&so->so_rcv);
3709                 }
3710                 SBLASTRECORDCHK(&so->so_rcv, "soreceive 4");
3711                 SBLASTMBUFCHK(&so->so_rcv, "soreceive 4");
3712                 if (pr->pr_flags & PR_WANTRCVD && so->so_pcb)
3713                         (*pr->pr_usrreqs->pru_rcvd)(so, flags);
3714         }
3715
3716         if (delayed_copy_len) {
3717                 error = sodelayed_copy(so, uio, &free_list, &delayed_copy_len);
3718                 if (error)
3719                         goto release;
3720         }
3721         if (free_list != NULL) {
3722                 m_freem_list(free_list);
3723                 free_list = NULL;
3724         }
3725         if (need_event)
3726                 postevent(so, 0, EV_OOB);
3727
3728         if (orig_resid == uio_resid(uio) && orig_resid &&
3729             (flags & MSG_EOR) == 0 && (so->so_state & SS_CANTRCVMORE) == 0) {
3730                 sbunlock(&so->so_rcv, TRUE);    /* keep socket locked */
3731                 goto restart;
3732         }
3733
3734         if (flagsp != NULL)
3735                 *flagsp |= flags;
3736 release:
3737 #ifdef MORE_LOCKING_DEBUG
3738         if (so->so_usecount <= 1) {
3739                 panic("%s: release so=%p ref=%d on socket\n", __func__,
3740                     so, so->so_usecount);
3741                 /* NOTREACHED */
3742         }
3743 #endif
3744         if (delayed_copy_len)
3745                 error = sodelayed_copy(so, uio, &free_list, &delayed_copy_len);
3746
3747         if (free_list != NULL)
3748                 m_freem_list(free_list);
3749
3750         sbunlock(&so->so_rcv, FALSE);   /* will unlock socket */
3751
3752         if (en_tracing) {
3753                 KERNEL_ENERGYTRACE(kEnTrActKernSockRead, DBG_FUNC_END,
3754                     VM_KERNEL_ADDRPERM(so),
3755                     ((error == EWOULDBLOCK) ? kEnTrFlagNoWork : 0),
3756                     (int64_t)(orig_resid - uio_resid(uio)));
3757         }
3758         KERNEL_DEBUG(DBG_FNC_SORECEIVE | DBG_FUNC_END, so, uio_resid(uio),
3759             so->so_rcv.sb_cc, 0, error);
3760
3761         return (error);
3762 }
3763
3764 /*
3765  * Returns:     0                       Success
3766  *      uiomove:EFAULT
3767  */
3768 static int
3769 sodelayed_copy(struct socket *so, struct uio *uio, struct mbuf **free_list,
3770     user_ssize_t *resid)
3771 {
3772         int error = 0;
3773         struct mbuf *m;
3774
3775         m = *free_list;
3776
3777         socket_unlock(so, 0);
3778
3779         while (m != NULL && error == 0) {
3780                 error = uiomove(mtod(m, caddr_t), (int)m->m_len, uio);
3781                 m = m->m_next;
3782         }
3783         m_freem_list(*free_list);
3784
3785         *free_list = NULL;
3786         *resid = 0;
3787
3788         socket_lock(so, 0);
3789
3790         return (error);
3791 }
3792
3793 static int
3794 sodelayed_copy_list(struct socket *so, struct recv_msg_elem *msgarray,
3795     u_int uiocnt, struct mbuf **free_list, user_ssize_t *resid)
3796 {
3797 #pragma unused(so)
3798         int error = 0;
3799         struct mbuf *ml, *m;
3800         int i = 0;
3801         struct uio *auio;
3802
3803         for (ml = *free_list, i = 0; ml != NULL && i < uiocnt;
3804             ml = ml->m_nextpkt, i++) {
3805                 auio = msgarray[i].uio;
3806                 for (m = ml; m != NULL; m = m->m_next) {
3807                         error = uiomove(mtod(m, caddr_t), m->m_len, auio);
3808                         if (error != 0)
3809                                 goto out;
3810                 }
3811         }
3812 out:
3813         m_freem_list(*free_list);
3814
3815         *free_list = NULL;
3816         *resid = 0;
3817
3818         return (error);
3819 }
3820
3821 int
3822 soreceive_list(struct socket *so, struct recv_msg_elem *msgarray, u_int uiocnt,
3823     int *flagsp)
3824 {
3825         struct mbuf *m;
3826         struct mbuf *nextrecord;
3827         struct mbuf *ml = NULL, *free_list = NULL, *free_tail = NULL;
3828         int error;
3829         user_ssize_t len, pktlen, delayed_copy_len = 0;
3830         struct protosw *pr = so->so_proto;
3831         user_ssize_t resid;
3832         struct proc *p = current_proc();
3833         struct uio *auio = NULL;
3834         int npkts = 0;
3835         int sblocked = 0;
3836         struct sockaddr **psa = NULL;
3837         struct mbuf **controlp = NULL;
3838         int can_delay;
3839         int flags;
3840         struct mbuf *free_others = NULL;
3841
3842         KERNEL_DEBUG(DBG_FNC_SORECEIVE_LIST | DBG_FUNC_START,
3843             so, uiocnt,
3844             so->so_rcv.sb_cc, so->so_rcv.sb_lowat, so->so_rcv.sb_hiwat);
3845
3846         /*
3847          * Sanity checks:
3848          * - Only supports don't wait flags
3849          * - Only support datagram sockets (could be extended to raw)
3850          * - Must be atomic
3851          * - Protocol must support packet chains
3852          * - The uio array is NULL (should we panic?)
3853          */
3854         if (flagsp != NULL)
3855                 flags = *flagsp;
3856         else
3857                 flags = 0;
3858         if (flags & ~(MSG_PEEK | MSG_WAITALL | MSG_DONTWAIT | MSG_NEEDSA |
3859             MSG_NBIO)) {
3860                 printf("%s invalid flags 0x%x\n", __func__, flags);
3861                 error = EINVAL;
3862                 goto out;
3863         }
3864         if (so->so_type != SOCK_DGRAM) {
3865                 error = EINVAL;
3866                 goto out;
3867         }
3868         if (sosendallatonce(so) == 0) {
3869                 error = EINVAL;
3870                 goto out;
3871         }
3872         if (so->so_proto->pr_usrreqs->pru_send_list == NULL) {
3873                 error = EPROTONOSUPPORT;
3874                 goto out;
3875         }
3876         if (msgarray == NULL) {
3877                 printf("%s uioarray is NULL\n", __func__);
3878                 error = EINVAL;
3879                 goto out;
3880         }
3881         if (uiocnt == 0) {
3882                 printf("%s uiocnt is 0\n", __func__);
3883                 error = EINVAL;
3884                 goto out;
3885         }
3886         /*
3887          * Sanity check on the length passed by caller as we are making 'int'
3888          * comparisons
3889          */
3890         resid = recv_msg_array_resid(msgarray, uiocnt);
3891         if (resid < 0 || resid > INT_MAX) {
3892                 error = EINVAL;
3893                 goto out;
3894         }
3895
3896         if (!(flags & MSG_PEEK) && sorecvmincopy > 0)
3897                 can_delay = 1;
3898         else
3899                 can_delay = 0;
3900
3901         socket_lock(so, 1);
3902         so_update_last_owner_locked(so, p);
3903         so_update_policy(so);
3904
3905 #if NECP
3906         so_update_necp_policy(so, NULL, NULL);
3907 #endif /* NECP */
3908
3909         /*
3910          * If a recv attempt is made on a previously-accepted socket
3911          * that has been marked as inactive (disconnected), reject
3912          * the request.
3913          */
3914         if (so->so_flags & SOF_DEFUNCT) {
3915                 struct sockbuf *sb = &so->so_rcv;
3916
3917                 error = ENOTCONN;
3918                 SODEFUNCTLOG("%s[%d, %s]: defunct so 0x%llx [%d,%d] (%d)\n",
3919                     __func__, proc_pid(p), proc_best_name(p),
3920                     (uint64_t)DEBUG_KERNEL_ADDRPERM(so),
3921                     SOCK_DOM(so), SOCK_TYPE(so), error);
3922                 /*
3923                  * This socket should have been disconnected and flushed
3924                  * prior to being returned from sodefunct(); there should
3925                  * be no data on its receive list, so panic otherwise.
3926                  */
3927                 if (so->so_state & SS_DEFUNCT)
3928                         sb_empty_assert(sb, __func__);
3929                 goto release;
3930         }
3931
3932 next:
3933         /*
3934          * The uio may be empty
3935          */
3936         if (npkts >= uiocnt) {
3937                 error = 0;
3938                 goto release;
3939         }
3940 restart:
3941         /*
3942          * See if the socket has been closed (SS_NOFDREF|SS_CANTRCVMORE)
3943          * and if so just return to the caller.  This could happen when
3944          * soreceive() is called by a socket upcall function during the
3945          * time the socket is freed.  The socket buffer would have been
3946          * locked across the upcall, therefore we cannot put this thread
3947          * to sleep (else we will deadlock) or return EWOULDBLOCK (else
3948          * we may livelock), because the lock on the socket buffer will
3949          * only be released when the upcall routine returns to its caller.
3950          * Because the socket has been officially closed, there can be
3951          * no further read on it.
3952          */
3953         if ((so->so_state & (SS_NOFDREF | SS_CANTRCVMORE)) ==
3954             (SS_NOFDREF | SS_CANTRCVMORE)) {
3955                 error = 0;
3956                 goto release;
3957         }
3958
3959         error = sblock(&so->so_rcv, SBLOCKWAIT(flags));
3960         if (error) {
3961                 goto release;
3962         }
3963         sblocked = 1;
3964
3965         m = so->so_rcv.sb_mb;
3966         /*
3967          * Block awaiting more datagram if needed
3968          */
3969         if (m == NULL || (((flags & MSG_DONTWAIT) == 0 &&
3970             (so->so_rcv.sb_cc < so->so_rcv.sb_lowat ||
3971             ((flags & MSG_WAITALL) && npkts < uiocnt))))) {
3972                 /*
3973                  * Panic if we notice inconsistencies in the socket's
3974                  * receive list; both sb_mb and sb_cc should correctly
3975                  * reflect the contents of the list, otherwise we may
3976                  * end up with false positives during select() or poll()
3977                  * which could put the application in a bad state.
3978                  */
3979                 SB_MB_CHECK(&so->so_rcv);
3980
3981                 if (so->so_error) {
3982                         error = so->so_error;
3983                         if ((flags & MSG_PEEK) == 0)
3984                                 so->so_error = 0;
3985                         goto release;
3986                 }
3987                 if (so->so_state & SS_CANTRCVMORE) {
3988                         goto release;
3989                 }
3990                 if ((so->so_state & (SS_ISCONNECTED|SS_ISCONNECTING)) == 0 &&
3991                     (so->so_proto->pr_flags & PR_CONNREQUIRED)) {
3992                         error = ENOTCONN;
3993                         goto release;
3994                 }
3995                 if ((so->so_state & SS_NBIO) ||
3996                     (flags & (MSG_DONTWAIT|MSG_NBIO))) {
3997                         error = EWOULDBLOCK;
3998                         goto release;
3999                 }
4000                 /*
4001                  * Do not block if we got some data
4002                  */
4003                 if (free_list != NULL) {
4004                         error = 0;
4005                         goto release;
4006                 }
4007
4008                 SBLASTRECORDCHK(&so->so_rcv, "soreceive sbwait 1");
4009                 SBLASTMBUFCHK(&so->so_rcv, "soreceive sbwait 1");
4010
4011                 sbunlock(&so->so_rcv, TRUE);    /* keep socket locked */
4012                 sblocked = 0;
4013
4014                 error = sbwait(&so->so_rcv);
4015                 if (error) {
4016                         goto release;
4017                 }
4018                 goto restart;
4019         }
4020
4021         OSIncrementAtomicLong(&p->p_stats->p_ru.ru_msgrcv);
4022         SBLASTRECORDCHK(&so->so_rcv, "soreceive 1");
4023         SBLASTMBUFCHK(&so->so_rcv, "soreceive 1");
4024
4025         /*
4026          * Consume the current uio index as we have a datagram
4027          */
4028         auio = msgarray[npkts].uio;
4029         resid = uio_resid(auio);
4030         msgarray[npkts].which |= SOCK_MSG_DATA;
4031         psa = (msgarray[npkts].which & SOCK_MSG_SA) ?
4032             &msgarray[npkts].psa : NULL;
4033         controlp = (msgarray[npkts].which & SOCK_MSG_CONTROL) ?
4034             &msgarray[npkts].controlp : NULL;
4035         npkts += 1;
4036         nextrecord = m->m_nextpkt;
4037
4038         if ((pr->pr_flags & PR_ADDR) && m->m_type == MT_SONAME) {
4039                 error = soreceive_addr(p, so, psa, flags, &m, &nextrecord, 1);
4040                 if (error == ERESTART)
4041                         goto restart;
4042                 else if (error != 0)
4043                         goto release;
4044         }
4045
4046         if (m != NULL && m->m_type == MT_CONTROL) {
4047                 error = soreceive_ctl(so, controlp, flags, &m, &nextrecord);
4048                 if (error != 0)
4049                         goto release;
4050         }
4051
4052         if (m->m_pkthdr.len == 0) {
4053                 printf("%s:%d so %llx pkt %llx type %u pktlen null\n",
4054                     __func__, __LINE__,
4055                     (uint64_t)DEBUG_KERNEL_ADDRPERM(so),
4056                     (uint64_t)DEBUG_KERNEL_ADDRPERM(m),
4057                     m->m_type);
4058         }
4059
4060         /*
4061          * Loop to copy the mbufs of the current record
4062          * Support zero length packets
4063          */
4064         ml = NULL;
4065         pktlen = 0;
4066         while (m != NULL && (len = resid - pktlen) >= 0 && error == 0) {
4067                 if (m->m_len == 0)
4068                         panic("%p m_len zero", m);
4069                 if (m->m_type == 0)
4070                         panic("%p m_type zero", m);
4071                 /*
4072                  * Clip to the residual length
4073                  */
4074                 if (len > m->m_len)
4075                         len = m->m_len;
4076                 pktlen += len;
4077                 /*
4078                  * Copy the mbufs via the uio or delay the copy
4079                  * Sockbuf must be consistent here (points to current mbuf,
4080                  * it points to next record) when we drop priority;
4081                  * we must note any additions to the sockbuf when we
4082                  * block interrupts again.
4083                  */
4084                 if (len > 0 && can_delay == 0) {
4085                         socket_unlock(so, 0);
4086                         error = uiomove(mtod(m, caddr_t), (int)len, auio);
4087                         socket_lock(so, 0);
4088                         if (error)
4089                                 goto release;
4090                 } else {
4091                         delayed_copy_len += len;
4092                 }
4093
4094                 if (len == m->m_len) {
4095                         /*
4096                          * m was entirely copied
4097                          */
4098                         sbfree(&so->so_rcv, m);
4099                         nextrecord = m->m_nextpkt;
4100                         m->m_nextpkt = NULL;
4101
4102                         /*
4103                          * Set the first packet to the head of the free list
4104                          */
4105                         if (free_list == NULL)
4106                                 free_list = m;
4107                         /*
4108                          * Link current packet to tail of free list
4109                          */
4110                         if (ml == NULL) {
4111                                 if (free_tail != NULL)
4112                                         free_tail->m_nextpkt = m;
4113                                 free_tail = m;
4114                         }
4115                         /*
4116                          * Link current mbuf to last mbuf of current packet
4117                          */
4118                         if (ml != NULL)
4119                                 ml->m_next = m;
4120                         ml = m;
4121
4122                         /*
4123                          * Move next buf to head of socket buffer
4124                          */
4125                         so->so_rcv.sb_mb = m = ml->m_next;
4126                         ml->m_next = NULL;
4127
4128                         if (m != NULL) {
4129                                 m->m_nextpkt = nextrecord;
4130                                 if (nextrecord == NULL)
4131                                         so->so_rcv.sb_lastrecord = m;
4132                         } else {
4133                                 so->so_rcv.sb_mb = nextrecord;
4134                                 SB_EMPTY_FIXUP(&so->so_rcv);
4135                         }
4136                         SBLASTRECORDCHK(&so->so_rcv, "soreceive 3");
4137                         SBLASTMBUFCHK(&so->so_rcv, "soreceive 3");
4138                 } else {
4139                         /*
4140                          * Stop the loop on partial copy
4141                          */
4142                         break;
4143                 }
4144         }
4145 #ifdef MORE_LOCKING_DEBUG
4146         if (so->so_usecount <= 1) {
4147                 panic("%s: after big while so=%llx ref=%d on socket\n",
4148                     __func__,
4149                     (uint64_t)DEBUG_KERNEL_ADDRPERM(so), so->so_usecount);
4150                 /* NOTREACHED */
4151         }
4152 #endif
4153         /*
4154          * Tell the caller we made a partial copy
4155          */
4156         if (m != NULL) {
4157                 if (so->so_options & SO_DONTTRUNC) {
4158                         /*
4159                          * Copyout first the freelist then the partial mbuf
4160                          */
4161                         socket_unlock(so, 0);
4162                         if (delayed_copy_len)
4163                                 error = sodelayed_copy_list(so, msgarray,
4164                                     uiocnt, &free_list, &delayed_copy_len);
4165
4166                         if (error == 0) {
4167                                 error = uiomove(mtod(m, caddr_t), (int)len,
4168                                     auio);
4169                         }
4170                         socket_lock(so, 0);
4171                         if (error)
4172                                 goto release;
4173
4174                         m->m_data += len;
4175                         m->m_len -= len;
4176                         so->so_rcv.sb_cc -= len;
4177                         flags |= MSG_RCVMORE;
4178                 } else {
4179                         (void) sbdroprecord(&so->so_rcv);
4180                         nextrecord = so->so_rcv.sb_mb;
4181                         m = NULL;
4182                         flags |= MSG_TRUNC;
4183                 }
4184         }
4185
4186         if (m == NULL) {
4187                 so->so_rcv.sb_mb = nextrecord;
4188                 /*
4189                  * First part is an inline SB_EMPTY_FIXUP().  Second
4190                  * part makes sure sb_lastrecord is up-to-date if
4191                  * there is still data in the socket buffer.
4192                  */
4193                 if (so->so_rcv.sb_mb == NULL) {
4194                         so->so_rcv.sb_mbtail = NULL;
4195                         so->so_rcv.sb_lastrecord = NULL;
4196                 } else if (nextrecord->m_nextpkt == NULL) {
4197                         so->so_rcv.sb_lastrecord = nextrecord;
4198                 }
4199                 SB_MB_CHECK(&so->so_rcv);
4200         }
4201         SBLASTRECORDCHK(&so->so_rcv, "soreceive 4");
4202         SBLASTMBUFCHK(&so->so_rcv, "soreceive 4");
4203
4204         /*
4205          * We can continue to the next packet as long as:
4206          * - We haven't exhausted the uio array
4207          * - There was no error
4208          * - A packet was not truncated
4209          * - We can still receive more data
4210          */
4211         if (npkts < uiocnt && error == 0 &&
4212             (flags & (MSG_RCVMORE | MSG_TRUNC)) == 0 &&
4213             (so->so_state & SS_CANTRCVMORE) == 0) {
4214                 sbunlock(&so->so_rcv, TRUE);    /* keep socket locked */
4215                 sblocked = 0;
4216
4217                 goto next;
4218         }
4219         if (flagsp != NULL)
4220                 *flagsp |= flags;
4221
4222 release:
4223         /*
4224          * pru_rcvd may cause more data to be received if the socket lock
4225          * is dropped so we set MSG_HAVEMORE now based on what we know.
4226          * That way the caller won't be surprised if it receives less data
4227          * than requested.
4228          */
4229         if ((so->so_options & SO_WANTMORE) && so->so_rcv.sb_cc > 0)
4230                 flags |= MSG_HAVEMORE;
4231
4232         if (pr->pr_flags & PR_WANTRCVD && so->so_pcb)
4233                 (*pr->pr_usrreqs->pru_rcvd)(so, flags);
4234
4235         if (sblocked)
4236                 sbunlock(&so->so_rcv, FALSE);   /* will unlock socket */
4237         else
4238                 socket_unlock(so, 1);
4239
4240         if (delayed_copy_len)
4241                 error = sodelayed_copy_list(so, msgarray, uiocnt,
4242                     &free_list, &delayed_copy_len);
4243 out:
4244         /*
4245          * Amortize the cost of freeing the mbufs
4246          */
4247         if (free_list != NULL)
4248                 m_freem_list(free_list);
4249         if (free_others != NULL)
4250                 m_freem_list(free_others);
4251
4252         KERNEL_DEBUG(DBG_FNC_SORECEIVE_LIST | DBG_FUNC_END, error,
4253             0, 0, 0, 0);
4254         return (error);
4255 }
4256
4257 /*
4258  * Returns:     0                       Success
4259  *              EINVAL
4260  *              ENOTCONN
4261  *      <pru_shutdown>:EINVAL
4262  *      <pru_shutdown>:EADDRNOTAVAIL[TCP]
4263  *      <pru_shutdown>:ENOBUFS[TCP]
4264  *      <pru_shutdown>:EMSGSIZE[TCP]
4265  *      <pru_shutdown>:EHOSTUNREACH[TCP]
4266  *      <pru_shutdown>:ENETUNREACH[TCP]
4267  *      <pru_shutdown>:ENETDOWN[TCP]
4268  *      <pru_shutdown>:ENOMEM[TCP]
4269  *      <pru_shutdown>:EACCES[TCP]
4270  *      <pru_shutdown>:EMSGSIZE[TCP]
4271  *      <pru_shutdown>:ENOBUFS[TCP]
4272  *      <pru_shutdown>:???[TCP]         [ignorable: mostly IPSEC/firewall/DLIL]
4273  *      <pru_shutdown>:???              [other protocol families]
4274  */
4275 int
4276 soshutdown(struct socket *so, int how)
4277 {
4278         int error;
4279
4280         KERNEL_DEBUG(DBG_FNC_SOSHUTDOWN | DBG_FUNC_START, how, 0, 0, 0, 0);
4281
4282         switch (how) {
4283         case SHUT_RD:
4284         case SHUT_WR:
4285         case SHUT_RDWR:
4286                 socket_lock(so, 1);
4287                 if ((so->so_state &
4288                     (SS_ISCONNECTED|SS_ISCONNECTING|SS_ISDISCONNECTING)) == 0) {
4289                         error = ENOTCONN;
4290                 } else {
4291                         error = soshutdownlock(so, how);
4292                 }
4293                 socket_unlock(so, 1);
4294                 break;
4295         default:
4296                 error = EINVAL;
4297                 break;
4298         }
4299
4300         KERNEL_DEBUG(DBG_FNC_SOSHUTDOWN | DBG_FUNC_END, how, error, 0, 0, 0);
4301
4302         return (error);
4303 }
4304
4305 int
4306 soshutdownlock_final(struct socket *so, int how)
4307 {
4308         struct protosw *pr = so->so_proto;
4309         int error = 0;
4310
4311         sflt_notify(so, sock_evt_shutdown, &how);
4312
4313         if (how != SHUT_WR) {
4314                 if ((so->so_state & SS_CANTRCVMORE) != 0) {
4315                         /* read already shut down */
4316                         error = ENOTCONN;
4317                         goto done;
4318                 }
4319                 sorflush(so);
4320                 postevent(so, 0, EV_RCLOSED);
4321         }
4322         if (how != SHUT_RD) {
4323                 if ((so->so_state & SS_CANTSENDMORE) != 0) {
4324                         /* write already shut down */
4325                         error = ENOTCONN;
4326                         goto done;
4327                 }
4328                 error = (*pr->pr_usrreqs->pru_shutdown)(so);
4329                 postevent(so, 0, EV_WCLOSED);
4330         }
4331 done:
4332         KERNEL_DEBUG(DBG_FNC_SOSHUTDOWN, how, 1, 0, 0, 0);
4333         return (error);
4334 }
4335
4336 int
4337 soshutdownlock(struct socket *so, int how)
4338 {
4339         int error = 0;
4340
4341 #if CONTENT_FILTER
4342         /*
4343          * A content filter may delay the actual shutdown until it
4344          * has processed the pending data
4345          */
4346         if (so->so_flags & SOF_CONTENT_FILTER) {
4347                 error = cfil_sock_shutdown(so, &how);
4348                 if (error == EJUSTRETURN) {
4349                         error = 0;
4350                         goto done;
4351                 } else if (error != 0) {
4352                         goto done;
4353                 }
4354         }
4355 #endif /* CONTENT_FILTER */
4356
4357         error = soshutdownlock_final(so, how);
4358
4359 done:
4360         return (error);
4361 }
4362
4363 void
4364 sowflush(struct socket *so)
4365 {
4366         struct sockbuf *sb = &so->so_snd;
4367
4368         /*
4369          * Obtain lock on the socket buffer (SB_LOCK).  This is required
4370          * to prevent the socket buffer from being unexpectedly altered
4371          * while it is used by another thread in socket send/receive.
4372          *
4373          * sblock() must not fail here, hence the assertion.
4374          */
4375         (void) sblock(sb, SBL_WAIT | SBL_NOINTR | SBL_IGNDEFUNCT);
4376         VERIFY(sb->sb_flags & SB_LOCK);
4377
4378         sb->sb_flags            &= ~(SB_SEL|SB_UPCALL);
4379         sb->sb_flags            |= SB_DROP;
4380         sb->sb_upcall           = NULL;
4381         sb->sb_upcallarg        = NULL;
4382
4383         sbunlock(sb, TRUE);     /* keep socket locked */
4384
4385         selthreadclear(&sb->sb_sel);
4386         sbrelease(sb);
4387 }
4388
4389 void
4390 sorflush(struct socket *so)
4391 {
4392         struct sockbuf *sb = &so->so_rcv;
4393         struct protosw *pr = so->so_proto;
4394         struct sockbuf asb;
4395 #ifdef notyet
4396         lck_mtx_t *mutex_held;
4397         /*
4398          * XXX: This code is currently commented out, because we may get here
4399          * as part of sofreelastref(), and at that time, pr_getlock() may no
4400          * longer be able to return us the lock; this will be fixed in future.
4401          */
4402         if (so->so_proto->pr_getlock != NULL)
4403                 mutex_held = (*so->so_proto->pr_getlock)(so, 0);
4404         else
4405                 mutex_held = so->so_proto->pr_domain->dom_mtx;
4406
4407         lck_mtx_assert(mutex_held, LCK_MTX_ASSERT_OWNED);
4408 #endif /* notyet */
4409
4410         sflt_notify(so, sock_evt_flush_read, NULL);
4411
4412         socantrcvmore(so);
4413
4414         /*
4415          * Obtain lock on the socket buffer (SB_LOCK).  This is required
4416          * to prevent the socket buffer from being unexpectedly altered
4417          * while it is used by another thread in socket send/receive.
4418          *
4419          * sblock() must not fail here, hence the assertion.
4420          */
4421         (void) sblock(sb, SBL_WAIT | SBL_NOINTR | SBL_IGNDEFUNCT);
4422         VERIFY(sb->sb_flags & SB_LOCK);
4423
4424         /*
4425          * Copy only the relevant fields from "sb" to "asb" which we
4426          * need for sbrelease() to function.  In particular, skip
4427          * sb_sel as it contains the wait queue linkage, which would
4428          * wreak havoc if we were to issue selthreadclear() on "asb".
4429          * Make sure to not carry over SB_LOCK in "asb", as we need
4430          * to acquire it later as part of sbrelease().
4431          */
4432         bzero(&asb, sizeof (asb));
4433         asb.sb_cc               = sb->sb_cc;
4434         asb.sb_hiwat            = sb->sb_hiwat;
4435         asb.sb_mbcnt            = sb->sb_mbcnt;
4436         asb.sb_mbmax            = sb->sb_mbmax;
4437         asb.sb_ctl              = sb->sb_ctl;
4438         asb.sb_lowat            = sb->sb_lowat;
4439         asb.sb_mb               = sb->sb_mb;
4440         asb.sb_mbtail           = sb->sb_mbtail;
4441         asb.sb_lastrecord       = sb->sb_lastrecord;
4442         asb.sb_so               = sb->sb_so;
4443         asb.sb_flags            = sb->sb_flags;
4444         asb.sb_flags            &= ~(SB_LOCK|SB_SEL|SB_KNOTE|SB_UPCALL);
4445         asb.sb_flags            |= SB_DROP;
4446
4447         /*
4448          * Ideally we'd bzero() these and preserve the ones we need;
4449          * but to do that we'd need to shuffle things around in the
4450          * sockbuf, and we can't do it now because there are KEXTS
4451          * that are directly referring to the socket structure.
4452          *
4453          * Setting SB_DROP acts as a barrier to prevent further appends.
4454          * Clearing SB_SEL is done for selthreadclear() below.
4455          */
4456         sb->sb_cc               = 0;
4457         sb->sb_hiwat            = 0;
4458         sb->sb_mbcnt            = 0;
4459         sb->sb_mbmax            = 0;
4460         sb->sb_ctl              = 0;
4461         sb->sb_lowat            = 0;
4462         sb->sb_mb               = NULL;
4463         sb->sb_mbtail           = NULL;
4464         sb->sb_lastrecord       = NULL;
4465         sb->sb_timeo.tv_sec     = 0;
4466         sb->sb_timeo.tv_usec    = 0;
4467         sb->sb_upcall           = NULL;
4468         sb->sb_upcallarg        = NULL;
4469         sb->sb_flags            &= ~(SB_SEL|SB_UPCALL);
4470         sb->sb_flags            |= SB_DROP;
4471
4472         sbunlock(sb, TRUE);     /* keep socket locked */
4473
4474         /*
4475          * Note that selthreadclear() is called on the original "sb" and
4476          * not the local "asb" because of the way wait queue linkage is
4477          * implemented.  Given that selwakeup() may be triggered, SB_SEL
4478          * should no longer be set (cleared above.)
4479          */
4480         selthreadclear(&sb->sb_sel);
4481
4482         if ((pr->pr_flags & PR_RIGHTS) && pr->pr_domain->dom_dispose)
4483                 (*pr->pr_domain->dom_dispose)(asb.sb_mb);
4484
4485         sbrelease(&asb);
4486 }
4487
4488 /*
4489  * Perhaps this routine, and sooptcopyout(), below, ought to come in
4490  * an additional variant to handle the case where the option value needs
4491  * to be some kind of integer, but not a specific size.
4492  * In addition to their use here, these functions are also called by the
4493  * protocol-level pr_ctloutput() routines.
4494  *
4495  * Returns:     0                       Success
4496  *              EINVAL
4497  *      copyin:EFAULT
4498  */
4499 int
4500 sooptcopyin(struct sockopt *sopt, void *buf, size_t len, size_t minlen)
4501 {
4502         size_t  valsize;
4503
4504         /*
4505          * If the user gives us more than we wanted, we ignore it,
4506          * but if we don't get the minimum length the caller
4507          * wants, we return EINVAL.  On success, sopt->sopt_valsize
4508          * is set to however much we actually retrieved.
4509          */
4510         if ((valsize = sopt->sopt_valsize) < minlen)
4511                 return (EINVAL);
4512         if (valsize > len)
4513                 sopt->sopt_valsize = valsize = len;
4514
4515         if (sopt->sopt_p != kernproc)
4516                 return (copyin(sopt->sopt_val, buf, valsize));
4517
4518         bcopy(CAST_DOWN(caddr_t, sopt->sopt_val), buf, valsize);
4519         return (0);
4520 }
4521
4522 /*
4523  * sooptcopyin_timeval
4524  *   Copy in a timeval value into tv_p, and take into account whether the
4525  *   the calling process is 64-bit or 32-bit.  Moved the sanity checking
4526  *   code here so that we can verify the 64-bit tv_sec value before we lose
4527  *   the top 32-bits assigning tv64.tv_sec to tv_p->tv_sec.
4528  */
4529 static int
4530 sooptcopyin_timeval(struct sockopt *sopt, struct timeval *tv_p)
4531 {
4532         int                     error;
4533
4534         if (proc_is64bit(sopt->sopt_p)) {
4535                 struct user64_timeval   tv64;
4536
4537                 if (sopt->sopt_valsize < sizeof (tv64))
4538                         return (EINVAL);
4539
4540                 sopt->sopt_valsize = sizeof (tv64);
4541                 if (sopt->sopt_p != kernproc) {
4542                         error = copyin(sopt->sopt_val, &tv64, sizeof (tv64));
4543                         if (error != 0)
4544                                 return (error);
4545                 } else {
4546                         bcopy(CAST_DOWN(caddr_t, sopt->sopt_val), &tv64,
4547                             sizeof (tv64));
4548                 }
4549                 if (tv64.tv_sec < 0 || tv64.tv_sec > LONG_MAX ||
4550                     tv64.tv_usec < 0 || tv64.tv_usec >= 1000000)
4551                         return (EDOM);
4552
4553                 tv_p->tv_sec = tv64.tv_sec;
4554                 tv_p->tv_usec = tv64.tv_usec;
4555         } else {
4556                 struct user32_timeval   tv32;
4557
4558                 if (sopt->sopt_valsize < sizeof (tv32))
4559                         return (EINVAL);
4560
4561                 sopt->sopt_valsize = sizeof (tv32);
4562                 if (sopt->sopt_p != kernproc) {
4563                         error = copyin(sopt->sopt_val, &tv32, sizeof (tv32));
4564                         if (error != 0) {
4565                                 return (error);
4566                         }
4567                 } else {
4568                         bcopy(CAST_DOWN(caddr_t, sopt->sopt_val), &tv32,
4569                             sizeof (tv32));
4570                 }
4571 #ifndef __LP64__
4572                 /*
4573                  * K64todo "comparison is always false due to
4574                  * limited range of data type"
4575                  */
4576                 if (tv32.tv_sec < 0 || tv32.tv_sec > LONG_MAX ||
4577                     tv32.tv_usec < 0 || tv32.tv_usec >= 1000000)
4578                         return (EDOM);
4579 #endif
4580                 tv_p->tv_sec = tv32.tv_sec;
4581                 tv_p->tv_usec = tv32.tv_usec;
4582         }
4583         return (0);
4584 }
4585
4586 static int
4587 soopt_cred_check(struct socket *so, int priv)
4588 {
4589         kauth_cred_t cred =  NULL;
4590         proc_t ep = PROC_NULL;
4591         int error;
4592
4593         if (so->so_flags & SOF_DELEGATED) {
4594                 ep = proc_find(so->e_pid);
4595                 if (ep)
4596                         cred = kauth_cred_proc_ref(ep);
4597         }
4598         error = priv_check_cred(cred ? cred : so->so_cred, priv, 0);
4599         if (cred)
4600                 kauth_cred_unref(&cred);
4601         if (ep != PROC_NULL)
4602                 proc_rele(ep);
4603
4604         return (error);
4605 }
4606
4607 /*
4608  * Returns:     0                       Success
4609  *              EINVAL
4610  *              ENOPROTOOPT
4611  *              ENOBUFS
4612  *              EDOM
4613  *      sooptcopyin:EINVAL
4614  *      sooptcopyin:EFAULT
4615  *      sooptcopyin_timeval:EINVAL
4616  *      sooptcopyin_timeval:EFAULT
4617  *      sooptcopyin_timeval:EDOM
4618  *      <pr_ctloutput>:EOPNOTSUPP[AF_UNIX]
4619  *      <pr_ctloutput>:???w
4620  *      sflt_attach_private:???         [whatever a filter author chooses]
4621  *      <sf_setoption>:???              [whatever a filter author chooses]
4622  *
4623  * Notes:       Other <pru_listen> returns depend on the protocol family; all
4624  *              <sf_listen> returns depend on what the filter author causes
4625  *              their filter to return.
4626  */
4627 int
4628 sosetoptlock(struct socket *so, struct sockopt *sopt, int dolock)
4629 {
4630         int     error, optval;
4631         struct  linger l;
4632         struct  timeval tv;
4633 #if CONFIG_MACF_SOCKET
4634         struct mac extmac;
4635 #endif /* MAC_SOCKET */
4636
4637         if (sopt->sopt_dir != SOPT_SET)
4638                 sopt->sopt_dir = SOPT_SET;
4639
4640         if (dolock)
4641                 socket_lock(so, 1);
4642
4643         if ((so->so_state & (SS_CANTRCVMORE | SS_CANTSENDMORE)) ==
4644             (SS_CANTRCVMORE | SS_CANTSENDMORE) &&
4645             (so->so_flags & SOF_NPX_SETOPTSHUT) == 0) {
4646                 /* the socket has been shutdown, no more sockopt's */
4647                 error = EINVAL;
4648                 goto out;
4649         }
4650
4651         error = sflt_setsockopt(so, sopt);
4652         if (error != 0) {
4653                 if (error == EJUSTRETURN)
4654                         error = 0;
4655                 goto out;
4656         }
4657
4658         if (sopt->sopt_level != SOL_SOCKET) {
4659                 if (so->so_proto != NULL &&
4660                     so->so_proto->pr_ctloutput != NULL) {
4661                         error = (*so->so_proto->pr_ctloutput)(so, sopt);
4662                         goto out;
4663                 }
4664                 error = ENOPROTOOPT;
4665         } else {
4666                 /*
4667                  * Allow socket-level (SOL_SOCKET) options to be filtered by
4668                  * the protocol layer, if needed.  A zero value returned from
4669                  * the handler means use default socket-level processing as
4670                  * done by the rest of this routine.  Otherwise, any other
4671                  * return value indicates that the option is unsupported.
4672                  */
4673                 if (so->so_proto != NULL && (error = so->so_proto->pr_usrreqs->
4674                     pru_socheckopt(so, sopt)) != 0)
4675                         goto out;
4676
4677                 error = 0;
4678                 switch (sopt->sopt_name) {
4679                 case SO_LINGER:
4680                 case SO_LINGER_SEC:
4681                         error = sooptcopyin(sopt, &l, sizeof (l), sizeof (l));
4682                         if (error != 0)
4683                                 goto out;
4684
4685                         so->so_linger = (sopt->sopt_name == SO_LINGER) ?
4686                             l.l_linger : l.l_linger * hz;
4687                         if (l.l_onoff != 0)
4688                                 so->so_options |= SO_LINGER;
4689                         else
4690                                 so->so_options &= ~SO_LINGER;
4691                         break;
4692
4693                 case SO_DEBUG:
4694                 case SO_KEEPALIVE:
4695                 case SO_DONTROUTE:
4696                 case SO_USELOOPBACK:
4697                 case SO_BROADCAST:
4698                 case SO_REUSEADDR:
4699                 case SO_REUSEPORT:
4700                 case SO_OOBINLINE:
4701                 case SO_TIMESTAMP:
4702                 case SO_TIMESTAMP_MONOTONIC:
4703                 case SO_DONTTRUNC:
4704                 case SO_WANTMORE:
4705                 case SO_WANTOOBFLAG:
4706                 case SO_NOWAKEFROMSLEEP:
4707                 case SO_NOAPNFALLBK:
4708                         error = sooptcopyin(sopt, &optval, sizeof (optval),
4709                             sizeof (optval));
4710                         if (error != 0)
4711                                 goto out;
4712                         if (optval)
4713                                 so->so_options |= sopt->sopt_name;
4714                         else
4715                                 so->so_options &= ~sopt->sopt_name;
4716                         break;
4717
4718                 case SO_SNDBUF:
4719                 case SO_RCVBUF:
4720                 case SO_SNDLOWAT:
4721                 case SO_RCVLOWAT:
4722                         error = sooptcopyin(sopt, &optval, sizeof (optval),
4723                             sizeof (optval));
4724                         if (error != 0)
4725                                 goto out;
4726
4727                         /*
4728                          * Values < 1 make no sense for any of these
4729                          * options, so disallow them.
4730                          */
4731                         if (optval < 1) {
4732                                 error = EINVAL;
4733                                 goto out;
4734                         }
4735
4736                         switch (sopt->sopt_name) {
4737                         case SO_SNDBUF:
4738                         case SO_RCVBUF: {
4739                                 struct sockbuf *sb =
4740                                     (sopt->sopt_name == SO_SNDBUF) ?
4741                                     &so->so_snd : &so->so_rcv;
4742                                 if (sbreserve(sb, (u_int32_t)optval) == 0) {
4743                                         error = ENOBUFS;
4744                                         goto out;
4745                                 }
4746                                 sb->sb_flags |= SB_USRSIZE;
4747                                 sb->sb_flags &= ~SB_AUTOSIZE;
4748                                 sb->sb_idealsize = (u_int32_t)optval;
4749                                 break;
4750                         }
4751                         /*
4752                          * Make sure the low-water is never greater than
4753                          * the high-water.
4754                          */
4755                         case SO_SNDLOWAT: {
4756                                 int space = sbspace(&so->so_snd);
4757                                 u_int32_t hiwat = so->so_snd.sb_hiwat;
4758
4759                                 if (so->so_snd.sb_flags & SB_UNIX) {
4760                                         struct unpcb *unp =
4761                                             (struct unpcb *)(so->so_pcb);
4762                                         if (unp != NULL &&
4763                                             unp->unp_conn != NULL) {
4764                                                 hiwat += unp->unp_conn->unp_cc;
4765                                         }
4766                                 }
4767
4768                                 so->so_snd.sb_lowat =
4769                                     (optval > hiwat) ?
4770                                     hiwat : optval;
4771
4772                                 if (space >= so->so_snd.sb_lowat) {
4773                                         sowwakeup(so);
4774                                 }
4775                                 break;
4776                         }
4777                         case SO_RCVLOWAT: {
4778                                 int64_t data_len;
4779                                 so->so_rcv.sb_lowat =
4780                                     (optval > so->so_rcv.sb_hiwat) ?
4781                                     so->so_rcv.sb_hiwat : optval;
4782                                 data_len = so->so_rcv.sb_cc
4783                                     - so->so_rcv.sb_ctl;
4784                                 if (data_len >= so->so_rcv.sb_lowat)
4785                                     sorwakeup(so);
4786                                 break;
4787                         }
4788                         }
4789                         break;
4790
4791                 case SO_SNDTIMEO:
4792                 case SO_RCVTIMEO:
4793                         error = sooptcopyin_timeval(sopt, &tv);
4794                         if (error != 0)
4795                                 goto out;
4796
4797                         switch (sopt->sopt_name) {
4798                         case SO_SNDTIMEO:
4799                                 so->so_snd.sb_timeo = tv;
4800                                 break;
4801                         case SO_RCVTIMEO:
4802                                 so->so_rcv.sb_timeo = tv;
4803                                 break;
4804                         }
4805                         break;
4806
4807                 case SO_NKE: {
4808                         struct so_nke nke;
4809
4810                         error = sooptcopyin(sopt, &nke, sizeof (nke),
4811                             sizeof (nke));
4812                         if (error != 0)
4813                                 goto out;
4814
4815                         error = sflt_attach_internal(so, nke.nke_handle);
4816                         break;
4817                 }
4818
4819                 case SO_NOSIGPIPE:
4820                         error = sooptcopyin(sopt, &optval, sizeof (optval),
4821                             sizeof (optval));
4822                         if (error != 0)
4823                                 goto out;
4824                         if (optval != 0)
4825                                 so->so_flags |= SOF_NOSIGPIPE;
4826                         else
4827                                 so->so_flags &= ~SOF_NOSIGPIPE;
4828                         break;
4829
4830                 case SO_NOADDRERR:
4831                         error = sooptcopyin(sopt, &optval, sizeof (optval),
4832                             sizeof (optval));
4833                         if (error != 0)
4834                                 goto out;
4835                         if (optval != 0)
4836                                 so->so_flags |= SOF_NOADDRAVAIL;
4837                         else
4838                                 so->so_flags &= ~SOF_NOADDRAVAIL;
4839                         break;
4840
4841                 case SO_REUSESHAREUID:
4842                         error = sooptcopyin(sopt, &optval, sizeof (optval),
4843                             sizeof (optval));
4844                         if (error != 0)
4845                                 goto out;
4846                         if (optval != 0)
4847                                 so->so_flags |= SOF_REUSESHAREUID;
4848                         else
4849                                 so->so_flags &= ~SOF_REUSESHAREUID;
4850                         break;
4851
4852                 case SO_NOTIFYCONFLICT:
4853                         if (kauth_cred_issuser(kauth_cred_get()) == 0) {
4854                                 error = EPERM;
4855                                 goto out;
4856                         }
4857                         error = sooptcopyin(sopt, &optval, sizeof (optval),
4858                             sizeof (optval));
4859                         if (error != 0)
4860                                 goto out;
4861                         if (optval != 0)
4862                                 so->so_flags |= SOF_NOTIFYCONFLICT;
4863                         else
4864                                 so->so_flags &= ~SOF_NOTIFYCONFLICT;
4865                         break;
4866
4867                 case SO_RESTRICTIONS:
4868                         error = sooptcopyin(sopt, &optval, sizeof (optval),
4869                             sizeof (optval));
4870                         if (error != 0)
4871                                 goto out;
4872
4873                         error = so_set_restrictions(so, optval);
4874                         break;
4875
4876                 case SO_AWDL_UNRESTRICTED:
4877                         if (SOCK_DOM(so) != PF_INET &&
4878                             SOCK_DOM(so) != PF_INET6) {
4879                                 error = EOPNOTSUPP;
4880                                 goto out;
4881                         }
4882                         error = sooptcopyin(sopt, &optval, sizeof(optval),
4883                             sizeof(optval));
4884                         if (error != 0)
4885                                 goto out;
4886                         if (optval != 0) {
4887                                 error = soopt_cred_check(so,
4888                                     PRIV_NET_RESTRICTED_AWDL);
4889                                 if (error == 0)
4890                                         inp_set_awdl_unrestricted(
4891                                             sotoinpcb(so));
4892                         } else
4893                                 inp_clear_awdl_unrestricted(sotoinpcb(so));
4894                         break;
4895                 case SO_INTCOPROC_ALLOW:
4896                         if (SOCK_DOM(so) != PF_INET6) {
4897                                 error = EOPNOTSUPP;
4898                                 goto out;
4899                         }
4900                         error = sooptcopyin(sopt, &optval, sizeof(optval),
4901                             sizeof(optval));
4902                         if (error != 0)
4903                                 goto out;
4904                         if (optval != 0 &&
4905                                         inp_get_intcoproc_allowed(sotoinpcb(so)) == FALSE) {
4906                                 error = soopt_cred_check(so,
4907                                     PRIV_NET_RESTRICTED_INTCOPROC);
4908                                 if (error == 0)
4909                                         inp_set_intcoproc_allowed(
4910                                             sotoinpcb(so));
4911                         } else if (optval == 0)
4912                                 inp_clear_intcoproc_allowed(sotoinpcb(so));
4913                         break;
4914
4915                 case SO_LABEL:
4916 #if CONFIG_MACF_SOCKET
4917                         if ((error = sooptcopyin(sopt, &extmac, sizeof (extmac),
4918                             sizeof (extmac))) != 0)
4919                                 goto out;
4920
4921                         error = mac_setsockopt_label(proc_ucred(sopt->sopt_p),
4922                             so, &extmac);
4923 #else
4924                         error = EOPNOTSUPP;
4925 #endif /* MAC_SOCKET */
4926                         break;
4927
4928                 case SO_UPCALLCLOSEWAIT:
4929                         error = sooptcopyin(sopt, &optval, sizeof (optval),
4930                             sizeof (optval));
4931                         if (error != 0)
4932                                 goto out;
4933                         if (optval != 0)
4934                                 so->so_flags |= SOF_UPCALLCLOSEWAIT;
4935                         else
4936                                 so->so_flags &= ~SOF_UPCALLCLOSEWAIT;
4937                         break;
4938
4939                 case SO_RANDOMPORT:
4940                         error = sooptcopyin(sopt, &optval, sizeof (optval),
4941                             sizeof (optval));
4942                         if (error != 0)
4943                                 goto out;
4944                         if (optval != 0)
4945                                 so->so_flags |= SOF_BINDRANDOMPORT;
4946                         else
4947                                 so->so_flags &= ~SOF_BINDRANDOMPORT;
4948                         break;
4949
4950                 case SO_NP_EXTENSIONS: {
4951                         struct so_np_extensions sonpx;
4952
4953                         error = sooptcopyin(sopt, &sonpx, sizeof (sonpx),
4954                             sizeof (sonpx));
4955                         if (error != 0)
4956                                 goto out;
4957                         if (sonpx.npx_mask & ~SONPX_MASK_VALID) {
4958                                 error = EINVAL;
4959                                 goto out;
4960                         }
4961                         /*
4962                          * Only one bit defined for now
4963                          */
4964                         if ((sonpx.npx_mask & SONPX_SETOPTSHUT)) {
4965                                 if ((sonpx.npx_flags & SONPX_SETOPTSHUT))
4966                                         so->so_flags |= SOF_NPX_SETOPTSHUT;
4967                                 else
4968                                         so->so_flags &= ~SOF_NPX_SETOPTSHUT;
4969                         }
4970                         break;
4971                 }
4972
4973                 case SO_TRAFFIC_CLASS: {
4974                         error = sooptcopyin(sopt, &optval, sizeof (optval),
4975                             sizeof (optval));
4976                         if (error != 0)
4977                                 goto out;
4978                         if (optval >= SO_TC_NET_SERVICE_OFFSET) {
4979                                 int netsvc = optval - SO_TC_NET_SERVICE_OFFSET;
4980                                 error = so_set_net_service_type(so, netsvc);
4981                                 goto out;
4982                         }
4983                         error = so_set_traffic_class(so, optval);
4984                         if (error != 0)
4985                                 goto out;
4986                         so->so_flags1 &= ~SOF1_TC_NET_SERV_TYPE;
4987                         so->so_netsvctype = _NET_SERVICE_TYPE_UNSPEC;
4988                         break;
4989                 }
4990
4991                 case SO_RECV_TRAFFIC_CLASS: {
4992                         error = sooptcopyin(sopt, &optval, sizeof (optval),
4993                             sizeof (optval));
4994                         if (error != 0)
4995                                 goto out;
4996                         if (optval == 0)
4997                                 so->so_flags &= ~SOF_RECV_TRAFFIC_CLASS;
4998                         else
4999                                 so->so_flags |= SOF_RECV_TRAFFIC_CLASS;
5000                         break;
5001                 }
5002
5003 #if (DEVELOPMENT || DEBUG)
5004                 case SO_TRAFFIC_CLASS_DBG: {
5005                         struct so_tcdbg so_tcdbg;
5006
5007                         error = sooptcopyin(sopt, &so_tcdbg,
5008                             sizeof (struct so_tcdbg), sizeof (struct so_tcdbg));
5009                         if (error != 0)
5010                                 goto out;
5011                         error = so_set_tcdbg(so, &so_tcdbg);
5012                         if (error != 0)
5013                                 goto out;
5014                         break;
5015                 }
5016 #endif /* (DEVELOPMENT || DEBUG) */
5017
5018                 case SO_PRIVILEGED_TRAFFIC_CLASS:
5019                         error = priv_check_cred(kauth_cred_get(),
5020                             PRIV_NET_PRIVILEGED_TRAFFIC_CLASS, 0);
5021                         if (error != 0)
5022                                 goto out;
5023                         error = sooptcopyin(sopt, &optval, sizeof (optval),
5024                             sizeof (optval));
5025                         if (error != 0)
5026                                 goto out;
5027                         if (optval == 0)
5028                                 so->so_flags &= ~SOF_PRIVILEGED_TRAFFIC_CLASS;
5029                         else
5030                                 so->so_flags |= SOF_PRIVILEGED_TRAFFIC_CLASS;
5031                         break;
5032
5033                 case SO_DEFUNCTOK:
5034                         error = sooptcopyin(sopt, &optval, sizeof (optval),
5035                             sizeof (optval));
5036                         if (error != 0 || (so->so_flags & SOF_DEFUNCT)) {
5037                                 if (error == 0)
5038                                         error = EBADF;
5039                                 goto out;
5040                         }
5041                         /*
5042                          * Any process can set SO_DEFUNCTOK (clear
5043                          * SOF_NODEFUNCT), but only root can clear
5044                          * SO_DEFUNCTOK (set SOF_NODEFUNCT).
5045                          */
5046                         if (optval == 0 &&
5047                             kauth_cred_issuser(kauth_cred_get()) == 0) {
5048                                 error = EPERM;
5049                                 goto out;
5050                         }
5051                         if (optval)
5052                                 so->so_flags &= ~SOF_NODEFUNCT;
5053                         else
5054                                 so->so_flags |= SOF_NODEFUNCT;
5055
5056                         if (SOCK_DOM(so) == PF_INET ||
5057                             SOCK_DOM(so) == PF_INET6) {
5058                                 char s[MAX_IPv6_STR_LEN];
5059                                 char d[MAX_IPv6_STR_LEN];
5060                                 struct inpcb *inp = sotoinpcb(so);
5061
5062                                 SODEFUNCTLOG("%s[%d, %s]: so 0x%llx "
5063                                     "[%s %s:%d -> %s:%d] is now marked "
5064                                     "as %seligible for "
5065                                     "defunct\n", __func__, proc_selfpid(),
5066                                     proc_best_name(current_proc()),
5067                                     (uint64_t)DEBUG_KERNEL_ADDRPERM(so),
5068                                     (SOCK_TYPE(so) == SOCK_STREAM) ?
5069                                     "TCP" : "UDP", inet_ntop(SOCK_DOM(so),
5070                                     ((SOCK_DOM(so) == PF_INET) ?
5071                                     (void *)&inp->inp_laddr.s_addr :
5072                                     (void *)&inp->in6p_laddr), s, sizeof (s)),
5073                                     ntohs(inp->in6p_lport),
5074                                     inet_ntop(SOCK_DOM(so),
5075                                     (SOCK_DOM(so) == PF_INET) ?
5076                                     (void *)&inp->inp_faddr.s_addr :
5077                                     (void *)&inp->in6p_faddr, d, sizeof (d)),
5078                                     ntohs(inp->in6p_fport),
5079                                     (so->so_flags & SOF_NODEFUNCT) ?
5080                                     "not " : "");
5081                         } else {
5082                                 SODEFUNCTLOG("%s[%d, %s]: so 0x%llx [%d,%d] "
5083                                     "is now marked as %seligible for "
5084                                     "defunct\n",
5085                                     __func__, proc_selfpid(),
5086                                     proc_best_name(current_proc()),
5087                                     (uint64_t)DEBUG_KERNEL_ADDRPERM(so),
5088                                     SOCK_DOM(so), SOCK_TYPE(so),
5089                                     (so->so_flags & SOF_NODEFUNCT) ?
5090                                     "not " : "");
5091                         }
5092                         break;
5093
5094                 case SO_ISDEFUNCT:
5095                         /* This option is not settable */
5096                         error = EINVAL;
5097                         break;
5098
5099                 case SO_OPPORTUNISTIC:
5100                         error = sooptcopyin(sopt, &optval, sizeof (optval),
5101                             sizeof (optval));
5102                         if (error == 0)
5103                                 error = so_set_opportunistic(so, optval);
5104                         break;
5105
5106                 case SO_FLUSH:
5107                         /* This option is handled by lower layer(s) */
5108                         error = 0;
5109                         break;
5110
5111                 case SO_RECV_ANYIF:
5112                         error = sooptcopyin(sopt, &optval, sizeof (optval),
5113                             sizeof (optval));
5114                         if (error == 0)
5115                                 error = so_set_recv_anyif(so, optval);
5116                         break;
5117
5118                 case SO_TRAFFIC_MGT_BACKGROUND: {
5119                         /* This option is handled by lower layer(s) */
5120                         error = 0;
5121                         break;
5122                 }
5123
5124 #if FLOW_DIVERT
5125                 case SO_FLOW_DIVERT_TOKEN:
5126                         error = flow_divert_token_set(so, sopt);
5127                         break;
5128 #endif  /* FLOW_DIVERT */
5129
5130
5131                 case SO_DELEGATED:
5132                         if ((error = sooptcopyin(sopt, &optval, sizeof (optval),
5133                             sizeof (optval))) != 0)
5134                                 break;
5135
5136                         error = so_set_effective_pid(so, optval, sopt->sopt_p);
5137                         break;
5138
5139                 case SO_DELEGATED_UUID: {
5140                         uuid_t euuid;
5141
5142                         if ((error = sooptcopyin(sopt, &euuid, sizeof (euuid),
5143                             sizeof (euuid))) != 0)
5144                                 break;
5145
5146                         error = so_set_effective_uuid(so, euuid, sopt->sopt_p);
5147                         break;
5148                 }
5149
5150 #if NECP
5151                 case SO_NECP_ATTRIBUTES:
5152                         error = necp_set_socket_attributes(so, sopt);
5153                         break;
5154 #endif /* NECP */
5155
5156 #if MPTCP
5157                 case SO_MPTCP_FASTJOIN:
5158                         if (!((so->so_flags & SOF_MP_SUBFLOW) ||
5159                             ((SOCK_CHECK_DOM(so, PF_MULTIPATH)) &&
5160                             (SOCK_CHECK_PROTO(so, IPPROTO_TCP))))) {
5161                                 error = ENOPROTOOPT;
5162                                 break;
5163                         }
5164
5165                         error = sooptcopyin(sopt, &optval, sizeof (optval),
5166                             sizeof (optval));
5167                         if (error != 0)
5168                                 goto out;
5169                         if (optval == 0)
5170                                 so->so_flags &= ~SOF_MPTCP_FASTJOIN;
5171                         else
5172                                 so->so_flags |= SOF_MPTCP_FASTJOIN;
5173                         break;
5174 #endif /* MPTCP */
5175
5176                 case SO_EXTENDED_BK_IDLE:
5177                         error = sooptcopyin(sopt, &optval, sizeof (optval),
5178                             sizeof (optval));
5179                         if (error == 0)
5180                                 error = so_set_extended_bk_idle(so, optval);
5181                         break;
5182
5183                 case SO_MARK_CELLFALLBACK:
5184                         error = sooptcopyin(sopt, &optval, sizeof(optval),
5185                             sizeof(optval));
5186                         if (error != 0)
5187                                 goto out;
5188                         if (optval < 0) {
5189                                 error = EINVAL;
5190                                 goto out;
5191                         }
5192                         if (optval == 0)
5193                                 so->so_flags1 &= ~SOF1_CELLFALLBACK;
5194                         else
5195                                 so->so_flags1 |= SOF1_CELLFALLBACK;
5196                         break;
5197
5198                 case SO_NET_SERVICE_TYPE: {
5199                         error = sooptcopyin(sopt, &optval, sizeof(optval),
5200                             sizeof(optval));
5201                         if (error != 0)
5202                                 goto out;
5203                         error = so_set_net_service_type(so, optval);
5204                         break;
5205                 }
5206
5207                 case SO_QOSMARKING_POLICY_OVERRIDE:
5208                         error = priv_check_cred(kauth_cred_get(),
5209                             PRIV_NET_QOSMARKING_POLICY_OVERRIDE, 0);
5210                         if (error != 0)
5211                                 goto out;
5212                         error = sooptcopyin(sopt, &optval, sizeof(optval),
5213                             sizeof(optval));
5214                         if (error != 0)
5215                                 goto out;
5216                         if (optval == 0)
5217                                 so->so_flags1 &= ~SOF1_QOSMARKING_POLICY_OVERRIDE;
5218                         else
5219                                 so->so_flags1 |= SOF1_QOSMARKING_POLICY_OVERRIDE;
5220                         break;
5221
5222                 default:
5223                         error = ENOPROTOOPT;
5224                         break;
5225                 }
5226                 if (error == 0 && so->so_proto != NULL &&
5227                     so->so_proto->pr_ctloutput != NULL) {
5228                         (void) so->so_proto->pr_ctloutput(so, sopt);
5229                 }
5230         }
5231 out:
5232         if (dolock)
5233                 socket_unlock(so, 1);
5234         return (error);
5235 }
5236
5237 /* Helper routines for getsockopt */
5238 int
5239 sooptcopyout(struct sockopt *sopt, void *buf, size_t len)
5240 {
5241         int     error;
5242         size_t  valsize;
5243
5244         error = 0;
5245
5246         /*
5247          * Documented get behavior is that we always return a value,
5248          * possibly truncated to fit in the user's buffer.
5249          * Traditional behavior is that we always tell the user
5250          * precisely how much we copied, rather than something useful
5251          * like the total amount we had available for her.
5252          * Note that this interface is not idempotent; the entire answer must
5253          * generated ahead of time.
5254          */
5255         valsize = min(len, sopt->sopt_valsize);
5256         sopt->sopt_valsize = valsize;
5257         if (sopt->sopt_val != USER_ADDR_NULL) {
5258                 if (sopt->sopt_p != kernproc)
5259                         error = copyout(buf, sopt->sopt_val, valsize);
5260                 else
5261                         bcopy(buf, CAST_DOWN(caddr_t, sopt->sopt_val), valsize);
5262         }
5263         return (error);
5264 }
5265
5266 static int
5267 sooptcopyout_timeval(struct sockopt *sopt, const struct timeval *tv_p)
5268 {
5269         int                     error;
5270         size_t                  len;
5271         struct user64_timeval   tv64;
5272         struct user32_timeval   tv32;
5273         const void *            val;
5274         size_t                  valsize;
5275
5276         error = 0;
5277         if (proc_is64bit(sopt->sopt_p)) {
5278                 len = sizeof (tv64);
5279                 tv64.tv_sec = tv_p->tv_sec;
5280                 tv64.tv_usec = tv_p->tv_usec;
5281                 val = &tv64;
5282         } else {
5283                 len = sizeof (tv32);
5284                 tv32.tv_sec = tv_p->tv_sec;
5285                 tv32.tv_usec = tv_p->tv_usec;
5286                 val = &tv32;
5287         }
5288         valsize = min(len, sopt->sopt_valsize);
5289         sopt->sopt_valsize = valsize;
5290         if (sopt->sopt_val != USER_ADDR_NULL) {
5291                 if (sopt->sopt_p != kernproc)
5292                         error = copyout(val, sopt->sopt_val, valsize);
5293                 else
5294                         bcopy(val, CAST_DOWN(caddr_t, sopt->sopt_val), valsize);
5295         }
5296         return (error);
5297 }
5298
5299 /*
5300  * Return:      0                       Success
5301  *              ENOPROTOOPT
5302  *      <pr_ctloutput>:EOPNOTSUPP[AF_UNIX]
5303  *      <pr_ctloutput>:???
5304  *      <sf_getoption>:???
5305  */
5306 int
5307 sogetoptlock(struct socket *so, struct sockopt *sopt, int dolock)
5308 {
5309         int     error, optval;
5310         struct  linger l;
5311         struct  timeval tv;
5312 #if CONFIG_MACF_SOCKET
5313         struct mac extmac;
5314 #endif /* MAC_SOCKET */
5315
5316         if (sopt->sopt_dir != SOPT_GET)
5317                 sopt->sopt_dir = SOPT_GET;
5318
5319         if (dolock)
5320                 socket_lock(so, 1);
5321
5322         error = sflt_getsockopt(so, sopt);
5323         if (error != 0) {
5324                 if (error == EJUSTRETURN)
5325                         error = 0;
5326                 goto out;
5327         }
5328
5329         if (sopt->sopt_level != SOL_SOCKET) {
5330                 if (so->so_proto != NULL &&
5331                     so->so_proto->pr_ctloutput != NULL) {
5332                         error = (*so->so_proto->pr_ctloutput)(so, sopt);
5333                         goto out;
5334                 }
5335                 error = ENOPROTOOPT;
5336         } else {
5337                 /*
5338                  * Allow socket-level (SOL_SOCKET) options to be filtered by
5339                  * the protocol layer, if needed.  A zero value returned from
5340                  * the handler means use default socket-level processing as
5341                  * done by the rest of this routine.  Otherwise, any other
5342                  * return value indicates that the option is unsupported.
5343                  */
5344                 if (so->so_proto != NULL && (error = so->so_proto->pr_usrreqs->
5345                     pru_socheckopt(so, sopt)) != 0)
5346                         goto out;
5347
5348                 error = 0;
5349                 switch (sopt->sopt_name) {
5350                 case SO_LINGER:
5351                 case SO_LINGER_SEC:
5352                         l.l_onoff = ((so->so_options & SO_LINGER) ? 1 : 0);
5353                         l.l_linger = (sopt->sopt_name == SO_LINGER) ?
5354                             so->so_linger : so->so_linger / hz;
5355                         error = sooptcopyout(sopt, &l, sizeof (l));
5356                         break;
5357
5358                 case SO_USELOOPBACK:
5359                 case SO_DONTROUTE:
5360                 case SO_DEBUG:
5361                 case SO_KEEPALIVE:
5362                 case SO_REUSEADDR:
5363                 case SO_REUSEPORT:
5364                 case SO_BROADCAST:
5365                 case SO_OOBINLINE:
5366                 case SO_TIMESTAMP:
5367                 case SO_TIMESTAMP_MONOTONIC:
5368                 case SO_DONTTRUNC:
5369                 case SO_WANTMORE:
5370                 case SO_WANTOOBFLAG:
5371                 case SO_NOWAKEFROMSLEEP:
5372                 case SO_NOAPNFALLBK:
5373                         optval = so->so_options & sopt->sopt_name;
5374 integer:
5375                         error = sooptcopyout(sopt, &optval, sizeof (optval));
5376                         break;
5377
5378                 case SO_TYPE:
5379                         optval = so->so_type;
5380                         goto integer;
5381
5382                 case SO_NREAD:
5383                         if (so->so_proto->pr_flags & PR_ATOMIC) {
5384                                 int pkt_total;
5385                                 struct mbuf *m1;
5386
5387                                 pkt_total = 0;
5388                                 m1 = so->so_rcv.sb_mb;
5389                                 while (m1 != NULL) {
5390                                         if (m1->m_type == MT_DATA ||
5391                                             m1->m_type == MT_HEADER ||
5392                                             m1->m_type == MT_OOBDATA)
5393                                                 pkt_total += m1->m_len;
5394                                         m1 = m1->m_next;
5395                                 }
5396                                 optval = pkt_total;
5397                         } else {
5398                                 optval = so->so_rcv.sb_cc - so->so_rcv.sb_ctl;
5399                         }
5400                         goto integer;
5401
5402                 case SO_NUMRCVPKT:
5403                         if (so->so_proto->pr_flags & PR_ATOMIC) {
5404                                 int cnt = 0;
5405                                 struct mbuf *m1;
5406
5407                                 m1 = so->so_rcv.sb_mb;
5408                                 while (m1 != NULL) {
5409                                         if (m1->m_type == MT_DATA ||
5410                                             m1->m_type == MT_HEADER ||
5411                                             m1->m_type == MT_OOBDATA)
5412                                                 cnt += 1;
5413                                         m1 = m1->m_nextpkt;
5414                                 }
5415                                 optval = cnt;
5416                                 goto integer;
5417                         } else {
5418                                 error = EINVAL;
5419                                 break;
5420                         }
5421
5422                 case SO_NWRITE:
5423                         optval = so->so_snd.sb_cc;
5424                         goto integer;
5425
5426                 case SO_ERROR:
5427                         optval = so->so_error;
5428                         so->so_error = 0;
5429                         goto integer;
5430
5431                 case SO_SNDBUF: {
5432                         u_int32_t hiwat = so->so_snd.sb_hiwat;
5433
5434                         if (so->so_snd.sb_flags & SB_UNIX) {
5435                                 struct unpcb *unp =
5436                                     (struct unpcb *)(so->so_pcb);
5437                                 if (unp != NULL && unp->unp_conn != NULL) {
5438                                         hiwat += unp->unp_conn->unp_cc;
5439                                 }
5440                         }
5441
5442                         optval = hiwat;
5443                         goto integer;
5444                 }
5445                 case SO_RCVBUF:
5446                         optval = so->so_rcv.sb_hiwat;
5447                         goto integer;
5448
5449                 case SO_SNDLOWAT:
5450                         optval = so->so_snd.sb_lowat;
5451                         goto integer;
5452
5453                 case SO_RCVLOWAT:
5454                         optval = so->so_rcv.sb_lowat;
5455                         goto integer;
5456
5457                 case SO_SNDTIMEO:
5458                 case SO_RCVTIMEO:
5459                         tv = (sopt->sopt_name == SO_SNDTIMEO ?
5460                             so->so_snd.sb_timeo : so->so_rcv.sb_timeo);
5461
5462                         error = sooptcopyout_timeval(sopt, &tv);
5463                         break;
5464
5465                 case SO_NOSIGPIPE:
5466                         optval = (so->so_flags & SOF_NOSIGPIPE);
5467                         goto integer;
5468
5469                 case SO_NOADDRERR:
5470                         optval = (so->so_flags & SOF_NOADDRAVAIL);
5471                         goto integer;
5472
5473                 case SO_REUSESHAREUID:
5474                         optval = (so->so_flags & SOF_REUSESHAREUID);
5475                         goto integer;
5476
5477
5478                 case SO_NOTIFYCONFLICT:
5479                         optval = (so->so_flags & SOF_NOTIFYCONFLICT);
5480                         goto integer;
5481
5482                 case SO_RESTRICTIONS:
5483                         optval = so_get_restrictions(so);
5484                         goto integer;
5485
5486                 case SO_AWDL_UNRESTRICTED:
5487                         if (SOCK_DOM(so) == PF_INET ||
5488                             SOCK_DOM(so) == PF_INET6) {
5489                                 optval = inp_get_awdl_unrestricted(
5490                                     sotoinpcb(so));
5491                                 goto integer;
5492                         } else
5493                                 error = EOPNOTSUPP;
5494                         break;
5495
5496                 case SO_INTCOPROC_ALLOW:
5497                         if (SOCK_DOM(so) == PF_INET6) {
5498                                 optval = inp_get_intcoproc_allowed(
5499                                     sotoinpcb(so));
5500                                 goto integer;
5501                         } else
5502                                 error = EOPNOTSUPP;
5503                         break;
5504
5505                 case SO_LABEL:
5506 #if CONFIG_MACF_SOCKET
5507                         if ((error = sooptcopyin(sopt, &extmac, sizeof (extmac),
5508                             sizeof (extmac))) != 0 ||
5509                             (error = mac_socket_label_get(proc_ucred(
5510                             sopt->sopt_p), so, &extmac)) != 0)
5511                                 break;
5512
5513                         error = sooptcopyout(sopt, &extmac, sizeof (extmac));
5514 #else
5515                         error = EOPNOTSUPP;
5516 #endif /* MAC_SOCKET */
5517                         break;
5518
5519                 case SO_PEERLABEL:
5520 #if CONFIG_MACF_SOCKET
5521                         if ((error = sooptcopyin(sopt, &extmac, sizeof (extmac),
5522                             sizeof (extmac))) != 0 ||
5523                             (error = mac_socketpeer_label_get(proc_ucred(
5524                             sopt->sopt_p), so, &extmac)) != 0)
5525                                 break;
5526
5527                         error = sooptcopyout(sopt, &extmac, sizeof (extmac));
5528 #else
5529                         error = EOPNOTSUPP;
5530 #endif /* MAC_SOCKET */
5531                         break;
5532
5533 #ifdef __APPLE_API_PRIVATE
5534                 case SO_UPCALLCLOSEWAIT:
5535                         optval = (so->so_flags & SOF_UPCALLCLOSEWAIT);
5536                         goto integer;
5537 #endif
5538                 case SO_RANDOMPORT:
5539                         optval = (so->so_flags & SOF_BINDRANDOMPORT);
5540                         goto integer;
5541
5542                 case SO_NP_EXTENSIONS: {
5543                         struct so_np_extensions sonpx;
5544
5545                         sonpx.npx_flags = (so->so_flags & SOF_NPX_SETOPTSHUT) ?
5546                             SONPX_SETOPTSHUT : 0;
5547                         sonpx.npx_mask = SONPX_MASK_VALID;
5548
5549                         error = sooptcopyout(sopt, &sonpx,
5550                             sizeof (struct so_np_extensions));
5551                         break;
5552                 }
5553
5554                 case SO_TRAFFIC_CLASS:
5555                         optval = so->so_traffic_class;
5556                         goto integer;
5557
5558                 case SO_RECV_TRAFFIC_CLASS:
5559                         optval = (so->so_flags & SOF_RECV_TRAFFIC_CLASS);
5560                         goto integer;
5561
5562                 case SO_TRAFFIC_CLASS_STATS:
5563                         error = sooptcopyout(sopt, &so->so_tc_stats,
5564                             sizeof (so->so_tc_stats));
5565                         break;
5566
5567 #if (DEVELOPMENT || DEBUG)
5568                 case SO_TRAFFIC_CLASS_DBG:
5569                         error = sogetopt_tcdbg(so, sopt);
5570                         break;
5571 #endif /* (DEVELOPMENT || DEBUG) */
5572
5573                 case SO_PRIVILEGED_TRAFFIC_CLASS:
5574                         optval = (so->so_flags & SOF_PRIVILEGED_TRAFFIC_CLASS);
5575                         goto integer;
5576
5577                 case SO_DEFUNCTOK:
5578                         optval = !(so->so_flags & SOF_NODEFUNCT);
5579                         goto integer;
5580
5581                 case SO_ISDEFUNCT:
5582                         optval = (so->so_flags & SOF_DEFUNCT);
5583                         goto integer;
5584
5585                 case SO_OPPORTUNISTIC:
5586                         optval = so_get_opportunistic(so);
5587                         goto integer;
5588
5589                 case SO_FLUSH:
5590                         /* This option is not gettable */
5591                         error = EINVAL;
5592                         break;
5593
5594                 case SO_RECV_ANYIF:
5595                         optval = so_get_recv_anyif(so);
5596                         goto integer;
5597
5598                 case SO_TRAFFIC_MGT_BACKGROUND:
5599                         /* This option is handled by lower layer(s) */
5600                         if (so->so_proto != NULL &&
5601                             so->so_proto->pr_ctloutput != NULL) {
5602                                 (void) so->so_proto->pr_ctloutput(so, sopt);
5603                         }
5604                         break;
5605
5606 #if FLOW_DIVERT
5607                 case SO_FLOW_DIVERT_TOKEN:
5608                         error = flow_divert_token_get(so, sopt);
5609                         break;
5610 #endif  /* FLOW_DIVERT */
5611
5612 #if NECP
5613                 case SO_NECP_ATTRIBUTES:
5614                         error = necp_get_socket_attributes(so, sopt);
5615                         break;
5616 #endif /* NECP */
5617
5618 #if CONTENT_FILTER
5619                 case SO_CFIL_SOCK_ID: {
5620                         cfil_sock_id_t sock_id;
5621
5622                         sock_id = cfil_sock_id_from_socket(so);
5623
5624                         error = sooptcopyout(sopt, &sock_id,
5625                                 sizeof(cfil_sock_id_t));
5626                         break;
5627                 }
5628 #endif  /* CONTENT_FILTER */
5629
5630 #if MPTCP
5631                 case SO_MPTCP_FASTJOIN:
5632                         if (!((so->so_flags & SOF_MP_SUBFLOW) ||
5633                             ((SOCK_CHECK_DOM(so, PF_MULTIPATH)) &&
5634                             (SOCK_CHECK_PROTO(so, IPPROTO_TCP))))) {
5635                                 error = ENOPROTOOPT;
5636                                 break;
5637                         }
5638                         optval = (so->so_flags & SOF_MPTCP_FASTJOIN);
5639                         /* Fixed along with rdar://19391339 */
5640                         goto integer;
5641 #endif /* MPTCP */
5642
5643                 case SO_EXTENDED_BK_IDLE:
5644                         optval = (so->so_flags1 & SOF1_EXTEND_BK_IDLE_WANTED);
5645                         goto integer;
5646                 case SO_MARK_CELLFALLBACK:
5647                         optval = ((so->so_flags1 & SOF1_CELLFALLBACK) > 0)
5648                             ? 1 : 0;
5649                         goto integer;
5650                 case SO_NET_SERVICE_TYPE: {
5651                         if ((so->so_flags1 & SOF1_TC_NET_SERV_TYPE))
5652                                 optval = so->so_netsvctype;
5653                         else
5654                                 optval = NET_SERVICE_TYPE_BE;
5655                         goto integer;
5656                 }
5657                 case SO_NETSVC_MARKING_LEVEL:
5658                         optval = so_get_netsvc_marking_level(so);
5659                         goto integer;
5660
5661                 default:
5662                         error = ENOPROTOOPT;
5663                         break;
5664                 }
5665         }
5666 out:
5667         if (dolock)
5668                 socket_unlock(so, 1);
5669         return (error);
5670 }
5671
5672 /*
5673  * The size limits on our soopt_getm is different from that on FreeBSD.
5674  * We limit the size of options to MCLBYTES. This will have to change
5675  * if we need to define options that need more space than MCLBYTES.
5676  */
5677 int
5678 soopt_getm(struct sockopt *sopt, struct mbuf **mp)
5679 {
5680         struct mbuf *m, *m_prev;
5681         int sopt_size = sopt->sopt_valsize;
5682         int how;
5683
5684         if (sopt_size <= 0 || sopt_size > MCLBYTES)
5685                 return (EMSGSIZE);
5686
5687         how = sopt->sopt_p != kernproc ? M_WAIT : M_DONTWAIT;
5688         MGET(m, how, MT_DATA);
5689         if (m == NULL)
5690                 return (ENOBUFS);
5691         if (sopt_size > MLEN) {
5692                 MCLGET(m, how);
5693                 if ((m->m_flags & M_EXT) == 0) {
5694                         m_free(m);
5695                         return (ENOBUFS);
5696                 }
5697                 m->m_len = min(MCLBYTES, sopt_size);
5698         } else {
5699                 m->m_len = min(MLEN, sopt_size);
5700         }
5701         sopt_size -= m->m_len;
5702         *mp = m;
5703         m_prev = m;
5704
5705         while (sopt_size > 0) {
5706                 MGET(m, how, MT_DATA);
5707                 if (m == NULL) {
5708                         m_freem(*mp);
5709                         return (ENOBUFS);
5710                 }
5711                 if (sopt_size > MLEN) {
5712                         MCLGET(m, how);
5713                         if ((m->m_flags & M_EXT) == 0) {
5714                                 m_freem(*mp);
5715                                 m_freem(m);
5716                                 return (ENOBUFS);
5717                         }
5718                         m->m_len = min(MCLBYTES, sopt_size);
5719                 } else {
5720                         m->m_len = min(MLEN, sopt_size);
5721                 }
5722                 sopt_size -= m->m_len;
5723                 m_prev->m_next = m;
5724                 m_prev = m;
5725         }
5726         return (0);
5727 }
5728
5729 /* copyin sopt data into mbuf chain */
5730 int
5731 soopt_mcopyin(struct sockopt *sopt, struct mbuf *m)
5732 {
5733         struct mbuf *m0 = m;
5734
5735         if (sopt->sopt_val == USER_ADDR_NULL)
5736                 return (0);
5737         while (m != NULL && sopt->sopt_valsize >= m->m_len) {
5738                 if (sopt->sopt_p != kernproc) {
5739                         int error;
5740
5741                         error = copyin(sopt->sopt_val, mtod(m, char *),
5742                             m->m_len);
5743                         if (error != 0) {
5744                                 m_freem(m0);
5745                                 return (error);
5746                         }
5747                 } else {
5748                         bcopy(CAST_DOWN(caddr_t, sopt->sopt_val),
5749                             mtod(m, char *), m->m_len);
5750                 }
5751                 sopt->sopt_valsize -= m->m_len;
5752                 sopt->sopt_val += m->m_len;
5753                 m = m->m_next;
5754         }
5755         /* should be allocated enoughly at ip6_sooptmcopyin() */
5756         if (m != NULL) {
5757                 panic("soopt_mcopyin");
5758                 /* NOTREACHED */
5759         }
5760         return (0);
5761 }
5762
5763 /* copyout mbuf chain data into soopt */
5764 int
5765 soopt_mcopyout(struct sockopt *sopt, struct mbuf *m)
5766 {
5767         struct mbuf *m0 = m;
5768         size_t valsize = 0;
5769
5770         if (sopt->sopt_val == USER_ADDR_NULL)
5771                 return (0);
5772         while (m != NULL && sopt->sopt_valsize >= m->m_len) {
5773                 if (sopt->sopt_p != kernproc) {
5774                         int error;
5775
5776                         error = copyout(mtod(m, char *), sopt->sopt_val,
5777                             m->m_len);
5778                         if (error != 0) {
5779                                 m_freem(m0);
5780                                 return (error);
5781                         }
5782                 } else {
5783                         bcopy(mtod(m, char *),
5784                             CAST_DOWN(caddr_t, sopt->sopt_val), m->m_len);
5785                 }
5786                 sopt->sopt_valsize -= m->m_len;
5787                 sopt->sopt_val += m->m_len;
5788                 valsize += m->m_len;
5789                 m = m->m_next;
5790         }
5791         if (m != NULL) {
5792                 /* enough soopt buffer should be given from user-land */
5793                 m_freem(m0);
5794                 return (EINVAL);
5795         }
5796         sopt->sopt_valsize = valsize;
5797         return (0);
5798 }
5799
5800 void
5801 sohasoutofband(struct socket *so)
5802 {
5803         if (so->so_pgid < 0)
5804                 gsignal(-so->so_pgid, SIGURG);
5805         else if (so->so_pgid > 0)
5806                 proc_signal(so->so_pgid, SIGURG);
5807         selwakeup(&so->so_rcv.sb_sel);
5808         if (so->so_rcv.sb_flags & SB_KNOTE) {
5809                 KNOTE(&so->so_rcv.sb_sel.si_note,
5810                     (NOTE_OOB | SO_FILT_HINT_LOCKED));
5811         }
5812 }
5813
5814 int
5815 sopoll(struct socket *so, int events, kauth_cred_t cred, void * wql)
5816 {
5817 #pragma unused(cred)
5818         struct proc *p = current_proc();
5819         int revents = 0;
5820
5821         socket_lock(so, 1);
5822         so_update_last_owner_locked(so, PROC_NULL);
5823         so_update_policy(so);
5824
5825         if (events & (POLLIN | POLLRDNORM))
5826                 if (soreadable(so))
5827                         revents |= events & (POLLIN | POLLRDNORM);
5828
5829         if (events & (POLLOUT | POLLWRNORM))
5830                 if (sowriteable(so))
5831                         revents |= events & (POLLOUT | POLLWRNORM);
5832
5833         if (events & (POLLPRI | POLLRDBAND))
5834                 if (so->so_oobmark || (so->so_state & SS_RCVATMARK))
5835                         revents |= events & (POLLPRI | POLLRDBAND);
5836
5837         if (revents == 0) {
5838                 if (events & (POLLIN | POLLPRI | POLLRDNORM | POLLRDBAND)) {
5839                         /*
5840                          * Darwin sets the flag first,
5841                          * BSD calls selrecord first
5842                          */
5843                         so->so_rcv.sb_flags |= SB_SEL;
5844                         selrecord(p, &so->so_rcv.sb_sel, wql);
5845                 }
5846
5847                 if (events & (POLLOUT | POLLWRNORM)) {
5848                         /*
5849                          * Darwin sets the flag first,
5850                          * BSD calls selrecord first
5851                          */
5852                         so->so_snd.sb_flags |= SB_SEL;
5853                         selrecord(p, &so->so_snd.sb_sel, wql);
5854                 }
5855         }
5856
5857         socket_unlock(so, 1);
5858         return (revents);
5859 }
5860
5861 int
5862 soo_kqfilter(struct fileproc *fp, struct knote *kn, vfs_context_t ctx)
5863 {
5864 #pragma unused(fp)
5865 #if !CONFIG_MACF_SOCKET
5866 #pragma unused(ctx)
5867 #endif /* MAC_SOCKET */
5868         struct socket *so = (struct socket *)kn->kn_fp->f_fglob->fg_data;
5869         int result;
5870
5871         socket_lock(so, 1);
5872         so_update_last_owner_locked(so, PROC_NULL);
5873         so_update_policy(so);
5874
5875 #if CONFIG_MACF_SOCKET
5876         if (mac_socket_check_kqfilter(proc_ucred(vfs_context_proc(ctx)),
5877             kn, so) != 0) {
5878                 socket_unlock(so, 1);
5879                 kn->kn_flags = EV_ERROR;
5880                 kn->kn_data = EPERM;
5881                 return 0;
5882         }
5883 #endif /* MAC_SOCKET */
5884
5885         switch (kn->kn_filter) {
5886         case EVFILT_READ:
5887                 kn->kn_filtid = EVFILTID_SOREAD;
5888                 break;
5889         case EVFILT_WRITE:
5890                 kn->kn_filtid = EVFILTID_SOWRITE;
5891                 break;
5892         case EVFILT_SOCK:
5893                 kn->kn_filtid = EVFILTID_SCK;
5894                 break;
5895         case EVFILT_EXCEPT:
5896                 kn->kn_filtid = EVFILTID_SOEXCEPT;
5897                 break;
5898         default:
5899                 socket_unlock(so, 1);
5900                 kn->kn_flags = EV_ERROR;
5901                 kn->kn_data = EINVAL;
5902                 return 0;
5903         }
5904
5905         /*
5906          * call the appropriate sub-filter attach
5907          * with the socket still locked
5908          */
5909         result = knote_fops(kn)->f_attach(kn);
5910
5911         socket_unlock(so, 1);
5912
5913         return result;
5914 }
5915
5916 static int
5917 filt_soread_common(struct knote *kn, struct socket *so)
5918 {
5919         if (so->so_options & SO_ACCEPTCONN) {
5920                 int is_not_empty;
5921
5922                 /*
5923                  * Radar 6615193 handle the listen case dynamically
5924                  * for kqueue read filter. This allows to call listen()
5925                  * after registering the kqueue EVFILT_READ.
5926                  */
5927
5928                 kn->kn_data = so->so_qlen;
5929                 is_not_empty = ! TAILQ_EMPTY(&so->so_comp);
5930
5931                 return (is_not_empty);
5932         }
5933
5934         /* socket isn't a listener */
5935         /*
5936          * NOTE_LOWAT specifies new low water mark in data, i.e.
5937          * the bytes of protocol data. We therefore exclude any
5938          * control bytes.
5939          */
5940         kn->kn_data = so->so_rcv.sb_cc - so->so_rcv.sb_ctl;
5941
5942         if (kn->kn_sfflags & NOTE_OOB) {
5943                 if (so->so_oobmark || (so->so_state & SS_RCVATMARK)) {
5944                         kn->kn_fflags |= NOTE_OOB;
5945                         kn->kn_data -= so->so_oobmark;
5946                         return (1);
5947                 }
5948         }
5949
5950         if ((so->so_state & SS_CANTRCVMORE)
5951 #if CONTENT_FILTER
5952             && cfil_sock_data_pending(&so->so_rcv) == 0
5953 #endif /* CONTENT_FILTER */
5954            ) {
5955                 kn->kn_flags |= EV_EOF;
5956                 kn->kn_fflags = so->so_error;
5957                 return (1);
5958         }
5959
5960         if (so->so_error) {     /* temporary udp error */
5961                 return (1);
5962         }
5963
5964         int64_t lowwat = so->so_rcv.sb_lowat;
5965         /*
5966          * Ensure that when NOTE_LOWAT is used, the derived
5967          * low water mark is bounded by socket's rcv buf's
5968          * high and low water mark values.
5969          */
5970         if (kn->kn_sfflags & NOTE_LOWAT) {
5971                 if (kn->kn_sdata > so->so_rcv.sb_hiwat)
5972                         lowwat = so->so_rcv.sb_hiwat;
5973                 else if (kn->kn_sdata > lowwat)
5974                         lowwat = kn->kn_sdata;
5975         }
5976
5977         /*
5978          * The order below is important. Since NOTE_LOWAT
5979          * overrides sb_lowat, check for NOTE_LOWAT case
5980          * first.
5981          */
5982         if (kn->kn_sfflags & NOTE_LOWAT)
5983                 return (kn->kn_data >= lowwat);
5984
5985         return (so->so_rcv.sb_cc >= lowwat);
5986 }
5987
5988 static int
5989 filt_sorattach(struct knote *kn)
5990 {
5991         struct socket *so = (struct socket *)kn->kn_fp->f_fglob->fg_data;
5992
5993         /* socket locked */
5994
5995         /*
5996          * If the caller explicitly asked for OOB results (e.g. poll())
5997          * from EVFILT_READ, then save that off in the hookid field
5998          * and reserve the kn_flags EV_OOBAND bit for output only.
5999          */
6000         if (kn->kn_filter == EVFILT_READ &&
6001             kn->kn_flags & EV_OOBAND) {
6002                 kn->kn_flags &= ~EV_OOBAND;
6003                 kn->kn_hookid = EV_OOBAND;
6004         } else {
6005                 kn->kn_hookid = 0;
6006         }
6007         if (KNOTE_ATTACH(&so->so_rcv.sb_sel.si_note, kn))
6008                 so->so_rcv.sb_flags |= SB_KNOTE;
6009
6010         /* indicate if event is already fired */
6011         return filt_soread_common(kn, so);
6012 }
6013
6014 static void
6015 filt_sordetach(struct knote *kn)
6016 {
6017         struct socket *so = (struct socket *)kn->kn_fp->f_fglob->fg_data;
6018
6019         socket_lock(so, 1);
6020         if (so->so_rcv.sb_flags & SB_KNOTE)
6021                 if (KNOTE_DETACH(&so->so_rcv.sb_sel.si_note, kn))
6022                         so->so_rcv.sb_flags &= ~SB_KNOTE;
6023         socket_unlock(so, 1);
6024 }
6025
6026 /*ARGSUSED*/
6027 static int
6028 filt_soread(struct knote *kn, long hint)
6029 {
6030         struct socket *so = (struct socket *)kn->kn_fp->f_fglob->fg_data;
6031         int retval;
6032
6033         if ((hint & SO_FILT_HINT_LOCKED) == 0)
6034                 socket_lock(so, 1);
6035
6036         retval = filt_soread_common(kn, so);
6037
6038         if ((hint & SO_FILT_HINT_LOCKED) == 0)
6039                 socket_unlock(so, 1);
6040
6041         return retval;
6042 }
6043
6044 static int
6045 filt_sortouch(struct knote *kn, struct kevent_internal_s *kev)
6046 {
6047         struct socket *so = (struct socket *)kn->kn_fp->f_fglob->fg_data;
6048         int retval;
6049
6050         socket_lock(so, 1);
6051
6052         /* save off the new input fflags and data */
6053         kn->kn_sfflags = kev->fflags;
6054         kn->kn_sdata = kev->data;
6055         if ((kn->kn_status & KN_UDATA_SPECIFIC) == 0)
6056                 kn->kn_udata = kev->udata;
6057
6058         /* determine if changes result in fired events */
6059         retval = filt_soread_common(kn, so);
6060
6061         socket_unlock(so, 1);
6062
6063         return retval;
6064 }
6065
6066 static int
6067 filt_sorprocess(struct knote *kn, struct filt_process_s *data, struct kevent_internal_s *kev)
6068 {
6069 #pragma unused(data)
6070         struct socket *so = (struct socket *)kn->kn_fp->f_fglob->fg_data;
6071         int retval;
6072
6073         socket_lock(so, 1);
6074         retval = filt_soread_common(kn, so);
6075         if (retval) {
6076                 *kev = kn->kn_kevent;
6077                 if (kn->kn_flags & EV_CLEAR) {
6078                         kn->kn_fflags = 0;
6079                         kn->kn_data = 0;
6080                 }
6081         }
6082         socket_unlock(so, 1);
6083
6084         return retval;
6085 }
6086
6087 int
6088 so_wait_for_if_feedback(struct socket *so)
6089 {
6090         if ((SOCK_DOM(so) == PF_INET || SOCK_DOM(so) == PF_INET6) &&
6091             (so->so_state & SS_ISCONNECTED)) {
6092                 struct inpcb *inp = sotoinpcb(so);
6093                 if (INP_WAIT_FOR_IF_FEEDBACK(inp))
6094                         return (1);
6095         }
6096         return (0);
6097 }
6098
6099 static int
6100 filt_sowrite_common(struct knote *kn, struct socket *so)
6101 {
6102         int ret = 0;
6103
6104         kn->kn_data = sbspace(&so->so_snd);
6105         if (so->so_state & SS_CANTSENDMORE) {
6106                 kn->kn_flags |= EV_EOF;
6107                 kn->kn_fflags = so->so_error;
6108                 return 1;
6109         }
6110         if (so->so_error) {     /* temporary udp error */
6111                 return 1;
6112         }
6113         if (!socanwrite(so)) {
6114                 return 0;
6115         }
6116         if (so->so_flags1 & SOF1_PRECONNECT_DATA) {
6117                 return 1;
6118         }
6119         int64_t lowwat = so->so_snd.sb_lowat;
6120         if (kn->kn_sfflags & NOTE_LOWAT) {
6121                 if (kn->kn_sdata > so->so_snd.sb_hiwat)
6122                         lowwat = so->so_snd.sb_hiwat;
6123                 else if (kn->kn_sdata > lowwat)
6124                         lowwat = kn->kn_sdata;
6125         }
6126         if (kn->kn_data >= lowwat) {
6127                 if ((so->so_flags & SOF_NOTSENT_LOWAT)
6128 #if (DEBUG || DEVELOPMENT)
6129                     && so_notsent_lowat_check == 1
6130 #endif /* DEBUG || DEVELOPMENT */
6131                     ) {
6132                         if ((SOCK_DOM(so) == PF_INET ||
6133                             SOCK_DOM(so) == PF_INET6) &&
6134                             so->so_type == SOCK_STREAM) {
6135                                 ret = tcp_notsent_lowat_check(so);
6136                         }
6137 #if MPTCP
6138                         else if ((SOCK_DOM(so) == PF_MULTIPATH) &&
6139                             (SOCK_PROTO(so) == IPPROTO_TCP)) {
6140                                 ret = mptcp_notsent_lowat_check(so);
6141                         }
6142 #endif
6143                         else {
6144                                 return 1;
6145                         }
6146                 } else {
6147                         ret = 1;
6148                 }
6149         }
6150         if (so_wait_for_if_feedback(so))
6151                 ret = 0;
6152         return (ret);
6153 }
6154
6155 static int
6156 filt_sowattach(struct knote *kn)
6157 {
6158         struct socket *so = (struct socket *)kn->kn_fp->f_fglob->fg_data;
6159
6160         /* socket locked */
6161         if (KNOTE_ATTACH(&so->so_snd.sb_sel.si_note, kn))
6162                 so->so_snd.sb_flags |= SB_KNOTE;
6163
6164         /* determine if its already fired */
6165         return filt_sowrite_common(kn, so);
6166 }
6167
6168 static void
6169 filt_sowdetach(struct knote *kn)
6170 {
6171         struct socket *so = (struct socket *)kn->kn_fp->f_fglob->fg_data;
6172         socket_lock(so, 1);
6173
6174         if (so->so_snd.sb_flags & SB_KNOTE)
6175                 if (KNOTE_DETACH(&so->so_snd.sb_sel.si_note, kn))
6176                         so->so_snd.sb_flags &= ~SB_KNOTE;
6177         socket_unlock(so, 1);
6178 }
6179
6180 /*ARGSUSED*/
6181 static int
6182 filt_sowrite(struct knote *kn, long hint)
6183 {
6184         struct socket *so = (struct socket *)kn->kn_fp->f_fglob->fg_data;
6185         int ret;
6186
6187         if ((hint & SO_FILT_HINT_LOCKED) == 0)
6188                 socket_lock(so, 1);
6189
6190         ret = filt_sowrite_common(kn, so);
6191
6192         if ((hint & SO_FILT_HINT_LOCKED) == 0)
6193                 socket_unlock(so, 1);
6194
6195         return ret;
6196 }
6197
6198 static int
6199 filt_sowtouch(struct knote *kn, struct kevent_internal_s *kev)
6200 {
6201         struct socket *so = (struct socket *)kn->kn_fp->f_fglob->fg_data;
6202         int ret;
6203
6204         socket_lock(so, 1);
6205
6206         /*save off the new input fflags and data */
6207         kn->kn_sfflags = kev->fflags;
6208         kn->kn_sdata = kev->data;
6209         if ((kn->kn_status & KN_UDATA_SPECIFIC) == 0)
6210                 kn->kn_udata = kev->udata;
6211
6212         /* determine if these changes result in a triggered event */
6213         ret = filt_sowrite_common(kn, so);
6214
6215         socket_unlock(so, 1);
6216
6217         return ret;
6218 }
6219
6220 static int
6221 filt_sowprocess(struct knote *kn, struct filt_process_s *data, struct kevent_internal_s *kev)
6222 {
6223 #pragma unused(data)
6224         struct socket *so = (struct socket *)kn->kn_fp->f_fglob->fg_data;
6225         int ret;
6226
6227         socket_lock(so, 1);
6228         ret = filt_sowrite_common(kn, so);
6229         if (ret) {
6230                 *kev = kn->kn_kevent;
6231                 if (kn->kn_flags & EV_CLEAR) {
6232                         kn->kn_fflags = 0;
6233                         kn->kn_data = 0;
6234                 }
6235         }
6236         socket_unlock(so, 1);
6237         return ret;
6238 }
6239
6240 static int
6241 filt_sockev_common(struct knote *kn, struct socket *so, long ev_hint)
6242 {
6243         int ret = 0;
6244         uint32_t level_trigger = 0;
6245
6246         if (ev_hint & SO_FILT_HINT_CONNRESET) {
6247                 kn->kn_fflags |= NOTE_CONNRESET;
6248         }
6249         if (ev_hint & SO_FILT_HINT_TIMEOUT) {
6250                 kn->kn_fflags |= NOTE_TIMEOUT;
6251         }
6252         if (ev_hint & SO_FILT_HINT_NOSRCADDR) {
6253                 kn->kn_fflags |= NOTE_NOSRCADDR;
6254         }
6255         if (ev_hint & SO_FILT_HINT_IFDENIED) {
6256                 kn->kn_fflags |= NOTE_IFDENIED;
6257         }
6258         if (ev_hint & SO_FILT_HINT_KEEPALIVE) {
6259                 kn->kn_fflags |= NOTE_KEEPALIVE;
6260         }
6261         if (ev_hint & SO_FILT_HINT_ADAPTIVE_WTIMO) {
6262                 kn->kn_fflags |= NOTE_ADAPTIVE_WTIMO;
6263         }
6264         if (ev_hint & SO_FILT_HINT_ADAPTIVE_RTIMO) {
6265                 kn->kn_fflags |= NOTE_ADAPTIVE_RTIMO;
6266         }
6267         if ((ev_hint & SO_FILT_HINT_CONNECTED) ||
6268             (so->so_state & SS_ISCONNECTED)) {
6269                 kn->kn_fflags |= NOTE_CONNECTED;
6270                 level_trigger |= NOTE_CONNECTED;
6271         }
6272         if ((ev_hint & SO_FILT_HINT_DISCONNECTED) ||
6273             (so->so_state & SS_ISDISCONNECTED)) {
6274                 kn->kn_fflags |= NOTE_DISCONNECTED;
6275                 level_trigger |= NOTE_DISCONNECTED;
6276         }
6277         if (ev_hint & SO_FILT_HINT_CONNINFO_UPDATED) {
6278                 if (so->so_proto != NULL &&
6279                     (so->so_proto->pr_flags & PR_EVCONNINFO))
6280                         kn->kn_fflags |= NOTE_CONNINFO_UPDATED;
6281         }
6282
6283         if ((ev_hint & SO_FILT_HINT_NOTIFY_ACK) ||
6284             tcp_notify_ack_active(so)) {
6285                 kn->kn_fflags |= NOTE_NOTIFY_ACK;
6286         }
6287
6288         if ((so->so_state & SS_CANTRCVMORE)
6289 #if CONTENT_FILTER
6290             && cfil_sock_data_pending(&so->so_rcv) == 0
6291 #endif /* CONTENT_FILTER */
6292             ) {
6293                 kn->kn_fflags |= NOTE_READCLOSED;
6294                 level_trigger |= NOTE_READCLOSED;
6295         }
6296
6297         if (so->so_state & SS_CANTSENDMORE) {
6298                 kn->kn_fflags |= NOTE_WRITECLOSED;
6299                 level_trigger |= NOTE_WRITECLOSED;
6300         }
6301
6302         if ((ev_hint & SO_FILT_HINT_SUSPEND) ||
6303             (so->so_flags & SOF_SUSPENDED)) {
6304                 kn->kn_fflags &= ~(NOTE_SUSPEND | NOTE_RESUME);
6305
6306                 /* If resume event was delivered before, reset it */
6307                 kn->kn_hookid &= ~NOTE_RESUME;
6308
6309                 kn->kn_fflags |= NOTE_SUSPEND;
6310                 level_trigger |= NOTE_SUSPEND;
6311         }
6312
6313         if ((ev_hint & SO_FILT_HINT_RESUME) ||
6314             (so->so_flags & SOF_SUSPENDED) == 0) {
6315                 kn->kn_fflags &= ~(NOTE_SUSPEND | NOTE_RESUME);
6316
6317                 /* If suspend event was delivered before, reset it */
6318                 kn->kn_hookid &= ~NOTE_SUSPEND;
6319
6320                 kn->kn_fflags |= NOTE_RESUME;
6321                 level_trigger |= NOTE_RESUME;
6322         }
6323
6324         if (so->so_error != 0) {
6325                 ret = 1;
6326                 kn->kn_data = so->so_error;
6327                 kn->kn_flags |= EV_EOF;
6328         } else {
6329                 get_sockev_state(so, (u_int32_t *)&(kn->kn_data));
6330         }
6331
6332         /* Reset any events that are not requested on this knote */
6333         kn->kn_fflags &= (kn->kn_sfflags & EVFILT_SOCK_ALL_MASK);
6334         level_trigger &= (kn->kn_sfflags & EVFILT_SOCK_ALL_MASK);
6335
6336         /* Find the level triggerred events that are already delivered */
6337         level_trigger &= kn->kn_hookid;
6338         level_trigger &= EVFILT_SOCK_LEVEL_TRIGGER_MASK;
6339
6340         /* Do not deliver level triggerred events more than once */
6341         if ((kn->kn_fflags & ~level_trigger) != 0)
6342                 ret = 1;
6343
6344         return (ret);
6345 }
6346
6347 static int
6348 filt_sockattach(struct knote *kn)
6349 {
6350         struct socket *so = (struct socket *)kn->kn_fp->f_fglob->fg_data;
6351
6352         /* socket locked */
6353         kn->kn_hookid = 0;
6354         if (KNOTE_ATTACH(&so->so_klist, kn))
6355                 so->so_flags |= SOF_KNOTE;
6356
6357         /* determine if event already fired */
6358         return filt_sockev_common(kn, so, 0);
6359 }
6360
6361 static void
6362 filt_sockdetach(struct knote *kn)
6363 {
6364         struct socket *so = (struct socket *)kn->kn_fp->f_fglob->fg_data;
6365         socket_lock(so, 1);
6366
6367         if ((so->so_flags & SOF_KNOTE) != 0)
6368                 if (KNOTE_DETACH(&so->so_klist, kn))
6369                         so->so_flags &= ~SOF_KNOTE;
6370         socket_unlock(so, 1);
6371 }
6372
6373 static int
6374 filt_sockev(struct knote *kn, long hint)
6375 {
6376         int ret = 0, locked = 0;
6377         struct socket *so = (struct socket *)kn->kn_fp->f_fglob->fg_data;
6378         long ev_hint = (hint & SO_FILT_HINT_EV);
6379
6380         if ((hint & SO_FILT_HINT_LOCKED) == 0) {
6381                 socket_lock(so, 1);
6382                 locked = 1;
6383         }
6384
6385         ret = filt_sockev_common(kn, so, ev_hint);
6386
6387         if (locked)
6388                 socket_unlock(so, 1);
6389
6390         return ret;
6391 }
6392
6393
6394
6395 /*
6396  *      filt_socktouch - update event state
6397  */
6398 static int
6399 filt_socktouch(
6400         struct knote *kn,
6401         struct kevent_internal_s *kev)
6402 {
6403         struct socket *so = (struct socket *)kn->kn_fp->f_fglob->fg_data;
6404         uint32_t changed_flags;
6405         int ret;
6406
6407         socket_lock(so, 1);
6408
6409         /* save off the [result] data and fflags */
6410         changed_flags = (kn->kn_sfflags ^ kn->kn_hookid);
6411
6412         /* save off the new input fflags and data */
6413         kn->kn_sfflags = kev->fflags;
6414         kn->kn_sdata = kev->data;
6415         if ((kn->kn_status & KN_UDATA_SPECIFIC) == 0)
6416                 kn->kn_udata = kev->udata;
6417
6418         /* restrict the current results to the (smaller?) set of new interest */
6419         /*
6420          * For compatibility with previous implementations, we leave kn_fflags
6421          * as they were before.
6422          */
6423         //kn->kn_fflags &= kev->fflags;
6424
6425         /*
6426          * Since we keep track of events that are already
6427          * delivered, if any of those events are not requested
6428          * anymore the state related to them can be reset
6429          */
6430         kn->kn_hookid &=
6431             ~(changed_flags & EVFILT_SOCK_LEVEL_TRIGGER_MASK);
6432
6433         /* determine if we have events to deliver */
6434         ret = filt_sockev_common(kn, so, 0);
6435
6436         socket_unlock(so, 1);
6437
6438         return ret;
6439 }
6440
6441 /*
6442  *      filt_sockprocess - query event fired state and return data
6443  */
6444 static int
6445 filt_sockprocess(
6446         struct knote *kn,
6447         struct filt_process_s *data,
6448         struct kevent_internal_s *kev)
6449 {
6450 #pragma unused(data)
6451
6452         struct socket *so = (struct socket *)kn->kn_fp->f_fglob->fg_data;
6453         int ret = 0;
6454
6455         socket_lock(so, 1);
6456
6457         ret = filt_sockev_common(kn, so, 0);
6458         if (ret) {
6459                 *kev = kn->kn_kevent;
6460
6461                 /*
6462                  * Store the state of the events being delivered. This
6463                  * state can be used to deliver level triggered events
6464                  * ateast once and still avoid waking up the application
6465                  * multiple times as long as the event is active.
6466                  */
6467                 if (kn->kn_fflags != 0)
6468                         kn->kn_hookid |= (kn->kn_fflags &
6469                                           EVFILT_SOCK_LEVEL_TRIGGER_MASK);
6470
6471                 /*
6472                  * NOTE_RESUME and NOTE_SUSPEND are an exception, deliver
6473                  * only one of them and remember the last one that was
6474                  * delivered last
6475                  */
6476                 if (kn->kn_fflags & NOTE_SUSPEND)
6477                         kn->kn_hookid &= ~NOTE_RESUME;
6478                 if (kn->kn_fflags & NOTE_RESUME)
6479                         kn->kn_hookid &= ~NOTE_SUSPEND;
6480
6481                 if (kn->kn_flags & EV_CLEAR) {
6482                         kn->kn_data = 0;
6483                         kn->kn_fflags = 0;
6484                 }
6485         }
6486
6487         socket_unlock(so, 1);
6488
6489         return ret;
6490 }
6491
6492 void
6493 get_sockev_state(struct socket *so, u_int32_t *statep)
6494 {
6495         u_int32_t state = *(statep);
6496
6497         /*
6498          * If the state variable is already used by a previous event,
6499          * reset it.
6500          */
6501         if (state != 0)
6502                 return;
6503
6504         if (so->so_state & SS_ISCONNECTED)
6505                 state |= SOCKEV_CONNECTED;
6506         else
6507                 state &= ~(SOCKEV_CONNECTED);
6508         state |= ((so->so_state & SS_ISDISCONNECTED) ? SOCKEV_DISCONNECTED : 0);
6509         *(statep) = state;
6510 }
6511
6512 #define SO_LOCK_HISTORY_STR_LEN \
6513         (2 * SO_LCKDBG_MAX * (2 + (2 * sizeof (void *)) + 1) + 1)
6514
6515 __private_extern__ const char *
6516 solockhistory_nr(struct socket *so)
6517 {
6518         size_t n = 0;
6519         int i;
6520         static char lock_history_str[SO_LOCK_HISTORY_STR_LEN];
6521
6522         bzero(lock_history_str, sizeof (lock_history_str));
6523         for (i = SO_LCKDBG_MAX - 1; i >= 0; i--) {
6524                 n += snprintf(lock_history_str + n,
6525                     SO_LOCK_HISTORY_STR_LEN - n, "%p:%p ",
6526                     so->lock_lr[(so->next_lock_lr + i) % SO_LCKDBG_MAX],
6527                     so->unlock_lr[(so->next_unlock_lr + i) % SO_LCKDBG_MAX]);
6528         }
6529         return (lock_history_str);
6530 }
6531
6532 int
6533 socket_lock(struct socket *so, int refcount)
6534 {
6535         int error = 0;
6536         void *lr_saved;
6537
6538         lr_saved = __builtin_return_address(0);
6539
6540         if (so->so_proto->pr_lock) {
6541                 error = (*so->so_proto->pr_lock)(so, refcount, lr_saved);
6542         } else {
6543 #ifdef MORE_LOCKING_DEBUG
6544                 lck_mtx_assert(so->so_proto->pr_domain->dom_mtx,
6545                     LCK_MTX_ASSERT_NOTOWNED);
6546 #endif
6547                 lck_mtx_lock(so->so_proto->pr_domain->dom_mtx);
6548                 if (refcount)
6549                         so->so_usecount++;
6550                 so->lock_lr[so->next_lock_lr] = lr_saved;
6551                 so->next_lock_lr = (so->next_lock_lr+1) % SO_LCKDBG_MAX;
6552         }
6553
6554         return (error);
6555 }
6556
6557 int
6558 socket_unlock(struct socket *so, int refcount)
6559 {
6560         int error = 0;
6561         void *lr_saved;
6562         lck_mtx_t *mutex_held;
6563
6564         lr_saved = __builtin_return_address(0);
6565
6566         if (so->so_proto == NULL) {
6567                 panic("%s: null so_proto so=%p\n", __func__, so);
6568                 /* NOTREACHED */
6569         }
6570
6571         if (so && so->so_proto->pr_unlock) {
6572                 error = (*so->so_proto->pr_unlock)(so, refcount, lr_saved);
6573         } else {
6574                 mutex_held = so->so_proto->pr_domain->dom_mtx;
6575 #ifdef MORE_LOCKING_DEBUG
6576                 lck_mtx_assert(mutex_held, LCK_MTX_ASSERT_OWNED);
6577 #endif
6578                 so->unlock_lr[so->next_unlock_lr] = lr_saved;
6579                 so->next_unlock_lr = (so->next_unlock_lr+1) % SO_LCKDBG_MAX;
6580
6581                 if (refcount) {
6582                         if (so->so_usecount <= 0) {
6583                                 panic("%s: bad refcount=%d so=%p (%d, %d, %d) "
6584                                     "lrh=%s", __func__, so->so_usecount, so,
6585                                     SOCK_DOM(so), so->so_type,
6586                                     SOCK_PROTO(so), solockhistory_nr(so));
6587                                 /* NOTREACHED */
6588                         }
6589
6590                         so->so_usecount--;
6591                         if (so->so_usecount == 0)
6592                                 sofreelastref(so, 1);
6593                 }
6594                 lck_mtx_unlock(mutex_held);
6595         }
6596
6597         return (error);
6598 }
6599
6600 /* Called with socket locked, will unlock socket */
6601 void
6602 sofree(struct socket *so)
6603 {
6604         lck_mtx_t *mutex_held;
6605
6606         if (so->so_proto->pr_getlock != NULL)
6607                 mutex_held = (*so->so_proto->pr_getlock)(so, 0);
6608         else
6609                 mutex_held = so->so_proto->pr_domain->dom_mtx;
6610         lck_mtx_assert(mutex_held, LCK_MTX_ASSERT_OWNED);
6611
6612         sofreelastref(so, 0);
6613 }
6614
6615 void
6616 soreference(struct socket *so)
6617 {
6618         socket_lock(so, 1);     /* locks & take one reference on socket */
6619         socket_unlock(so, 0);   /* unlock only */
6620 }
6621
6622 void
6623 sodereference(struct socket *so)
6624 {
6625         socket_lock(so, 0);
6626         socket_unlock(so, 1);
6627 }
6628
6629 /*
6630  * Set or clear SOF_MULTIPAGES on the socket to enable or disable the
6631  * possibility of using jumbo clusters.  Caller must ensure to hold
6632  * the socket lock.
6633  */
6634 void
6635 somultipages(struct socket *so, boolean_t set)
6636 {
6637         if (set)
6638                 so->so_flags |= SOF_MULTIPAGES;
6639         else
6640                 so->so_flags &= ~SOF_MULTIPAGES;
6641 }
6642
6643 void
6644 soif2kcl(struct socket *so, boolean_t set)
6645 {
6646         if (set)
6647                 so->so_flags1 |= SOF1_IF_2KCL;
6648         else
6649                 so->so_flags1 &= ~SOF1_IF_2KCL;
6650 }
6651
6652 int
6653 so_isdstlocal(struct socket *so) {
6654
6655         struct inpcb *inp = (struct inpcb *)so->so_pcb;
6656
6657         if (SOCK_DOM(so) == PF_INET)
6658                 return (inaddr_local(inp->inp_faddr));
6659         else if (SOCK_DOM(so) == PF_INET6)
6660                 return (in6addr_local(&inp->in6p_faddr));
6661
6662         return (0);
6663 }
6664
6665 int
6666 sosetdefunct(struct proc *p, struct socket *so, int level, boolean_t noforce)
6667 {
6668         struct sockbuf *rcv, *snd;
6669         int err = 0, defunct;
6670
6671         rcv = &so->so_rcv;
6672         snd = &so->so_snd;
6673
6674         defunct = (so->so_flags & SOF_DEFUNCT);
6675         if (defunct) {
6676                 if (!(snd->sb_flags & rcv->sb_flags & SB_DROP)) {
6677                         panic("%s: SB_DROP not set", __func__);
6678                         /* NOTREACHED */
6679                 }
6680                 goto done;
6681         }
6682
6683         if (so->so_flags & SOF_NODEFUNCT) {
6684                 if (noforce) {
6685                         err = EOPNOTSUPP;
6686                         SODEFUNCTLOG("%s[%d, %s]: (target pid %d "
6687                             "name %s level %d) so 0x%llx [%d,%d] "
6688                             "is not eligible for defunct "
6689                             "(%d)\n", __func__, proc_selfpid(),
6690                             proc_best_name(current_proc()), proc_pid(p),
6691                             proc_best_name(p), level,
6692                             (uint64_t)DEBUG_KERNEL_ADDRPERM(so),
6693                             SOCK_DOM(so), SOCK_TYPE(so), err);
6694                         return (err);
6695                 }
6696                 so->so_flags &= ~SOF_NODEFUNCT;
6697                 SODEFUNCTLOG("%s[%d, %s]: (target pid %d name %s level %d) "
6698                     "so 0x%llx [%d,%d] defunct by force\n", __func__,
6699                     proc_selfpid(), proc_best_name(current_proc()),
6700                     proc_pid(p), proc_best_name(p), level,
6701                     (uint64_t)DEBUG_KERNEL_ADDRPERM(so),
6702                     SOCK_DOM(so), SOCK_TYPE(so));
6703         } else if (so->so_flags1 & SOF1_EXTEND_BK_IDLE_WANTED) {
6704                 struct inpcb *inp = (struct inpcb *)so->so_pcb;
6705                 struct ifnet *ifp = inp->inp_last_outifp;
6706
6707                 if (ifp && IFNET_IS_CELLULAR(ifp)) {
6708                         OSIncrementAtomic(&soextbkidlestat.so_xbkidle_nocell);
6709                 } else if (so->so_flags & SOF_DELEGATED) {
6710                         OSIncrementAtomic(&soextbkidlestat.so_xbkidle_nodlgtd);
6711                 } else if (soextbkidlestat.so_xbkidle_time == 0) {
6712                         OSIncrementAtomic(&soextbkidlestat.so_xbkidle_notime);
6713                 } else if (noforce) {
6714                         OSIncrementAtomic(&soextbkidlestat.so_xbkidle_active);
6715
6716                         so->so_flags1 |= SOF1_EXTEND_BK_IDLE_INPROG;
6717                         so->so_extended_bk_start = net_uptime();
6718                         OSBitOrAtomic(P_LXBKIDLEINPROG, &p->p_ladvflag);
6719
6720                         inpcb_timer_sched(inp->inp_pcbinfo, INPCB_TIMER_LAZY);
6721
6722                         err = EOPNOTSUPP;
6723                         SODEFUNCTLOG("%s[%d, %s]: (target pid %d name %s "
6724                             "level %d) extend bk idle so 0x%llx rcv hw %d "
6725                             "cc %d\n",
6726                             __func__, proc_selfpid(),
6727                             proc_best_name(current_proc()), proc_pid(p),
6728                             proc_best_name(p), level,
6729                             (uint64_t)DEBUG_KERNEL_ADDRPERM(so),
6730                             so->so_rcv.sb_hiwat, so->so_rcv.sb_cc);
6731                         return (err);
6732                 } else {
6733                         OSIncrementAtomic(&soextbkidlestat.so_xbkidle_forced);
6734                 }
6735         }
6736
6737         so->so_flags |= SOF_DEFUNCT;
6738
6739         /* Prevent further data from being appended to the socket buffers */
6740         snd->sb_flags |= SB_DROP;
6741         rcv->sb_flags |= SB_DROP;
6742
6743         /* Flush any existing data in the socket buffers */
6744         if (rcv->sb_cc != 0) {
6745                 rcv->sb_flags &= ~SB_SEL;
6746                 selthreadclear(&rcv->sb_sel);
6747                 sbrelease(rcv);
6748         }
6749         if (snd->sb_cc != 0) {
6750                 snd->sb_flags &= ~SB_SEL;
6751                 selthreadclear(&snd->sb_sel);
6752                 sbrelease(snd);
6753         }
6754
6755 done:
6756         SODEFUNCTLOG("%s[%d, %s]: (target pid %d name %s level %d) "
6757             "so 0x%llx [%d,%d] %s defunct%s\n", __func__, proc_selfpid(),
6758             proc_best_name(current_proc()), proc_pid(p), proc_best_name(p),
6759             level, (uint64_t)DEBUG_KERNEL_ADDRPERM(so), SOCK_DOM(so),
6760             SOCK_TYPE(so), defunct ? "is already" : "marked as",
6761             (so->so_flags1 & SOF1_EXTEND_BK_IDLE_WANTED) ? " extbkidle" : "");
6762
6763         return (err);
6764 }
6765
6766 int
6767 sodefunct(struct proc *p, struct socket *so, int level)
6768 {
6769         struct sockbuf *rcv, *snd;
6770
6771         if (!(so->so_flags & SOF_DEFUNCT)) {
6772                 panic("%s improperly called", __func__);
6773                 /* NOTREACHED */
6774         }
6775         if (so->so_state & SS_DEFUNCT)
6776                 goto done;
6777
6778         rcv = &so->so_rcv;
6779         snd = &so->so_snd;
6780
6781         if (SOCK_DOM(so) == PF_INET || SOCK_DOM(so) == PF_INET6) {
6782                 char s[MAX_IPv6_STR_LEN];
6783                 char d[MAX_IPv6_STR_LEN];
6784                 struct inpcb *inp = sotoinpcb(so);
6785
6786                 SODEFUNCTLOG("%s[%d, %s]: (target pid %d name %s level %d) "
6787                     "so 0x%llx [%s %s:%d -> %s:%d] is now defunct "
6788                     "[rcv_si 0x%x, snd_si 0x%x, rcv_fl 0x%x, snd_fl 0x%x]\n",
6789                     __func__, proc_selfpid(), proc_best_name(current_proc()),
6790                     proc_pid(p), proc_best_name(p), level,
6791                     (uint64_t)DEBUG_KERNEL_ADDRPERM(so),
6792                     (SOCK_TYPE(so) == SOCK_STREAM) ? "TCP" : "UDP",
6793                     inet_ntop(SOCK_DOM(so), ((SOCK_DOM(so) == PF_INET) ?
6794                     (void *)&inp->inp_laddr.s_addr : (void *)&inp->in6p_laddr),
6795                     s, sizeof (s)), ntohs(inp->in6p_lport),
6796                     inet_ntop(SOCK_DOM(so), (SOCK_DOM(so) == PF_INET) ?
6797                     (void *)&inp->inp_faddr.s_addr : (void *)&inp->in6p_faddr,
6798                     d, sizeof (d)), ntohs(inp->in6p_fport),
6799                     (uint32_t)rcv->sb_sel.si_flags,
6800                     (uint32_t)snd->sb_sel.si_flags,
6801                     rcv->sb_flags, snd->sb_flags);
6802         } else {
6803                 SODEFUNCTLOG("%s[%d, %s]: (target pid %d name %s level %d) "
6804                     "so 0x%llx [%d,%d] is now defunct [rcv_si 0x%x, "
6805                     "snd_si 0x%x, rcv_fl 0x%x, snd_fl 0x%x]\n", __func__,
6806                     proc_selfpid(), proc_best_name(current_proc()),
6807                     proc_pid(p), proc_best_name(p), level,
6808                     (uint64_t)DEBUG_KERNEL_ADDRPERM(so),
6809                     SOCK_DOM(so), SOCK_TYPE(so),
6810                     (uint32_t)rcv->sb_sel.si_flags,
6811                     (uint32_t)snd->sb_sel.si_flags, rcv->sb_flags,
6812                     snd->sb_flags);
6813         }
6814
6815         /*
6816          * Unwedge threads blocked on sbwait() and sb_lock().
6817          */
6818         sbwakeup(rcv);
6819         sbwakeup(snd);
6820
6821         so->so_flags1 |= SOF1_DEFUNCTINPROG;
6822         if (rcv->sb_flags & SB_LOCK)
6823                 sbunlock(rcv, TRUE);    /* keep socket locked */
6824         if (snd->sb_flags & SB_LOCK)
6825                 sbunlock(snd, TRUE);    /* keep socket locked */
6826
6827         /*
6828          * Flush the buffers and disconnect.  We explicitly call shutdown
6829          * on both data directions to ensure that SS_CANT{RCV,SEND}MORE
6830          * states are set for the socket.  This would also flush out data
6831          * hanging off the receive list of this socket.
6832          */
6833         (void) soshutdownlock_final(so, SHUT_RD);
6834         (void) soshutdownlock_final(so, SHUT_WR);
6835         (void) sodisconnectlocked(so);
6836
6837         /*
6838          * Explicitly handle connectionless-protocol disconnection
6839          * and release any remaining data in the socket buffers.
6840          */
6841         if (!(so->so_flags & SS_ISDISCONNECTED))
6842                 (void) soisdisconnected(so);
6843
6844         if (so->so_error == 0)
6845                 so->so_error = EBADF;
6846
6847         if (rcv->sb_cc != 0) {
6848                 rcv->sb_flags &= ~SB_SEL;
6849                 selthreadclear(&rcv->sb_sel);
6850                 sbrelease(rcv);
6851         }
6852         if (snd->sb_cc != 0) {
6853                 snd->sb_flags &= ~SB_SEL;
6854                 selthreadclear(&snd->sb_sel);
6855                 sbrelease(snd);
6856         }
6857         so->so_state |= SS_DEFUNCT;
6858         OSIncrementAtomicLong((volatile long *)&sodefunct_calls);
6859
6860 done:
6861         return (0);
6862 }
6863
6864 int
6865 soresume(struct proc *p, struct socket *so, int locked)
6866 {
6867         if (locked == 0)
6868                 socket_lock(so, 1);
6869
6870         if (so->so_flags1 & SOF1_EXTEND_BK_IDLE_INPROG) {
6871                 SODEFUNCTLOG("%s[%d, %s]: (target pid %d name %s) so 0x%llx "
6872                     "[%d,%d] resumed from bk idle\n",
6873                     __func__, proc_selfpid(), proc_best_name(current_proc()),
6874                     proc_pid(p), proc_best_name(p),
6875                     (uint64_t)DEBUG_KERNEL_ADDRPERM(so),
6876                     SOCK_DOM(so), SOCK_TYPE(so));
6877
6878                 so->so_flags1 &= ~SOF1_EXTEND_BK_IDLE_INPROG;
6879                 so->so_extended_bk_start = 0;
6880                 OSBitAndAtomic(~P_LXBKIDLEINPROG, &p->p_ladvflag);
6881
6882                 OSIncrementAtomic(&soextbkidlestat.so_xbkidle_resumed);
6883                 OSDecrementAtomic(&soextbkidlestat.so_xbkidle_active);
6884                 VERIFY(soextbkidlestat.so_xbkidle_active >= 0);
6885         }
6886         if (locked == 0)
6887                 socket_unlock(so, 1);
6888
6889         return (0);
6890 }
6891
6892 /*
6893  * Does not attempt to account for sockets that are delegated from
6894  * the current process
6895  */
6896 int
6897 so_set_extended_bk_idle(struct socket *so, int optval)
6898 {
6899         int error = 0;
6900
6901         if ((SOCK_DOM(so) != PF_INET && SOCK_DOM(so) != PF_INET6) ||
6902             SOCK_PROTO(so) != IPPROTO_TCP) {
6903                 OSDecrementAtomic(&soextbkidlestat.so_xbkidle_notsupp);
6904                 error = EOPNOTSUPP;
6905         } else if (optval == 0) {
6906                 so->so_flags1 &= ~SOF1_EXTEND_BK_IDLE_WANTED;
6907
6908                 soresume(current_proc(), so, 1);
6909         } else {
6910                 struct proc *p = current_proc();
6911                 int i;
6912                 struct filedesc *fdp;
6913                 int count = 0;
6914
6915                 proc_fdlock(p);
6916
6917                 fdp = p->p_fd;
6918                 for (i = 0; i < fdp->fd_nfiles; i++) {
6919                         struct fileproc *fp = fdp->fd_ofiles[i];
6920                         struct socket *so2;
6921
6922                         if (fp == NULL ||
6923                             (fdp->fd_ofileflags[i] & UF_RESERVED) != 0 ||
6924                             FILEGLOB_DTYPE(fp->f_fglob) != DTYPE_SOCKET)
6925                                 continue;
6926
6927                         so2 = (struct socket *)fp->f_fglob->fg_data;
6928                         if (so != so2 &&
6929                             so2->so_flags1 & SOF1_EXTEND_BK_IDLE_WANTED)
6930                                 count++;
6931                         if (count >= soextbkidlestat.so_xbkidle_maxperproc)
6932                                 break;
6933                 }
6934                 if (count >= soextbkidlestat.so_xbkidle_maxperproc) {
6935                         OSIncrementAtomic(&soextbkidlestat.so_xbkidle_toomany);
6936                         error = EBUSY;
6937                 } else if (so->so_flags & SOF_DELEGATED) {
6938                         OSIncrementAtomic(&soextbkidlestat.so_xbkidle_nodlgtd);
6939                         error = EBUSY;
6940                 } else {
6941                         so->so_flags1 |= SOF1_EXTEND_BK_IDLE_WANTED;
6942                         OSIncrementAtomic(&soextbkidlestat.so_xbkidle_wantok);
6943                 }
6944                 SODEFUNCTLOG("%s[%d, %s]: so 0x%llx [%d,%d] "
6945                     "%s marked for extended bk idle\n",
6946                     __func__, proc_selfpid(), proc_best_name(current_proc()),
6947                     (uint64_t)DEBUG_KERNEL_ADDRPERM(so),
6948                     SOCK_DOM(so), SOCK_TYPE(so),
6949                     (so->so_flags1 & SOF1_EXTEND_BK_IDLE_WANTED) ?
6950                     "is" : "not");
6951
6952                 proc_fdunlock(p);
6953         }
6954
6955         return (error);
6956 }
6957
6958 static void
6959 so_stop_extended_bk_idle(struct socket *so)
6960 {
6961         so->so_flags1 &= ~SOF1_EXTEND_BK_IDLE_INPROG;
6962         so->so_extended_bk_start = 0;
6963
6964         OSDecrementAtomic(&soextbkidlestat.so_xbkidle_active);
6965         VERIFY(soextbkidlestat.so_xbkidle_active >= 0);
6966         /*
6967          * Force defunct
6968          */
6969         sosetdefunct(current_proc(), so,
6970             SHUTDOWN_SOCKET_LEVEL_DISCONNECT_INTERNAL, FALSE);
6971         if (so->so_flags & SOF_DEFUNCT) {
6972                 sodefunct(current_proc(), so,
6973                     SHUTDOWN_SOCKET_LEVEL_DISCONNECT_INTERNAL);
6974         }
6975 }
6976
6977 void
6978 so_drain_extended_bk_idle(struct socket *so)
6979 {
6980         if (so && (so->so_flags1 & SOF1_EXTEND_BK_IDLE_INPROG)) {
6981                 /*
6982                  * Only penalize sockets that have outstanding data
6983                  */
6984                 if (so->so_rcv.sb_cc || so->so_snd.sb_cc) {
6985                         so_stop_extended_bk_idle(so);
6986
6987                         OSIncrementAtomic(&soextbkidlestat.so_xbkidle_drained);
6988                 }
6989         }
6990 }
6991
6992 /*
6993  * Return values tells if socket is still in extended background idle
6994  */
6995 int
6996 so_check_extended_bk_idle_time(struct socket *so)
6997 {
6998         int ret = 1;
6999
7000         if ((so->so_flags1 & SOF1_EXTEND_BK_IDLE_INPROG)) {
7001                 SODEFUNCTLOG("%s[%d, %s]: so 0x%llx [%d,%d]\n",
7002                     __func__, proc_selfpid(), proc_best_name(current_proc()),
7003                     (uint64_t)DEBUG_KERNEL_ADDRPERM(so),
7004                     SOCK_DOM(so), SOCK_TYPE(so));
7005                 if (net_uptime() - so->so_extended_bk_start >
7006                     soextbkidlestat.so_xbkidle_time) {
7007                         so_stop_extended_bk_idle(so);
7008
7009                         OSIncrementAtomic(&soextbkidlestat.so_xbkidle_expired);
7010
7011                         ret = 0;
7012                 } else {
7013                         struct inpcb *inp = (struct inpcb *)so->so_pcb;
7014
7015                         inpcb_timer_sched(inp->inp_pcbinfo, INPCB_TIMER_LAZY);
7016                         OSIncrementAtomic(&soextbkidlestat.so_xbkidle_resched);
7017                 }
7018         }
7019
7020         return (ret);
7021 }
7022
7023 void
7024 resume_proc_sockets(proc_t p)
7025 {
7026         if (p->p_ladvflag & P_LXBKIDLEINPROG) {
7027                 struct filedesc *fdp;
7028                 int i;
7029
7030                 proc_fdlock(p);
7031                 fdp = p->p_fd;
7032                 for (i = 0; i < fdp->fd_nfiles; i++) {
7033                         struct fileproc *fp;
7034                         struct socket *so;
7035
7036                         fp = fdp->fd_ofiles[i];
7037                         if (fp == NULL ||
7038                             (fdp->fd_ofileflags[i] & UF_RESERVED) != 0 ||
7039                             FILEGLOB_DTYPE(fp->f_fglob) != DTYPE_SOCKET)
7040                                 continue;
7041
7042                         so = (struct socket *)fp->f_fglob->fg_data;
7043                         (void) soresume(p, so, 0);
7044                 }
7045                 proc_fdunlock(p);
7046
7047                 OSBitAndAtomic(~P_LXBKIDLEINPROG, &p->p_ladvflag);
7048         }
7049 }
7050
7051 __private_extern__ int
7052 so_set_recv_anyif(struct socket *so, int optval)
7053 {
7054         int ret = 0;
7055
7056 #if INET6
7057         if (SOCK_DOM(so) == PF_INET || SOCK_DOM(so) == PF_INET6) {
7058 #else
7059         if (SOCK_DOM(so) == PF_INET) {
7060 #endif /* !INET6 */
7061                 if (optval)
7062                         sotoinpcb(so)->inp_flags |= INP_RECV_ANYIF;
7063                 else
7064                         sotoinpcb(so)->inp_flags &= ~INP_RECV_ANYIF;
7065         }
7066
7067         return (ret);
7068 }
7069
7070 __private_extern__ int
7071 so_get_recv_anyif(struct socket *so)
7072 {
7073         int ret = 0;
7074
7075 #if INET6
7076         if (SOCK_DOM(so) == PF_INET || SOCK_DOM(so) == PF_INET6) {
7077 #else
7078         if (SOCK_DOM(so) == PF_INET) {
7079 #endif /* !INET6 */
7080                 ret = (sotoinpcb(so)->inp_flags & INP_RECV_ANYIF) ? 1 : 0;
7081         }
7082
7083         return (ret);
7084 }
7085
7086 int
7087 so_set_restrictions(struct socket *so, uint32_t vals)
7088 {
7089         int nocell_old, nocell_new;
7090         int noexpensive_old, noexpensive_new;
7091
7092         /*
7093          * Deny-type restrictions are trapdoors; once set they cannot be
7094          * unset for the lifetime of the socket.  This allows them to be
7095          * issued by a framework on behalf of the application without
7096          * having to worry that they can be undone.
7097          *
7098          * Note here that socket-level restrictions overrides any protocol
7099          * level restrictions.  For instance, SO_RESTRICT_DENY_CELLULAR
7100          * socket restriction issued on the socket has a higher precendence
7101          * than INP_NO_IFT_CELLULAR.  The latter is affected by the UUID
7102          * policy PROC_UUID_NO_CELLULAR for unrestricted sockets only,
7103          * i.e. when SO_RESTRICT_DENY_CELLULAR has not been issued.
7104          */
7105         nocell_old = (so->so_restrictions & SO_RESTRICT_DENY_CELLULAR);
7106         noexpensive_old = (so->so_restrictions & SO_RESTRICT_DENY_EXPENSIVE);
7107         so->so_restrictions |= (vals & (SO_RESTRICT_DENY_IN |
7108             SO_RESTRICT_DENY_OUT | SO_RESTRICT_DENY_CELLULAR |
7109             SO_RESTRICT_DENY_EXPENSIVE));
7110         nocell_new = (so->so_restrictions & SO_RESTRICT_DENY_CELLULAR);
7111         noexpensive_new = (so->so_restrictions & SO_RESTRICT_DENY_EXPENSIVE);
7112
7113         /* we can only set, not clear restrictions */
7114         if ((nocell_new - nocell_old) == 0 &&
7115             (noexpensive_new - noexpensive_old) == 0)
7116                 return (0);
7117 #if INET6
7118         if (SOCK_DOM(so) == PF_INET || SOCK_DOM(so) == PF_INET6) {
7119 #else
7120         if (SOCK_DOM(so) == PF_INET) {
7121 #endif /* !INET6 */
7122                 if (nocell_new - nocell_old != 0) {
7123                         /*
7124                          * if deny cellular is now set, do what's needed
7125                          * for INPCB
7126                          */
7127                         inp_set_nocellular(sotoinpcb(so));
7128                 }
7129                 if (noexpensive_new - noexpensive_old != 0) {
7130                         inp_set_noexpensive(sotoinpcb(so));
7131                 }
7132         }
7133
7134         return (0);
7135 }
7136
7137 uint32_t
7138 so_get_restrictions(struct socket *so)
7139 {
7140         return (so->so_restrictions & (SO_RESTRICT_DENY_IN |
7141             SO_RESTRICT_DENY_OUT |
7142             SO_RESTRICT_DENY_CELLULAR | SO_RESTRICT_DENY_EXPENSIVE));
7143 }
7144
7145 struct sockaddr_entry *
7146 sockaddrentry_alloc(int how)
7147 {
7148         struct sockaddr_entry *se;
7149
7150         se = (how == M_WAITOK) ? zalloc(se_zone) : zalloc_noblock(se_zone);
7151         if (se != NULL)
7152                 bzero(se, se_zone_size);
7153
7154         return (se);
7155 }
7156
7157 void
7158 sockaddrentry_free(struct sockaddr_entry *se)
7159 {
7160         if (se->se_addr != NULL) {
7161                 FREE(se->se_addr, M_SONAME);
7162                 se->se_addr = NULL;
7163         }
7164         zfree(se_zone, se);
7165 }
7166
7167 struct sockaddr_entry *
7168 sockaddrentry_dup(const struct sockaddr_entry *src_se, int how)
7169 {
7170         struct sockaddr_entry *dst_se;
7171
7172         dst_se = sockaddrentry_alloc(how);
7173         if (dst_se != NULL) {
7174                 int len = src_se->se_addr->sa_len;
7175
7176                 MALLOC(dst_se->se_addr, struct sockaddr *,
7177                     len, M_SONAME, how | M_ZERO);
7178                 if (dst_se->se_addr != NULL) {
7179                         bcopy(src_se->se_addr, dst_se->se_addr, len);
7180                 } else {
7181                         sockaddrentry_free(dst_se);
7182                         dst_se = NULL;
7183                 }
7184         }
7185
7186         return (dst_se);
7187 }
7188
7189 struct sockaddr_list *
7190 sockaddrlist_alloc(int how)
7191 {
7192         struct sockaddr_list *sl;
7193
7194         sl = (how == M_WAITOK) ? zalloc(sl_zone) : zalloc_noblock(sl_zone);
7195         if (sl != NULL) {
7196                 bzero(sl, sl_zone_size);
7197                 TAILQ_INIT(&sl->sl_head);
7198         }
7199         return (sl);
7200 }
7201
7202 void
7203 sockaddrlist_free(struct sockaddr_list *sl)
7204 {
7205         struct sockaddr_entry *se, *tse;
7206
7207         TAILQ_FOREACH_SAFE(se, &sl->sl_head, se_link, tse) {
7208                 sockaddrlist_remove(sl, se);
7209                 sockaddrentry_free(se);
7210         }
7211         VERIFY(sl->sl_cnt == 0 && TAILQ_EMPTY(&sl->sl_head));
7212         zfree(sl_zone, sl);
7213 }
7214
7215 void
7216 sockaddrlist_insert(struct sockaddr_list *sl, struct sockaddr_entry *se)
7217 {
7218         VERIFY(!(se->se_flags & SEF_ATTACHED));
7219         se->se_flags |= SEF_ATTACHED;
7220         TAILQ_INSERT_TAIL(&sl->sl_head, se, se_link);
7221         sl->sl_cnt++;
7222         VERIFY(sl->sl_cnt != 0);
7223 }
7224
7225 void
7226 sockaddrlist_remove(struct sockaddr_list *sl, struct sockaddr_entry *se)
7227 {
7228         VERIFY(se->se_flags & SEF_ATTACHED);
7229         se->se_flags &= ~SEF_ATTACHED;
7230         VERIFY(sl->sl_cnt != 0);
7231         sl->sl_cnt--;
7232         TAILQ_REMOVE(&sl->sl_head, se, se_link);
7233 }
7234
7235 struct sockaddr_list *
7236 sockaddrlist_dup(const struct sockaddr_list *src_sl, int how)
7237 {
7238         struct sockaddr_entry *src_se, *tse;
7239         struct sockaddr_list *dst_sl;
7240
7241         dst_sl = sockaddrlist_alloc(how);
7242         if (dst_sl == NULL)
7243                 return (NULL);
7244
7245         TAILQ_FOREACH_SAFE(src_se, &src_sl->sl_head, se_link, tse) {
7246                 struct sockaddr_entry *dst_se;
7247
7248                 if (src_se->se_addr == NULL)
7249                         continue;
7250
7251                 dst_se = sockaddrentry_dup(src_se, how);
7252                 if (dst_se == NULL) {
7253                         sockaddrlist_free(dst_sl);
7254                         return (NULL);
7255                 }
7256
7257                 sockaddrlist_insert(dst_sl, dst_se);
7258         }
7259         VERIFY(src_sl->sl_cnt == dst_sl->sl_cnt);
7260
7261         return (dst_sl);
7262 }
7263
7264 int
7265 so_set_effective_pid(struct socket *so, int epid, struct proc *p)
7266 {
7267         struct proc *ep = PROC_NULL;
7268         int error = 0;
7269
7270         /* pid 0 is reserved for kernel */
7271         if (epid == 0) {
7272                 error = EINVAL;
7273                 goto done;
7274         }
7275
7276         /*
7277          * If this is an in-kernel socket, prevent its delegate
7278          * association from changing unless the socket option is
7279          * coming from within the kernel itself.
7280          */
7281         if (so->last_pid == 0 && p != kernproc) {
7282                 error = EACCES;
7283                 goto done;
7284         }
7285
7286         /*
7287          * If this is issued by a process that's recorded as the
7288          * real owner of the socket, or if the pid is the same as
7289          * the process's own pid, then proceed.  Otherwise ensure
7290          * that the issuing process has the necessary privileges.
7291          */
7292         if (epid != so->last_pid || epid != proc_pid(p)) {
7293                 if ((error = priv_check_cred(kauth_cred_get(),
7294                     PRIV_NET_PRIVILEGED_SOCKET_DELEGATE, 0))) {
7295                         error = EACCES;
7296                         goto done;
7297                 }
7298         }
7299
7300         /* Find the process that corresponds to the effective pid */
7301         if ((ep = proc_find(epid)) == PROC_NULL) {
7302                 error = ESRCH;
7303                 goto done;
7304         }
7305
7306         /*
7307          * If a process tries to delegate the socket to itself, then
7308          * there's really nothing to do; treat it as a way for the
7309          * delegate association to be cleared.  Note that we check
7310          * the passed-in proc rather than calling proc_selfpid(),
7311          * as we need to check the process issuing the socket option
7312          * which could be kernproc.  Given that we don't allow 0 for
7313          * effective pid, it means that a delegated in-kernel socket
7314          * stays delegated during its lifetime (which is probably OK.)
7315          */
7316         if (epid == proc_pid(p)) {
7317                 so->so_flags &= ~SOF_DELEGATED;
7318                 so->e_upid = 0;
7319                 so->e_pid = 0;
7320                 uuid_clear(so->e_uuid);
7321         } else {
7322                 so->so_flags |= SOF_DELEGATED;
7323                 so->e_upid = proc_uniqueid(ep);
7324                 so->e_pid = proc_pid(ep);
7325                 proc_getexecutableuuid(ep, so->e_uuid, sizeof (so->e_uuid));
7326         }
7327 done:
7328         if (error == 0 && net_io_policy_log) {
7329                 uuid_string_t buf;
7330
7331                 uuid_unparse(so->e_uuid, buf);
7332                 log(LOG_DEBUG, "%s[%s,%d]: so 0x%llx [%d,%d] epid %d (%s) "
7333                     "euuid %s%s\n", __func__, proc_name_address(p),
7334                     proc_pid(p), (uint64_t)DEBUG_KERNEL_ADDRPERM(so),
7335                     SOCK_DOM(so), SOCK_TYPE(so),
7336                     so->e_pid, proc_name_address(ep), buf,
7337                     ((so->so_flags & SOF_DELEGATED) ? " [delegated]" : ""));
7338         } else if (error != 0 && net_io_policy_log) {
7339                 log(LOG_ERR, "%s[%s,%d]: so 0x%llx [%d,%d] epid %d (%s) "
7340                     "ERROR (%d)\n", __func__, proc_name_address(p),
7341                     proc_pid(p), (uint64_t)DEBUG_KERNEL_ADDRPERM(so),
7342                     SOCK_DOM(so), SOCK_TYPE(so),
7343                     epid, (ep == PROC_NULL) ? "PROC_NULL" :
7344                     proc_name_address(ep), error);
7345         }
7346
7347         /* Update this socket's policy upon success */
7348         if (error == 0) {
7349                 so->so_policy_gencnt *= -1;
7350                 so_update_policy(so);
7351 #if NECP
7352                 so_update_necp_policy(so, NULL, NULL);
7353 #endif /* NECP */
7354         }
7355
7356         if (ep != PROC_NULL)
7357                 proc_rele(ep);
7358
7359         return (error);
7360 }
7361
7362 int
7363 so_set_effective_uuid(struct socket *so, uuid_t euuid, struct proc *p)
7364 {
7365         uuid_string_t buf;
7366         uuid_t uuid;
7367         int error = 0;
7368
7369         /* UUID must not be all-zeroes (reserved for kernel) */
7370         if (uuid_is_null(euuid)) {
7371                 error = EINVAL;
7372                 goto done;
7373         }
7374
7375         /*
7376          * If this is an in-kernel socket, prevent its delegate
7377          * association from changing unless the socket option is
7378          * coming from within the kernel itself.
7379          */
7380         if (so->last_pid == 0 && p != kernproc) {
7381                 error = EACCES;
7382                 goto done;
7383         }
7384
7385         /* Get the UUID of the issuing process */
7386         proc_getexecutableuuid(p, uuid, sizeof (uuid));
7387
7388         /*
7389          * If this is issued by a process that's recorded as the
7390          * real owner of the socket, or if the uuid is the same as
7391          * the process's own uuid, then proceed.  Otherwise ensure
7392          * that the issuing process has the necessary privileges.
7393          */
7394         if (uuid_compare(euuid, so->last_uuid) != 0 ||
7395             uuid_compare(euuid, uuid) != 0) {
7396                 if ((error = priv_check_cred(kauth_cred_get(),
7397                     PRIV_NET_PRIVILEGED_SOCKET_DELEGATE, 0))) {
7398                         error = EACCES;
7399                         goto done;
7400                 }
7401         }
7402
7403         /*
7404          * If a process tries to delegate the socket to itself, then
7405          * there's really nothing to do; treat it as a way for the
7406          * delegate association to be cleared.  Note that we check
7407          * the uuid of the passed-in proc rather than that of the
7408          * current process, as we need to check the process issuing
7409          * the socket option which could be kernproc itself.  Given
7410          * that we don't allow 0 for effective uuid, it means that
7411          * a delegated in-kernel socket stays delegated during its
7412          * lifetime (which is okay.)
7413          */
7414         if (uuid_compare(euuid, uuid) == 0) {
7415                 so->so_flags &= ~SOF_DELEGATED;
7416                 so->e_upid = 0;
7417                 so->e_pid = 0;
7418                 uuid_clear(so->e_uuid);
7419         } else {
7420                 so->so_flags |= SOF_DELEGATED;
7421                 /*
7422                  * Unlike so_set_effective_pid(), we only have the UUID
7423                  * here and the process ID is not known.  Inherit the
7424                  * real {pid,upid} of the socket.
7425                  */
7426                 so->e_upid = so->last_upid;
7427                 so->e_pid = so->last_pid;
7428                 uuid_copy(so->e_uuid, euuid);
7429         }
7430
7431 done:
7432         if (error == 0 && net_io_policy_log) {
7433                 uuid_unparse(so->e_uuid, buf);
7434                 log(LOG_DEBUG, "%s[%s,%d]: so 0x%llx [%d,%d] epid %d "
7435                     "euuid %s%s\n", __func__, proc_name_address(p), proc_pid(p),
7436                     (uint64_t)DEBUG_KERNEL_ADDRPERM(so), SOCK_DOM(so),
7437                     SOCK_TYPE(so), so->e_pid, buf,
7438                     ((so->so_flags & SOF_DELEGATED) ? " [delegated]" : ""));
7439         } else if (error != 0 && net_io_policy_log) {
7440                 uuid_unparse(euuid, buf);
7441                 log(LOG_DEBUG, "%s[%s,%d]: so 0x%llx [%d,%d] euuid %s "
7442                     "ERROR (%d)\n", __func__, proc_name_address(p), proc_pid(p),
7443                     (uint64_t)DEBUG_KERNEL_ADDRPERM(so), SOCK_DOM(so),
7444                     SOCK_TYPE(so), buf, error);
7445         }
7446
7447         /* Update this socket's policy upon success */
7448         if (error == 0) {
7449                 so->so_policy_gencnt *= -1;
7450                 so_update_policy(so);
7451 #if NECP
7452                 so_update_necp_policy(so, NULL, NULL);
7453 #endif /* NECP */
7454         }
7455
7456         return (error);
7457 }
7458
7459 void
7460 netpolicy_post_msg(uint32_t ev_code, struct netpolicy_event_data *ev_data,
7461     uint32_t ev_datalen)
7462 {
7463         struct kev_msg ev_msg;
7464
7465         /*
7466          * A netpolicy event always starts with a netpolicy_event_data
7467          * structure, but the caller can provide for a longer event
7468          * structure to post, depending on the event code.
7469          */
7470         VERIFY(ev_data != NULL && ev_datalen >= sizeof (*ev_data));
7471
7472         bzero(&ev_msg, sizeof (ev_msg));
7473         ev_msg.vendor_code      = KEV_VENDOR_APPLE;
7474         ev_msg.kev_class        = KEV_NETWORK_CLASS;
7475         ev_msg.kev_subclass     = KEV_NETPOLICY_SUBCLASS;
7476         ev_msg.event_code       = ev_code;
7477
7478         ev_msg.dv[0].data_ptr   = ev_data;
7479         ev_msg.dv[0].data_length = ev_datalen;
7480
7481         kev_post_msg(&ev_msg);
7482 }
7483
7484 void
7485 socket_post_kev_msg(uint32_t ev_code,
7486     struct kev_socket_event_data *ev_data,
7487     uint32_t ev_datalen)
7488 {
7489         struct kev_msg ev_msg;
7490
7491         bzero(&ev_msg, sizeof(ev_msg));
7492         ev_msg.vendor_code = KEV_VENDOR_APPLE;
7493         ev_msg.kev_class = KEV_NETWORK_CLASS;
7494         ev_msg.kev_subclass = KEV_SOCKET_SUBCLASS;
7495         ev_msg.event_code = ev_code;
7496
7497         ev_msg.dv[0].data_ptr = ev_data;
7498         ev_msg.dv[0]. data_length = ev_datalen;
7499
7500         kev_post_msg(&ev_msg);
7501 }
7502
7503 void
7504 socket_post_kev_msg_closed(struct socket *so)
7505 {
7506         struct kev_socket_closed ev;
7507         struct sockaddr *socksa = NULL, *peersa = NULL;
7508         int err;
7509         bzero(&ev, sizeof(ev));
7510         err = (*so->so_proto->pr_usrreqs->pru_sockaddr)(so, &socksa);
7511         if (err == 0) {
7512                 err = (*so->so_proto->pr_usrreqs->pru_peeraddr)(so,
7513                     &peersa);
7514                 if (err == 0) {
7515                         memcpy(&ev.ev_data.kev_sockname, socksa,
7516                             min(socksa->sa_len,
7517                             sizeof (ev.ev_data.kev_sockname)));
7518                         memcpy(&ev.ev_data.kev_peername, peersa,
7519                             min(peersa->sa_len,
7520                             sizeof (ev.ev_data.kev_peername)));
7521                         socket_post_kev_msg(KEV_SOCKET_CLOSED,
7522                             &ev.ev_data, sizeof (ev));
7523                 }
7524         }
7525         if (socksa != NULL)
7526                 FREE(socksa, M_SONAME);
7527         if (peersa != NULL)
7528                 FREE(peersa, M_SONAME);
7529 }