bsd/kern/uipc_socket.c

   1 /*
   2  * Copyright (c) 1998-2018 Apple Inc. All rights reserved.
   3  *
   4  * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
   5  *
   6  * This file contains Original Code and/or Modifications of Original Code
   7  * as defined in and that are subject to the Apple Public Source License
   8  * Version 2.0 (the 'License'). You may not use this file except in
   9  * compliance with the License. The rights granted to you under the License
  10  * may not be used to create, or enable the creation or redistribution of,
  11  * unlawful or unlicensed copies of an Apple operating system, or to
  12  * circumvent, violate, or enable the circumvention or violation of, any
  13  * terms of an Apple operating system software license agreement.
  14  *
  15  * Please obtain a copy of the License at
  16  * http://www.opensource.apple.com/apsl/ and read it before using this file.
  17  *
  18  * The Original Code and all software distributed under the License are
  19  * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
  20  * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
  21  * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
  22  * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
  23  * Please see the License for the specific language governing rights and
  24  * limitations under the License.
  25  *
  26  * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
  27  */
  28 /* Copyright (c) 1995 NeXT Computer, Inc. All Rights Reserved */
  29 /*
  30  * Copyright (c) 1982, 1986, 1988, 1990, 1993
  31  *      The Regents of the University of California.  All rights reserved.
  32  *
  33  * Redistribution and use in source and binary forms, with or without
  34  * modification, are permitted provided that the following conditions
  35  * are met:
  36  * 1. Redistributions of source code must retain the above copyright
  37  *    notice, this list of conditions and the following disclaimer.
  38  * 2. Redistributions in binary form must reproduce the above copyright
  39  *    notice, this list of conditions and the following disclaimer in the
  40  *    documentation and/or other materials provided with the distribution.
  41  * 3. All advertising materials mentioning features or use of this software
  42  *    must display the following acknowledgement:
  43  *      This product includes software developed by the University of
  44  *      California, Berkeley and its contributors.
  45  * 4. Neither the name of the University nor the names of its contributors
  46  *    may be used to endorse or promote products derived from this software
  47  *    without specific prior written permission.
  48  *
  49  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
  50  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  51  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  52  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  53  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  54  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  55  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  56  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  57  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  58  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  59  * SUCH DAMAGE.
  60  *
  61  *      @(#)uipc_socket.c       8.3 (Berkeley) 4/15/94
  62  */
  63 /*
  64  * NOTICE: This file was modified by SPARTA, Inc. in 2005 to introduce
  65  * support for mandatory and extensible security protections.  This notice
  66  * is included in support of clause 2.2 (b) of the Apple Public License,
  67  * Version 2.0.
  68  */
  69
  70 #include <sys/param.h>
  71 #include <sys/systm.h>
  72 #include <sys/filedesc.h>
  73 #include <sys/proc.h>
  74 #include <sys/proc_internal.h>
  75 #include <sys/kauth.h>
  76 #include <sys/file_internal.h>
  77 #include <sys/fcntl.h>
  78 #include <sys/malloc.h>
  79 #include <sys/mbuf.h>
  80 #include <sys/domain.h>
  81 #include <sys/kernel.h>
  82 #include <sys/event.h>
  83 #include <sys/poll.h>
  84 #include <sys/protosw.h>
  85 #include <sys/socket.h>
  86 #include <sys/socketvar.h>
  87 #include <sys/resourcevar.h>
  88 #include <sys/signalvar.h>
  89 #include <sys/sysctl.h>
  90 #include <sys/syslog.h>
  91 #include <sys/uio.h>
  92 #include <sys/uio_internal.h>
  93 #include <sys/ev.h>
  94 #include <sys/kdebug.h>
  95 #include <sys/un.h>
  96 #include <sys/user.h>
  97 #include <sys/priv.h>
  98 #include <sys/kern_event.h>
  99 #include <net/route.h>
 100 #include <net/init.h>
 101 #include <net/net_api_stats.h>
 102 #include <net/ntstat.h>
 103 #include <net/content_filter.h>
 104 #include <netinet/in.h>
 105 #include <netinet/in_pcb.h>
 106 #include <netinet/in_tclass.h>
 107 #include <netinet/tcp_var.h>
 108 #include <netinet/ip6.h>
 109 #include <netinet6/ip6_var.h>
 110 #include <netinet/flow_divert.h>
 111 #include <kern/zalloc.h>
 112 #include <kern/locks.h>
 113 #include <machine/limits.h>
 114 #include <libkern/OSAtomic.h>
 115 #include <pexpert/pexpert.h>
 116 #include <kern/assert.h>
 117 #include <kern/task.h>
 118 #include <kern/policy_internal.h>
 119
 120 #include <sys/kpi_mbuf.h>
 121 #include <sys/mcache.h>
 122 #include <sys/unpcb.h>
 123 #include <libkern/section_keywords.h>
 124
 125 #if CONFIG_MACF
 126 #include <security/mac_framework.h>
 127 #endif /* MAC */
 128
 129 #if MULTIPATH
 130 #include <netinet/mp_pcb.h>
 131 #include <netinet/mptcp_var.h>
 132 #endif /* MULTIPATH */
 133
 134 #define ROUNDUP(a, b) (((a) + ((b) - 1)) & (~((b) - 1)))
 135
 136 #if DEBUG || DEVELOPMENT
 137 #define DEBUG_KERNEL_ADDRPERM(_v) (_v)
 138 #else
 139 #define DEBUG_KERNEL_ADDRPERM(_v) VM_KERNEL_ADDRPERM(_v)
 140 #endif
 141
 142 /* TODO: this should be in a header file somewhere */
 143 extern char *proc_name_address(void *p);
 144
 145 static u_int32_t        so_cache_hw;    /* High water mark for socache */
 146 static u_int32_t        so_cache_timeouts;      /* number of timeouts */
 147 static u_int32_t        so_cache_max_freed;     /* max freed per timeout */
 148 static u_int32_t        cached_sock_count = 0;
 149 STAILQ_HEAD(, socket)   so_cache_head;
 150 int     max_cached_sock_count = MAX_CACHED_SOCKETS;
 151 static u_int32_t        so_cache_time;
 152 static int              socketinit_done;
 153 static struct zone      *so_cache_zone;
 154
 155 static lck_grp_t        *so_cache_mtx_grp;
 156 static lck_attr_t       *so_cache_mtx_attr;
 157 static lck_grp_attr_t   *so_cache_mtx_grp_attr;
 158 static lck_mtx_t        *so_cache_mtx;
 159
 160 #include <machine/limits.h>
 161
 162 static int      filt_sorattach(struct knote *kn, struct kevent_internal_s *kev);
 163 static void     filt_sordetach(struct knote *kn);
 164 static int      filt_soread(struct knote *kn, long hint);
 165 static int      filt_sortouch(struct knote *kn, struct kevent_internal_s *kev);
 166 static int      filt_sorprocess(struct knote *kn, struct filt_process_s *data, struct kevent_internal_s *kev);
 167
 168 static int      filt_sowattach(struct knote *kn, struct kevent_internal_s *kev);
 169 static void     filt_sowdetach(struct knote *kn);
 170 static int      filt_sowrite(struct knote *kn, long hint);
 171 static int      filt_sowtouch(struct knote *kn, struct kevent_internal_s *kev);
 172 static int      filt_sowprocess(struct knote *kn, struct filt_process_s *data, struct kevent_internal_s *kev);
 173
 174 static int      filt_sockattach(struct knote *kn, struct kevent_internal_s *kev);
 175 static void     filt_sockdetach(struct knote *kn);
 176 static int      filt_sockev(struct knote *kn, long hint);
 177 static int      filt_socktouch(struct knote *kn, struct kevent_internal_s *kev);
 178 static int      filt_sockprocess(struct knote *kn, struct filt_process_s *data, struct kevent_internal_s *kev);
 179
 180 static int sooptcopyin_timeval(struct sockopt *, struct timeval *);
 181 static int sooptcopyout_timeval(struct sockopt *, const struct timeval *);
 182
 183 SECURITY_READ_ONLY_EARLY(struct filterops) soread_filtops = {
 184         .f_isfd = 1,
 185         .f_attach = filt_sorattach,
 186         .f_detach = filt_sordetach,
 187         .f_event = filt_soread,
 188         .f_touch = filt_sortouch,
 189         .f_process = filt_sorprocess,
 190 };
 191
 192 SECURITY_READ_ONLY_EARLY(struct filterops) sowrite_filtops = {
 193         .f_isfd = 1,
 194         .f_attach = filt_sowattach,
 195         .f_detach = filt_sowdetach,
 196         .f_event = filt_sowrite,
 197         .f_touch = filt_sowtouch,
 198         .f_process = filt_sowprocess,
 199 };
 200
 201 SECURITY_READ_ONLY_EARLY(struct filterops) sock_filtops = {
 202         .f_isfd = 1,
 203         .f_attach = filt_sockattach,
 204         .f_detach = filt_sockdetach,
 205         .f_event = filt_sockev,
 206         .f_touch = filt_socktouch,
 207         .f_process = filt_sockprocess,
 208 };
 209
 210 SECURITY_READ_ONLY_EARLY(struct filterops) soexcept_filtops = {
 211         .f_isfd = 1,
 212         .f_attach = filt_sorattach,
 213         .f_detach = filt_sordetach,
 214         .f_event = filt_soread,
 215         .f_touch = filt_sortouch,
 216         .f_process = filt_sorprocess,
 217 };
 218
 219 SYSCTL_DECL(_kern_ipc);
 220
 221 #define EVEN_MORE_LOCKING_DEBUG 0
 222
 223 int socket_debug = 0;
 224 SYSCTL_INT(_kern_ipc, OID_AUTO, socket_debug,
 225     CTLFLAG_RW | CTLFLAG_LOCKED, &socket_debug, 0, "");
 226
 227 static unsigned long sodefunct_calls = 0;
 228 SYSCTL_LONG(_kern_ipc, OID_AUTO, sodefunct_calls, CTLFLAG_LOCKED,
 229     &sodefunct_calls, "");
 230
 231 static int socket_zone = M_SOCKET;
 232 so_gen_t        so_gencnt;      /* generation count for sockets */
 233
 234 MALLOC_DEFINE(M_SONAME, "soname", "socket name");
 235 MALLOC_DEFINE(M_PCB, "pcb", "protocol control block");
 236
 237 #define DBG_LAYER_IN_BEG        NETDBG_CODE(DBG_NETSOCK, 0)
 238 #define DBG_LAYER_IN_END        NETDBG_CODE(DBG_NETSOCK, 2)
 239 #define DBG_LAYER_OUT_BEG       NETDBG_CODE(DBG_NETSOCK, 1)
 240 #define DBG_LAYER_OUT_END       NETDBG_CODE(DBG_NETSOCK, 3)
 241 #define DBG_FNC_SOSEND          NETDBG_CODE(DBG_NETSOCK, (4 << 8) | 1)
 242 #define DBG_FNC_SOSEND_LIST     NETDBG_CODE(DBG_NETSOCK, (4 << 8) | 3)
 243 #define DBG_FNC_SORECEIVE       NETDBG_CODE(DBG_NETSOCK, (8 << 8))
 244 #define DBG_FNC_SORECEIVE_LIST  NETDBG_CODE(DBG_NETSOCK, (8 << 8) | 3)
 245 #define DBG_FNC_SOSHUTDOWN      NETDBG_CODE(DBG_NETSOCK, (9 << 8))
 246
 247 #define MAX_SOOPTGETM_SIZE      (128 * MCLBYTES)
 248
 249 int somaxconn = SOMAXCONN;
 250 SYSCTL_INT(_kern_ipc, KIPC_SOMAXCONN, somaxconn,
 251     CTLFLAG_RW | CTLFLAG_LOCKED, &somaxconn, 0, "");
 252
 253 /* Should we get a maximum also ??? */
 254 static int sosendmaxchain = 65536;
 255 static int sosendminchain = 16384;
 256 static int sorecvmincopy  = 16384;
 257 SYSCTL_INT(_kern_ipc, OID_AUTO, sosendminchain,
 258     CTLFLAG_RW | CTLFLAG_LOCKED, &sosendminchain, 0, "");
 259 SYSCTL_INT(_kern_ipc, OID_AUTO, sorecvmincopy,
 260     CTLFLAG_RW | CTLFLAG_LOCKED, &sorecvmincopy, 0, "");
 261
 262 /*
 263  * Set to enable jumbo clusters (if available) for large writes when
 264  * the socket is marked with SOF_MULTIPAGES; see below.
 265  */
 266 int sosendjcl = 1;
 267 SYSCTL_INT(_kern_ipc, OID_AUTO, sosendjcl,
 268     CTLFLAG_RW | CTLFLAG_LOCKED, &sosendjcl, 0, "");
 269
 270 /*
 271  * Set this to ignore SOF_MULTIPAGES and use jumbo clusters for large
 272  * writes on the socket for all protocols on any network interfaces,
 273  * depending upon sosendjcl above.  Be extra careful when setting this
 274  * to 1, because sending down packets that cross physical pages down to
 275  * broken drivers (those that falsely assume that the physical pages
 276  * are contiguous) might lead to system panics or silent data corruption.
 277  * When set to 0, the system will respect SOF_MULTIPAGES, which is set
 278  * only for TCP sockets whose outgoing interface is IFNET_MULTIPAGES
 279  * capable.  Set this to 1 only for testing/debugging purposes.
 280  */
 281 int sosendjcl_ignore_capab = 0;
 282 SYSCTL_INT(_kern_ipc, OID_AUTO, sosendjcl_ignore_capab,
 283     CTLFLAG_RW | CTLFLAG_LOCKED, &sosendjcl_ignore_capab, 0, "");
 284
 285 /*
 286  * Set this to ignore SOF1_IF_2KCL and use big clusters for large
 287  * writes on the socket for all protocols on any network interfaces.
 288  * Be extra careful when setting this to 1, because sending down packets with
 289  * clusters larger that 2 KB might lead to system panics or data corruption.
 290  * When set to 0, the system will respect SOF1_IF_2KCL, which is set
 291  * on the outgoing interface
 292  * Set this to 1  for testing/debugging purposes only.
 293  */
 294 int sosendbigcl_ignore_capab = 0;
 295 SYSCTL_INT(_kern_ipc, OID_AUTO, sosendbigcl_ignore_capab,
 296     CTLFLAG_RW | CTLFLAG_LOCKED, &sosendbigcl_ignore_capab, 0, "");
 297
 298 int sodefunctlog = 0;
 299 SYSCTL_INT(_kern_ipc, OID_AUTO, sodefunctlog, CTLFLAG_RW | CTLFLAG_LOCKED,
 300     &sodefunctlog, 0, "");
 301
 302 int sothrottlelog = 0;
 303 SYSCTL_INT(_kern_ipc, OID_AUTO, sothrottlelog, CTLFLAG_RW | CTLFLAG_LOCKED,
 304     &sothrottlelog, 0, "");
 305
 306 int sorestrictrecv = 1;
 307 SYSCTL_INT(_kern_ipc, OID_AUTO, sorestrictrecv, CTLFLAG_RW | CTLFLAG_LOCKED,
 308     &sorestrictrecv, 0, "Enable inbound interface restrictions");
 309
 310 int sorestrictsend = 1;
 311 SYSCTL_INT(_kern_ipc, OID_AUTO, sorestrictsend, CTLFLAG_RW | CTLFLAG_LOCKED,
 312     &sorestrictsend, 0, "Enable outbound interface restrictions");
 313
 314 int soreserveheadroom = 1;
 315 SYSCTL_INT(_kern_ipc, OID_AUTO, soreserveheadroom, CTLFLAG_RW | CTLFLAG_LOCKED,
 316     &soreserveheadroom, 0, "To allocate contiguous datagram buffers");
 317
 318 #if (DEBUG || DEVELOPMENT)
 319 int so_notsent_lowat_check = 1;
 320 SYSCTL_INT(_kern_ipc, OID_AUTO, notsent_lowat, CTLFLAG_RW | CTLFLAG_LOCKED,
 321     &so_notsent_lowat_check, 0, "enable/disable notsnet lowat check");
 322 #endif /* DEBUG || DEVELOPMENT */
 323
 324 int so_accept_list_waits = 0;
 325 #if (DEBUG || DEVELOPMENT)
 326 SYSCTL_INT(_kern_ipc, OID_AUTO, accept_list_waits, CTLFLAG_RW | CTLFLAG_LOCKED,
 327     &so_accept_list_waits, 0, "number of waits for listener incomp list");
 328 #endif /* DEBUG || DEVELOPMENT */
 329
 330 extern struct inpcbinfo tcbinfo;
 331
 332 /* TODO: these should be in header file */
 333 extern int get_inpcb_str_size(void);
 334 extern int get_tcp_str_size(void);
 335
 336 vm_size_t       so_cache_zone_element_size;
 337
 338 static int sodelayed_copy(struct socket *, struct uio *, struct mbuf **,
 339     user_ssize_t *);
 340 static void cached_sock_alloc(struct socket **, int);
 341 static void cached_sock_free(struct socket *);
 342
 343 /*
 344  * Maximum of extended background idle sockets per process
 345  * Set to zero to disable further setting of the option
 346  */
 347
 348 #define SO_IDLE_BK_IDLE_MAX_PER_PROC    1
 349 #define SO_IDLE_BK_IDLE_TIME            600
 350 #define SO_IDLE_BK_IDLE_RCV_HIWAT       131072
 351
 352 struct soextbkidlestat soextbkidlestat;
 353
 354 SYSCTL_UINT(_kern_ipc, OID_AUTO, maxextbkidleperproc,
 355     CTLFLAG_RW | CTLFLAG_LOCKED, &soextbkidlestat.so_xbkidle_maxperproc, 0,
 356     "Maximum of extended background idle sockets per process");
 357
 358 SYSCTL_UINT(_kern_ipc, OID_AUTO, extbkidletime, CTLFLAG_RW | CTLFLAG_LOCKED,
 359     &soextbkidlestat.so_xbkidle_time, 0,
 360     "Time in seconds to keep extended background idle sockets");
 361
 362 SYSCTL_UINT(_kern_ipc, OID_AUTO, extbkidlercvhiwat, CTLFLAG_RW | CTLFLAG_LOCKED,
 363     &soextbkidlestat.so_xbkidle_rcvhiwat, 0,
 364     "High water mark for extended background idle sockets");
 365
 366 SYSCTL_STRUCT(_kern_ipc, OID_AUTO, extbkidlestat, CTLFLAG_RD | CTLFLAG_LOCKED,
 367     &soextbkidlestat, soextbkidlestat, "");
 368
 369 int so_set_extended_bk_idle(struct socket *, int);
 370
 371
 372 /*
 373  * SOTCDB_NO_DSCP is set by default, to prevent the networking stack from
 374  * setting the DSCP code on the packet based on the service class; see
 375  * <rdar://problem/11277343> for details.
 376  */
 377 __private_extern__ u_int32_t sotcdb = 0;
 378 SYSCTL_INT(_kern_ipc, OID_AUTO, sotcdb, CTLFLAG_RW | CTLFLAG_LOCKED,
 379     &sotcdb, 0, "");
 380
 381 void
 382 socketinit(void)
 383 {
 384         _CASSERT(sizeof(so_gencnt) == sizeof(uint64_t));
 385         VERIFY(IS_P2ALIGNED(&so_gencnt, sizeof(uint32_t)));
 386
 387 #ifdef __LP64__
 388         _CASSERT(sizeof(struct sa_endpoints) == sizeof(struct user64_sa_endpoints));
 389         _CASSERT(offsetof(struct sa_endpoints, sae_srcif) == offsetof(struct user64_sa_endpoints, sae_srcif));
 390         _CASSERT(offsetof(struct sa_endpoints, sae_srcaddr) == offsetof(struct user64_sa_endpoints, sae_srcaddr));
 391         _CASSERT(offsetof(struct sa_endpoints, sae_srcaddrlen) == offsetof(struct user64_sa_endpoints, sae_srcaddrlen));
 392         _CASSERT(offsetof(struct sa_endpoints, sae_dstaddr) == offsetof(struct user64_sa_endpoints, sae_dstaddr));
 393         _CASSERT(offsetof(struct sa_endpoints, sae_dstaddrlen) == offsetof(struct user64_sa_endpoints, sae_dstaddrlen));
 394 #else
 395         _CASSERT(sizeof(struct sa_endpoints) == sizeof(struct user32_sa_endpoints));
 396         _CASSERT(offsetof(struct sa_endpoints, sae_srcif) == offsetof(struct user32_sa_endpoints, sae_srcif));
 397         _CASSERT(offsetof(struct sa_endpoints, sae_srcaddr) == offsetof(struct user32_sa_endpoints, sae_srcaddr));
 398         _CASSERT(offsetof(struct sa_endpoints, sae_srcaddrlen) == offsetof(struct user32_sa_endpoints, sae_srcaddrlen));
 399         _CASSERT(offsetof(struct sa_endpoints, sae_dstaddr) == offsetof(struct user32_sa_endpoints, sae_dstaddr));
 400         _CASSERT(offsetof(struct sa_endpoints, sae_dstaddrlen) == offsetof(struct user32_sa_endpoints, sae_dstaddrlen));
 401 #endif
 402
 403         if (socketinit_done) {
 404                 printf("socketinit: already called...\n");
 405                 return;
 406         }
 407         socketinit_done = 1;
 408
 409         PE_parse_boot_argn("socket_debug", &socket_debug,
 410             sizeof(socket_debug));
 411
 412         /*
 413          * allocate lock group attribute and group for socket cache mutex
 414          */
 415         so_cache_mtx_grp_attr = lck_grp_attr_alloc_init();
 416         so_cache_mtx_grp = lck_grp_alloc_init("so_cache",
 417             so_cache_mtx_grp_attr);
 418
 419         /*
 420          * allocate the lock attribute for socket cache mutex
 421          */
 422         so_cache_mtx_attr = lck_attr_alloc_init();
 423
 424         /* cached sockets mutex */
 425         so_cache_mtx = lck_mtx_alloc_init(so_cache_mtx_grp, so_cache_mtx_attr);
 426         if (so_cache_mtx == NULL) {
 427                 panic("%s: unable to allocate so_cache_mtx\n", __func__);
 428                 /* NOTREACHED */
 429         }
 430         STAILQ_INIT(&so_cache_head);
 431
 432         so_cache_zone_element_size = (vm_size_t)(sizeof(struct socket) + 4
 433             + get_inpcb_str_size() + 4 + get_tcp_str_size());
 434
 435         so_cache_zone = zinit(so_cache_zone_element_size,
 436             (120000 * so_cache_zone_element_size), 8192, "socache zone");
 437         zone_change(so_cache_zone, Z_CALLERACCT, FALSE);
 438         zone_change(so_cache_zone, Z_NOENCRYPT, TRUE);
 439
 440         bzero(&soextbkidlestat, sizeof(struct soextbkidlestat));
 441         soextbkidlestat.so_xbkidle_maxperproc = SO_IDLE_BK_IDLE_MAX_PER_PROC;
 442         soextbkidlestat.so_xbkidle_time = SO_IDLE_BK_IDLE_TIME;
 443         soextbkidlestat.so_xbkidle_rcvhiwat = SO_IDLE_BK_IDLE_RCV_HIWAT;
 444
 445         in_pcbinit();
 446         sflt_init();
 447         socket_tclass_init();
 448 #if MULTIPATH
 449         mp_pcbinit();
 450 #endif /* MULTIPATH */
 451 }
 452
 453 static void
 454 cached_sock_alloc(struct socket **so, int waitok)
 455 {
 456         caddr_t temp;
 457         uintptr_t offset;
 458
 459         lck_mtx_lock(so_cache_mtx);
 460
 461         if (!STAILQ_EMPTY(&so_cache_head)) {
 462                 VERIFY(cached_sock_count > 0);
 463
 464                 *so = STAILQ_FIRST(&so_cache_head);
 465                 STAILQ_REMOVE_HEAD(&so_cache_head, so_cache_ent);
 466                 STAILQ_NEXT((*so), so_cache_ent) = NULL;
 467
 468                 cached_sock_count--;
 469                 lck_mtx_unlock(so_cache_mtx);
 470
 471                 temp = (*so)->so_saved_pcb;
 472                 bzero((caddr_t)*so, sizeof(struct socket));
 473
 474                 (*so)->so_saved_pcb = temp;
 475         } else {
 476                 lck_mtx_unlock(so_cache_mtx);
 477
 478                 if (waitok) {
 479                         *so = (struct socket *)zalloc(so_cache_zone);
 480                 } else {
 481                         *so = (struct socket *)zalloc_noblock(so_cache_zone);
 482                 }
 483
 484                 if (*so == NULL) {
 485                         return;
 486                 }
 487
 488                 bzero((caddr_t)*so, sizeof(struct socket));
 489
 490                 /*
 491                  * Define offsets for extra structures into our
 492                  * single block of memory. Align extra structures
 493                  * on longword boundaries.
 494                  */
 495
 496                 offset = (uintptr_t)*so;
 497                 offset += sizeof(struct socket);
 498
 499                 offset = ALIGN(offset);
 500
 501                 (*so)->so_saved_pcb = (caddr_t)offset;
 502                 offset += get_inpcb_str_size();
 503
 504                 offset = ALIGN(offset);
 505
 506                 ((struct inpcb *)(void *)(*so)->so_saved_pcb)->inp_saved_ppcb =
 507                     (caddr_t)offset;
 508         }
 509
 510         OSBitOrAtomic(SOF1_CACHED_IN_SOCK_LAYER, &(*so)->so_flags1);
 511 }
 512
 513 static void
 514 cached_sock_free(struct socket *so)
 515 {
 516         lck_mtx_lock(so_cache_mtx);
 517
 518         so_cache_time = net_uptime();
 519         if (++cached_sock_count > max_cached_sock_count) {
 520                 --cached_sock_count;
 521                 lck_mtx_unlock(so_cache_mtx);
 522                 zfree(so_cache_zone, so);
 523         } else {
 524                 if (so_cache_hw < cached_sock_count) {
 525                         so_cache_hw = cached_sock_count;
 526                 }
 527
 528                 STAILQ_INSERT_TAIL(&so_cache_head, so, so_cache_ent);
 529
 530                 so->cache_timestamp = so_cache_time;
 531                 lck_mtx_unlock(so_cache_mtx);
 532         }
 533 }
 534
 535 void
 536 so_update_last_owner_locked(struct socket *so, proc_t self)
 537 {
 538         if (so->last_pid != 0) {
 539                 /*
 540                  * last_pid and last_upid should remain zero for sockets
 541                  * created using sock_socket. The check above achieves that
 542                  */
 543                 if (self == PROC_NULL) {
 544                         self = current_proc();
 545                 }
 546
 547                 if (so->last_upid != proc_uniqueid(self) ||
 548                     so->last_pid != proc_pid(self)) {
 549                         so->last_upid = proc_uniqueid(self);
 550                         so->last_pid = proc_pid(self);
 551                         proc_getexecutableuuid(self, so->last_uuid,
 552                             sizeof(so->last_uuid));
 553                 }
 554                 proc_pidoriginatoruuid(so->so_vuuid, sizeof(so->so_vuuid));
 555         }
 556 }
 557
 558 void
 559 so_update_policy(struct socket *so)
 560 {
 561         if (SOCK_DOM(so) == PF_INET || SOCK_DOM(so) == PF_INET6) {
 562                 (void) inp_update_policy(sotoinpcb(so));
 563         }
 564 }
 565
 566 #if NECP
 567 static void
 568 so_update_necp_policy(struct socket *so, struct sockaddr *override_local_addr,
 569     struct sockaddr *override_remote_addr)
 570 {
 571         if (SOCK_DOM(so) == PF_INET || SOCK_DOM(so) == PF_INET6) {
 572                 inp_update_necp_policy(sotoinpcb(so), override_local_addr,
 573                     override_remote_addr, 0);
 574         }
 575 }
 576 #endif /* NECP */
 577
 578 boolean_t
 579 so_cache_timer(void)
 580 {
 581         struct socket   *p;
 582         int             n_freed = 0;
 583         boolean_t rc = FALSE;
 584
 585         lck_mtx_lock(so_cache_mtx);
 586         so_cache_timeouts++;
 587         so_cache_time = net_uptime();
 588
 589         while (!STAILQ_EMPTY(&so_cache_head)) {
 590                 VERIFY(cached_sock_count > 0);
 591                 p = STAILQ_FIRST(&so_cache_head);
 592                 if ((so_cache_time - p->cache_timestamp) <
 593                     SO_CACHE_TIME_LIMIT) {
 594                         break;
 595                 }
 596
 597                 STAILQ_REMOVE_HEAD(&so_cache_head, so_cache_ent);
 598                 --cached_sock_count;
 599
 600                 zfree(so_cache_zone, p);
 601
 602                 if (++n_freed >= SO_CACHE_MAX_FREE_BATCH) {
 603                         so_cache_max_freed++;
 604                         break;
 605                 }
 606         }
 607
 608         /* Schedule again if there is more to cleanup */
 609         if (!STAILQ_EMPTY(&so_cache_head)) {
 610                 rc = TRUE;
 611         }
 612
 613         lck_mtx_unlock(so_cache_mtx);
 614         return rc;
 615 }
 616
 617 /*
 618  * Get a socket structure from our zone, and initialize it.
 619  * We don't implement `waitok' yet (see comments in uipc_domain.c).
 620  * Note that it would probably be better to allocate socket
 621  * and PCB at the same time, but I'm not convinced that all
 622  * the protocols can be easily modified to do this.
 623  */
 624 struct socket *
 625 soalloc(int waitok, int dom, int type)
 626 {
 627         struct socket *so;
 628
 629         if ((dom == PF_INET) && (type == SOCK_STREAM)) {
 630                 cached_sock_alloc(&so, waitok);
 631         } else {
 632                 MALLOC_ZONE(so, struct socket *, sizeof(*so), socket_zone,
 633                     M_WAITOK);
 634                 if (so != NULL) {
 635                         bzero(so, sizeof(*so));
 636                 }
 637         }
 638         if (so != NULL) {
 639                 so->so_gencnt = OSIncrementAtomic64((SInt64 *)&so_gencnt);
 640                 so->so_zone = socket_zone;
 641
 642                 /*
 643                  * Increment the socket allocation statistics
 644                  */
 645                 INC_ATOMIC_INT64_LIM(net_api_stats.nas_socket_alloc_total);
 646
 647 #if CONFIG_MACF_SOCKET
 648                 /* Convert waitok to  M_WAITOK/M_NOWAIT for MAC Framework. */
 649                 if (mac_socket_label_init(so, !waitok) != 0) {
 650                         sodealloc(so);
 651                         return NULL;
 652                 }
 653 #endif /* MAC_SOCKET */
 654         }
 655
 656         return so;
 657 }
 658
 659 int
 660 socreate_internal(int dom, struct socket **aso, int type, int proto,
 661     struct proc *p, uint32_t flags, struct proc *ep)
 662 {
 663         struct protosw *prp;
 664         struct socket *so;
 665         int error = 0;
 666
 667 #if TCPDEBUG
 668         extern int tcpconsdebug;
 669 #endif
 670
 671         VERIFY(aso != NULL);
 672         *aso = NULL;
 673
 674         if (proto != 0) {
 675                 prp = pffindproto(dom, proto, type);
 676         } else {
 677                 prp = pffindtype(dom, type);
 678         }
 679
 680         if (prp == NULL || prp->pr_usrreqs->pru_attach == NULL) {
 681                 if (pffinddomain(dom) == NULL) {
 682                         return EAFNOSUPPORT;
 683                 }
 684                 if (proto != 0) {
 685                         if (pffindprotonotype(dom, proto) != NULL) {
 686                                 return EPROTOTYPE;
 687                         }
 688                 }
 689                 return EPROTONOSUPPORT;
 690         }
 691         if (prp->pr_type != type) {
 692                 return EPROTOTYPE;
 693         }
 694         so = soalloc(1, dom, type);
 695         if (so == NULL) {
 696                 return ENOBUFS;
 697         }
 698
 699         switch (dom) {
 700         case PF_LOCAL:
 701                 INC_ATOMIC_INT64_LIM(net_api_stats.nas_socket_domain_local_total);
 702                 break;
 703         case PF_INET:
 704                 INC_ATOMIC_INT64_LIM(net_api_stats.nas_socket_domain_inet_total);
 705                 if (type == SOCK_STREAM) {
 706                         INC_ATOMIC_INT64_LIM(net_api_stats.nas_socket_inet_stream_total);
 707                 } else {
 708                         INC_ATOMIC_INT64_LIM(net_api_stats.nas_socket_inet_dgram_total);
 709                 }
 710                 break;
 711         case PF_ROUTE:
 712                 INC_ATOMIC_INT64_LIM(net_api_stats.nas_socket_domain_route_total);
 713                 break;
 714         case PF_NDRV:
 715                 INC_ATOMIC_INT64_LIM(net_api_stats.nas_socket_domain_ndrv_total);
 716                 break;
 717         case PF_KEY:
 718                 INC_ATOMIC_INT64_LIM(net_api_stats.nas_socket_domain_key_total);
 719                 break;
 720         case PF_INET6:
 721                 INC_ATOMIC_INT64_LIM(net_api_stats.nas_socket_domain_inet6_total);
 722                 if (type == SOCK_STREAM) {
 723                         INC_ATOMIC_INT64_LIM(net_api_stats.nas_socket_inet6_stream_total);
 724                 } else {
 725                         INC_ATOMIC_INT64_LIM(net_api_stats.nas_socket_inet6_dgram_total);
 726                 }
 727                 break;
 728         case PF_SYSTEM:
 729                 INC_ATOMIC_INT64_LIM(net_api_stats.nas_socket_domain_system_total);
 730                 break;
 731         case PF_MULTIPATH:
 732                 INC_ATOMIC_INT64_LIM(net_api_stats.nas_socket_domain_multipath_total);
 733                 break;
 734         default:
 735                 INC_ATOMIC_INT64_LIM(net_api_stats.nas_socket_domain_other_total);
 736                 break;
 737         }
 738
 739         if (flags & SOCF_ASYNC) {
 740                 so->so_state |= SS_NBIO;
 741         }
 742
 743         TAILQ_INIT(&so->so_incomp);
 744         TAILQ_INIT(&so->so_comp);
 745         so->so_type = type;
 746         so->last_upid = proc_uniqueid(p);
 747         so->last_pid = proc_pid(p);
 748         proc_getexecutableuuid(p, so->last_uuid, sizeof(so->last_uuid));
 749         proc_pidoriginatoruuid(so->so_vuuid, sizeof(so->so_vuuid));
 750
 751         if (ep != PROC_NULL && ep != p) {
 752                 so->e_upid = proc_uniqueid(ep);
 753                 so->e_pid = proc_pid(ep);
 754                 proc_getexecutableuuid(ep, so->e_uuid, sizeof(so->e_uuid));
 755                 so->so_flags |= SOF_DELEGATED;
 756         }
 757
 758         so->so_cred = kauth_cred_proc_ref(p);
 759         if (!suser(kauth_cred_get(), NULL)) {
 760                 so->so_state |= SS_PRIV;
 761         }
 762
 763         so->so_proto = prp;
 764         so->so_rcv.sb_flags |= SB_RECV;
 765         so->so_rcv.sb_so = so->so_snd.sb_so = so;
 766         so->next_lock_lr = 0;
 767         so->next_unlock_lr = 0;
 768
 769 #if CONFIG_MACF_SOCKET
 770         mac_socket_label_associate(kauth_cred_get(), so);
 771 #endif /* MAC_SOCKET */
 772
 773         /*
 774          * Attachment will create the per pcb lock if necessary and
 775          * increase refcount for creation, make sure it's done before
 776          * socket is inserted in lists.
 777          */
 778         so->so_usecount++;
 779
 780         error = (*prp->pr_usrreqs->pru_attach)(so, proto, p);
 781         if (error != 0) {
 782                 /*
 783                  * Warning:
 784                  * If so_pcb is not zero, the socket will be leaked,
 785                  * so protocol attachment handler must be coded carefuly
 786                  */
 787                 so->so_state |= SS_NOFDREF;
 788                 VERIFY(so->so_usecount > 0);
 789                 so->so_usecount--;
 790                 sofreelastref(so, 1);   /* will deallocate the socket */
 791                 return error;
 792         }
 793
 794         atomic_add_32(&prp->pr_domain->dom_refs, 1);
 795         TAILQ_INIT(&so->so_evlist);
 796
 797         /* Attach socket filters for this protocol */
 798         sflt_initsock(so);
 799 #if TCPDEBUG
 800         if (tcpconsdebug == 2) {
 801                 so->so_options |= SO_DEBUG;
 802         }
 803 #endif
 804         so_set_default_traffic_class(so);
 805
 806         /*
 807          * If this thread or task is marked to create backgrounded sockets,
 808          * mark the socket as background.
 809          */
 810         if (proc_get_effective_thread_policy(current_thread(),
 811             TASK_POLICY_NEW_SOCKETS_BG)) {
 812                 socket_set_traffic_mgt_flags(so, TRAFFIC_MGT_SO_BACKGROUND);
 813                 so->so_background_thread = current_thread();
 814         }
 815
 816         switch (dom) {
 817         /*
 818          * Don't mark Unix domain, system or multipath sockets as
 819          * eligible for defunct by default.
 820          */
 821         case PF_LOCAL:
 822         case PF_SYSTEM:
 823         case PF_MULTIPATH:
 824                 so->so_flags |= SOF_NODEFUNCT;
 825                 break;
 826         default:
 827                 break;
 828         }
 829
 830         /*
 831          * Entitlements can't be checked at socket creation time except if the
 832          * application requested a feature guarded by a privilege (c.f., socket
 833          * delegation).
 834          * The priv(9) and the Sandboxing APIs are designed with the idea that
 835          * a privilege check should only be triggered by a userland request.
 836          * A privilege check at socket creation time is time consuming and
 837          * could trigger many authorisation error messages from the security
 838          * APIs.
 839          */
 840
 841         *aso = so;
 842
 843         return 0;
 844 }
 845
 846 /*
 847  * Returns:     0                       Success
 848  *              EAFNOSUPPORT
 849  *              EPROTOTYPE
 850  *              EPROTONOSUPPORT
 851  *              ENOBUFS
 852  *      <pru_attach>:ENOBUFS[AF_UNIX]
 853  *      <pru_attach>:ENOBUFS[TCP]
 854  *      <pru_attach>:ENOMEM[TCP]
 855  *      <pru_attach>:???                [other protocol families, IPSEC]
 856  */
 857 int
 858 socreate(int dom, struct socket **aso, int type, int proto)
 859 {
 860         return socreate_internal(dom, aso, type, proto, current_proc(), 0,
 861                    PROC_NULL);
 862 }
 863
 864 int
 865 socreate_delegate(int dom, struct socket **aso, int type, int proto, pid_t epid)
 866 {
 867         int error = 0;
 868         struct proc *ep = PROC_NULL;
 869
 870         if ((proc_selfpid() != epid) && ((ep = proc_find(epid)) == PROC_NULL)) {
 871                 error = ESRCH;
 872                 goto done;
 873         }
 874
 875         error = socreate_internal(dom, aso, type, proto, current_proc(), 0, ep);
 876
 877         /*
 878          * It might not be wise to hold the proc reference when calling
 879          * socreate_internal since it calls soalloc with M_WAITOK
 880          */
 881 done:
 882         if (ep != PROC_NULL) {
 883                 proc_rele(ep);
 884         }
 885
 886         return error;
 887 }
 888
 889 /*
 890  * Returns:     0                       Success
 891  *      <pru_bind>:EINVAL               Invalid argument [COMMON_START]
 892  *      <pru_bind>:EAFNOSUPPORT         Address family not supported
 893  *      <pru_bind>:EADDRNOTAVAIL        Address not available.
 894  *      <pru_bind>:EINVAL               Invalid argument
 895  *      <pru_bind>:EAFNOSUPPORT         Address family not supported [notdef]
 896  *      <pru_bind>:EACCES               Permission denied
 897  *      <pru_bind>:EADDRINUSE           Address in use
 898  *      <pru_bind>:EAGAIN               Resource unavailable, try again
 899  *      <pru_bind>:EPERM                Operation not permitted
 900  *      <pru_bind>:???
 901  *      <sf_bind>:???
 902  *
 903  * Notes:       It's not possible to fully enumerate the return codes above,
 904  *              since socket filter authors and protocol family authors may
 905  *              not choose to limit their error returns to those listed, even
 906  *              though this may result in some software operating incorrectly.
 907  *
 908  *              The error codes which are enumerated above are those known to
 909  *              be returned by the tcp_usr_bind function supplied.
 910  */
 911 int
 912 sobindlock(struct socket *so, struct sockaddr *nam, int dolock)
 913 {
 914         struct proc *p = current_proc();
 915         int error = 0;
 916
 917         if (dolock) {
 918                 socket_lock(so, 1);
 919         }
 920
 921         so_update_last_owner_locked(so, p);
 922         so_update_policy(so);
 923
 924 #if NECP
 925         so_update_necp_policy(so, nam, NULL);
 926 #endif /* NECP */
 927
 928         /*
 929          * If this is a bind request on a socket that has been marked
 930          * as inactive, reject it now before we go any further.
 931          */
 932         if (so->so_flags & SOF_DEFUNCT) {
 933                 error = EINVAL;
 934                 SODEFUNCTLOG("%s[%d, %s]: defunct so 0x%llx [%d,%d] (%d)\n",
 935                     __func__, proc_pid(p), proc_best_name(p),
 936                     (uint64_t)DEBUG_KERNEL_ADDRPERM(so),
 937                     SOCK_DOM(so), SOCK_TYPE(so), error);
 938                 goto out;
 939         }
 940
 941         /* Socket filter */
 942         error = sflt_bind(so, nam);
 943
 944         if (error == 0) {
 945                 error = (*so->so_proto->pr_usrreqs->pru_bind)(so, nam, p);
 946         }
 947 out:
 948         if (dolock) {
 949                 socket_unlock(so, 1);
 950         }
 951
 952         if (error == EJUSTRETURN) {
 953                 error = 0;
 954         }
 955
 956         return error;
 957 }
 958
 959 void
 960 sodealloc(struct socket *so)
 961 {
 962         kauth_cred_unref(&so->so_cred);
 963
 964         /* Remove any filters */
 965         sflt_termsock(so);
 966
 967 #if CONTENT_FILTER
 968         cfil_sock_detach(so);
 969 #endif /* CONTENT_FILTER */
 970
 971         /* Delete the state allocated for msg queues on a socket */
 972         if (so->so_flags & SOF_ENABLE_MSGS) {
 973                 FREE(so->so_msg_state, M_TEMP);
 974                 so->so_msg_state = NULL;
 975         }
 976         VERIFY(so->so_msg_state == NULL);
 977
 978         so->so_gencnt = OSIncrementAtomic64((SInt64 *)&so_gencnt);
 979
 980 #if CONFIG_MACF_SOCKET
 981         mac_socket_label_destroy(so);
 982 #endif /* MAC_SOCKET */
 983
 984         if (so->so_flags1 & SOF1_CACHED_IN_SOCK_LAYER) {
 985                 cached_sock_free(so);
 986         } else {
 987                 FREE_ZONE(so, sizeof(*so), so->so_zone);
 988         }
 989 }
 990
 991 /*
 992  * Returns:     0                       Success
 993  *              EINVAL
 994  *              EOPNOTSUPP
 995  *      <pru_listen>:EINVAL[AF_UNIX]
 996  *      <pru_listen>:EINVAL[TCP]
 997  *      <pru_listen>:EADDRNOTAVAIL[TCP] Address not available.
 998  *      <pru_listen>:EINVAL[TCP]        Invalid argument
 999  *      <pru_listen>:EAFNOSUPPORT[TCP]  Address family not supported [notdef]
1000  *      <pru_listen>:EACCES[TCP]        Permission denied
1001  *      <pru_listen>:EADDRINUSE[TCP]    Address in use
1002  *      <pru_listen>:EAGAIN[TCP]        Resource unavailable, try again
1003  *      <pru_listen>:EPERM[TCP]         Operation not permitted
1004  *      <sf_listen>:???
1005  *
1006  * Notes:       Other <pru_listen> returns depend on the protocol family; all
1007  *              <sf_listen> returns depend on what the filter author causes
1008  *              their filter to return.
1009  */
1010 int
1011 solisten(struct socket *so, int backlog)
1012 {
1013         struct proc *p = current_proc();
1014         int error = 0;
1015
1016         socket_lock(so, 1);
1017
1018         so_update_last_owner_locked(so, p);
1019         so_update_policy(so);
1020
1021 #if NECP
1022         so_update_necp_policy(so, NULL, NULL);
1023 #endif /* NECP */
1024
1025         if (so->so_proto == NULL) {
1026                 error = EINVAL;
1027                 goto out;
1028         }
1029         if ((so->so_proto->pr_flags & PR_CONNREQUIRED) == 0) {
1030                 error = EOPNOTSUPP;
1031                 goto out;
1032         }
1033
1034         /*
1035          * If the listen request is made on a socket that is not fully
1036          * disconnected, or on a socket that has been marked as inactive,
1037          * reject the request now.
1038          */
1039         if ((so->so_state &
1040             (SS_ISCONNECTED | SS_ISCONNECTING | SS_ISDISCONNECTING)) ||
1041             (so->so_flags & SOF_DEFUNCT)) {
1042                 error = EINVAL;
1043                 if (so->so_flags & SOF_DEFUNCT) {
1044                         SODEFUNCTLOG("%s[%d, %s]: defunct so 0x%llx [%d,%d] "
1045                             "(%d)\n", __func__, proc_pid(p),
1046                             proc_best_name(p),
1047                             (uint64_t)DEBUG_KERNEL_ADDRPERM(so),
1048                             SOCK_DOM(so), SOCK_TYPE(so), error);
1049                 }
1050                 goto out;
1051         }
1052
1053         if ((so->so_restrictions & SO_RESTRICT_DENY_IN) != 0) {
1054                 error = EPERM;
1055                 goto out;
1056         }
1057
1058         error = sflt_listen(so);
1059         if (error == 0) {
1060                 error = (*so->so_proto->pr_usrreqs->pru_listen)(so, p);
1061         }
1062
1063         if (error) {
1064                 if (error == EJUSTRETURN) {
1065                         error = 0;
1066                 }
1067                 goto out;
1068         }
1069
1070         if (TAILQ_EMPTY(&so->so_comp)) {
1071                 so->so_options |= SO_ACCEPTCONN;
1072         }
1073         /*
1074          * POSIX: The implementation may have an upper limit on the length of
1075          * the listen queue-either global or per accepting socket. If backlog
1076          * exceeds this limit, the length of the listen queue is set to the
1077          * limit.
1078          *
1079          * If listen() is called with a backlog argument value that is less
1080          * than 0, the function behaves as if it had been called with a backlog
1081          * argument value of 0.
1082          *
1083          * A backlog argument of 0 may allow the socket to accept connections,
1084          * in which case the length of the listen queue may be set to an
1085          * implementation-defined minimum value.
1086          */
1087         if (backlog <= 0 || backlog > somaxconn) {
1088                 backlog = somaxconn;
1089         }
1090
1091         so->so_qlimit = backlog;
1092 out:
1093         socket_unlock(so, 1);
1094         return error;
1095 }
1096
1097 /*
1098  * The "accept list lock" protects the fields related to the listener queues
1099  * because we can unlock a socket to respect the lock ordering between
1100  * the listener socket and its clients sockets. The lock ordering is first to
1101  * acquire the client socket before the listener socket.
1102  *
1103  * The accept list lock serializes access to the following fields:
1104  * - of the listener socket:
1105  *   - so_comp
1106  *   - so_incomp
1107  *   - so_qlen
1108  *   - so_inqlen
1109  * - of client sockets that are in so_comp or so_incomp:
1110  *   - so_head
1111  *   - so_list
1112  *
1113  * As one can see the accept list lock protects the consistent of the
1114  * linkage of the client sockets.
1115  *
1116  * Note that those fields may be read without holding the accept list lock
1117  * for a preflight provided the accept list lock is taken when committing
1118  * to take an action based on the result of the preflight. The preflight
1119  * saves the cost of doing the unlock/lock dance.
1120  */
1121 void
1122 so_acquire_accept_list(struct socket *head, struct socket *so)
1123 {
1124         lck_mtx_t *mutex_held;
1125
1126         if (head->so_proto->pr_getlock == NULL) {
1127                 return;
1128         }
1129         mutex_held = (*head->so_proto->pr_getlock)(head, PR_F_WILLUNLOCK);
1130         LCK_MTX_ASSERT(mutex_held, LCK_MTX_ASSERT_OWNED);
1131
1132         if (!(head->so_flags1 & SOF1_ACCEPT_LIST_HELD)) {
1133                 head->so_flags1 |= SOF1_ACCEPT_LIST_HELD;
1134                 return;
1135         }
1136         if (so != NULL) {
1137                 socket_unlock(so, 0);
1138         }
1139         while (head->so_flags1 & SOF1_ACCEPT_LIST_HELD) {
1140                 so_accept_list_waits += 1;
1141                 msleep((caddr_t)&head->so_incomp, mutex_held,
1142                     PSOCK | PCATCH, __func__, NULL);
1143         }
1144         head->so_flags1 |= SOF1_ACCEPT_LIST_HELD;
1145         if (so != NULL) {
1146                 socket_unlock(head, 0);
1147                 socket_lock(so, 0);
1148                 socket_lock(head, 0);
1149         }
1150 }
1151
1152 void
1153 so_release_accept_list(struct socket *head)
1154 {
1155         if (head->so_proto->pr_getlock != NULL) {
1156                 lck_mtx_t *mutex_held;
1157
1158                 mutex_held = (*head->so_proto->pr_getlock)(head, 0);
1159                 LCK_MTX_ASSERT(mutex_held, LCK_MTX_ASSERT_OWNED);
1160
1161                 head->so_flags1 &= ~SOF1_ACCEPT_LIST_HELD;
1162                 wakeup((caddr_t)&head->so_incomp);
1163         }
1164 }
1165
1166 void
1167 sofreelastref(struct socket *so, int dealloc)
1168 {
1169         struct socket *head = so->so_head;
1170
1171         /* Assume socket is locked */
1172
1173         if (!(so->so_flags & SOF_PCBCLEARING) || !(so->so_state & SS_NOFDREF)) {
1174                 selthreadclear(&so->so_snd.sb_sel);
1175                 selthreadclear(&so->so_rcv.sb_sel);
1176                 so->so_rcv.sb_flags &= ~(SB_SEL | SB_UPCALL);
1177                 so->so_snd.sb_flags &= ~(SB_SEL | SB_UPCALL);
1178                 so->so_event = sonullevent;
1179                 return;
1180         }
1181         if (head != NULL) {
1182                 /*
1183                  * Need to lock the listener when the protocol has
1184                  * per socket locks
1185                  */
1186                 if (head->so_proto->pr_getlock != NULL) {
1187                         socket_lock(head, 1);
1188                         so_acquire_accept_list(head, so);
1189                 }
1190                 if (so->so_state & SS_INCOMP) {
1191                         so->so_state &= ~SS_INCOMP;
1192                         TAILQ_REMOVE(&head->so_incomp, so, so_list);
1193                         head->so_incqlen--;
1194                         head->so_qlen--;
1195                         so->so_head = NULL;
1196
1197                         if (head->so_proto->pr_getlock != NULL) {
1198                                 so_release_accept_list(head);
1199                                 socket_unlock(head, 1);
1200                         }
1201                 } else if (so->so_state & SS_COMP) {
1202                         if (head->so_proto->pr_getlock != NULL) {
1203                                 so_release_accept_list(head);
1204                                 socket_unlock(head, 1);
1205                         }
1206                         /*
1207                          * We must not decommission a socket that's
1208                          * on the accept(2) queue.  If we do, then
1209                          * accept(2) may hang after select(2) indicated
1210                          * that the listening socket was ready.
1211                          */
1212                         selthreadclear(&so->so_snd.sb_sel);
1213                         selthreadclear(&so->so_rcv.sb_sel);
1214                         so->so_rcv.sb_flags &= ~(SB_SEL | SB_UPCALL);
1215                         so->so_snd.sb_flags &= ~(SB_SEL | SB_UPCALL);
1216                         so->so_event = sonullevent;
1217                         return;
1218                 } else {
1219                         if (head->so_proto->pr_getlock != NULL) {
1220                                 so_release_accept_list(head);
1221                                 socket_unlock(head, 1);
1222                         }
1223                         printf("sofree: not queued\n");
1224                 }
1225         }
1226         sowflush(so);
1227         sorflush(so);
1228
1229 #if FLOW_DIVERT
1230         if (so->so_flags & SOF_FLOW_DIVERT) {
1231                 flow_divert_detach(so);
1232         }
1233 #endif  /* FLOW_DIVERT */
1234
1235         /* 3932268: disable upcall */
1236         so->so_rcv.sb_flags &= ~SB_UPCALL;
1237         so->so_snd.sb_flags &= ~(SB_UPCALL | SB_SNDBYTE_CNT);
1238         so->so_event = sonullevent;
1239
1240         if (dealloc) {
1241                 sodealloc(so);
1242         }
1243 }
1244
1245 void
1246 soclose_wait_locked(struct socket *so)
1247 {
1248         lck_mtx_t *mutex_held;
1249
1250         if (so->so_proto->pr_getlock != NULL) {
1251                 mutex_held = (*so->so_proto->pr_getlock)(so, PR_F_WILLUNLOCK);
1252         } else {
1253                 mutex_held = so->so_proto->pr_domain->dom_mtx;
1254         }
1255         LCK_MTX_ASSERT(mutex_held, LCK_MTX_ASSERT_OWNED);
1256
1257         /*
1258          * Double check here and return if there's no outstanding upcall;
1259          * otherwise proceed further only if SOF_UPCALLCLOSEWAIT is set.
1260          */
1261         if (!so->so_upcallusecount || !(so->so_flags & SOF_UPCALLCLOSEWAIT)) {
1262                 return;
1263         }
1264         so->so_rcv.sb_flags &= ~SB_UPCALL;
1265         so->so_snd.sb_flags &= ~SB_UPCALL;
1266         so->so_flags |= SOF_CLOSEWAIT;
1267
1268         (void) msleep((caddr_t)&so->so_upcallusecount, mutex_held, (PZERO - 1),
1269             "soclose_wait_locked", NULL);
1270         LCK_MTX_ASSERT(mutex_held, LCK_MTX_ASSERT_OWNED);
1271         so->so_flags &= ~SOF_CLOSEWAIT;
1272 }
1273
1274 /*
1275  * Close a socket on last file table reference removal.
1276  * Initiate disconnect if connected.
1277  * Free socket when disconnect complete.
1278  */
1279 int
1280 soclose_locked(struct socket *so)
1281 {
1282         int error = 0;
1283         struct timespec ts;
1284
1285         if (so->so_usecount == 0) {
1286                 panic("soclose: so=%p refcount=0\n", so);
1287                 /* NOTREACHED */
1288         }
1289
1290         sflt_notify(so, sock_evt_closing, NULL);
1291
1292         if (so->so_upcallusecount) {
1293                 soclose_wait_locked(so);
1294         }
1295
1296 #if CONTENT_FILTER
1297         /*
1298          * We have to wait until the content filters are done
1299          */
1300         if ((so->so_flags & SOF_CONTENT_FILTER) != 0) {
1301                 cfil_sock_close_wait(so);
1302                 cfil_sock_is_closed(so);
1303                 cfil_sock_detach(so);
1304         }
1305 #endif /* CONTENT_FILTER */
1306
1307         if (so->so_flags1 & SOF1_EXTEND_BK_IDLE_INPROG) {
1308                 soresume(current_proc(), so, 1);
1309                 so->so_flags1 &= ~SOF1_EXTEND_BK_IDLE_WANTED;
1310         }
1311
1312         if ((so->so_options & SO_ACCEPTCONN)) {
1313                 struct socket *sp, *sonext;
1314                 int persocklock = 0;
1315                 int incomp_overflow_only;
1316
1317                 /*
1318                  * We do not want new connection to be added
1319                  * to the connection queues
1320                  */
1321                 so->so_options &= ~SO_ACCEPTCONN;
1322
1323                 /*
1324                  * We can drop the lock on the listener once
1325                  * we've acquired the incoming list
1326                  */
1327                 if (so->so_proto->pr_getlock != NULL) {
1328                         persocklock = 1;
1329                         so_acquire_accept_list(so, NULL);
1330                         socket_unlock(so, 0);
1331                 }
1332 again:
1333                 incomp_overflow_only = 1;
1334
1335                 TAILQ_FOREACH_SAFE(sp, &so->so_incomp, so_list, sonext) {
1336                         /*
1337                          * Radar 5350314
1338                          * skip sockets thrown away by tcpdropdropblreq
1339                          * they will get cleanup by the garbage collection.
1340                          * otherwise, remove the incomp socket from the queue
1341                          * and let soabort trigger the appropriate cleanup.
1342                          */
1343                         if (sp->so_flags & SOF_OVERFLOW) {
1344                                 continue;
1345                         }
1346
1347                         if (persocklock != 0) {
1348                                 socket_lock(sp, 1);
1349                         }
1350
1351                         /*
1352                          * Radar 27945981
1353                          * The extra reference for the list insure the
1354                          * validity of the socket pointer when we perform the
1355                          * unlock of the head above
1356                          */
1357                         if (sp->so_state & SS_INCOMP) {
1358                                 sp->so_state &= ~SS_INCOMP;
1359                                 sp->so_head = NULL;
1360                                 TAILQ_REMOVE(&so->so_incomp, sp, so_list);
1361                                 so->so_incqlen--;
1362                                 so->so_qlen--;
1363
1364                                 (void) soabort(sp);
1365                         } else {
1366                                 panic("%s sp %p in so_incomp but !SS_INCOMP",
1367                                     __func__, sp);
1368                         }
1369
1370                         if (persocklock != 0) {
1371                                 socket_unlock(sp, 1);
1372                         }
1373                 }
1374
1375                 TAILQ_FOREACH_SAFE(sp, &so->so_comp, so_list, sonext) {
1376                         /* Dequeue from so_comp since sofree() won't do it */
1377                         if (persocklock != 0) {
1378                                 socket_lock(sp, 1);
1379                         }
1380
1381                         if (sp->so_state & SS_COMP) {
1382                                 sp->so_state &= ~SS_COMP;
1383                                 sp->so_head = NULL;
1384                                 TAILQ_REMOVE(&so->so_comp, sp, so_list);
1385                                 so->so_qlen--;
1386
1387                                 (void) soabort(sp);
1388                         } else {
1389                                 panic("%s sp %p in so_comp but !SS_COMP",
1390                                     __func__, sp);
1391                         }
1392
1393                         if (persocklock) {
1394                                 socket_unlock(sp, 1);
1395                         }
1396                 }
1397
1398                 if (incomp_overflow_only == 0 && !TAILQ_EMPTY(&so->so_incomp)) {
1399 #if (DEBUG | DEVELOPMENT)
1400                         panic("%s head %p so_comp not empty\n", __func__, so);
1401 #endif /* (DEVELOPMENT || DEBUG) */
1402
1403                         goto again;
1404                 }
1405
1406                 if (!TAILQ_EMPTY(&so->so_comp)) {
1407 #if (DEBUG | DEVELOPMENT)
1408                         panic("%s head %p so_comp not empty\n", __func__, so);
1409 #endif /* (DEVELOPMENT || DEBUG) */
1410
1411                         goto again;
1412                 }
1413
1414                 if (persocklock) {
1415                         socket_lock(so, 0);
1416                         so_release_accept_list(so);
1417                 }
1418         }
1419         if (so->so_pcb == NULL) {
1420                 /* 3915887: mark the socket as ready for dealloc */
1421                 so->so_flags |= SOF_PCBCLEARING;
1422                 goto discard;
1423         }
1424         if (so->so_state & SS_ISCONNECTED) {
1425                 if ((so->so_state & SS_ISDISCONNECTING) == 0) {
1426                         error = sodisconnectlocked(so);
1427                         if (error) {
1428                                 goto drop;
1429                         }
1430                 }
1431                 if (so->so_options & SO_LINGER) {
1432                         lck_mtx_t *mutex_held;
1433
1434                         if ((so->so_state & SS_ISDISCONNECTING) &&
1435                             (so->so_state & SS_NBIO)) {
1436                                 goto drop;
1437                         }
1438                         if (so->so_proto->pr_getlock != NULL) {
1439                                 mutex_held = (*so->so_proto->pr_getlock)(so, PR_F_WILLUNLOCK);
1440                         } else {
1441                                 mutex_held = so->so_proto->pr_domain->dom_mtx;
1442                         }
1443                         while (so->so_state & SS_ISCONNECTED) {
1444                                 ts.tv_sec = (so->so_linger / 100);
1445                                 ts.tv_nsec = (so->so_linger % 100) *
1446                                     NSEC_PER_USEC * 1000 * 10;
1447                                 error = msleep((caddr_t)&so->so_timeo,
1448                                     mutex_held, PSOCK | PCATCH, "soclose", &ts);
1449                                 if (error) {
1450                                         /*
1451                                          * It's OK when the time fires,
1452                                          * don't report an error
1453                                          */
1454                                         if (error == EWOULDBLOCK) {
1455                                                 error = 0;
1456                                         }
1457                                         break;
1458                                 }
1459                         }
1460                 }
1461         }
1462 drop:
1463         if (so->so_usecount == 0) {
1464                 panic("soclose: usecount is zero so=%p\n", so);
1465                 /* NOTREACHED */
1466         }
1467         if (so->so_pcb != NULL && !(so->so_flags & SOF_PCBCLEARING)) {
1468                 int error2 = (*so->so_proto->pr_usrreqs->pru_detach)(so);
1469                 if (error == 0) {
1470                         error = error2;
1471                 }
1472         }
1473         if (so->so_usecount <= 0) {
1474                 panic("soclose: usecount is zero so=%p\n", so);
1475                 /* NOTREACHED */
1476         }
1477 discard:
1478         if (so->so_pcb != NULL && !(so->so_flags & SOF_MP_SUBFLOW) &&
1479             (so->so_state & SS_NOFDREF)) {
1480                 panic("soclose: NOFDREF");
1481                 /* NOTREACHED */
1482         }
1483         so->so_state |= SS_NOFDREF;
1484
1485         if ((so->so_flags & SOF_KNOTE) != 0) {
1486                 KNOTE(&so->so_klist, SO_FILT_HINT_LOCKED);
1487         }
1488
1489         atomic_add_32(&so->so_proto->pr_domain->dom_refs, -1);
1490         evsofree(so);
1491
1492         VERIFY(so->so_usecount > 0);
1493         so->so_usecount--;
1494         sofree(so);
1495         return error;
1496 }
1497
1498 int
1499 soclose(struct socket *so)
1500 {
1501         int error = 0;
1502         socket_lock(so, 1);
1503
1504         if (so->so_retaincnt == 0) {
1505                 error = soclose_locked(so);
1506         } else {
1507                 /*
1508                  * if the FD is going away, but socket is
1509                  * retained in kernel remove its reference
1510                  */
1511                 so->so_usecount--;
1512                 if (so->so_usecount < 2) {
1513                         panic("soclose: retaincnt non null and so=%p "
1514                             "usecount=%d\n", so, so->so_usecount);
1515                 }
1516         }
1517         socket_unlock(so, 1);
1518         return error;
1519 }
1520
1521 /*
1522  * Must be called at splnet...
1523  */
1524 /* Should already be locked */
1525 int
1526 soabort(struct socket *so)
1527 {
1528         int error;
1529
1530 #ifdef MORE_LOCKING_DEBUG
1531         lck_mtx_t *mutex_held;
1532
1533         if (so->so_proto->pr_getlock != NULL) {
1534                 mutex_held = (*so->so_proto->pr_getlock)(so, 0);
1535         } else {
1536                 mutex_held = so->so_proto->pr_domain->dom_mtx;
1537         }
1538         LCK_MTX_ASSERT(mutex_held, LCK_MTX_ASSERT_OWNED);
1539 #endif
1540
1541         if ((so->so_flags & SOF_ABORTED) == 0) {
1542                 so->so_flags |= SOF_ABORTED;
1543                 error = (*so->so_proto->pr_usrreqs->pru_abort)(so);
1544                 if (error) {
1545                         sofree(so);
1546                         return error;
1547                 }
1548         }
1549         return 0;
1550 }
1551
1552 int
1553 soacceptlock(struct socket *so, struct sockaddr **nam, int dolock)
1554 {
1555         int error;
1556
1557         if (dolock) {
1558                 socket_lock(so, 1);
1559         }
1560
1561         so_update_last_owner_locked(so, PROC_NULL);
1562         so_update_policy(so);
1563 #if NECP
1564         so_update_necp_policy(so, NULL, NULL);
1565 #endif /* NECP */
1566
1567         if ((so->so_state & SS_NOFDREF) == 0) {
1568                 panic("soaccept: !NOFDREF");
1569         }
1570         so->so_state &= ~SS_NOFDREF;
1571         error = (*so->so_proto->pr_usrreqs->pru_accept)(so, nam);
1572
1573         if (dolock) {
1574                 socket_unlock(so, 1);
1575         }
1576         return error;
1577 }
1578
1579 int
1580 soaccept(struct socket *so, struct sockaddr **nam)
1581 {
1582         return soacceptlock(so, nam, 1);
1583 }
1584
1585 int
1586 soacceptfilter(struct socket *so, struct socket *head)
1587 {
1588         struct sockaddr *local = NULL, *remote = NULL;
1589         int error = 0;
1590
1591         /*
1592          * Hold the lock even if this socket has not been made visible
1593          * to the filter(s).  For sockets with global locks, this protects
1594          * against the head or peer going away
1595          */
1596         socket_lock(so, 1);
1597         if (sogetaddr_locked(so, &remote, 1) != 0 ||
1598             sogetaddr_locked(so, &local, 0) != 0) {
1599                 so->so_state &= ~SS_NOFDREF;
1600                 socket_unlock(so, 1);
1601                 soclose(so);
1602                 /* Out of resources; try it again next time */
1603                 error = ECONNABORTED;
1604                 goto done;
1605         }
1606
1607         error = sflt_accept(head, so, local, remote);
1608
1609         /*
1610          * If we get EJUSTRETURN from one of the filters, mark this socket
1611          * as inactive and return it anyway.  This newly accepted socket
1612          * will be disconnected later before we hand it off to the caller.
1613          */
1614         if (error == EJUSTRETURN) {
1615                 error = 0;
1616                 (void) sosetdefunct(current_proc(), so,
1617                     SHUTDOWN_SOCKET_LEVEL_DISCONNECT_INTERNAL, FALSE);
1618         }
1619
1620         if (error != 0) {
1621                 /*
1622                  * This may seem like a duplication to the above error
1623                  * handling part when we return ECONNABORTED, except
1624                  * the following is done while holding the lock since
1625                  * the socket has been exposed to the filter(s) earlier.
1626                  */
1627                 so->so_state &= ~SS_NOFDREF;
1628                 socket_unlock(so, 1);
1629                 soclose(so);
1630                 /* Propagate socket filter's error code to the caller */
1631         } else {
1632                 socket_unlock(so, 1);
1633         }
1634 done:
1635         /* Callee checks for NULL pointer */
1636         sock_freeaddr(remote);
1637         sock_freeaddr(local);
1638         return error;
1639 }
1640
1641 /*
1642  * Returns:     0                       Success
1643  *              EOPNOTSUPP              Operation not supported on socket
1644  *              EISCONN                 Socket is connected
1645  *      <pru_connect>:EADDRNOTAVAIL     Address not available.
1646  *      <pru_connect>:EINVAL            Invalid argument
1647  *      <pru_connect>:EAFNOSUPPORT      Address family not supported [notdef]
1648  *      <pru_connect>:EACCES            Permission denied
1649  *      <pru_connect>:EADDRINUSE        Address in use
1650  *      <pru_connect>:EAGAIN            Resource unavailable, try again
1651  *      <pru_connect>:EPERM             Operation not permitted
1652  *      <sf_connect_out>:???            [anything a filter writer might set]
1653  */
1654 int
1655 soconnectlock(struct socket *so, struct sockaddr *nam, int dolock)
1656 {
1657         int error;
1658         struct proc *p = current_proc();
1659
1660         if (dolock) {
1661                 socket_lock(so, 1);
1662         }
1663
1664         so_update_last_owner_locked(so, p);
1665         so_update_policy(so);
1666
1667 #if NECP
1668         so_update_necp_policy(so, NULL, nam);
1669 #endif /* NECP */
1670
1671         /*
1672          * If this is a listening socket or if this is a previously-accepted
1673          * socket that has been marked as inactive, reject the connect request.
1674          */
1675         if ((so->so_options & SO_ACCEPTCONN) || (so->so_flags & SOF_DEFUNCT)) {
1676                 error = EOPNOTSUPP;
1677                 if (so->so_flags & SOF_DEFUNCT) {
1678                         SODEFUNCTLOG("%s[%d, %s]: defunct so 0x%llx [%d,%d] "
1679                             "(%d)\n", __func__, proc_pid(p),
1680                             proc_best_name(p),
1681                             (uint64_t)DEBUG_KERNEL_ADDRPERM(so),
1682                             SOCK_DOM(so), SOCK_TYPE(so), error);
1683                 }
1684                 if (dolock) {
1685                         socket_unlock(so, 1);
1686                 }
1687                 return error;
1688         }
1689
1690         if ((so->so_restrictions & SO_RESTRICT_DENY_OUT) != 0) {
1691                 if (dolock) {
1692                         socket_unlock(so, 1);
1693                 }
1694                 return EPERM;
1695         }
1696
1697         /*
1698          * If protocol is connection-based, can only connect once.
1699          * Otherwise, if connected, try to disconnect first.
1700          * This allows user to disconnect by connecting to, e.g.,
1701          * a null address.
1702          */
1703         if (so->so_state & (SS_ISCONNECTED | SS_ISCONNECTING) &&
1704             ((so->so_proto->pr_flags & PR_CONNREQUIRED) ||
1705             (error = sodisconnectlocked(so)))) {
1706                 error = EISCONN;
1707         } else {
1708                 /*
1709                  * Run connect filter before calling protocol:
1710                  *  - non-blocking connect returns before completion;
1711                  */
1712                 error = sflt_connectout(so, nam);
1713                 if (error != 0) {
1714                         if (error == EJUSTRETURN) {
1715                                 error = 0;
1716                         }
1717                 } else {
1718                         error = (*so->so_proto->pr_usrreqs->pru_connect)
1719                             (so, nam, p);
1720                 }
1721         }
1722         if (dolock) {
1723                 socket_unlock(so, 1);
1724         }
1725         return error;
1726 }
1727
1728 int
1729 soconnect(struct socket *so, struct sockaddr *nam)
1730 {
1731         return soconnectlock(so, nam, 1);
1732 }
1733
1734 /*
1735  * Returns:     0                       Success
1736  *      <pru_connect2>:EINVAL[AF_UNIX]
1737  *      <pru_connect2>:EPROTOTYPE[AF_UNIX]
1738  *      <pru_connect2>:???              [other protocol families]
1739  *
1740  * Notes:       <pru_connect2> is not supported by [TCP].
1741  */
1742 int
1743 soconnect2(struct socket *so1, struct socket *so2)
1744 {
1745         int error;
1746
1747         socket_lock(so1, 1);
1748         if (so2->so_proto->pr_lock) {
1749                 socket_lock(so2, 1);
1750         }
1751
1752         error = (*so1->so_proto->pr_usrreqs->pru_connect2)(so1, so2);
1753
1754         socket_unlock(so1, 1);
1755         if (so2->so_proto->pr_lock) {
1756                 socket_unlock(so2, 1);
1757         }
1758         return error;
1759 }
1760
1761 int
1762 soconnectxlocked(struct socket *so, struct sockaddr *src,
1763     struct sockaddr *dst, struct proc *p, uint32_t ifscope,
1764     sae_associd_t aid, sae_connid_t *pcid, uint32_t flags, void *arg,
1765     uint32_t arglen, uio_t auio, user_ssize_t *bytes_written)
1766 {
1767         int error;
1768
1769         so_update_last_owner_locked(so, p);
1770         so_update_policy(so);
1771
1772         /*
1773          * If this is a listening socket or if this is a previously-accepted
1774          * socket that has been marked as inactive, reject the connect request.
1775          */
1776         if ((so->so_options & SO_ACCEPTCONN) || (so->so_flags & SOF_DEFUNCT)) {
1777                 error = EOPNOTSUPP;
1778                 if (so->so_flags & SOF_DEFUNCT) {
1779                         SODEFUNCTLOG("%s[%d, %s]: defunct so 0x%llx [%d,%d] "
1780                             "(%d)\n", __func__, proc_pid(p),
1781                             proc_best_name(p),
1782                             (uint64_t)DEBUG_KERNEL_ADDRPERM(so),
1783                             SOCK_DOM(so), SOCK_TYPE(so), error);
1784                 }
1785                 return error;
1786         }
1787
1788         if ((so->so_restrictions & SO_RESTRICT_DENY_OUT) != 0) {
1789                 return EPERM;
1790         }
1791
1792         /*
1793          * If protocol is connection-based, can only connect once
1794          * unless PR_MULTICONN is set.  Otherwise, if connected,
1795          * try to disconnect first.  This allows user to disconnect
1796          * by connecting to, e.g., a null address.
1797          */
1798         if ((so->so_state & (SS_ISCONNECTED | SS_ISCONNECTING)) &&
1799             !(so->so_proto->pr_flags & PR_MULTICONN) &&
1800             ((so->so_proto->pr_flags & PR_CONNREQUIRED) ||
1801             (error = sodisconnectlocked(so)) != 0)) {
1802                 error = EISCONN;
1803         } else {
1804                 /*
1805                  * Run connect filter before calling protocol:
1806                  *  - non-blocking connect returns before completion;
1807                  */
1808                 error = sflt_connectout(so, dst);
1809                 if (error != 0) {
1810                         /* Disable PRECONNECT_DATA, as we don't need to send a SYN anymore. */
1811                         so->so_flags1 &= ~SOF1_PRECONNECT_DATA;
1812                         if (error == EJUSTRETURN) {
1813                                 error = 0;
1814                         }
1815                 } else {
1816                         error = (*so->so_proto->pr_usrreqs->pru_connectx)
1817                             (so, src, dst, p, ifscope, aid, pcid,
1818                             flags, arg, arglen, auio, bytes_written);
1819                 }
1820         }
1821
1822         return error;
1823 }
1824
1825 int
1826 sodisconnectlocked(struct socket *so)
1827 {
1828         int error;
1829
1830         if ((so->so_state & SS_ISCONNECTED) == 0) {
1831                 error = ENOTCONN;
1832                 goto bad;
1833         }
1834         if (so->so_state & SS_ISDISCONNECTING) {
1835                 error = EALREADY;
1836                 goto bad;
1837         }
1838
1839         error = (*so->so_proto->pr_usrreqs->pru_disconnect)(so);
1840         if (error == 0) {
1841                 sflt_notify(so, sock_evt_disconnected, NULL);
1842         }
1843
1844 bad:
1845         return error;
1846 }
1847
1848 /* Locking version */
1849 int
1850 sodisconnect(struct socket *so)
1851 {
1852         int error;
1853
1854         socket_lock(so, 1);
1855         error = sodisconnectlocked(so);
1856         socket_unlock(so, 1);
1857         return error;
1858 }
1859
1860 int
1861 sodisconnectxlocked(struct socket *so, sae_associd_t aid, sae_connid_t cid)
1862 {
1863         int error;
1864
1865         /*
1866          * Call the protocol disconnectx handler; let it handle all
1867          * matters related to the connection state of this session.
1868          */
1869         error = (*so->so_proto->pr_usrreqs->pru_disconnectx)(so, aid, cid);
1870         if (error == 0) {
1871                 /*
1872                  * The event applies only for the session, not for
1873                  * the disconnection of individual subflows.
1874                  */
1875                 if (so->so_state & (SS_ISDISCONNECTING | SS_ISDISCONNECTED)) {
1876                         sflt_notify(so, sock_evt_disconnected, NULL);
1877                 }
1878         }
1879         return error;
1880 }
1881
1882 int
1883 sodisconnectx(struct socket *so, sae_associd_t aid, sae_connid_t cid)
1884 {
1885         int error;
1886
1887         socket_lock(so, 1);
1888         error = sodisconnectxlocked(so, aid, cid);
1889         socket_unlock(so, 1);
1890         return error;
1891 }
1892
1893 #define SBLOCKWAIT(f)   (((f) & MSG_DONTWAIT) ? 0 : SBL_WAIT)
1894
1895 /*
1896  * sosendcheck will lock the socket buffer if it isn't locked and
1897  * verify that there is space for the data being inserted.
1898  *
1899  * Returns:     0                       Success
1900  *              EPIPE
1901  *      sblock:EWOULDBLOCK
1902  *      sblock:EINTR
1903  *      sbwait:EBADF
1904  *      sbwait:EINTR
1905  *      [so_error]:???
1906  */
1907 int
1908 sosendcheck(struct socket *so, struct sockaddr *addr, user_ssize_t resid,
1909     int32_t clen, int32_t atomic, int flags, int *sblocked,
1910     struct mbuf *control)
1911 {
1912         int     error = 0;
1913         int32_t space;
1914         int     assumelock = 0;
1915
1916 restart:
1917         if (*sblocked == 0) {
1918                 if ((so->so_snd.sb_flags & SB_LOCK) != 0 &&
1919                     so->so_send_filt_thread != 0 &&
1920                     so->so_send_filt_thread == current_thread()) {
1921                         /*
1922                          * We're being called recursively from a filter,
1923                          * allow this to continue. Radar 4150520.
1924                          * Don't set sblocked because we don't want
1925                          * to perform an unlock later.
1926                          */
1927                         assumelock = 1;
1928                 } else {
1929                         error = sblock(&so->so_snd, SBLOCKWAIT(flags));
1930                         if (error) {
1931                                 if (so->so_flags & SOF_DEFUNCT) {
1932                                         goto defunct;
1933                                 }
1934                                 return error;
1935                         }
1936                         *sblocked = 1;
1937                 }
1938         }
1939
1940         /*
1941          * If a send attempt is made on a socket that has been marked
1942          * as inactive (disconnected), reject the request.
1943          */
1944         if (so->so_flags & SOF_DEFUNCT) {
1945 defunct:
1946                 error = EPIPE;
1947                 SODEFUNCTLOG("%s[%d, %s]: defunct so 0x%llx [%d,%d] (%d)\n",
1948                     __func__, proc_selfpid(), proc_best_name(current_proc()),
1949                     (uint64_t)DEBUG_KERNEL_ADDRPERM(so),
1950                     SOCK_DOM(so), SOCK_TYPE(so), error);
1951                 return error;
1952         }
1953
1954         if (so->so_state & SS_CANTSENDMORE) {
1955 #if CONTENT_FILTER
1956                 /*
1957                  * Can re-inject data of half closed connections
1958                  */
1959                 if ((so->so_state & SS_ISDISCONNECTED) == 0 &&
1960                     so->so_snd.sb_cfil_thread == current_thread() &&
1961                     cfil_sock_data_pending(&so->so_snd) != 0) {
1962                         CFIL_LOG(LOG_INFO,
1963                             "so %llx ignore SS_CANTSENDMORE",
1964                             (uint64_t)DEBUG_KERNEL_ADDRPERM(so));
1965                 } else
1966 #endif /* CONTENT_FILTER */
1967                 return EPIPE;
1968         }
1969         if (so->so_error) {
1970                 error = so->so_error;
1971                 so->so_error = 0;
1972                 return error;
1973         }
1974
1975         if ((so->so_state & SS_ISCONNECTED) == 0) {
1976                 if ((so->so_proto->pr_flags & PR_CONNREQUIRED) != 0) {
1977                         if (((so->so_state & SS_ISCONFIRMING) == 0) &&
1978                             (resid != 0 || clen == 0) &&
1979                             !(so->so_flags1 & SOF1_PRECONNECT_DATA)) {
1980                                 return ENOTCONN;
1981                         }
1982                 } else if (addr == 0 && !(flags & MSG_HOLD)) {
1983                         return (so->so_proto->pr_flags & PR_CONNREQUIRED) ?
1984                                ENOTCONN : EDESTADDRREQ;
1985                 }
1986         }
1987
1988         if (so->so_flags & SOF_ENABLE_MSGS) {
1989                 space = msgq_sbspace(so, control);
1990         } else {
1991                 space = sbspace(&so->so_snd);
1992         }
1993
1994         if (flags & MSG_OOB) {
1995                 space += 1024;
1996         }
1997         if ((atomic && resid > so->so_snd.sb_hiwat) ||
1998             clen > so->so_snd.sb_hiwat) {
1999                 return EMSGSIZE;
2000         }
2001
2002         if ((space < resid + clen &&
2003             (atomic || (space < (int32_t)so->so_snd.sb_lowat) ||
2004             space < clen)) ||
2005             (so->so_type == SOCK_STREAM && so_wait_for_if_feedback(so))) {
2006                 /*
2007                  * don't block the connectx call when there's more data
2008                  * than can be copied.
2009                  */
2010                 if (so->so_flags1 & SOF1_PRECONNECT_DATA) {
2011                         if (space == 0) {
2012                                 return EWOULDBLOCK;
2013                         }
2014                         if (space < (int32_t)so->so_snd.sb_lowat) {
2015                                 return 0;
2016                         }
2017                 }
2018                 if ((so->so_state & SS_NBIO) || (flags & MSG_NBIO) ||
2019                     assumelock) {
2020                         return EWOULDBLOCK;
2021                 }
2022                 sbunlock(&so->so_snd, TRUE);    /* keep socket locked */
2023                 *sblocked = 0;
2024                 error = sbwait(&so->so_snd);
2025                 if (error) {
2026                         if (so->so_flags & SOF_DEFUNCT) {
2027                                 goto defunct;
2028                         }
2029                         return error;
2030                 }
2031                 goto restart;
2032         }
2033         return 0;
2034 }
2035
2036 /*
2037  * Send on a socket.
2038  * If send must go all at once and message is larger than
2039  * send buffering, then hard error.
2040  * Lock against other senders.
2041  * If must go all at once and not enough room now, then
2042  * inform user that this would block and do nothing.
2043  * Otherwise, if nonblocking, send as much as possible.
2044  * The data to be sent is described by "uio" if nonzero,
2045  * otherwise by the mbuf chain "top" (which must be null
2046  * if uio is not).  Data provided in mbuf chain must be small
2047  * enough to send all at once.
2048  *
2049  * Returns nonzero on error, timeout or signal; callers
2050  * must check for short counts if EINTR/ERESTART are returned.
2051  * Data and control buffers are freed on return.
2052  * Experiment:
2053  * MSG_HOLD: go thru most of sosend(), but just enqueue the mbuf
2054  * MSG_SEND: go thru as for MSG_HOLD on current fragment, then
2055  *  point at the mbuf chain being constructed and go from there.
2056  *
2057  * Returns:     0                       Success
2058  *              EOPNOTSUPP
2059  *              EINVAL
2060  *              ENOBUFS
2061  *      uiomove:EFAULT
2062  *      sosendcheck:EPIPE
2063  *      sosendcheck:EWOULDBLOCK
2064  *      sosendcheck:EINTR
2065  *      sosendcheck:EBADF
2066  *      sosendcheck:EINTR
2067  *      sosendcheck:???                 [value from so_error]
2068  *      <pru_send>:ECONNRESET[TCP]
2069  *      <pru_send>:EINVAL[TCP]
2070  *      <pru_send>:ENOBUFS[TCP]
2071  *      <pru_send>:EADDRINUSE[TCP]
2072  *      <pru_send>:EADDRNOTAVAIL[TCP]
2073  *      <pru_send>:EAFNOSUPPORT[TCP]
2074  *      <pru_send>:EACCES[TCP]
2075  *      <pru_send>:EAGAIN[TCP]
2076  *      <pru_send>:EPERM[TCP]
2077  *      <pru_send>:EMSGSIZE[TCP]
2078  *      <pru_send>:EHOSTUNREACH[TCP]
2079  *      <pru_send>:ENETUNREACH[TCP]
2080  *      <pru_send>:ENETDOWN[TCP]
2081  *      <pru_send>:ENOMEM[TCP]
2082  *      <pru_send>:ENOBUFS[TCP]
2083  *      <pru_send>:???[TCP]             [ignorable: mostly IPSEC/firewall/DLIL]
2084  *      <pru_send>:EINVAL[AF_UNIX]
2085  *      <pru_send>:EOPNOTSUPP[AF_UNIX]
2086  *      <pru_send>:EPIPE[AF_UNIX]
2087  *      <pru_send>:ENOTCONN[AF_UNIX]
2088  *      <pru_send>:EISCONN[AF_UNIX]
2089  *      <pru_send>:???[AF_UNIX]         [whatever a filter author chooses]
2090  *      <sf_data_out>:???               [whatever a filter author chooses]
2091  *
2092  * Notes:       Other <pru_send> returns depend on the protocol family; all
2093  *              <sf_data_out> returns depend on what the filter author causes
2094  *              their filter to return.
2095  */
2096 int
2097 sosend(struct socket *so, struct sockaddr *addr, struct uio *uio,
2098     struct mbuf *top, struct mbuf *control, int flags)
2099 {
2100         struct mbuf **mp;
2101         struct mbuf *m, *freelist = NULL;
2102         user_ssize_t space, len, resid, orig_resid;
2103         int clen = 0, error, dontroute, mlen, sendflags;
2104         int atomic = sosendallatonce(so) || top;
2105         int sblocked = 0;
2106         struct proc *p = current_proc();
2107         struct mbuf *control_copy = NULL;
2108         uint16_t headroom = 0;
2109         boolean_t en_tracing = FALSE;
2110
2111         if (uio != NULL) {
2112                 resid = uio_resid(uio);
2113         } else {
2114                 resid = top->m_pkthdr.len;
2115         }
2116
2117         KERNEL_DEBUG((DBG_FNC_SOSEND | DBG_FUNC_START), so, resid,
2118             so->so_snd.sb_cc, so->so_snd.sb_lowat, so->so_snd.sb_hiwat);
2119
2120         socket_lock(so, 1);
2121
2122         /*
2123          * trace if tracing & network (vs. unix) sockets & and
2124          * non-loopback
2125          */
2126         if (ENTR_SHOULDTRACE &&
2127             (SOCK_CHECK_DOM(so, AF_INET) || SOCK_CHECK_DOM(so, AF_INET6))) {
2128                 struct inpcb *inp = sotoinpcb(so);
2129                 if (inp->inp_last_outifp != NULL &&
2130                     !(inp->inp_last_outifp->if_flags & IFF_LOOPBACK)) {
2131                         en_tracing = TRUE;
2132                         KERNEL_ENERGYTRACE(kEnTrActKernSockWrite, DBG_FUNC_START,
2133                             VM_KERNEL_ADDRPERM(so),
2134                             ((so->so_state & SS_NBIO) ? kEnTrFlagNonBlocking : 0),
2135                             (int64_t)resid);
2136                         orig_resid = resid;
2137                 }
2138         }
2139
2140         /*
2141          * Re-injection should not affect process accounting
2142          */
2143         if ((flags & MSG_SKIPCFIL) == 0) {
2144                 so_update_last_owner_locked(so, p);
2145                 so_update_policy(so);
2146
2147 #if NECP
2148                 so_update_necp_policy(so, NULL, addr);
2149 #endif /* NECP */
2150         }
2151
2152         if (so->so_type != SOCK_STREAM && (flags & MSG_OOB) != 0) {
2153                 error = EOPNOTSUPP;
2154                 goto out_locked;
2155         }
2156
2157         /*
2158          * In theory resid should be unsigned.
2159          * However, space must be signed, as it might be less than 0
2160          * if we over-committed, and we must use a signed comparison
2161          * of space and resid.  On the other hand, a negative resid
2162          * causes us to loop sending 0-length segments to the protocol.
2163          *
2164          * Usually, MSG_EOR isn't used on SOCK_STREAM type sockets.
2165          * But it will be used by sockets doing message delivery.
2166          *
2167          * Note: We limit resid to be a positive int value as we use
2168          * imin() to set bytes_to_copy -- radr://14558484
2169          */
2170         if (resid < 0 || resid > INT_MAX || (so->so_type == SOCK_STREAM &&
2171             !(so->so_flags & SOF_ENABLE_MSGS) && (flags & MSG_EOR))) {
2172                 error = EINVAL;
2173                 goto out_locked;
2174         }
2175
2176         dontroute = (flags & MSG_DONTROUTE) &&
2177             (so->so_options & SO_DONTROUTE) == 0 &&
2178             (so->so_proto->pr_flags & PR_ATOMIC);
2179         OSIncrementAtomicLong(&p->p_stats->p_ru.ru_msgsnd);
2180
2181         if (control != NULL) {
2182                 clen = control->m_len;
2183         }
2184
2185         if (soreserveheadroom != 0) {
2186                 headroom = so->so_pktheadroom;
2187         }
2188
2189         do {
2190                 error = sosendcheck(so, addr, resid, clen, atomic, flags,
2191                     &sblocked, control);
2192                 if (error) {
2193                         goto out_locked;
2194                 }
2195
2196                 mp = &top;
2197                 if (so->so_flags & SOF_ENABLE_MSGS) {
2198                         space = msgq_sbspace(so, control);
2199                 } else {
2200                         space = sbspace(&so->so_snd) - clen;
2201                 }
2202                 space += ((flags & MSG_OOB) ? 1024 : 0);
2203
2204                 do {
2205                         if (uio == NULL) {
2206                                 /*
2207                                  * Data is prepackaged in "top".
2208                                  */
2209                                 resid = 0;
2210                                 if (flags & MSG_EOR) {
2211                                         top->m_flags |= M_EOR;
2212                                 }
2213                         } else {
2214                                 int chainlength;
2215                                 int bytes_to_copy;
2216                                 boolean_t jumbocl;
2217                                 boolean_t bigcl;
2218                                 int bytes_to_alloc;
2219
2220                                 bytes_to_copy = imin(resid, space);
2221
2222                                 bytes_to_alloc = bytes_to_copy;
2223                                 if (top == NULL) {
2224                                         bytes_to_alloc += headroom;
2225                                 }
2226
2227                                 if (sosendminchain > 0) {
2228                                         chainlength = 0;
2229                                 } else {
2230                                         chainlength = sosendmaxchain;
2231                                 }
2232
2233                                 /*
2234                                  * Use big 4 KB cluster when the outgoing interface
2235                                  * does not prefer 2 KB clusters
2236                                  */
2237                                 bigcl = !(so->so_flags1 & SOF1_IF_2KCL) ||
2238                                     sosendbigcl_ignore_capab;
2239
2240                                 /*
2241                                  * Attempt to use larger than system page-size
2242                                  * clusters for large writes only if there is
2243                                  * a jumbo cluster pool and if the socket is
2244                                  * marked accordingly.
2245                                  */
2246                                 jumbocl = sosendjcl && njcl > 0 &&
2247                                     ((so->so_flags & SOF_MULTIPAGES) ||
2248                                     sosendjcl_ignore_capab) &&
2249                                     bigcl;
2250
2251                                 socket_unlock(so, 0);
2252
2253                                 do {
2254                                         int num_needed;
2255                                         int hdrs_needed = (top == NULL) ? 1 : 0;
2256
2257                                         /*
2258                                          * try to maintain a local cache of mbuf
2259                                          * clusters needed to complete this
2260                                          * write the list is further limited to
2261                                          * the number that are currently needed
2262                                          * to fill the socket this mechanism
2263                                          * allows a large number of mbufs/
2264                                          * clusters to be grabbed under a single
2265                                          * mbuf lock... if we can't get any
2266                                          * clusters, than fall back to trying
2267                                          * for mbufs if we fail early (or
2268                                          * miscalcluate the number needed) make
2269                                          * sure to release any clusters we
2270                                          * haven't yet consumed.
2271                                          */
2272                                         if (freelist == NULL &&
2273                                             bytes_to_alloc > MBIGCLBYTES &&
2274                                             jumbocl) {
2275                                                 num_needed =
2276                                                     bytes_to_alloc / M16KCLBYTES;
2277
2278                                                 if ((bytes_to_alloc -
2279                                                     (num_needed * M16KCLBYTES))
2280                                                     >= MINCLSIZE) {
2281                                                         num_needed++;
2282                                                 }
2283
2284                                                 freelist =
2285                                                     m_getpackets_internal(
2286                                                         (unsigned int *)&num_needed,
2287                                                         hdrs_needed, M_WAIT, 0,
2288                                                         M16KCLBYTES);
2289                                                 /*
2290                                                  * Fall back to 4K cluster size
2291                                                  * if allocation failed
2292                                                  */
2293                                         }
2294
2295                                         if (freelist == NULL &&
2296                                             bytes_to_alloc > MCLBYTES &&
2297                                             bigcl) {
2298                                                 num_needed =
2299                                                     bytes_to_alloc / MBIGCLBYTES;
2300
2301                                                 if ((bytes_to_alloc -
2302                                                     (num_needed * MBIGCLBYTES)) >=
2303                                                     MINCLSIZE) {
2304                                                         num_needed++;
2305                                                 }
2306
2307                                                 freelist =
2308                                                     m_getpackets_internal(
2309                                                         (unsigned int *)&num_needed,
2310                                                         hdrs_needed, M_WAIT, 0,
2311                                                         MBIGCLBYTES);
2312                                                 /*
2313                                                  * Fall back to cluster size
2314                                                  * if allocation failed
2315                                                  */
2316                                         }
2317
2318                                         /*
2319                                          * Allocate a cluster as we want to
2320                                          * avoid to split the data in more
2321                                          * that one segment and using MINCLSIZE
2322                                          * would lead us to allocate two mbufs
2323                                          */
2324                                         if (soreserveheadroom != 0 &&
2325                                             freelist == NULL &&
2326                                             ((top == NULL &&
2327                                             bytes_to_alloc > _MHLEN) ||
2328                                             bytes_to_alloc > _MLEN)) {
2329                                                 num_needed = ROUNDUP(bytes_to_alloc, MCLBYTES) /
2330                                                     MCLBYTES;
2331                                                 freelist =
2332                                                     m_getpackets_internal(
2333                                                         (unsigned int *)&num_needed,
2334                                                         hdrs_needed, M_WAIT, 0,
2335                                                         MCLBYTES);
2336                                                 /*
2337                                                  * Fall back to a single mbuf
2338                                                  * if allocation failed
2339                                                  */
2340                                         } else if (freelist == NULL &&
2341                                             bytes_to_alloc > MINCLSIZE) {
2342                                                 num_needed =
2343                                                     bytes_to_alloc / MCLBYTES;
2344
2345                                                 if ((bytes_to_alloc -
2346                                                     (num_needed * MCLBYTES)) >=
2347                                                     MINCLSIZE) {
2348                                                         num_needed++;
2349                                                 }
2350
2351                                                 freelist =
2352                                                     m_getpackets_internal(
2353                                                         (unsigned int *)&num_needed,
2354                                                         hdrs_needed, M_WAIT, 0,
2355                                                         MCLBYTES);
2356                                                 /*
2357                                                  * Fall back to a single mbuf
2358                                                  * if allocation failed
2359                                                  */
2360                                         }
2361                                         /*
2362                                          * For datagram protocols, leave
2363                                          * headroom for protocol headers
2364                                          * in the first cluster of the chain
2365                                          */
2366                                         if (freelist != NULL && atomic &&
2367                                             top == NULL && headroom > 0) {
2368                                                 freelist->m_data += headroom;
2369                                         }
2370
2371                                         /*
2372                                          * Fall back to regular mbufs without
2373                                          * reserving the socket headroom
2374                                          */
2375                                         if (freelist == NULL) {
2376                                                 if (top == NULL) {
2377                                                         MGETHDR(freelist,
2378                                                             M_WAIT, MT_DATA);
2379                                                 } else {
2380                                                         MGET(freelist,
2381                                                             M_WAIT, MT_DATA);
2382                                                 }
2383
2384                                                 if (freelist == NULL) {
2385                                                         error = ENOBUFS;
2386                                                         socket_lock(so, 0);
2387                                                         goto out_locked;
2388                                                 }
2389                                                 /*
2390                                                  * For datagram protocols,
2391                                                  * leave room for protocol
2392                                                  * headers in first mbuf.
2393                                                  */
2394                                                 if (atomic && top == NULL &&
2395                                                     bytes_to_copy < MHLEN) {
2396                                                         MH_ALIGN(freelist,
2397                                                             bytes_to_copy);
2398                                                 }
2399                                         }
2400                                         m = freelist;
2401                                         freelist = m->m_next;
2402                                         m->m_next = NULL;
2403
2404                                         if ((m->m_flags & M_EXT)) {
2405                                                 mlen = m->m_ext.ext_size -
2406                                                     M_LEADINGSPACE(m);
2407                                         } else if ((m->m_flags & M_PKTHDR)) {
2408                                                 mlen =
2409                                                     MHLEN - M_LEADINGSPACE(m);
2410                                         } else {
2411                                                 mlen = MLEN - M_LEADINGSPACE(m);
2412                                         }
2413                                         len = imin(mlen, bytes_to_copy);
2414
2415                                         chainlength += len;
2416
2417                                         space -= len;
2418
2419                                         error = uiomove(mtod(m, caddr_t),
2420                                             len, uio);
2421
2422                                         resid = uio_resid(uio);
2423
2424                                         m->m_len = len;
2425                                         *mp = m;
2426                                         top->m_pkthdr.len += len;
2427                                         if (error) {
2428                                                 break;
2429                                         }
2430                                         mp = &m->m_next;
2431                                         if (resid <= 0) {
2432                                                 if (flags & MSG_EOR) {
2433                                                         top->m_flags |= M_EOR;
2434                                                 }
2435                                                 break;
2436                                         }
2437                                         bytes_to_copy = min(resid, space);
2438                                 } while (space > 0 &&
2439                                     (chainlength < sosendmaxchain || atomic ||
2440                                     resid < MINCLSIZE));
2441
2442                                 socket_lock(so, 0);
2443
2444                                 if (error) {
2445                                         goto out_locked;
2446                                 }
2447                         }
2448
2449                         if (flags & (MSG_HOLD | MSG_SEND)) {
2450                                 /* Enqueue for later, go away if HOLD */
2451                                 struct mbuf *mb1;
2452                                 if (so->so_temp && (flags & MSG_FLUSH)) {
2453                                         m_freem(so->so_temp);
2454                                         so->so_temp = NULL;
2455                                 }
2456                                 if (so->so_temp) {
2457                                         so->so_tail->m_next = top;
2458                                 } else {
2459                                         so->so_temp = top;
2460                                 }
2461                                 mb1 = top;
2462                                 while (mb1->m_next) {
2463                                         mb1 = mb1->m_next;
2464                                 }
2465                                 so->so_tail = mb1;
2466                                 if (flags & MSG_HOLD) {
2467                                         top = NULL;
2468                                         goto out_locked;
2469                                 }
2470                                 top = so->so_temp;
2471                         }
2472                         if (dontroute) {
2473                                 so->so_options |= SO_DONTROUTE;
2474                         }
2475
2476                         /*
2477                          * Compute flags here, for pru_send and NKEs
2478                          *
2479                          * If the user set MSG_EOF, the protocol
2480                          * understands this flag and nothing left to
2481                          * send then use PRU_SEND_EOF instead of PRU_SEND.
2482                          */
2483                         sendflags = (flags & MSG_OOB) ? PRUS_OOB :
2484                             ((flags & MSG_EOF) &&
2485                             (so->so_proto->pr_flags & PR_IMPLOPCL) &&
2486                             (resid <= 0)) ? PRUS_EOF :
2487                             /* If there is more to send set PRUS_MORETOCOME */
2488                             (resid > 0 && space > 0) ? PRUS_MORETOCOME : 0;
2489
2490                         if ((flags & MSG_SKIPCFIL) == 0) {
2491                                 /*
2492                                  * Socket filter processing
2493                                  */
2494                                 error = sflt_data_out(so, addr, &top,
2495                                     &control, (sendflags & MSG_OOB) ?
2496                                     sock_data_filt_flag_oob : 0);
2497                                 if (error) {
2498                                         if (error == EJUSTRETURN) {
2499                                                 error = 0;
2500                                                 clen = 0;
2501                                                 control = NULL;
2502                                                 top = NULL;
2503                                         }
2504                                         goto out_locked;
2505                                 }
2506 #if CONTENT_FILTER
2507                                 /*
2508                                  * Content filter processing
2509                                  */
2510                                 error = cfil_sock_data_out(so, addr, top,
2511                                     control, sendflags);
2512                                 if (error) {
2513                                         if (error == EJUSTRETURN) {
2514                                                 error = 0;
2515                                                 clen = 0;
2516                                                 control = NULL;
2517                                                 top = NULL;
2518                                         }
2519                                         goto out_locked;
2520                                 }
2521 #endif /* CONTENT_FILTER */
2522                         }
2523                         if (so->so_flags & SOF_ENABLE_MSGS) {
2524                                 /*
2525                                  * Make a copy of control mbuf,
2526                                  * so that msg priority can be
2527                                  * passed to subsequent mbufs.
2528                                  */
2529                                 control_copy = m_dup(control, M_NOWAIT);
2530                         }
2531                         error = (*so->so_proto->pr_usrreqs->pru_send)
2532                             (so, sendflags, top, addr, control, p);
2533
2534                         if (flags & MSG_SEND) {
2535                                 so->so_temp = NULL;
2536                         }
2537
2538                         if (dontroute) {
2539                                 so->so_options &= ~SO_DONTROUTE;
2540                         }
2541
2542                         clen = 0;
2543                         control = control_copy;
2544                         control_copy = NULL;
2545                         top = NULL;
2546                         mp = &top;
2547                         if (error) {
2548                                 goto out_locked;
2549                         }
2550                 } while (resid && space > 0);
2551         } while (resid);
2552
2553 out_locked:
2554         if (sblocked) {
2555                 sbunlock(&so->so_snd, FALSE);   /* will unlock socket */
2556         } else {
2557                 socket_unlock(so, 1);
2558         }
2559         if (top != NULL) {
2560                 m_freem(top);
2561         }
2562         if (control != NULL) {
2563                 m_freem(control);
2564         }
2565         if (freelist != NULL) {
2566                 m_freem_list(freelist);
2567         }
2568         if (control_copy != NULL) {
2569                 m_freem(control_copy);
2570         }
2571
2572         soclearfastopen(so);
2573
2574         if (en_tracing) {
2575                 /* resid passed here is the bytes left in uio */
2576                 KERNEL_ENERGYTRACE(kEnTrActKernSockWrite, DBG_FUNC_END,
2577                     VM_KERNEL_ADDRPERM(so),
2578                     ((error == EWOULDBLOCK) ? kEnTrFlagNoWork : 0),
2579                     (int64_t)(orig_resid - resid));
2580         }
2581         KERNEL_DEBUG(DBG_FNC_SOSEND | DBG_FUNC_END, so, resid,
2582             so->so_snd.sb_cc, space, error);
2583
2584         return error;
2585 }
2586
2587 int
2588 sosend_reinject(struct socket *so, struct sockaddr *addr, struct mbuf *top, struct mbuf *control, uint32_t sendflags)
2589 {
2590         struct mbuf *m0, *control_end;
2591
2592         socket_lock_assert_owned(so);
2593
2594         /*
2595          * top must points to mbuf chain to be sent.
2596          * If control is not NULL, top must be packet header
2597          */
2598         VERIFY(top != NULL &&
2599             (control == NULL || top->m_flags & M_PKTHDR));
2600
2601         /*
2602          * If control is not passed in, see if we can get it
2603          * from top.
2604          */
2605         if (control == NULL && (top->m_flags & M_PKTHDR) == 0) {
2606                 // Locate start of control if present and start of data
2607                 for (m0 = top; m0 != NULL; m0 = m0->m_next) {
2608                         if (m0->m_flags & M_PKTHDR) {
2609                                 top = m0;
2610                                 break;
2611                         } else if (m0->m_type == MT_CONTROL) {
2612                                 if (control == NULL) {
2613                                         // Found start of control
2614                                         control = m0;
2615                                 }
2616                                 if (control != NULL && m0->m_next != NULL && m0->m_next->m_type != MT_CONTROL) {
2617                                         // Found end of control
2618                                         control_end = m0;
2619                                 }
2620                         }
2621                 }
2622                 if (control_end != NULL) {
2623                         control_end->m_next = NULL;
2624                 }
2625         }
2626
2627         int error = (*so->so_proto->pr_usrreqs->pru_send)
2628             (so, sendflags, top, addr, control, current_proc());
2629
2630         return error;
2631 }
2632
2633 /*
2634  * Supported only connected sockets (no address) without ancillary data
2635  * (control mbuf) for atomic protocols
2636  */
2637 int
2638 sosend_list(struct socket *so, struct uio **uioarray, u_int uiocnt, int flags)
2639 {
2640         struct mbuf *m, *freelist = NULL;
2641         user_ssize_t len, resid;
2642         int error, dontroute, mlen;
2643         int atomic = sosendallatonce(so);
2644         int sblocked = 0;
2645         struct proc *p = current_proc();
2646         u_int uiofirst = 0;
2647         u_int uiolast = 0;
2648         struct mbuf *top = NULL;
2649         uint16_t headroom = 0;
2650         boolean_t bigcl;
2651
2652         KERNEL_DEBUG((DBG_FNC_SOSEND_LIST | DBG_FUNC_START), so, uiocnt,
2653             so->so_snd.sb_cc, so->so_snd.sb_lowat, so->so_snd.sb_hiwat);
2654
2655         if (so->so_type != SOCK_DGRAM) {
2656                 error = EINVAL;
2657                 goto out;
2658         }
2659         if (atomic == 0) {
2660                 error = EINVAL;
2661                 goto out;
2662         }
2663         if (so->so_proto->pr_usrreqs->pru_send_list == NULL) {
2664                 error = EPROTONOSUPPORT;
2665                 goto out;
2666         }
2667         if (flags & ~(MSG_DONTWAIT | MSG_NBIO)) {
2668                 error = EINVAL;
2669                 goto out;
2670         }
2671         resid = uio_array_resid(uioarray, uiocnt);
2672
2673         /*
2674          * In theory resid should be unsigned.
2675          * However, space must be signed, as it might be less than 0
2676          * if we over-committed, and we must use a signed comparison
2677          * of space and resid.  On the other hand, a negative resid
2678          * causes us to loop sending 0-length segments to the protocol.
2679          *
2680          * Note: We limit resid to be a positive int value as we use
2681          * imin() to set bytes_to_copy -- radr://14558484
2682          */
2683         if (resid < 0 || resid > INT_MAX) {
2684                 error = EINVAL;
2685                 goto out;
2686         }
2687
2688         socket_lock(so, 1);
2689         so_update_last_owner_locked(so, p);
2690         so_update_policy(so);
2691
2692 #if NECP
2693         so_update_necp_policy(so, NULL, NULL);
2694 #endif /* NECP */
2695
2696         dontroute = (flags & MSG_DONTROUTE) &&
2697             (so->so_options & SO_DONTROUTE) == 0 &&
2698             (so->so_proto->pr_flags & PR_ATOMIC);
2699         OSIncrementAtomicLong(&p->p_stats->p_ru.ru_msgsnd);
2700
2701         error = sosendcheck(so, NULL, resid, 0, atomic, flags,
2702             &sblocked, NULL);
2703         if (error) {
2704                 goto release;
2705         }
2706
2707         /*
2708          * Use big 4 KB clusters when the outgoing interface does not prefer
2709          * 2 KB clusters
2710          */
2711         bigcl = !(so->so_flags1 & SOF1_IF_2KCL) || sosendbigcl_ignore_capab;
2712
2713         if (soreserveheadroom != 0) {
2714                 headroom = so->so_pktheadroom;
2715         }
2716
2717         do {
2718                 int i;
2719                 int num_needed = 0;
2720                 int chainlength;
2721                 size_t maxpktlen = 0;
2722                 int bytes_to_alloc;
2723
2724                 if (sosendminchain > 0) {
2725                         chainlength = 0;
2726                 } else {
2727                         chainlength = sosendmaxchain;
2728                 }
2729
2730                 socket_unlock(so, 0);
2731
2732                 /*
2733                  * Find a set of uio that fit in a reasonable number
2734                  * of mbuf packets
2735                  */
2736                 for (i = uiofirst; i < uiocnt; i++) {
2737                         struct uio *auio = uioarray[i];
2738
2739                         len = uio_resid(auio);
2740
2741                         /* Do nothing for empty messages */
2742                         if (len == 0) {
2743                                 continue;
2744                         }
2745
2746                         num_needed += 1;
2747                         uiolast += 1;
2748
2749                         if (len > maxpktlen) {
2750                                 maxpktlen = len;
2751                         }
2752
2753                         chainlength += len;
2754                         if (chainlength > sosendmaxchain) {
2755                                 break;
2756                         }
2757                 }
2758                 /*
2759                  * Nothing left to send
2760                  */
2761                 if (num_needed == 0) {
2762                         socket_lock(so, 0);
2763                         break;
2764                 }
2765                 /*
2766                  * Allocate buffer large enough to include headroom space for
2767                  * network and link header
2768                  *
2769                  */
2770                 bytes_to_alloc = maxpktlen + headroom;
2771
2772                 /*
2773                  * Allocate a single contiguous buffer of the smallest available
2774                  * size when possible
2775                  */
2776                 if (bytes_to_alloc > MCLBYTES &&
2777                     bytes_to_alloc <= MBIGCLBYTES && bigcl) {
2778                         freelist = m_getpackets_internal(
2779                                 (unsigned int *)&num_needed,
2780                                 num_needed, M_WAIT, 1,
2781                                 MBIGCLBYTES);
2782                 } else if (bytes_to_alloc > _MHLEN &&
2783                     bytes_to_alloc <= MCLBYTES) {
2784                         freelist = m_getpackets_internal(
2785                                 (unsigned int *)&num_needed,
2786                                 num_needed, M_WAIT, 1,
2787                                 MCLBYTES);
2788                 } else {
2789                         freelist = m_allocpacket_internal(
2790                                 (unsigned int *)&num_needed,
2791                                 bytes_to_alloc, NULL, M_WAIT, 1, 0);
2792                 }
2793
2794                 if (freelist == NULL) {
2795                         socket_lock(so, 0);
2796                         error = ENOMEM;
2797                         goto release;
2798                 }
2799                 /*
2800                  * Copy each uio of the set into its own mbuf packet
2801                  */
2802                 for (i = uiofirst, m = freelist;
2803                     i < uiolast && m != NULL;
2804                     i++) {
2805                         int bytes_to_copy;
2806                         struct mbuf *n;
2807                         struct uio *auio = uioarray[i];
2808
2809                         bytes_to_copy = uio_resid(auio);
2810
2811                         /* Do nothing for empty messages */
2812                         if (bytes_to_copy == 0) {
2813                                 continue;
2814                         }
2815                         /*
2816                          * Leave headroom for protocol headers
2817                          * in the first mbuf of the chain
2818                          */
2819                         m->m_data += headroom;
2820
2821                         for (n = m; n != NULL; n = n->m_next) {
2822                                 if ((m->m_flags & M_EXT)) {
2823                                         mlen = m->m_ext.ext_size -
2824                                             M_LEADINGSPACE(m);
2825                                 } else if ((m->m_flags & M_PKTHDR)) {
2826                                         mlen =
2827                                             MHLEN - M_LEADINGSPACE(m);
2828                                 } else {
2829                                         mlen = MLEN - M_LEADINGSPACE(m);
2830                                 }
2831                                 len = imin(mlen, bytes_to_copy);
2832
2833                                 /*
2834                                  * Note: uiomove() decrements the iovec
2835                                  * length
2836                                  */
2837                                 error = uiomove(mtod(n, caddr_t),
2838                                     len, auio);
2839                                 if (error != 0) {
2840                                         break;
2841                                 }
2842                                 n->m_len = len;
2843                                 m->m_pkthdr.len += len;
2844
2845                                 VERIFY(m->m_pkthdr.len <= maxpktlen);
2846
2847                                 bytes_to_copy -= len;
2848                                 resid -= len;
2849                         }
2850                         if (m->m_pkthdr.len == 0) {
2851                                 printf(
2852                                         "%s:%d so %llx pkt %llx type %u len null\n",
2853                                         __func__, __LINE__,
2854                                         (uint64_t)DEBUG_KERNEL_ADDRPERM(so),
2855                                         (uint64_t)DEBUG_KERNEL_ADDRPERM(m),
2856                                         m->m_type);
2857                         }
2858                         if (error != 0) {
2859                                 break;
2860                         }
2861                         m = m->m_nextpkt;
2862                 }
2863
2864                 socket_lock(so, 0);
2865
2866                 if (error) {
2867                         goto release;
2868                 }
2869                 top = freelist;
2870                 freelist = NULL;
2871
2872                 if (dontroute) {
2873                         so->so_options |= SO_DONTROUTE;
2874                 }
2875
2876                 if ((flags & MSG_SKIPCFIL) == 0) {
2877                         struct mbuf **prevnextp = NULL;
2878
2879                         for (i = uiofirst, m = top;
2880                             i < uiolast && m != NULL;
2881                             i++) {
2882                                 struct mbuf *nextpkt = m->m_nextpkt;
2883
2884                                 /*
2885                                  * Socket filter processing
2886                                  */
2887                                 error = sflt_data_out(so, NULL, &m,
2888                                     NULL, 0);
2889                                 if (error != 0 && error != EJUSTRETURN) {
2890                                         goto release;
2891                                 }
2892
2893 #if CONTENT_FILTER
2894                                 if (error == 0) {
2895                                         /*
2896                                          * Content filter processing
2897                                          */
2898                                         error = cfil_sock_data_out(so, NULL, m,
2899                                             NULL, 0);
2900                                         if (error != 0 && error != EJUSTRETURN) {
2901                                                 goto release;
2902                                         }
2903                                 }
2904 #endif /* CONTENT_FILTER */
2905                                 /*
2906                                  * Remove packet from the list when
2907                                  * swallowed by a filter
2908                                  */
2909                                 if (error == EJUSTRETURN) {
2910                                         error = 0;
2911                                         if (prevnextp != NULL) {
2912                                                 *prevnextp = nextpkt;
2913                                         } else {
2914                                                 top = nextpkt;
2915                                         }
2916                                 }
2917
2918                                 m = nextpkt;
2919                                 if (m != NULL) {
2920                                         prevnextp = &m->m_nextpkt;
2921                                 }
2922                         }
2923                 }
2924                 if (top != NULL) {
2925                         error = (*so->so_proto->pr_usrreqs->pru_send_list)
2926                             (so, 0, top, NULL, NULL, p);
2927                 }
2928
2929                 if (dontroute) {
2930                         so->so_options &= ~SO_DONTROUTE;
2931                 }
2932
2933                 top = NULL;
2934                 uiofirst = uiolast;
2935         } while (resid > 0 && error == 0);
2936 release:
2937         if (sblocked) {
2938                 sbunlock(&so->so_snd, FALSE);   /* will unlock socket */
2939         } else {
2940                 socket_unlock(so, 1);
2941         }
2942 out:
2943         if (top != NULL) {
2944                 m_freem(top);
2945         }
2946         if (freelist != NULL) {
2947                 m_freem_list(freelist);
2948         }
2949
2950         KERNEL_DEBUG(DBG_FNC_SOSEND_LIST | DBG_FUNC_END, so, resid,
2951             so->so_snd.sb_cc, 0, error);
2952
2953         return error;
2954 }
2955
2956 /*
2957  * May return ERESTART when packet is dropped by MAC policy check
2958  */
2959 static int
2960 soreceive_addr(struct proc *p, struct socket *so, struct sockaddr **psa,
2961     int flags, struct mbuf **mp, struct mbuf **nextrecordp, int canwait)
2962 {
2963         int error = 0;
2964         struct mbuf *m = *mp;
2965         struct mbuf *nextrecord = *nextrecordp;
2966
2967         KASSERT(m->m_type == MT_SONAME, ("receive 1a"));
2968 #if CONFIG_MACF_SOCKET_SUBSET
2969         /*
2970          * Call the MAC framework for policy checking if we're in
2971          * the user process context and the socket isn't connected.
2972          */
2973         if (p != kernproc && !(so->so_state & SS_ISCONNECTED)) {
2974                 struct mbuf *m0 = m;
2975                 /*
2976                  * Dequeue this record (temporarily) from the receive
2977                  * list since we're about to drop the socket's lock
2978                  * where a new record may arrive and be appended to
2979                  * the list.  Upon MAC policy failure, the record
2980                  * will be freed.  Otherwise, we'll add it back to
2981                  * the head of the list.  We cannot rely on SB_LOCK
2982                  * because append operation uses the socket's lock.
2983                  */
2984                 do {
2985                         m->m_nextpkt = NULL;
2986                         sbfree(&so->so_rcv, m);
2987                         m = m->m_next;
2988                 } while (m != NULL);
2989                 m = m0;
2990                 so->so_rcv.sb_mb = nextrecord;
2991                 SB_EMPTY_FIXUP(&so->so_rcv);
2992                 SBLASTRECORDCHK(&so->so_rcv, "soreceive 1a");
2993                 SBLASTMBUFCHK(&so->so_rcv, "soreceive 1a");
2994                 socket_unlock(so, 0);
2995
2996                 if (mac_socket_check_received(proc_ucred(p), so,
2997                     mtod(m, struct sockaddr *)) != 0) {
2998                         /*
2999                          * MAC policy failure; free this record and
3000                          * process the next record (or block until
3001                          * one is available).  We have adjusted sb_cc
3002                          * and sb_mbcnt above so there is no need to
3003                          * call sbfree() again.
3004                          */
3005                         m_freem(m);
3006                         /*
3007                          * Clear SB_LOCK but don't unlock the socket.
3008                          * Process the next record or wait for one.
3009                          */
3010                         socket_lock(so, 0);
3011                         sbunlock(&so->so_rcv, TRUE); /* stay locked */
3012                         error = ERESTART;
3013                         goto done;
3014                 }
3015                 socket_lock(so, 0);
3016                 /*
3017                  * If the socket has been defunct'd, drop it.
3018                  */
3019                 if (so->so_flags & SOF_DEFUNCT) {
3020                         m_freem(m);
3021                         error = ENOTCONN;
3022                         goto done;
3023                 }
3024                 /*
3025                  * Re-adjust the socket receive list and re-enqueue
3026                  * the record in front of any packets which may have
3027                  * been appended while we dropped the lock.
3028                  */
3029                 for (m = m0; m->m_next != NULL; m = m->m_next) {
3030                         sballoc(&so->so_rcv, m);
3031                 }
3032                 sballoc(&so->so_rcv, m);
3033                 if (so->so_rcv.sb_mb == NULL) {
3034                         so->so_rcv.sb_lastrecord = m0;
3035                         so->so_rcv.sb_mbtail = m;
3036                 }
3037                 m = m0;
3038                 nextrecord = m->m_nextpkt = so->so_rcv.sb_mb;
3039                 so->so_rcv.sb_mb = m;
3040                 SBLASTRECORDCHK(&so->so_rcv, "soreceive 1b");
3041                 SBLASTMBUFCHK(&so->so_rcv, "soreceive 1b");
3042         }
3043 #endif /* CONFIG_MACF_SOCKET_SUBSET */
3044         if (psa != NULL) {
3045                 *psa = dup_sockaddr(mtod(m, struct sockaddr *), canwait);
3046                 if ((*psa == NULL) && (flags & MSG_NEEDSA)) {
3047                         error = EWOULDBLOCK;
3048                         goto done;
3049                 }
3050         }
3051         if (flags & MSG_PEEK) {
3052                 m = m->m_next;
3053         } else {
3054                 sbfree(&so->so_rcv, m);
3055                 if (m->m_next == NULL && so->so_rcv.sb_cc != 0) {
3056                         panic("%s: about to create invalid socketbuf",
3057                             __func__);
3058                         /* NOTREACHED */
3059                 }
3060                 MFREE(m, so->so_rcv.sb_mb);
3061                 m = so->so_rcv.sb_mb;
3062                 if (m != NULL) {
3063                         m->m_nextpkt = nextrecord;
3064                 } else {
3065                         so->so_rcv.sb_mb = nextrecord;
3066                         SB_EMPTY_FIXUP(&so->so_rcv);
3067                 }
3068         }
3069 done:
3070         *mp = m;
3071         *nextrecordp = nextrecord;
3072
3073         return error;
3074 }
3075
3076 /*
3077  * Process one or more MT_CONTROL mbufs present before any data mbufs
3078  * in the first mbuf chain on the socket buffer.  If MSG_PEEK, we
3079  * just copy the data; if !MSG_PEEK, we call into the protocol to
3080  * perform externalization.
3081  */
3082 static int
3083 soreceive_ctl(struct socket *so, struct mbuf **controlp, int flags,
3084     struct mbuf **mp, struct mbuf **nextrecordp)
3085 {
3086         int error = 0;
3087         struct mbuf *cm = NULL, *cmn;
3088         struct mbuf **cme = &cm;
3089         struct sockbuf *sb_rcv = &so->so_rcv;
3090         struct mbuf **msgpcm = NULL;
3091         struct mbuf *m = *mp;
3092         struct mbuf *nextrecord = *nextrecordp;
3093         struct protosw *pr = so->so_proto;
3094
3095         /*
3096          * Externalizing the control messages would require us to
3097          * drop the socket's lock below.  Once we re-acquire the
3098          * lock, the mbuf chain might change.  In order to preserve
3099          * consistency, we unlink all control messages from the
3100          * first mbuf chain in one shot and link them separately
3101          * onto a different chain.
3102          */
3103         do {
3104                 if (flags & MSG_PEEK) {
3105                         if (controlp != NULL) {
3106                                 if (*controlp == NULL) {
3107                                         msgpcm = controlp;
3108                                 }
3109                                 *controlp = m_copy(m, 0, m->m_len);
3110
3111                                 /*
3112                                  * If we failed to allocate an mbuf,
3113                                  * release any previously allocated
3114                                  * mbufs for control data. Return
3115                                  * an error. Keep the mbufs in the
3116                                  * socket as this is using
3117                                  * MSG_PEEK flag.
3118                                  */
3119                                 if (*controlp == NULL) {
3120                                         m_freem(*msgpcm);
3121                                         error = ENOBUFS;
3122                                         goto done;
3123                                 }
3124                                 controlp = &(*controlp)->m_next;
3125                         }
3126                         m = m->m_next;
3127                 } else {
3128                         m->m_nextpkt = NULL;
3129                         sbfree(sb_rcv, m);
3130                         sb_rcv->sb_mb = m->m_next;
3131                         m->m_next = NULL;
3132                         *cme = m;
3133                         cme = &(*cme)->m_next;
3134                         m = sb_rcv->sb_mb;
3135                 }
3136         } while (m != NULL && m->m_type == MT_CONTROL);
3137
3138         if (!(flags & MSG_PEEK)) {
3139                 if (sb_rcv->sb_mb != NULL) {
3140                         sb_rcv->sb_mb->m_nextpkt = nextrecord;
3141                 } else {
3142                         sb_rcv->sb_mb = nextrecord;
3143                         SB_EMPTY_FIXUP(sb_rcv);
3144                 }
3145                 if (nextrecord == NULL) {
3146                         sb_rcv->sb_lastrecord = m;
3147                 }
3148         }
3149
3150         SBLASTRECORDCHK(&so->so_rcv, "soreceive ctl");
3151         SBLASTMBUFCHK(&so->so_rcv, "soreceive ctl");
3152
3153         while (cm != NULL) {
3154                 int cmsg_type;
3155
3156                 cmn = cm->m_next;
3157                 cm->m_next = NULL;
3158                 cmsg_type = mtod(cm, struct cmsghdr *)->cmsg_type;
3159
3160                 /*
3161                  * Call the protocol to externalize SCM_RIGHTS message
3162                  * and return the modified message to the caller upon
3163                  * success.  Otherwise, all other control messages are
3164                  * returned unmodified to the caller.  Note that we
3165                  * only get into this loop if MSG_PEEK is not set.
3166                  */
3167                 if (pr->pr_domain->dom_externalize != NULL &&
3168                     cmsg_type == SCM_RIGHTS) {
3169                         /*
3170                          * Release socket lock: see 3903171.  This
3171                          * would also allow more records to be appended
3172                          * to the socket buffer.  We still have SB_LOCK
3173                          * set on it, so we can be sure that the head
3174                          * of the mbuf chain won't change.
3175                          */
3176                         socket_unlock(so, 0);
3177                         error = (*pr->pr_domain->dom_externalize)(cm);
3178                         socket_lock(so, 0);
3179                 } else {
3180                         error = 0;
3181                 }
3182
3183                 if (controlp != NULL && error == 0) {
3184                         *controlp = cm;
3185                         controlp = &(*controlp)->m_next;
3186                 } else {
3187                         (void) m_free(cm);
3188                 }
3189                 cm = cmn;
3190         }
3191         /*
3192          * Update the value of nextrecord in case we received new
3193          * records when the socket was unlocked above for
3194          * externalizing SCM_RIGHTS.
3195          */
3196         if (m != NULL) {
3197                 nextrecord = sb_rcv->sb_mb->m_nextpkt;
3198         } else {
3199                 nextrecord = sb_rcv->sb_mb;
3200         }
3201
3202 done:
3203         *mp = m;
3204         *nextrecordp = nextrecord;
3205
3206         return error;
3207 }
3208
3209 /*
3210  * Implement receive operations on a socket.
3211  * We depend on the way that records are added to the sockbuf
3212  * by sbappend*.  In particular, each record (mbufs linked through m_next)
3213  * must begin with an address if the protocol so specifies,
3214  * followed by an optional mbuf or mbufs containing ancillary data,
3215  * and then zero or more mbufs of data.
3216  * In order to avoid blocking network interrupts for the entire time here,
3217  * we splx() while doing the actual copy to user space.
3218  * Although the sockbuf is locked, new data may still be appended,
3219  * and thus we must maintain consistency of the sockbuf during that time.
3220  *
3221  * The caller may receive the data as a single mbuf chain by supplying
3222  * an mbuf **mp0 for use in returning the chain.  The uio is then used
3223  * only for the count in uio_resid.
3224  *
3225  * Returns:     0                       Success
3226  *              ENOBUFS
3227  *              ENOTCONN
3228  *              EWOULDBLOCK
3229  *      uiomove:EFAULT
3230  *      sblock:EWOULDBLOCK
3231  *      sblock:EINTR
3232  *      sbwait:EBADF
3233  *      sbwait:EINTR
3234  *      sodelayed_copy:EFAULT
3235  *      <pru_rcvoob>:EINVAL[TCP]
3236  *      <pru_rcvoob>:EWOULDBLOCK[TCP]
3237  *      <pru_rcvoob>:???
3238  *      <pr_domain->dom_externalize>:EMSGSIZE[AF_UNIX]
3239  *      <pr_domain->dom_externalize>:ENOBUFS[AF_UNIX]
3240  *      <pr_domain->dom_externalize>:???
3241  *
3242  * Notes:       Additional return values from calls through <pru_rcvoob> and
3243  *              <pr_domain->dom_externalize> depend on protocols other than
3244  *              TCP or AF_UNIX, which are documented above.
3245  */
3246 int
3247 soreceive(struct socket *so, struct sockaddr **psa, struct uio *uio,
3248     struct mbuf **mp0, struct mbuf **controlp, int *flagsp)
3249 {
3250         struct mbuf *m, **mp, *ml = NULL;
3251         struct mbuf *nextrecord, *free_list;
3252         int flags, error, offset;
3253         user_ssize_t len;
3254         struct protosw *pr = so->so_proto;
3255         int moff, type = 0;
3256         user_ssize_t orig_resid = uio_resid(uio);
3257         user_ssize_t delayed_copy_len;
3258         int can_delay;
3259         int need_event;
3260         struct proc *p = current_proc();
3261         boolean_t en_tracing = FALSE;
3262
3263         /*
3264          * Sanity check on the length passed by caller as we are making 'int'
3265          * comparisons
3266          */
3267         if (orig_resid < 0 || orig_resid > INT_MAX) {
3268                 return EINVAL;
3269         }
3270
3271         KERNEL_DEBUG(DBG_FNC_SORECEIVE | DBG_FUNC_START, so,
3272             uio_resid(uio), so->so_rcv.sb_cc, so->so_rcv.sb_lowat,
3273             so->so_rcv.sb_hiwat);
3274
3275         socket_lock(so, 1);
3276         so_update_last_owner_locked(so, p);
3277         so_update_policy(so);
3278
3279 #ifdef MORE_LOCKING_DEBUG
3280         if (so->so_usecount == 1) {
3281                 panic("%s: so=%x no other reference on socket\n", __func__, so);
3282                 /* NOTREACHED */
3283         }
3284 #endif
3285         mp = mp0;
3286         if (psa != NULL) {
3287                 *psa = NULL;
3288         }
3289         if (controlp != NULL) {
3290                 *controlp = NULL;
3291         }
3292         if (flagsp != NULL) {
3293                 flags = *flagsp & ~MSG_EOR;
3294         } else {
3295                 flags = 0;
3296         }
3297
3298         /*
3299          * If a recv attempt is made on a previously-accepted socket
3300          * that has been marked as inactive (disconnected), reject
3301          * the request.
3302          */
3303         if (so->so_flags & SOF_DEFUNCT) {
3304                 struct sockbuf *sb = &so->so_rcv;
3305
3306                 error = ENOTCONN;
3307                 SODEFUNCTLOG("%s[%d, %s]: defunct so 0x%llx [%d,%d] (%d)\n",
3308                     __func__, proc_pid(p), proc_best_name(p),
3309                     (uint64_t)DEBUG_KERNEL_ADDRPERM(so),
3310                     SOCK_DOM(so), SOCK_TYPE(so), error);
3311                 /*
3312                  * This socket should have been disconnected and flushed
3313                  * prior to being returned from sodefunct(); there should
3314                  * be no data on its receive list, so panic otherwise.
3315                  */
3316                 if (so->so_state & SS_DEFUNCT) {
3317                         sb_empty_assert(sb, __func__);
3318                 }
3319                 socket_unlock(so, 1);
3320                 return error;
3321         }
3322
3323         if ((so->so_flags1 & SOF1_PRECONNECT_DATA) &&
3324             pr->pr_usrreqs->pru_preconnect) {
3325                 /*
3326                  * A user may set the CONNECT_RESUME_ON_READ_WRITE-flag but not
3327                  * calling write() right after this. *If* the app calls a read
3328                  * we do not want to block this read indefinetely. Thus,
3329                  * we trigger a connect so that the session gets initiated.
3330                  */
3331                 error = (*pr->pr_usrreqs->pru_preconnect)(so);
3332
3333                 if (error) {
3334                         socket_unlock(so, 1);
3335                         return error;
3336                 }
3337         }
3338
3339         if (ENTR_SHOULDTRACE &&
3340             (SOCK_CHECK_DOM(so, AF_INET) || SOCK_CHECK_DOM(so, AF_INET6))) {
3341                 /*
3342                  * enable energy tracing for inet sockets that go over
3343                  * non-loopback interfaces only.
3344                  */
3345                 struct inpcb *inp = sotoinpcb(so);
3346                 if (inp->inp_last_outifp != NULL &&
3347                     !(inp->inp_last_outifp->if_flags & IFF_LOOPBACK)) {
3348                         en_tracing = TRUE;
3349                         KERNEL_ENERGYTRACE(kEnTrActKernSockRead, DBG_FUNC_START,
3350                             VM_KERNEL_ADDRPERM(so),
3351                             ((so->so_state & SS_NBIO) ?
3352                             kEnTrFlagNonBlocking : 0),
3353                             (int64_t)orig_resid);
3354                 }
3355         }
3356
3357         /*
3358          * When SO_WANTOOBFLAG is set we try to get out-of-band data
3359          * regardless of the flags argument. Here is the case were
3360          * out-of-band data is not inline.
3361          */
3362         if ((flags & MSG_OOB) ||
3363             ((so->so_options & SO_WANTOOBFLAG) != 0 &&
3364             (so->so_options & SO_OOBINLINE) == 0 &&
3365             (so->so_oobmark || (so->so_state & SS_RCVATMARK)))) {
3366                 m = m_get(M_WAIT, MT_DATA);
3367                 if (m == NULL) {
3368                         socket_unlock(so, 1);
3369                         KERNEL_DEBUG(DBG_FNC_SORECEIVE | DBG_FUNC_END,
3370                             ENOBUFS, 0, 0, 0, 0);
3371                         return ENOBUFS;
3372                 }
3373                 error = (*pr->pr_usrreqs->pru_rcvoob)(so, m, flags & MSG_PEEK);
3374                 if (error) {
3375                         goto bad;
3376                 }
3377                 socket_unlock(so, 0);
3378                 do {
3379                         error = uiomove(mtod(m, caddr_t),
3380                             imin(uio_resid(uio), m->m_len), uio);
3381                         m = m_free(m);
3382                 } while (uio_resid(uio) && error == 0 && m != NULL);
3383                 socket_lock(so, 0);
3384 bad:
3385                 if (m != NULL) {
3386                         m_freem(m);
3387                 }
3388
3389                 if ((so->so_options & SO_WANTOOBFLAG) != 0) {
3390                         if (error == EWOULDBLOCK || error == EINVAL) {
3391                                 /*
3392                                  * Let's try to get normal data:
3393                                  * EWOULDBLOCK: out-of-band data not
3394                                  * receive yet. EINVAL: out-of-band data
3395                                  * already read.
3396                                  */
3397                                 error = 0;
3398                                 goto nooob;
3399                         } else if (error == 0 && flagsp != NULL) {
3400                                 *flagsp |= MSG_OOB;
3401                         }
3402                 }
3403                 socket_unlock(so, 1);
3404                 if (en_tracing) {
3405                         KERNEL_ENERGYTRACE(kEnTrActKernSockRead, DBG_FUNC_END,
3406                             VM_KERNEL_ADDRPERM(so), 0,
3407                             (int64_t)(orig_resid - uio_resid(uio)));
3408                 }
3409                 KERNEL_DEBUG(DBG_FNC_SORECEIVE | DBG_FUNC_END, error,
3410                     0, 0, 0, 0);
3411
3412                 return error;
3413         }
3414 nooob:
3415         if (mp != NULL) {
3416                 *mp = NULL;
3417         }
3418
3419         if (so->so_state & SS_ISCONFIRMING && uio_resid(uio)) {
3420                 (*pr->pr_usrreqs->pru_rcvd)(so, 0);
3421         }
3422
3423         free_list = NULL;
3424         delayed_copy_len = 0;
3425 restart:
3426 #ifdef MORE_LOCKING_DEBUG
3427         if (so->so_usecount <= 1) {
3428                 printf("soreceive: sblock so=0x%llx ref=%d on socket\n",
3429                     (uint64_t)DEBUG_KERNEL_ADDRPERM(so), so->so_usecount);
3430         }
3431 #endif
3432         /*
3433          * See if the socket has been closed (SS_NOFDREF|SS_CANTRCVMORE)
3434          * and if so just return to the caller.  This could happen when
3435          * soreceive() is called by a socket upcall function during the
3436          * time the socket is freed.  The socket buffer would have been
3437          * locked across the upcall, therefore we cannot put this thread
3438          * to sleep (else we will deadlock) or return EWOULDBLOCK (else
3439          * we may livelock), because the lock on the socket buffer will
3440          * only be released when the upcall routine returns to its caller.
3441          * Because the socket has been officially closed, there can be
3442          * no further read on it.
3443          *
3444          * A multipath subflow socket would have its SS_NOFDREF set by
3445          * default, so check for SOF_MP_SUBFLOW socket flag; when the
3446          * socket is closed for real, SOF_MP_SUBFLOW would be cleared.
3447          */
3448         if ((so->so_state & (SS_NOFDREF | SS_CANTRCVMORE)) ==
3449             (SS_NOFDREF | SS_CANTRCVMORE) && !(so->so_flags & SOF_MP_SUBFLOW)) {
3450                 socket_unlock(so, 1);
3451                 return 0;
3452         }
3453
3454         error = sblock(&so->so_rcv, SBLOCKWAIT(flags));
3455         if (error) {
3456                 socket_unlock(so, 1);
3457                 KERNEL_DEBUG(DBG_FNC_SORECEIVE | DBG_FUNC_END, error,
3458                     0, 0, 0, 0);
3459                 if (en_tracing) {
3460                         KERNEL_ENERGYTRACE(kEnTrActKernSockRead, DBG_FUNC_END,
3461                             VM_KERNEL_ADDRPERM(so), 0,
3462                             (int64_t)(orig_resid - uio_resid(uio)));
3463                 }
3464                 return error;
3465         }
3466
3467         m = so->so_rcv.sb_mb;
3468         /*
3469          * If we have less data than requested, block awaiting more
3470          * (subject to any timeout) if:
3471          *   1. the current count is less than the low water mark, or
3472          *   2. MSG_WAITALL is set, and it is possible to do the entire
3473          *      receive operation at once if we block (resid <= hiwat).
3474          *   3. MSG_DONTWAIT is not set
3475          * If MSG_WAITALL is set but resid is larger than the receive buffer,
3476          * we have to do the receive in sections, and thus risk returning
3477          * a short count if a timeout or signal occurs after we start.
3478          */
3479         if (m == NULL || (((flags & MSG_DONTWAIT) == 0 &&
3480             so->so_rcv.sb_cc < uio_resid(uio)) &&
3481             (so->so_rcv.sb_cc < so->so_rcv.sb_lowat ||
3482             ((flags & MSG_WAITALL) && uio_resid(uio) <= so->so_rcv.sb_hiwat)) &&
3483             m->m_nextpkt == NULL && (pr->pr_flags & PR_ATOMIC) == 0)) {
3484                 /*
3485                  * Panic if we notice inconsistencies in the socket's
3486                  * receive list; both sb_mb and sb_cc should correctly
3487                  * reflect the contents of the list, otherwise we may
3488                  * end up with false positives during select() or poll()
3489                  * which could put the application in a bad state.
3490                  */
3491                 SB_MB_CHECK(&so->so_rcv);
3492
3493                 if (so->so_error) {
3494                         if (m != NULL) {
3495                                 goto dontblock;
3496                         }
3497                         error = so->so_error;
3498                         if ((flags & MSG_PEEK) == 0) {
3499                                 so->so_error = 0;
3500                         }
3501                         goto release;
3502                 }
3503                 if (so->so_state & SS_CANTRCVMORE) {
3504 #if CONTENT_FILTER
3505                         /*
3506                          * Deal with half closed connections
3507                          */
3508                         if ((so->so_state & SS_ISDISCONNECTED) == 0 &&
3509                             cfil_sock_data_pending(&so->so_rcv) != 0) {
3510                                 CFIL_LOG(LOG_INFO,
3511                                     "so %llx ignore SS_CANTRCVMORE",
3512                                     (uint64_t)DEBUG_KERNEL_ADDRPERM(so));
3513                         } else
3514 #endif /* CONTENT_FILTER */
3515                         if (m != NULL) {
3516                                 goto dontblock;
3517                         } else {
3518                                 goto release;
3519                         }
3520                 }
3521                 for (; m != NULL; m = m->m_next) {
3522                         if (m->m_type == MT_OOBDATA || (m->m_flags & M_EOR)) {
3523                                 m = so->so_rcv.sb_mb;
3524                                 goto dontblock;
3525                         }
3526                 }
3527                 if ((so->so_state & (SS_ISCONNECTED | SS_ISCONNECTING)) == 0 &&
3528                     (so->so_proto->pr_flags & PR_CONNREQUIRED)) {
3529                         error = ENOTCONN;
3530                         goto release;
3531                 }
3532                 if (uio_resid(uio) == 0) {
3533                         goto release;
3534                 }
3535
3536                 if ((so->so_state & SS_NBIO) ||
3537                     (flags & (MSG_DONTWAIT | MSG_NBIO))) {
3538                         error = EWOULDBLOCK;
3539                         goto release;
3540                 }
3541                 SBLASTRECORDCHK(&so->so_rcv, "soreceive sbwait 1");
3542                 SBLASTMBUFCHK(&so->so_rcv, "soreceive sbwait 1");
3543                 sbunlock(&so->so_rcv, TRUE);    /* keep socket locked */
3544 #if EVEN_MORE_LOCKING_DEBUG
3545                 if (socket_debug) {
3546                         printf("Waiting for socket data\n");
3547                 }
3548 #endif
3549
3550                 error = sbwait(&so->so_rcv);
3551 #if EVEN_MORE_LOCKING_DEBUG
3552                 if (socket_debug) {
3553                         printf("SORECEIVE - sbwait returned %d\n", error);
3554                 }
3555 #endif
3556                 if (so->so_usecount < 1) {
3557                         panic("%s: after 2nd sblock so=%p ref=%d on socket\n",
3558                             __func__, so, so->so_usecount);
3559                         /* NOTREACHED */
3560                 }
3561                 if (error) {
3562                         socket_unlock(so, 1);
3563                         KERNEL_DEBUG(DBG_FNC_SORECEIVE | DBG_FUNC_END, error,
3564                             0, 0, 0, 0);
3565                         if (en_tracing) {
3566                                 KERNEL_ENERGYTRACE(kEnTrActKernSockRead, DBG_FUNC_END,
3567                                     VM_KERNEL_ADDRPERM(so), 0,
3568                                     (int64_t)(orig_resid - uio_resid(uio)));
3569                         }
3570                         return error;
3571                 }
3572                 goto restart;
3573         }
3574 dontblock:
3575         OSIncrementAtomicLong(&p->p_stats->p_ru.ru_msgrcv);
3576         SBLASTRECORDCHK(&so->so_rcv, "soreceive 1");
3577         SBLASTMBUFCHK(&so->so_rcv, "soreceive 1");
3578         nextrecord = m->m_nextpkt;
3579
3580         if ((pr->pr_flags & PR_ADDR) && m->m_type == MT_SONAME) {
3581                 error = soreceive_addr(p, so, psa, flags, &m, &nextrecord,
3582                     mp0 == NULL);
3583                 if (error == ERESTART) {
3584                         goto restart;
3585                 } else if (error != 0) {
3586                         goto release;
3587                 }
3588                 orig_resid = 0;
3589         }
3590
3591         /*
3592          * Process one or more MT_CONTROL mbufs present before any data mbufs
3593          * in the first mbuf chain on the socket buffer.  If MSG_PEEK, we
3594          * just copy the data; if !MSG_PEEK, we call into the protocol to
3595          * perform externalization.
3596          */
3597         if (m != NULL && m->m_type == MT_CONTROL) {
3598                 error = soreceive_ctl(so, controlp, flags, &m, &nextrecord);
3599                 if (error != 0) {
3600                         goto release;
3601                 }
3602                 orig_resid = 0;
3603         }
3604
3605         /*
3606          * If the socket is a TCP socket with message delivery
3607          * enabled, then create a control msg to deliver the
3608          * relative TCP sequence number for this data. Waiting
3609          * until this point will protect against failures to
3610          * allocate an mbuf for control msgs.
3611          */
3612         if (so->so_type == SOCK_STREAM && SOCK_PROTO(so) == IPPROTO_TCP &&
3613             (so->so_flags & SOF_ENABLE_MSGS) && controlp != NULL) {
3614                 struct mbuf *seq_cm;
3615
3616                 seq_cm = sbcreatecontrol((caddr_t)&m->m_pkthdr.msg_seq,
3617                     sizeof(uint32_t), SCM_SEQNUM, SOL_SOCKET);
3618                 if (seq_cm == NULL) {
3619                         /* unable to allocate a control mbuf */
3620                         error = ENOBUFS;
3621                         goto release;
3622                 }
3623                 *controlp = seq_cm;
3624                 controlp = &seq_cm->m_next;
3625         }
3626
3627         if (m != NULL) {
3628                 if (!(flags & MSG_PEEK)) {
3629                         /*
3630                          * We get here because m points to an mbuf following
3631                          * any MT_SONAME or MT_CONTROL mbufs which have been
3632                          * processed above.  In any case, m should be pointing
3633                          * to the head of the mbuf chain, and the nextrecord
3634                          * should be either NULL or equal to m->m_nextpkt.
3635                          * See comments above about SB_LOCK.
3636                          */
3637                         if (m != so->so_rcv.sb_mb ||
3638                             m->m_nextpkt != nextrecord) {
3639                                 panic("%s: post-control !sync so=%p m=%p "
3640                                     "nextrecord=%p\n", __func__, so, m,
3641                                     nextrecord);
3642                                 /* NOTREACHED */
3643                         }
3644                         if (nextrecord == NULL) {
3645                                 so->so_rcv.sb_lastrecord = m;
3646                         }
3647                 }
3648                 type = m->m_type;
3649                 if (type == MT_OOBDATA) {
3650                         flags |= MSG_OOB;
3651                 }
3652         } else {
3653                 if (!(flags & MSG_PEEK)) {
3654                         SB_EMPTY_FIXUP(&so->so_rcv);
3655                 }
3656         }
3657         SBLASTRECORDCHK(&so->so_rcv, "soreceive 2");
3658         SBLASTMBUFCHK(&so->so_rcv, "soreceive 2");
3659
3660         moff = 0;
3661         offset = 0;
3662
3663         if (!(flags & MSG_PEEK) && uio_resid(uio) > sorecvmincopy) {
3664                 can_delay = 1;
3665         } else {
3666                 can_delay = 0;
3667         }
3668
3669         need_event = 0;
3670
3671         while (m != NULL &&
3672             (uio_resid(uio) - delayed_copy_len) > 0 && error == 0) {
3673                 if (m->m_type == MT_OOBDATA) {
3674                         if (type != MT_OOBDATA) {
3675                                 break;
3676                         }
3677                 } else if (type == MT_OOBDATA) {
3678                         break;
3679                 }
3680                 /*
3681                  * Make sure to allways set MSG_OOB event when getting
3682                  * out of band data inline.
3683                  */
3684                 if ((so->so_options & SO_WANTOOBFLAG) != 0 &&
3685                     (so->so_options & SO_OOBINLINE) != 0 &&
3686                     (so->so_state & SS_RCVATMARK) != 0) {
3687                         flags |= MSG_OOB;
3688                 }
3689                 so->so_state &= ~SS_RCVATMARK;
3690                 len = uio_resid(uio) - delayed_copy_len;
3691                 if (so->so_oobmark && len > so->so_oobmark - offset) {
3692                         len = so->so_oobmark - offset;
3693                 }
3694                 if (len > m->m_len - moff) {
3695                         len = m->m_len - moff;
3696                 }
3697                 /*
3698                  * If mp is set, just pass back the mbufs.
3699                  * Otherwise copy them out via the uio, then free.
3700                  * Sockbuf must be consistent here (points to current mbuf,
3701                  * it points to next record) when we drop priority;
3702                  * we must note any additions to the sockbuf when we
3703                  * block interrupts again.
3704                  */
3705                 if (mp == NULL) {
3706                         SBLASTRECORDCHK(&so->so_rcv, "soreceive uiomove");
3707                         SBLASTMBUFCHK(&so->so_rcv, "soreceive uiomove");
3708                         if (can_delay && len == m->m_len) {
3709                                 /*
3710                                  * only delay the copy if we're consuming the
3711                                  * mbuf and we're NOT in MSG_PEEK mode
3712                                  * and we have enough data to make it worthwile
3713                                  * to drop and retake the lock... can_delay
3714                                  * reflects the state of the 2 latter
3715                                  * constraints moff should always be zero
3716                                  * in these cases
3717                                  */
3718                                 delayed_copy_len += len;
3719                         } else {
3720                                 if (delayed_copy_len) {
3721                                         error = sodelayed_copy(so, uio,
3722                                             &free_list, &delayed_copy_len);
3723
3724                                         if (error) {
3725                                                 goto release;
3726                                         }
3727                                         /*
3728                                          * can only get here if MSG_PEEK is not
3729                                          * set therefore, m should point at the
3730                                          * head of the rcv queue; if it doesn't,
3731                                          * it means something drastically
3732                                          * changed while we were out from behind
3733                                          * the lock in sodelayed_copy. perhaps
3734                                          * a RST on the stream. in any event,
3735                                          * the stream has been interrupted. it's
3736                                          * probably best just to return whatever
3737                                          * data we've moved and let the caller
3738                                          * sort it out...
3739                                          */
3740                                         if (m != so->so_rcv.sb_mb) {
3741                                                 break;
3742                                         }
3743                                 }
3744                                 socket_unlock(so, 0);
3745                                 error = uiomove(mtod(m, caddr_t) + moff,
3746                                     (int)len, uio);
3747                                 socket_lock(so, 0);
3748
3749                                 if (error) {
3750                                         goto release;
3751                                 }
3752                         }
3753                 } else {
3754                         uio_setresid(uio, (uio_resid(uio) - len));
3755                 }
3756                 if (len == m->m_len - moff) {
3757                         if (m->m_flags & M_EOR) {
3758                                 flags |= MSG_EOR;
3759                         }
3760                         if (flags & MSG_PEEK) {
3761                                 m = m->m_next;
3762                                 moff = 0;
3763                         } else {
3764                                 nextrecord = m->m_nextpkt;
3765                                 sbfree(&so->so_rcv, m);
3766                                 m->m_nextpkt = NULL;
3767
3768                                 /*
3769                                  * If this packet is an unordered packet
3770                                  * (indicated by M_UNORDERED_DATA flag), remove
3771                                  * the additional bytes added to the
3772                                  * receive socket buffer size.
3773                                  */
3774                                 if ((so->so_flags & SOF_ENABLE_MSGS) &&
3775                                     m->m_len &&
3776                                     (m->m_flags & M_UNORDERED_DATA) &&
3777                                     sbreserve(&so->so_rcv,
3778                                     so->so_rcv.sb_hiwat - m->m_len)) {
3779                                         if (so->so_msg_state->msg_uno_bytes >
3780                                             m->m_len) {
3781                                                 so->so_msg_state->
3782                                                 msg_uno_bytes -= m->m_len;
3783                                         } else {
3784                                                 so->so_msg_state->
3785                                                 msg_uno_bytes = 0;
3786                                         }
3787                                         m->m_flags &= ~M_UNORDERED_DATA;
3788                                 }
3789
3790                                 if (mp != NULL) {
3791                                         *mp = m;
3792                                         mp = &m->m_next;
3793                                         so->so_rcv.sb_mb = m = m->m_next;
3794                                         *mp = NULL;
3795                                 } else {
3796                                         if (free_list == NULL) {
3797                                                 free_list = m;
3798                                         } else {
3799                                                 ml->m_next = m;
3800                                         }
3801                                         ml = m;
3802                                         so->so_rcv.sb_mb = m = m->m_next;
3803                                         ml->m_next = NULL;
3804                                 }
3805                                 if (m != NULL) {
3806                                         m->m_nextpkt = nextrecord;
3807                                         if (nextrecord == NULL) {
3808                                                 so->so_rcv.sb_lastrecord = m;
3809                                         }
3810                                 } else {
3811                                         so->so_rcv.sb_mb = nextrecord;
3812                                         SB_EMPTY_FIXUP(&so->so_rcv);
3813                                 }
3814                                 SBLASTRECORDCHK(&so->so_rcv, "soreceive 3");
3815                                 SBLASTMBUFCHK(&so->so_rcv, "soreceive 3");
3816                         }
3817                 } else {
3818                         if (flags & MSG_PEEK) {
3819                                 moff += len;
3820                         } else {
3821                                 if (mp != NULL) {
3822                                         int copy_flag;
3823
3824                                         if (flags & MSG_DONTWAIT) {
3825                                                 copy_flag = M_DONTWAIT;
3826                                         } else {
3827                                                 copy_flag = M_WAIT;
3828                                         }
3829                                         *mp = m_copym(m, 0, len, copy_flag);
3830                                         /*
3831                                          * Failed to allocate an mbuf?
3832                                          * Adjust uio_resid back, it was
3833                                          * adjusted down by len bytes which
3834                                          * we didn't copy over.
3835                                          */
3836                                         if (*mp == NULL) {
3837                                                 uio_setresid(uio,
3838                                                     (uio_resid(uio) + len));
3839                                                 break;
3840                                         }
3841                                 }
3842                                 m->m_data += len;
3843                                 m->m_len -= len;
3844                                 so->so_rcv.sb_cc -= len;
3845                         }
3846                 }
3847                 if (so->so_oobmark) {
3848                         if ((flags & MSG_PEEK) == 0) {
3849                                 so->so_oobmark -= len;
3850                                 if (so->so_oobmark == 0) {
3851                                         so->so_state |= SS_RCVATMARK;
3852                                         /*
3853                                          * delay posting the actual event until
3854                                          * after any delayed copy processing
3855                                          * has finished
3856                                          */
3857                                         need_event = 1;
3858                                         break;
3859                                 }
3860                         } else {
3861                                 offset += len;
3862                                 if (offset == so->so_oobmark) {
3863                                         break;
3864                                 }
3865                         }
3866                 }
3867                 if (flags & MSG_EOR) {
3868                         break;
3869                 }
3870                 /*
3871                  * If the MSG_WAITALL or MSG_WAITSTREAM flag is set
3872                  * (for non-atomic socket), we must not quit until
3873                  * "uio->uio_resid == 0" or an error termination.
3874                  * If a signal/timeout occurs, return with a short
3875                  * count but without error.  Keep sockbuf locked
3876                  * against other readers.
3877                  */
3878                 while (flags & (MSG_WAITALL | MSG_WAITSTREAM) && m == NULL &&
3879                     (uio_resid(uio) - delayed_copy_len) > 0 &&
3880                     !sosendallatonce(so) && !nextrecord) {
3881                         if (so->so_error || ((so->so_state & SS_CANTRCVMORE)
3882 #if CONTENT_FILTER
3883                             && cfil_sock_data_pending(&so->so_rcv) == 0
3884 #endif /* CONTENT_FILTER */
3885                             )) {
3886                                 goto release;
3887                         }
3888
3889                         /*
3890                          * Depending on the protocol (e.g. TCP), the following
3891                          * might cause the socket lock to be dropped and later
3892                          * be reacquired, and more data could have arrived and
3893                          * have been appended to the receive socket buffer by
3894                          * the time it returns.  Therefore, we only sleep in
3895                          * sbwait() below if and only if the socket buffer is
3896                          * empty, in order to avoid a false sleep.
3897                          */
3898                         if (pr->pr_flags & PR_WANTRCVD && so->so_pcb &&
3899                             (((struct inpcb *)so->so_pcb)->inp_state !=
3900                             INPCB_STATE_DEAD)) {
3901                                 (*pr->pr_usrreqs->pru_rcvd)(so, flags);
3902                         }
3903
3904                         SBLASTRECORDCHK(&so->so_rcv, "soreceive sbwait 2");
3905                         SBLASTMBUFCHK(&so->so_rcv, "soreceive sbwait 2");
3906
3907                         if (so->so_rcv.sb_mb == NULL && sbwait(&so->so_rcv)) {
3908                                 error = 0;
3909                                 goto release;
3910                         }
3911                         /*
3912                          * have to wait until after we get back from the sbwait
3913                          * to do the copy because we will drop the lock if we
3914                          * have enough data that has been delayed... by dropping
3915                          * the lock we open up a window allowing the netisr
3916                          * thread to process the incoming packets and to change
3917                          * the state of this socket... we're issuing the sbwait
3918                          * because the socket is empty and we're expecting the
3919                          * netisr thread to wake us up when more packets arrive;
3920                          * if we allow that processing to happen and then sbwait
3921                          * we could stall forever with packets sitting in the
3922                          * socket if no further packets arrive from the remote
3923                          * side.
3924                          *
3925                          * we want to copy before we've collected all the data
3926                          * to satisfy this request to allow the copy to overlap
3927                          * the incoming packet processing on an MP system
3928                          */
3929                         if (delayed_copy_len > sorecvmincopy &&
3930                             (delayed_copy_len > (so->so_rcv.sb_hiwat / 2))) {
3931                                 error = sodelayed_copy(so, uio,
3932                                     &free_list, &delayed_copy_len);
3933
3934                                 if (error) {
3935                                         goto release;
3936                                 }
3937                         }
3938                         m = so->so_rcv.sb_mb;
3939                         if (m != NULL) {
3940                                 nextrecord = m->m_nextpkt;
3941                         }
3942                         SB_MB_CHECK(&so->so_rcv);
3943                 }
3944         }
3945 #ifdef MORE_LOCKING_DEBUG
3946         if (so->so_usecount <= 1) {
3947                 panic("%s: after big while so=%p ref=%d on socket\n",
3948                     __func__, so, so->so_usecount);
3949                 /* NOTREACHED */
3950         }
3951 #endif
3952
3953         if (m != NULL && pr->pr_flags & PR_ATOMIC) {
3954                 if (so->so_options & SO_DONTTRUNC) {
3955                         flags |= MSG_RCVMORE;
3956                 } else {
3957                         flags |= MSG_TRUNC;
3958                         if ((flags & MSG_PEEK) == 0) {
3959                                 (void) sbdroprecord(&so->so_rcv);
3960                         }
3961                 }
3962         }
3963
3964         /*
3965          * pru_rcvd below (for TCP) may cause more data to be received
3966          * if the socket lock is dropped prior to sending the ACK; some
3967          * legacy OpenTransport applications don't handle this well
3968          * (if it receives less data than requested while MSG_HAVEMORE
3969          * is set), and so we set the flag now based on what we know
3970          * prior to calling pru_rcvd.
3971          */
3972         if ((so->so_options & SO_WANTMORE) && so->so_rcv.sb_cc > 0) {
3973                 flags |= MSG_HAVEMORE;
3974         }
3975
3976         if ((flags & MSG_PEEK) == 0) {
3977                 if (m == NULL) {
3978                         so->so_rcv.sb_mb = nextrecord;
3979                         /*
3980                          * First part is an inline SB_EMPTY_FIXUP().  Second
3981                          * part makes sure sb_lastrecord is up-to-date if
3982                          * there is still data in the socket buffer.
3983                          */
3984                         if (so->so_rcv.sb_mb == NULL) {
3985                                 so->so_rcv.sb_mbtail = NULL;
3986                                 so->so_rcv.sb_lastrecord = NULL;
3987                         } else if (nextrecord->m_nextpkt == NULL) {
3988                                 so->so_rcv.sb_lastrecord = nextrecord;
3989                         }
3990                         SB_MB_CHECK(&so->so_rcv);
3991                 }
3992                 SBLASTRECORDCHK(&so->so_rcv, "soreceive 4");
3993                 SBLASTMBUFCHK(&so->so_rcv, "soreceive 4");
3994                 if (pr->pr_flags & PR_WANTRCVD && so->so_pcb) {
3995                         (*pr->pr_usrreqs->pru_rcvd)(so, flags);
3996                 }
3997         }
3998
3999         if (delayed_copy_len) {
4000                 error = sodelayed_copy(so, uio, &free_list, &delayed_copy_len);
4001                 if (error) {
4002                         goto release;
4003                 }
4004         }
4005         if (free_list != NULL) {
4006                 m_freem_list(free_list);
4007                 free_list = NULL;
4008         }
4009         if (need_event) {
4010                 postevent(so, 0, EV_OOB);
4011         }
4012
4013         if (orig_resid == uio_resid(uio) && orig_resid &&
4014             (flags & MSG_EOR) == 0 && (so->so_state & SS_CANTRCVMORE) == 0) {
4015                 sbunlock(&so->so_rcv, TRUE);    /* keep socket locked */
4016                 goto restart;
4017         }
4018
4019         if (flagsp != NULL) {
4020                 *flagsp |= flags;
4021         }
4022 release:
4023 #ifdef MORE_LOCKING_DEBUG
4024         if (so->so_usecount <= 1) {
4025                 panic("%s: release so=%p ref=%d on socket\n", __func__,
4026                     so, so->so_usecount);
4027                 /* NOTREACHED */
4028         }
4029 #endif
4030         if (delayed_copy_len) {
4031                 error = sodelayed_copy(so, uio, &free_list, &delayed_copy_len);
4032         }
4033
4034         if (free_list != NULL) {
4035                 m_freem_list(free_list);
4036         }
4037
4038         sbunlock(&so->so_rcv, FALSE);   /* will unlock socket */
4039
4040         if (en_tracing) {
4041                 KERNEL_ENERGYTRACE(kEnTrActKernSockRead, DBG_FUNC_END,
4042                     VM_KERNEL_ADDRPERM(so),
4043                     ((error == EWOULDBLOCK) ? kEnTrFlagNoWork : 0),
4044                     (int64_t)(orig_resid - uio_resid(uio)));
4045         }
4046         KERNEL_DEBUG(DBG_FNC_SORECEIVE | DBG_FUNC_END, so, uio_resid(uio),
4047             so->so_rcv.sb_cc, 0, error);
4048
4049         return error;
4050 }
4051
4052 /*
4053  * Returns:     0                       Success
4054  *      uiomove:EFAULT
4055  */
4056 static int
4057 sodelayed_copy(struct socket *so, struct uio *uio, struct mbuf **free_list,
4058     user_ssize_t *resid)
4059 {
4060         int error = 0;
4061         struct mbuf *m;
4062
4063         m = *free_list;
4064
4065         socket_unlock(so, 0);
4066
4067         while (m != NULL && error == 0) {
4068                 error = uiomove(mtod(m, caddr_t), (int)m->m_len, uio);
4069                 m = m->m_next;
4070         }
4071         m_freem_list(*free_list);
4072
4073         *free_list = NULL;
4074         *resid = 0;
4075
4076         socket_lock(so, 0);
4077
4078         return error;
4079 }
4080
4081 static int
4082 sodelayed_copy_list(struct socket *so, struct recv_msg_elem *msgarray,
4083     u_int uiocnt, struct mbuf **free_list, user_ssize_t *resid)
4084 {
4085 #pragma unused(so)
4086         int error = 0;
4087         struct mbuf *ml, *m;
4088         int i = 0;
4089         struct uio *auio;
4090
4091         for (ml = *free_list, i = 0; ml != NULL && i < uiocnt;
4092             ml = ml->m_nextpkt, i++) {
4093                 auio = msgarray[i].uio;
4094                 for (m = ml; m != NULL; m = m->m_next) {
4095                         error = uiomove(mtod(m, caddr_t), m->m_len, auio);
4096                         if (error != 0) {
4097                                 goto out;
4098                         }
4099                 }
4100         }
4101 out:
4102         m_freem_list(*free_list);
4103
4104         *free_list = NULL;
4105         *resid = 0;
4106
4107         return error;
4108 }
4109
4110 int
4111 soreceive_list(struct socket *so, struct recv_msg_elem *msgarray, u_int uiocnt,
4112     int *flagsp)
4113 {
4114         struct mbuf *m;
4115         struct mbuf *nextrecord;
4116         struct mbuf *ml = NULL, *free_list = NULL, *free_tail = NULL;
4117         int error;
4118         user_ssize_t len, pktlen, delayed_copy_len = 0;
4119         struct protosw *pr = so->so_proto;
4120         user_ssize_t resid;
4121         struct proc *p = current_proc();
4122         struct uio *auio = NULL;
4123         int npkts = 0;
4124         int sblocked = 0;
4125         struct sockaddr **psa = NULL;
4126         struct mbuf **controlp = NULL;
4127         int can_delay;
4128         int flags;
4129         struct mbuf *free_others = NULL;
4130
4131         KERNEL_DEBUG(DBG_FNC_SORECEIVE_LIST | DBG_FUNC_START,
4132             so, uiocnt,
4133             so->so_rcv.sb_cc, so->so_rcv.sb_lowat, so->so_rcv.sb_hiwat);
4134
4135         /*
4136          * Sanity checks:
4137          * - Only supports don't wait flags
4138          * - Only support datagram sockets (could be extended to raw)
4139          * - Must be atomic
4140          * - Protocol must support packet chains
4141          * - The uio array is NULL (should we panic?)
4142          */
4143         if (flagsp != NULL) {
4144                 flags = *flagsp;
4145         } else {
4146                 flags = 0;
4147         }
4148         if (flags & ~(MSG_PEEK | MSG_WAITALL | MSG_DONTWAIT | MSG_NEEDSA |
4149             MSG_NBIO)) {
4150                 printf("%s invalid flags 0x%x\n", __func__, flags);
4151                 error = EINVAL;
4152                 goto out;
4153         }
4154         if (so->so_type != SOCK_DGRAM) {
4155                 error = EINVAL;
4156                 goto out;
4157         }
4158         if (sosendallatonce(so) == 0) {
4159                 error = EINVAL;
4160                 goto out;
4161         }
4162         if (so->so_proto->pr_usrreqs->pru_send_list == NULL) {
4163                 error = EPROTONOSUPPORT;
4164                 goto out;
4165         }
4166         if (msgarray == NULL) {
4167                 printf("%s uioarray is NULL\n", __func__);
4168                 error = EINVAL;
4169                 goto out;
4170         }
4171         if (uiocnt == 0) {
4172                 printf("%s uiocnt is 0\n", __func__);
4173                 error = EINVAL;
4174                 goto out;
4175         }
4176         /*
4177          * Sanity check on the length passed by caller as we are making 'int'
4178          * comparisons
4179          */
4180         resid = recv_msg_array_resid(msgarray, uiocnt);
4181         if (resid < 0 || resid > INT_MAX) {
4182                 error = EINVAL;
4183                 goto out;
4184         }
4185
4186         if (!(flags & MSG_PEEK) && sorecvmincopy > 0) {
4187                 can_delay = 1;
4188         } else {
4189                 can_delay = 0;
4190         }
4191
4192         socket_lock(so, 1);
4193         so_update_last_owner_locked(so, p);
4194         so_update_policy(so);
4195
4196 #if NECP
4197         so_update_necp_policy(so, NULL, NULL);
4198 #endif /* NECP */
4199
4200         /*
4201          * If a recv attempt is made on a previously-accepted socket
4202          * that has been marked as inactive (disconnected), reject
4203          * the request.
4204          */
4205         if (so->so_flags & SOF_DEFUNCT) {
4206                 struct sockbuf *sb = &so->so_rcv;
4207
4208                 error = ENOTCONN;
4209                 SODEFUNCTLOG("%s[%d, %s]: defunct so 0x%llx [%d,%d] (%d)\n",
4210                     __func__, proc_pid(p), proc_best_name(p),
4211                     (uint64_t)DEBUG_KERNEL_ADDRPERM(so),
4212                     SOCK_DOM(so), SOCK_TYPE(so), error);
4213                 /*
4214                  * This socket should have been disconnected and flushed
4215                  * prior to being returned from sodefunct(); there should
4216                  * be no data on its receive list, so panic otherwise.
4217                  */
4218                 if (so->so_state & SS_DEFUNCT) {
4219                         sb_empty_assert(sb, __func__);
4220                 }
4221                 goto release;
4222         }
4223
4224 next:
4225         /*
4226          * The uio may be empty
4227          */
4228         if (npkts >= uiocnt) {
4229                 error = 0;
4230                 goto release;
4231         }
4232 restart:
4233         /*
4234          * See if the socket has been closed (SS_NOFDREF|SS_CANTRCVMORE)
4235          * and if so just return to the caller.  This could happen when
4236          * soreceive() is called by a socket upcall function during the
4237          * time the socket is freed.  The socket buffer would have been
4238          * locked across the upcall, therefore we cannot put this thread
4239          * to sleep (else we will deadlock) or return EWOULDBLOCK (else
4240          * we may livelock), because the lock on the socket buffer will
4241          * only be released when the upcall routine returns to its caller.
4242          * Because the socket has been officially closed, there can be
4243          * no further read on it.
4244          */
4245         if ((so->so_state & (SS_NOFDREF | SS_CANTRCVMORE)) ==
4246             (SS_NOFDREF | SS_CANTRCVMORE)) {
4247                 error = 0;
4248                 goto release;
4249         }
4250
4251         error = sblock(&so->so_rcv, SBLOCKWAIT(flags));
4252         if (error) {
4253                 goto release;
4254         }
4255         sblocked = 1;
4256
4257         m = so->so_rcv.sb_mb;
4258         /*
4259          * Block awaiting more datagram if needed
4260          */
4261         if (m == NULL || (((flags & MSG_DONTWAIT) == 0 &&
4262             (so->so_rcv.sb_cc < so->so_rcv.sb_lowat ||
4263             ((flags & MSG_WAITALL) && npkts < uiocnt))))) {
4264                 /*
4265                  * Panic if we notice inconsistencies in the socket's
4266                  * receive list; both sb_mb and sb_cc should correctly
4267                  * reflect the contents of the list, otherwise we may
4268                  * end up with false positives during select() or poll()
4269                  * which could put the application in a bad state.
4270                  */
4271                 SB_MB_CHECK(&so->so_rcv);
4272
4273                 if (so->so_error) {
4274                         error = so->so_error;
4275                         if ((flags & MSG_PEEK) == 0) {
4276                                 so->so_error = 0;
4277                         }
4278                         goto release;
4279                 }
4280                 if (so->so_state & SS_CANTRCVMORE) {
4281                         goto release;
4282                 }
4283                 if ((so->so_state & (SS_ISCONNECTED | SS_ISCONNECTING)) == 0 &&
4284                     (so->so_proto->pr_flags & PR_CONNREQUIRED)) {
4285                         error = ENOTCONN;
4286                         goto release;
4287                 }
4288                 if ((so->so_state & SS_NBIO) ||
4289                     (flags & (MSG_DONTWAIT | MSG_NBIO))) {
4290                         error = EWOULDBLOCK;
4291                         goto release;
4292                 }
4293                 /*
4294                  * Do not block if we got some data
4295                  */
4296                 if (free_list != NULL) {
4297                         error = 0;
4298                         goto release;
4299                 }
4300
4301                 SBLASTRECORDCHK(&so->so_rcv, "soreceive sbwait 1");
4302                 SBLASTMBUFCHK(&so->so_rcv, "soreceive sbwait 1");
4303
4304                 sbunlock(&so->so_rcv, TRUE);    /* keep socket locked */
4305                 sblocked = 0;
4306
4307                 error = sbwait(&so->so_rcv);
4308                 if (error) {
4309                         goto release;
4310                 }
4311                 goto restart;
4312         }
4313
4314         OSIncrementAtomicLong(&p->p_stats->p_ru.ru_msgrcv);
4315         SBLASTRECORDCHK(&so->so_rcv, "soreceive 1");
4316         SBLASTMBUFCHK(&so->so_rcv, "soreceive 1");
4317
4318         /*
4319          * Consume the current uio index as we have a datagram
4320          */
4321         auio = msgarray[npkts].uio;
4322         resid = uio_resid(auio);
4323         msgarray[npkts].which |= SOCK_MSG_DATA;
4324         psa = (msgarray[npkts].which & SOCK_MSG_SA) ?
4325             &msgarray[npkts].psa : NULL;
4326         controlp = (msgarray[npkts].which & SOCK_MSG_CONTROL) ?
4327             &msgarray[npkts].controlp : NULL;
4328         npkts += 1;
4329         nextrecord = m->m_nextpkt;
4330
4331         if ((pr->pr_flags & PR_ADDR) && m->m_type == MT_SONAME) {
4332                 error = soreceive_addr(p, so, psa, flags, &m, &nextrecord, 1);
4333                 if (error == ERESTART) {
4334                         goto restart;
4335                 } else if (error != 0) {
4336                         goto release;
4337                 }
4338         }
4339
4340         if (m != NULL && m->m_type == MT_CONTROL) {
4341                 error = soreceive_ctl(so, controlp, flags, &m, &nextrecord);
4342                 if (error != 0) {
4343                         goto release;
4344                 }
4345         }
4346
4347         if (m->m_pkthdr.len == 0) {
4348                 printf("%s:%d so %llx pkt %llx type %u pktlen null\n",
4349                     __func__, __LINE__,
4350                     (uint64_t)DEBUG_KERNEL_ADDRPERM(so),
4351                     (uint64_t)DEBUG_KERNEL_ADDRPERM(m),
4352                     m->m_type);
4353         }
4354
4355         /*
4356          * Loop to copy the mbufs of the current record
4357          * Support zero length packets
4358          */
4359         ml = NULL;
4360         pktlen = 0;
4361         while (m != NULL && (len = resid - pktlen) >= 0 && error == 0) {
4362                 if (m->m_len == 0) {
4363                         panic("%p m_len zero", m);
4364                 }
4365                 if (m->m_type == 0) {
4366                         panic("%p m_type zero", m);
4367                 }
4368                 /*
4369                  * Clip to the residual length
4370                  */
4371                 if (len > m->m_len) {
4372                         len = m->m_len;
4373                 }
4374                 pktlen += len;
4375                 /*
4376                  * Copy the mbufs via the uio or delay the copy
4377                  * Sockbuf must be consistent here (points to current mbuf,
4378                  * it points to next record) when we drop priority;
4379                  * we must note any additions to the sockbuf when we
4380                  * block interrupts again.
4381                  */
4382                 if (len > 0 && can_delay == 0) {
4383                         socket_unlock(so, 0);
4384                         error = uiomove(mtod(m, caddr_t), (int)len, auio);
4385                         socket_lock(so, 0);
4386                         if (error) {
4387                                 goto release;
4388                         }
4389                 } else {
4390                         delayed_copy_len += len;
4391                 }
4392
4393                 if (len == m->m_len) {
4394                         /*
4395                          * m was entirely copied
4396                          */
4397                         sbfree(&so->so_rcv, m);
4398                         nextrecord = m->m_nextpkt;
4399                         m->m_nextpkt = NULL;
4400
4401                         /*
4402                          * Set the first packet to the head of the free list
4403                          */
4404                         if (free_list == NULL) {
4405                                 free_list = m;
4406                         }
4407                         /*
4408                          * Link current packet to tail of free list
4409                          */
4410                         if (ml == NULL) {
4411                                 if (free_tail != NULL) {
4412                                         free_tail->m_nextpkt = m;
4413                                 }
4414                                 free_tail = m;
4415                         }
4416                         /*
4417                          * Link current mbuf to last mbuf of current packet
4418                          */
4419                         if (ml != NULL) {
4420                                 ml->m_next = m;
4421                         }
4422                         ml = m;
4423
4424                         /*
4425                          * Move next buf to head of socket buffer
4426                          */
4427                         so->so_rcv.sb_mb = m = ml->m_next;
4428                         ml->m_next = NULL;
4429
4430                         if (m != NULL) {
4431                                 m->m_nextpkt = nextrecord;
4432                                 if (nextrecord == NULL) {
4433                                         so->so_rcv.sb_lastrecord = m;
4434                                 }
4435                         } else {
4436                                 so->so_rcv.sb_mb = nextrecord;
4437                                 SB_EMPTY_FIXUP(&so->so_rcv);
4438                         }
4439                         SBLASTRECORDCHK(&so->so_rcv, "soreceive 3");
4440                         SBLASTMBUFCHK(&so->so_rcv, "soreceive 3");
4441                 } else {
4442                         /*
4443                          * Stop the loop on partial copy
4444                          */
4445                         break;
4446                 }
4447         }
4448 #ifdef MORE_LOCKING_DEBUG
4449         if (so->so_usecount <= 1) {
4450                 panic("%s: after big while so=%llx ref=%d on socket\n",
4451                     __func__,
4452                     (uint64_t)DEBUG_KERNEL_ADDRPERM(so), so->so_usecount);
4453                 /* NOTREACHED */
4454         }
4455 #endif
4456         /*
4457          * Tell the caller we made a partial copy
4458          */
4459         if (m != NULL) {
4460                 if (so->so_options & SO_DONTTRUNC) {
4461                         /*
4462                          * Copyout first the freelist then the partial mbuf
4463                          */
4464                         socket_unlock(so, 0);
4465                         if (delayed_copy_len) {
4466                                 error = sodelayed_copy_list(so, msgarray,
4467                                     uiocnt, &free_list, &delayed_copy_len);
4468                         }
4469
4470                         if (error == 0) {
4471                                 error = uiomove(mtod(m, caddr_t), (int)len,
4472                                     auio);
4473                         }
4474                         socket_lock(so, 0);
4475                         if (error) {
4476                                 goto release;
4477                         }
4478
4479                         m->m_data += len;
4480                         m->m_len -= len;
4481                         so->so_rcv.sb_cc -= len;
4482                         flags |= MSG_RCVMORE;
4483                 } else {
4484                         (void) sbdroprecord(&so->so_rcv);
4485                         nextrecord = so->so_rcv.sb_mb;
4486                         m = NULL;
4487                         flags |= MSG_TRUNC;
4488                 }
4489         }
4490
4491         if (m == NULL) {
4492                 so->so_rcv.sb_mb = nextrecord;
4493                 /*
4494                  * First part is an inline SB_EMPTY_FIXUP().  Second
4495                  * part makes sure sb_lastrecord is up-to-date if
4496                  * there is still data in the socket buffer.
4497                  */
4498                 if (so->so_rcv.sb_mb == NULL) {
4499                         so->so_rcv.sb_mbtail = NULL;
4500                         so->so_rcv.sb_lastrecord = NULL;
4501                 } else if (nextrecord->m_nextpkt == NULL) {
4502                         so->so_rcv.sb_lastrecord = nextrecord;
4503                 }
4504                 SB_MB_CHECK(&so->so_rcv);
4505         }
4506         SBLASTRECORDCHK(&so->so_rcv, "soreceive 4");
4507         SBLASTMBUFCHK(&so->so_rcv, "soreceive 4");
4508
4509         /*
4510          * We can continue to the next packet as long as:
4511          * - We haven't exhausted the uio array
4512          * - There was no error
4513          * - A packet was not truncated
4514          * - We can still receive more data
4515          */
4516         if (npkts < uiocnt && error == 0 &&
4517             (flags & (MSG_RCVMORE | MSG_TRUNC)) == 0 &&
4518             (so->so_state & SS_CANTRCVMORE) == 0) {
4519                 sbunlock(&so->so_rcv, TRUE);    /* keep socket locked */
4520                 sblocked = 0;
4521
4522                 goto next;
4523         }
4524         if (flagsp != NULL) {
4525                 *flagsp |= flags;
4526         }
4527
4528 release:
4529         /*
4530          * pru_rcvd may cause more data to be received if the socket lock
4531          * is dropped so we set MSG_HAVEMORE now based on what we know.
4532          * That way the caller won't be surprised if it receives less data
4533          * than requested.
4534          */
4535         if ((so->so_options & SO_WANTMORE) && so->so_rcv.sb_cc > 0) {
4536                 flags |= MSG_HAVEMORE;
4537         }
4538
4539         if (pr->pr_flags & PR_WANTRCVD && so->so_pcb) {
4540                 (*pr->pr_usrreqs->pru_rcvd)(so, flags);
4541         }
4542
4543         if (sblocked) {
4544                 sbunlock(&so->so_rcv, FALSE);   /* will unlock socket */
4545         } else {
4546                 socket_unlock(so, 1);
4547         }
4548
4549         if (delayed_copy_len) {
4550                 error = sodelayed_copy_list(so, msgarray, uiocnt,
4551                     &free_list, &delayed_copy_len);
4552         }
4553 out:
4554         /*
4555          * Amortize the cost of freeing the mbufs
4556          */
4557         if (free_list != NULL) {
4558                 m_freem_list(free_list);
4559         }
4560         if (free_others != NULL) {
4561                 m_freem_list(free_others);
4562         }
4563
4564         KERNEL_DEBUG(DBG_FNC_SORECEIVE_LIST | DBG_FUNC_END, error,
4565             0, 0, 0, 0);
4566         return error;
4567 }
4568
4569 /*
4570  * Returns:     0                       Success
4571  *              EINVAL
4572  *              ENOTCONN
4573  *      <pru_shutdown>:EINVAL
4574  *      <pru_shutdown>:EADDRNOTAVAIL[TCP]
4575  *      <pru_shutdown>:ENOBUFS[TCP]
4576  *      <pru_shutdown>:EMSGSIZE[TCP]
4577  *      <pru_shutdown>:EHOSTUNREACH[TCP]
4578  *      <pru_shutdown>:ENETUNREACH[TCP]
4579  *      <pru_shutdown>:ENETDOWN[TCP]
4580  *      <pru_shutdown>:ENOMEM[TCP]
4581  *      <pru_shutdown>:EACCES[TCP]
4582  *      <pru_shutdown>:EMSGSIZE[TCP]
4583  *      <pru_shutdown>:ENOBUFS[TCP]
4584  *      <pru_shutdown>:???[TCP]         [ignorable: mostly IPSEC/firewall/DLIL]
4585  *      <pru_shutdown>:???              [other protocol families]
4586  */
4587 int
4588 soshutdown(struct socket *so, int how)
4589 {
4590         int error;
4591
4592         KERNEL_DEBUG(DBG_FNC_SOSHUTDOWN | DBG_FUNC_START, how, 0, 0, 0, 0);
4593
4594         switch (how) {
4595         case SHUT_RD:
4596         case SHUT_WR:
4597         case SHUT_RDWR:
4598                 socket_lock(so, 1);
4599                 if ((so->so_state &
4600                     (SS_ISCONNECTED | SS_ISCONNECTING | SS_ISDISCONNECTING)) == 0) {
4601                         error = ENOTCONN;
4602                 } else {
4603                         error = soshutdownlock(so, how);
4604                 }
4605                 socket_unlock(so, 1);
4606                 break;
4607         default:
4608                 error = EINVAL;
4609                 break;
4610         }
4611
4612         KERNEL_DEBUG(DBG_FNC_SOSHUTDOWN | DBG_FUNC_END, how, error, 0, 0, 0);
4613
4614         return error;
4615 }
4616
4617 int
4618 soshutdownlock_final(struct socket *so, int how)
4619 {
4620         struct protosw *pr = so->so_proto;
4621         int error = 0;
4622
4623         sflt_notify(so, sock_evt_shutdown, &how);
4624
4625         if (how != SHUT_WR) {
4626                 if ((so->so_state & SS_CANTRCVMORE) != 0) {
4627                         /* read already shut down */
4628                         error = ENOTCONN;
4629                         goto done;
4630                 }
4631                 sorflush(so);
4632                 postevent(so, 0, EV_RCLOSED);
4633         }
4634         if (how != SHUT_RD) {
4635                 if ((so->so_state & SS_CANTSENDMORE) != 0) {
4636                         /* write already shut down */
4637                         error = ENOTCONN;
4638                         goto done;
4639                 }
4640                 error = (*pr->pr_usrreqs->pru_shutdown)(so);
4641                 postevent(so, 0, EV_WCLOSED);
4642         }
4643 done:
4644         KERNEL_DEBUG(DBG_FNC_SOSHUTDOWN, how, 1, 0, 0, 0);
4645         return error;
4646 }
4647
4648 int
4649 soshutdownlock(struct socket *so, int how)
4650 {
4651         int error = 0;
4652
4653 #if CONTENT_FILTER
4654         /*
4655          * A content filter may delay the actual shutdown until it
4656          * has processed the pending data
4657          */
4658         if (so->so_flags & SOF_CONTENT_FILTER) {
4659                 error = cfil_sock_shutdown(so, &how);
4660                 if (error == EJUSTRETURN) {
4661                         error = 0;
4662                         goto done;
4663                 } else if (error != 0) {
4664                         goto done;
4665                 }
4666         }
4667 #endif /* CONTENT_FILTER */
4668
4669         error = soshutdownlock_final(so, how);
4670
4671 done:
4672         return error;
4673 }
4674
4675 void
4676 sowflush(struct socket *so)
4677 {
4678         struct sockbuf *sb = &so->so_snd;
4679
4680         /*
4681          * Obtain lock on the socket buffer (SB_LOCK).  This is required
4682          * to prevent the socket buffer from being unexpectedly altered
4683          * while it is used by another thread in socket send/receive.
4684          *
4685          * sblock() must not fail here, hence the assertion.
4686          */
4687         (void) sblock(sb, SBL_WAIT | SBL_NOINTR | SBL_IGNDEFUNCT);
4688         VERIFY(sb->sb_flags & SB_LOCK);
4689
4690         sb->sb_flags            &= ~(SB_SEL | SB_UPCALL);
4691         sb->sb_flags            |= SB_DROP;
4692         sb->sb_upcall           = NULL;
4693         sb->sb_upcallarg        = NULL;
4694
4695         sbunlock(sb, TRUE);     /* keep socket locked */
4696
4697         selthreadclear(&sb->sb_sel);
4698         sbrelease(sb);
4699 }
4700
4701 void
4702 sorflush(struct socket *so)
4703 {
4704         struct sockbuf *sb = &so->so_rcv;
4705         struct protosw *pr = so->so_proto;
4706         struct sockbuf asb;
4707 #ifdef notyet
4708         lck_mtx_t *mutex_held;
4709         /*
4710          * XXX: This code is currently commented out, because we may get here
4711          * as part of sofreelastref(), and at that time, pr_getlock() may no
4712          * longer be able to return us the lock; this will be fixed in future.
4713          */
4714         if (so->so_proto->pr_getlock != NULL) {
4715                 mutex_held = (*so->so_proto->pr_getlock)(so, 0);
4716         } else {
4717                 mutex_held = so->so_proto->pr_domain->dom_mtx;
4718         }
4719
4720         LCK_MTX_ASSERT(mutex_held, LCK_MTX_ASSERT_OWNED);
4721 #endif /* notyet */
4722
4723         sflt_notify(so, sock_evt_flush_read, NULL);
4724
4725         socantrcvmore(so);
4726
4727         /*
4728          * Obtain lock on the socket buffer (SB_LOCK).  This is required
4729          * to prevent the socket buffer from being unexpectedly altered
4730          * while it is used by another thread in socket send/receive.
4731          *
4732          * sblock() must not fail here, hence the assertion.
4733          */
4734         (void) sblock(sb, SBL_WAIT | SBL_NOINTR | SBL_IGNDEFUNCT);
4735         VERIFY(sb->sb_flags & SB_LOCK);
4736
4737         /*
4738          * Copy only the relevant fields from "sb" to "asb" which we
4739          * need for sbrelease() to function.  In particular, skip
4740          * sb_sel as it contains the wait queue linkage, which would
4741          * wreak havoc if we were to issue selthreadclear() on "asb".
4742          * Make sure to not carry over SB_LOCK in "asb", as we need
4743          * to acquire it later as part of sbrelease().
4744          */
4745         bzero(&asb, sizeof(asb));
4746         asb.sb_cc               = sb->sb_cc;
4747         asb.sb_hiwat            = sb->sb_hiwat;
4748         asb.sb_mbcnt            = sb->sb_mbcnt;
4749         asb.sb_mbmax            = sb->sb_mbmax;
4750         asb.sb_ctl              = sb->sb_ctl;
4751         asb.sb_lowat            = sb->sb_lowat;
4752         asb.sb_mb               = sb->sb_mb;
4753         asb.sb_mbtail           = sb->sb_mbtail;
4754         asb.sb_lastrecord       = sb->sb_lastrecord;
4755         asb.sb_so               = sb->sb_so;
4756         asb.sb_flags            = sb->sb_flags;
4757         asb.sb_flags            &= ~(SB_LOCK | SB_SEL | SB_KNOTE | SB_UPCALL);
4758         asb.sb_flags            |= SB_DROP;
4759
4760         /*
4761          * Ideally we'd bzero() these and preserve the ones we need;
4762          * but to do that we'd need to shuffle things around in the
4763          * sockbuf, and we can't do it now because there are KEXTS
4764          * that are directly referring to the socket structure.
4765          *
4766          * Setting SB_DROP acts as a barrier to prevent further appends.
4767          * Clearing SB_SEL is done for selthreadclear() below.
4768          */
4769         sb->sb_cc               = 0;
4770         sb->sb_hiwat            = 0;
4771         sb->sb_mbcnt            = 0;
4772         sb->sb_mbmax            = 0;
4773         sb->sb_ctl              = 0;
4774         sb->sb_lowat            = 0;
4775         sb->sb_mb               = NULL;
4776         sb->sb_mbtail           = NULL;
4777         sb->sb_lastrecord       = NULL;
4778         sb->sb_timeo.tv_sec     = 0;
4779         sb->sb_timeo.tv_usec    = 0;
4780         sb->sb_upcall           = NULL;
4781         sb->sb_upcallarg        = NULL;
4782         sb->sb_flags            &= ~(SB_SEL | SB_UPCALL);
4783         sb->sb_flags            |= SB_DROP;
4784
4785         sbunlock(sb, TRUE);     /* keep socket locked */
4786
4787         /*
4788          * Note that selthreadclear() is called on the original "sb" and
4789          * not the local "asb" because of the way wait queue linkage is
4790          * implemented.  Given that selwakeup() may be triggered, SB_SEL
4791          * should no longer be set (cleared above.)
4792          */
4793         selthreadclear(&sb->sb_sel);
4794
4795         if ((pr->pr_flags & PR_RIGHTS) && pr->pr_domain->dom_dispose) {
4796                 (*pr->pr_domain->dom_dispose)(asb.sb_mb);
4797         }
4798
4799         sbrelease(&asb);
4800 }
4801
4802 /*
4803  * Perhaps this routine, and sooptcopyout(), below, ought to come in
4804  * an additional variant to handle the case where the option value needs
4805  * to be some kind of integer, but not a specific size.
4806  * In addition to their use here, these functions are also called by the
4807  * protocol-level pr_ctloutput() routines.
4808  *
4809  * Returns:     0                       Success
4810  *              EINVAL
4811  *      copyin:EFAULT
4812  */
4813 int
4814 sooptcopyin(struct sockopt *sopt, void *buf, size_t len, size_t minlen)
4815 {
4816         size_t  valsize;
4817
4818         /*
4819          * If the user gives us more than we wanted, we ignore it,
4820          * but if we don't get the minimum length the caller
4821          * wants, we return EINVAL.  On success, sopt->sopt_valsize
4822          * is set to however much we actually retrieved.
4823          */
4824         if ((valsize = sopt->sopt_valsize) < minlen) {
4825                 return EINVAL;
4826         }
4827         if (valsize > len) {
4828                 sopt->sopt_valsize = valsize = len;
4829         }
4830
4831         if (sopt->sopt_p != kernproc) {
4832                 return copyin(sopt->sopt_val, buf, valsize);
4833         }
4834
4835         bcopy(CAST_DOWN(caddr_t, sopt->sopt_val), buf, valsize);
4836         return 0;
4837 }
4838
4839 /*
4840  * sooptcopyin_timeval
4841  *   Copy in a timeval value into tv_p, and take into account whether the
4842  *   the calling process is 64-bit or 32-bit.  Moved the sanity checking
4843  *   code here so that we can verify the 64-bit tv_sec value before we lose
4844  *   the top 32-bits assigning tv64.tv_sec to tv_p->tv_sec.
4845  */
4846 static int
4847 sooptcopyin_timeval(struct sockopt *sopt, struct timeval *tv_p)
4848 {
4849         int                     error;
4850
4851         if (proc_is64bit(sopt->sopt_p)) {
4852                 struct user64_timeval   tv64;
4853
4854                 if (sopt->sopt_valsize < sizeof(tv64)) {
4855                         return EINVAL;
4856                 }
4857
4858                 sopt->sopt_valsize = sizeof(tv64);
4859                 if (sopt->sopt_p != kernproc) {
4860                         error = copyin(sopt->sopt_val, &tv64, sizeof(tv64));
4861                         if (error != 0) {
4862                                 return error;
4863                         }
4864                 } else {
4865                         bcopy(CAST_DOWN(caddr_t, sopt->sopt_val), &tv64,
4866                             sizeof(tv64));
4867                 }
4868                 if (tv64.tv_sec < 0 || tv64.tv_sec > LONG_MAX ||
4869                     tv64.tv_usec < 0 || tv64.tv_usec >= 1000000) {
4870                         return EDOM;
4871                 }
4872
4873                 tv_p->tv_sec = tv64.tv_sec;
4874                 tv_p->tv_usec = tv64.tv_usec;
4875         } else {
4876                 struct user32_timeval   tv32;
4877
4878                 if (sopt->sopt_valsize < sizeof(tv32)) {
4879                         return EINVAL;
4880                 }
4881
4882                 sopt->sopt_valsize = sizeof(tv32);
4883                 if (sopt->sopt_p != kernproc) {
4884                         error = copyin(sopt->sopt_val, &tv32, sizeof(tv32));
4885                         if (error != 0) {
4886                                 return error;
4887                         }
4888                 } else {
4889                         bcopy(CAST_DOWN(caddr_t, sopt->sopt_val), &tv32,
4890                             sizeof(tv32));
4891                 }
4892 #ifndef __LP64__
4893                 /*
4894                  * K64todo "comparison is always false due to
4895                  * limited range of data type"
4896                  */
4897                 if (tv32.tv_sec < 0 || tv32.tv_sec > LONG_MAX ||
4898                     tv32.tv_usec < 0 || tv32.tv_usec >= 1000000) {
4899                         return EDOM;
4900                 }
4901 #endif
4902                 tv_p->tv_sec = tv32.tv_sec;
4903                 tv_p->tv_usec = tv32.tv_usec;
4904         }
4905         return 0;
4906 }
4907
4908 int
4909 soopt_cred_check(struct socket *so, int priv, boolean_t allow_root)
4910 {
4911         kauth_cred_t cred =  NULL;
4912         proc_t ep = PROC_NULL;
4913         uid_t uid;
4914         int error = 0;
4915
4916         if (so->so_flags & SOF_DELEGATED) {
4917                 ep = proc_find(so->e_pid);
4918                 if (ep) {
4919                         cred = kauth_cred_proc_ref(ep);
4920                 }
4921         }
4922
4923         uid = kauth_cred_getuid(cred ? cred : so->so_cred);
4924
4925         /* uid is 0 for root */
4926         if (uid != 0 || !allow_root) {
4927                 error = priv_check_cred(cred ? cred : so->so_cred, priv, 0);
4928         }
4929         if (cred) {
4930                 kauth_cred_unref(&cred);
4931         }
4932         if (ep != PROC_NULL) {
4933                 proc_rele(ep);
4934         }
4935
4936         return error;
4937 }
4938
4939 /*
4940  * Returns:     0                       Success
4941  *              EINVAL
4942  *              ENOPROTOOPT
4943  *              ENOBUFS
4944  *              EDOM
4945  *      sooptcopyin:EINVAL
4946  *      sooptcopyin:EFAULT
4947  *      sooptcopyin_timeval:EINVAL
4948  *      sooptcopyin_timeval:EFAULT
4949  *      sooptcopyin_timeval:EDOM
4950  *      <pr_ctloutput>:EOPNOTSUPP[AF_UNIX]
4951  *      <pr_ctloutput>:???w
4952  *      sflt_attach_private:???         [whatever a filter author chooses]
4953  *      <sf_setoption>:???              [whatever a filter author chooses]
4954  *
4955  * Notes:       Other <pru_listen> returns depend on the protocol family; all
4956  *              <sf_listen> returns depend on what the filter author causes
4957  *              their filter to return.
4958  */
4959 int
4960 sosetoptlock(struct socket *so, struct sockopt *sopt, int dolock)
4961 {
4962         int     error, optval;
4963         struct  linger l;
4964         struct  timeval tv;
4965 #if CONFIG_MACF_SOCKET
4966         struct mac extmac;
4967 #endif /* MAC_SOCKET */
4968
4969         if (sopt->sopt_dir != SOPT_SET) {
4970                 sopt->sopt_dir = SOPT_SET;
4971         }
4972
4973         if (dolock) {
4974                 socket_lock(so, 1);
4975         }
4976
4977         if ((so->so_state & (SS_CANTRCVMORE | SS_CANTSENDMORE)) ==
4978             (SS_CANTRCVMORE | SS_CANTSENDMORE) &&
4979             (so->so_flags & SOF_NPX_SETOPTSHUT) == 0) {
4980                 /* the socket has been shutdown, no more sockopt's */
4981                 error = EINVAL;
4982                 goto out;
4983         }
4984
4985         error = sflt_setsockopt(so, sopt);
4986         if (error != 0) {
4987                 if (error == EJUSTRETURN) {
4988                         error = 0;
4989                 }
4990                 goto out;
4991         }
4992
4993         if (sopt->sopt_level != SOL_SOCKET) {
4994                 if (so->so_proto != NULL &&
4995                     so->so_proto->pr_ctloutput != NULL) {
4996                         error = (*so->so_proto->pr_ctloutput)(so, sopt);
4997                         goto out;
4998                 }
4999                 error = ENOPROTOOPT;
5000         } else {
5001                 /*
5002                  * Allow socket-level (SOL_SOCKET) options to be filtered by
5003                  * the protocol layer, if needed.  A zero value returned from
5004                  * the handler means use default socket-level processing as
5005                  * done by the rest of this routine.  Otherwise, any other
5006                  * return value indicates that the option is unsupported.
5007                  */
5008                 if (so->so_proto != NULL && (error = so->so_proto->pr_usrreqs->
5009                     pru_socheckopt(so, sopt)) != 0) {
5010                         goto out;
5011                 }
5012
5013                 error = 0;
5014                 switch (sopt->sopt_name) {
5015                 case SO_LINGER:
5016                 case SO_LINGER_SEC:
5017                         error = sooptcopyin(sopt, &l, sizeof(l), sizeof(l));
5018                         if (error != 0) {
5019                                 goto out;
5020                         }
5021
5022                         so->so_linger = (sopt->sopt_name == SO_LINGER) ?
5023                             l.l_linger : l.l_linger * hz;
5024                         if (l.l_onoff != 0) {
5025                                 so->so_options |= SO_LINGER;
5026                         } else {
5027                                 so->so_options &= ~SO_LINGER;
5028                         }
5029                         break;
5030
5031                 case SO_DEBUG:
5032                 case SO_KEEPALIVE:
5033                 case SO_DONTROUTE:
5034                 case SO_USELOOPBACK:
5035                 case SO_BROADCAST:
5036                 case SO_REUSEADDR:
5037                 case SO_REUSEPORT:
5038                 case SO_OOBINLINE:
5039                 case SO_TIMESTAMP:
5040                 case SO_TIMESTAMP_MONOTONIC:
5041                 case SO_TIMESTAMP_CONTINUOUS:
5042                 case SO_DONTTRUNC:
5043                 case SO_WANTMORE:
5044                 case SO_WANTOOBFLAG:
5045                 case SO_NOWAKEFROMSLEEP:
5046                 case SO_NOAPNFALLBK:
5047                         error = sooptcopyin(sopt, &optval, sizeof(optval),
5048                             sizeof(optval));
5049                         if (error != 0) {
5050                                 goto out;
5051                         }
5052                         if (optval) {
5053                                 so->so_options |= sopt->sopt_name;
5054                         } else {
5055                                 so->so_options &= ~sopt->sopt_name;
5056                         }
5057                         break;
5058
5059                 case SO_SNDBUF:
5060                 case SO_RCVBUF:
5061                 case SO_SNDLOWAT:
5062                 case SO_RCVLOWAT:
5063                         error = sooptcopyin(sopt, &optval, sizeof(optval),
5064                             sizeof(optval));
5065                         if (error != 0) {
5066                                 goto out;
5067                         }
5068
5069                         /*
5070                          * Values < 1 make no sense for any of these
5071                          * options, so disallow them.
5072                          */
5073                         if (optval < 1) {
5074                                 error = EINVAL;
5075                                 goto out;
5076                         }
5077
5078                         switch (sopt->sopt_name) {
5079                         case SO_SNDBUF:
5080                         case SO_RCVBUF: {
5081                                 struct sockbuf *sb =
5082                                     (sopt->sopt_name == SO_SNDBUF) ?
5083                                     &so->so_snd : &so->so_rcv;
5084                                 if (sbreserve(sb, (u_int32_t)optval) == 0) {
5085                                         error = ENOBUFS;
5086                                         goto out;
5087                                 }
5088                                 sb->sb_flags |= SB_USRSIZE;
5089                                 sb->sb_flags &= ~SB_AUTOSIZE;
5090                                 sb->sb_idealsize = (u_int32_t)optval;
5091                                 break;
5092                         }
5093                         /*
5094                          * Make sure the low-water is never greater than
5095                          * the high-water.
5096                          */
5097                         case SO_SNDLOWAT: {
5098                                 int space = sbspace(&so->so_snd);
5099                                 u_int32_t hiwat = so->so_snd.sb_hiwat;
5100
5101                                 if (so->so_snd.sb_flags & SB_UNIX) {
5102                                         struct unpcb *unp =
5103                                             (struct unpcb *)(so->so_pcb);
5104                                         if (unp != NULL &&
5105                                             unp->unp_conn != NULL) {
5106                                                 hiwat += unp->unp_conn->unp_cc;
5107                                         }
5108                                 }
5109
5110                                 so->so_snd.sb_lowat =
5111                                     (optval > hiwat) ?
5112                                     hiwat : optval;
5113
5114                                 if (space >= so->so_snd.sb_lowat) {
5115                                         sowwakeup(so);
5116                                 }
5117                                 break;
5118                         }
5119                         case SO_RCVLOWAT: {
5120                                 int64_t data_len;
5121                                 so->so_rcv.sb_lowat =
5122                                     (optval > so->so_rcv.sb_hiwat) ?
5123                                     so->so_rcv.sb_hiwat : optval;
5124                                 data_len = so->so_rcv.sb_cc
5125                                     - so->so_rcv.sb_ctl;
5126                                 if (data_len >= so->so_rcv.sb_lowat) {
5127                                         sorwakeup(so);
5128                                 }
5129                                 break;
5130                         }
5131                         }
5132                         break;
5133
5134                 case SO_SNDTIMEO:
5135                 case SO_RCVTIMEO:
5136                         error = sooptcopyin_timeval(sopt, &tv);
5137                         if (error != 0) {
5138                                 goto out;
5139                         }
5140
5141                         switch (sopt->sopt_name) {
5142                         case SO_SNDTIMEO:
5143                                 so->so_snd.sb_timeo = tv;
5144                                 break;
5145                         case SO_RCVTIMEO:
5146                                 so->so_rcv.sb_timeo = tv;
5147                                 break;
5148                         }
5149                         break;
5150
5151                 case SO_NKE: {
5152                         struct so_nke nke;
5153
5154                         error = sooptcopyin(sopt, &nke, sizeof(nke),
5155                             sizeof(nke));
5156                         if (error != 0) {
5157                                 goto out;
5158                         }
5159
5160                         error = sflt_attach_internal(so, nke.nke_handle);
5161                         break;
5162                 }
5163
5164                 case SO_NOSIGPIPE:
5165                         error = sooptcopyin(sopt, &optval, sizeof(optval),
5166                             sizeof(optval));
5167                         if (error != 0) {
5168                                 goto out;
5169                         }
5170                         if (optval != 0) {
5171                                 so->so_flags |= SOF_NOSIGPIPE;
5172                         } else {
5173                                 so->so_flags &= ~SOF_NOSIGPIPE;
5174                         }
5175                         break;
5176
5177                 case SO_NOADDRERR:
5178                         error = sooptcopyin(sopt, &optval, sizeof(optval),
5179                             sizeof(optval));
5180                         if (error != 0) {
5181                                 goto out;
5182                         }
5183                         if (optval != 0) {
5184                                 so->so_flags |= SOF_NOADDRAVAIL;
5185                         } else {
5186                                 so->so_flags &= ~SOF_NOADDRAVAIL;
5187                         }
5188                         break;
5189
5190                 case SO_REUSESHAREUID:
5191                         error = sooptcopyin(sopt, &optval, sizeof(optval),
5192                             sizeof(optval));
5193                         if (error != 0) {
5194                                 goto out;
5195                         }
5196                         if (optval != 0) {
5197                                 so->so_flags |= SOF_REUSESHAREUID;
5198                         } else {
5199                                 so->so_flags &= ~SOF_REUSESHAREUID;
5200                         }
5201                         break;
5202
5203                 case SO_NOTIFYCONFLICT:
5204                         if (kauth_cred_issuser(kauth_cred_get()) == 0) {
5205                                 error = EPERM;
5206                                 goto out;
5207                         }
5208                         error = sooptcopyin(sopt, &optval, sizeof(optval),
5209                             sizeof(optval));
5210                         if (error != 0) {
5211                                 goto out;
5212                         }
5213                         if (optval != 0) {
5214                                 so->so_flags |= SOF_NOTIFYCONFLICT;
5215                         } else {
5216                                 so->so_flags &= ~SOF_NOTIFYCONFLICT;
5217                         }
5218                         break;
5219
5220                 case SO_RESTRICTIONS:
5221                         error = sooptcopyin(sopt, &optval, sizeof(optval),
5222                             sizeof(optval));
5223                         if (error != 0) {
5224                                 goto out;
5225                         }
5226
5227                         error = so_set_restrictions(so, optval);
5228                         break;
5229
5230                 case SO_AWDL_UNRESTRICTED:
5231                         if (SOCK_DOM(so) != PF_INET &&
5232                             SOCK_DOM(so) != PF_INET6) {
5233                                 error = EOPNOTSUPP;
5234                                 goto out;
5235                         }
5236                         error = sooptcopyin(sopt, &optval, sizeof(optval),
5237                             sizeof(optval));
5238                         if (error != 0) {
5239                                 goto out;
5240                         }
5241                         if (optval != 0) {
5242                                 error = soopt_cred_check(so,
5243                                     PRIV_NET_RESTRICTED_AWDL, false);
5244                                 if (error == 0) {
5245                                         inp_set_awdl_unrestricted(
5246                                                 sotoinpcb(so));
5247                                 }
5248                         } else {
5249                                 inp_clear_awdl_unrestricted(sotoinpcb(so));
5250                         }
5251                         break;
5252                 case SO_INTCOPROC_ALLOW:
5253                         if (SOCK_DOM(so) != PF_INET6) {
5254                                 error = EOPNOTSUPP;
5255                                 goto out;
5256                         }
5257                         error = sooptcopyin(sopt, &optval, sizeof(optval),
5258                             sizeof(optval));
5259                         if (error != 0) {
5260                                 goto out;
5261                         }
5262                         if (optval != 0 &&
5263                             inp_get_intcoproc_allowed(sotoinpcb(so)) == FALSE) {
5264                                 error = soopt_cred_check(so,
5265                                     PRIV_NET_RESTRICTED_INTCOPROC, false);
5266                                 if (error == 0) {
5267                                         inp_set_intcoproc_allowed(
5268                                                 sotoinpcb(so));
5269                                 }
5270                         } else if (optval == 0) {
5271                                 inp_clear_intcoproc_allowed(sotoinpcb(so));
5272                         }
5273                         break;
5274
5275                 case SO_LABEL:
5276 #if CONFIG_MACF_SOCKET
5277                         if ((error = sooptcopyin(sopt, &extmac, sizeof(extmac),
5278                             sizeof(extmac))) != 0) {
5279                                 goto out;
5280                         }
5281
5282                         error = mac_setsockopt_label(proc_ucred(sopt->sopt_p),
5283                             so, &extmac);
5284 #else
5285                         error = EOPNOTSUPP;
5286 #endif /* MAC_SOCKET */
5287                         break;
5288
5289                 case SO_UPCALLCLOSEWAIT:
5290                         error = sooptcopyin(sopt, &optval, sizeof(optval),
5291                             sizeof(optval));
5292                         if (error != 0) {
5293                                 goto out;
5294                         }
5295                         if (optval != 0) {
5296                                 so->so_flags |= SOF_UPCALLCLOSEWAIT;
5297                         } else {
5298                                 so->so_flags &= ~SOF_UPCALLCLOSEWAIT;
5299                         }
5300                         break;
5301
5302                 case SO_RANDOMPORT:
5303                         error = sooptcopyin(sopt, &optval, sizeof(optval),
5304                             sizeof(optval));
5305                         if (error != 0) {
5306                                 goto out;
5307                         }
5308                         if (optval != 0) {
5309                                 so->so_flags |= SOF_BINDRANDOMPORT;
5310                         } else {
5311                                 so->so_flags &= ~SOF_BINDRANDOMPORT;
5312                         }
5313                         break;
5314
5315                 case SO_NP_EXTENSIONS: {
5316                         struct so_np_extensions sonpx;
5317
5318                         error = sooptcopyin(sopt, &sonpx, sizeof(sonpx),
5319                             sizeof(sonpx));
5320                         if (error != 0) {
5321                                 goto out;
5322                         }
5323                         if (sonpx.npx_mask & ~SONPX_MASK_VALID) {
5324                                 error = EINVAL;
5325                                 goto out;
5326                         }
5327                         /*
5328                          * Only one bit defined for now
5329                          */
5330                         if ((sonpx.npx_mask & SONPX_SETOPTSHUT)) {
5331                                 if ((sonpx.npx_flags & SONPX_SETOPTSHUT)) {
5332                                         so->so_flags |= SOF_NPX_SETOPTSHUT;
5333                                 } else {
5334                                         so->so_flags &= ~SOF_NPX_SETOPTSHUT;
5335                                 }
5336                         }
5337                         break;
5338                 }
5339
5340                 case SO_TRAFFIC_CLASS: {
5341                         error = sooptcopyin(sopt, &optval, sizeof(optval),
5342                             sizeof(optval));
5343                         if (error != 0) {
5344                                 goto out;
5345                         }
5346                         if (optval >= SO_TC_NET_SERVICE_OFFSET) {
5347                                 int netsvc = optval - SO_TC_NET_SERVICE_OFFSET;
5348                                 error = so_set_net_service_type(so, netsvc);
5349                                 goto out;
5350                         }
5351                         error = so_set_traffic_class(so, optval);
5352                         if (error != 0) {
5353                                 goto out;
5354                         }
5355                         so->so_flags1 &= ~SOF1_TC_NET_SERV_TYPE;
5356                         so->so_netsvctype = _NET_SERVICE_TYPE_UNSPEC;
5357                         break;
5358                 }
5359
5360                 case SO_RECV_TRAFFIC_CLASS: {
5361                         error = sooptcopyin(sopt, &optval, sizeof(optval),
5362                             sizeof(optval));
5363                         if (error != 0) {
5364                                 goto out;
5365                         }
5366                         if (optval == 0) {
5367                                 so->so_flags &= ~SOF_RECV_TRAFFIC_CLASS;
5368                         } else {
5369                                 so->so_flags |= SOF_RECV_TRAFFIC_CLASS;
5370                         }
5371                         break;
5372                 }
5373
5374 #if (DEVELOPMENT || DEBUG)
5375                 case SO_TRAFFIC_CLASS_DBG: {
5376                         struct so_tcdbg so_tcdbg;
5377
5378                         error = sooptcopyin(sopt, &so_tcdbg,
5379                             sizeof(struct so_tcdbg), sizeof(struct so_tcdbg));
5380                         if (error != 0) {
5381                                 goto out;
5382                         }
5383                         error = so_set_tcdbg(so, &so_tcdbg);
5384                         if (error != 0) {
5385                                 goto out;
5386                         }
5387                         break;
5388                 }
5389 #endif /* (DEVELOPMENT || DEBUG) */
5390
5391                 case SO_PRIVILEGED_TRAFFIC_CLASS:
5392                         error = priv_check_cred(kauth_cred_get(),
5393                             PRIV_NET_PRIVILEGED_TRAFFIC_CLASS, 0);
5394                         if (error != 0) {
5395                                 goto out;
5396                         }
5397                         error = sooptcopyin(sopt, &optval, sizeof(optval),
5398                             sizeof(optval));
5399                         if (error != 0) {
5400                                 goto out;
5401                         }
5402                         if (optval == 0) {
5403                                 so->so_flags &= ~SOF_PRIVILEGED_TRAFFIC_CLASS;
5404                         } else {
5405                                 so->so_flags |= SOF_PRIVILEGED_TRAFFIC_CLASS;
5406                         }
5407                         break;
5408
5409 #if (DEVELOPMENT || DEBUG)
5410                 case SO_DEFUNCTIT:
5411                         error = sosetdefunct(current_proc(), so, 0, FALSE);
5412                         if (error == 0) {
5413                                 error = sodefunct(current_proc(), so, 0);
5414                         }
5415
5416                         break;
5417 #endif /* (DEVELOPMENT || DEBUG) */
5418
5419                 case SO_DEFUNCTOK:
5420                         error = sooptcopyin(sopt, &optval, sizeof(optval),
5421                             sizeof(optval));
5422                         if (error != 0 || (so->so_flags & SOF_DEFUNCT)) {
5423                                 if (error == 0) {
5424                                         error = EBADF;
5425                                 }
5426                                 goto out;
5427                         }
5428                         /*
5429                          * Any process can set SO_DEFUNCTOK (clear
5430                          * SOF_NODEFUNCT), but only root can clear
5431                          * SO_DEFUNCTOK (set SOF_NODEFUNCT).
5432                          */
5433                         if (optval == 0 &&
5434                             kauth_cred_issuser(kauth_cred_get()) == 0) {
5435                                 error = EPERM;
5436                                 goto out;
5437                         }
5438                         if (optval) {
5439                                 so->so_flags &= ~SOF_NODEFUNCT;
5440                         } else {
5441                                 so->so_flags |= SOF_NODEFUNCT;
5442                         }
5443
5444                         if (SOCK_DOM(so) == PF_INET ||
5445                             SOCK_DOM(so) == PF_INET6) {
5446                                 char s[MAX_IPv6_STR_LEN];
5447                                 char d[MAX_IPv6_STR_LEN];
5448                                 struct inpcb *inp = sotoinpcb(so);
5449
5450                                 SODEFUNCTLOG("%s[%d, %s]: so 0x%llx "
5451                                     "[%s %s:%d -> %s:%d] is now marked "
5452                                     "as %seligible for "
5453                                     "defunct\n", __func__, proc_selfpid(),
5454                                     proc_best_name(current_proc()),
5455                                     (uint64_t)DEBUG_KERNEL_ADDRPERM(so),
5456                                     (SOCK_TYPE(so) == SOCK_STREAM) ?
5457                                     "TCP" : "UDP", inet_ntop(SOCK_DOM(so),
5458                                     ((SOCK_DOM(so) == PF_INET) ?
5459                                     (void *)&inp->inp_laddr.s_addr :
5460                                     (void *)&inp->in6p_laddr), s, sizeof(s)),
5461                                     ntohs(inp->in6p_lport),
5462                                     inet_ntop(SOCK_DOM(so),
5463                                     (SOCK_DOM(so) == PF_INET) ?
5464                                     (void *)&inp->inp_faddr.s_addr :
5465                                     (void *)&inp->in6p_faddr, d, sizeof(d)),
5466                                     ntohs(inp->in6p_fport),
5467                                     (so->so_flags & SOF_NODEFUNCT) ?
5468                                     "not " : "");
5469                         } else {
5470                                 SODEFUNCTLOG("%s[%d, %s]: so 0x%llx [%d,%d] "
5471                                     "is now marked as %seligible for "
5472                                     "defunct\n",
5473                                     __func__, proc_selfpid(),
5474                                     proc_best_name(current_proc()),
5475                                     (uint64_t)DEBUG_KERNEL_ADDRPERM(so),
5476                                     SOCK_DOM(so), SOCK_TYPE(so),
5477                                     (so->so_flags & SOF_NODEFUNCT) ?
5478                                     "not " : "");
5479                         }
5480                         break;
5481
5482                 case SO_ISDEFUNCT:
5483                         /* This option is not settable */
5484                         error = EINVAL;
5485                         break;
5486
5487                 case SO_OPPORTUNISTIC:
5488                         error = sooptcopyin(sopt, &optval, sizeof(optval),
5489                             sizeof(optval));
5490                         if (error == 0) {
5491                                 error = so_set_opportunistic(so, optval);
5492                         }
5493                         break;
5494
5495                 case SO_FLUSH:
5496                         /* This option is handled by lower layer(s) */
5497                         error = 0;
5498                         break;
5499
5500                 case SO_RECV_ANYIF:
5501                         error = sooptcopyin(sopt, &optval, sizeof(optval),
5502                             sizeof(optval));
5503                         if (error == 0) {
5504                                 error = so_set_recv_anyif(so, optval);
5505                         }
5506                         break;
5507
5508                 case SO_TRAFFIC_MGT_BACKGROUND: {
5509                         /* This option is handled by lower layer(s) */
5510                         error = 0;
5511                         break;
5512                 }
5513
5514 #if FLOW_DIVERT
5515                 case SO_FLOW_DIVERT_TOKEN:
5516                         error = flow_divert_token_set(so, sopt);
5517                         break;
5518 #endif  /* FLOW_DIVERT */
5519
5520
5521                 case SO_DELEGATED:
5522                         if ((error = sooptcopyin(sopt, &optval, sizeof(optval),
5523                             sizeof(optval))) != 0) {
5524                                 break;
5525                         }
5526
5527                         error = so_set_effective_pid(so, optval, sopt->sopt_p);
5528                         break;
5529
5530                 case SO_DELEGATED_UUID: {
5531                         uuid_t euuid;
5532
5533                         if ((error = sooptcopyin(sopt, &euuid, sizeof(euuid),
5534                             sizeof(euuid))) != 0) {
5535                                 break;
5536                         }
5537
5538                         error = so_set_effective_uuid(so, euuid, sopt->sopt_p);
5539                         break;
5540                 }
5541
5542 #if NECP
5543                 case SO_NECP_ATTRIBUTES:
5544                         error = necp_set_socket_attributes(so, sopt);
5545                         break;
5546
5547                 case SO_NECP_CLIENTUUID:
5548                         if (SOCK_DOM(so) == PF_MULTIPATH) {
5549                                 /* Handled by MPTCP itself */
5550                                 break;
5551                         }
5552
5553                         if (SOCK_DOM(so) != PF_INET && SOCK_DOM(so) != PF_INET6) {
5554                                 error = EINVAL;
5555                                 goto out;
5556                         }
5557
5558                         struct inpcb *inp = sotoinpcb(so);
5559                         if (!uuid_is_null(inp->necp_client_uuid)) {
5560                                 // Clear out the old client UUID if present
5561                                 necp_inpcb_remove_cb(inp);
5562                         }
5563
5564                         error = sooptcopyin(sopt, &inp->necp_client_uuid,
5565                             sizeof(uuid_t), sizeof(uuid_t));
5566                         if (error != 0) {
5567                                 goto out;
5568                         }
5569
5570                         if (uuid_is_null(inp->necp_client_uuid)) {
5571                                 error = EINVAL;
5572                                 goto out;
5573                         }
5574
5575                         error = necp_client_register_socket_flow(so->last_pid,
5576                             inp->necp_client_uuid, inp);
5577                         if (error != 0) {
5578                                 uuid_clear(inp->necp_client_uuid);
5579                                 goto out;
5580                         }
5581
5582                         if (inp->inp_lport != 0) {
5583                                 // There is bound local port, so this is not
5584                                 // a fresh socket. Assign to the client.
5585                                 necp_client_assign_from_socket(so->last_pid, inp->necp_client_uuid, inp);
5586                         }
5587
5588                         break;
5589 #endif /* NECP */
5590
5591                 case SO_EXTENDED_BK_IDLE:
5592                         error = sooptcopyin(sopt, &optval, sizeof(optval),
5593                             sizeof(optval));
5594                         if (error == 0) {
5595                                 error = so_set_extended_bk_idle(so, optval);
5596                         }
5597                         break;
5598
5599                 case SO_MARK_CELLFALLBACK:
5600                         error = sooptcopyin(sopt, &optval, sizeof(optval),
5601                             sizeof(optval));
5602                         if (error != 0) {
5603                                 goto out;
5604                         }
5605                         if (optval < 0) {
5606                                 error = EINVAL;
5607                                 goto out;
5608                         }
5609                         if (optval == 0) {
5610                                 so->so_flags1 &= ~SOF1_CELLFALLBACK;
5611                         } else {
5612                                 so->so_flags1 |= SOF1_CELLFALLBACK;
5613                         }
5614                         break;
5615
5616                 case SO_NET_SERVICE_TYPE: {
5617                         error = sooptcopyin(sopt, &optval, sizeof(optval),
5618                             sizeof(optval));
5619                         if (error != 0) {
5620                                 goto out;
5621                         }
5622                         error = so_set_net_service_type(so, optval);
5623                         break;
5624                 }
5625
5626                 case SO_QOSMARKING_POLICY_OVERRIDE:
5627                         error = priv_check_cred(kauth_cred_get(),
5628                             PRIV_NET_QOSMARKING_POLICY_OVERRIDE, 0);
5629                         if (error != 0) {
5630                                 goto out;
5631                         }
5632                         error = sooptcopyin(sopt, &optval, sizeof(optval),
5633                             sizeof(optval));
5634                         if (error != 0) {
5635                                 goto out;
5636                         }
5637                         if (optval == 0) {
5638                                 so->so_flags1 &= ~SOF1_QOSMARKING_POLICY_OVERRIDE;
5639                         } else {
5640                                 so->so_flags1 |= SOF1_QOSMARKING_POLICY_OVERRIDE;
5641                         }
5642                         break;
5643
5644                 default:
5645                         error = ENOPROTOOPT;
5646                         break;
5647                 }
5648                 if (error == 0 && so->so_proto != NULL &&
5649                     so->so_proto->pr_ctloutput != NULL) {
5650                         (void) so->so_proto->pr_ctloutput(so, sopt);
5651                 }
5652         }
5653 out:
5654         if (dolock) {
5655                 socket_unlock(so, 1);
5656         }
5657         return error;
5658 }
5659
5660 /* Helper routines for getsockopt */
5661 int
5662 sooptcopyout(struct sockopt *sopt, void *buf, size_t len)
5663 {
5664         int     error;
5665         size_t  valsize;
5666
5667         error = 0;
5668
5669         /*
5670          * Documented get behavior is that we always return a value,
5671          * possibly truncated to fit in the user's buffer.
5672          * Traditional behavior is that we always tell the user
5673          * precisely how much we copied, rather than something useful
5674          * like the total amount we had available for her.
5675          * Note that this interface is not idempotent; the entire answer must
5676          * generated ahead of time.
5677          */
5678         valsize = min(len, sopt->sopt_valsize);
5679         sopt->sopt_valsize = valsize;
5680         if (sopt->sopt_val != USER_ADDR_NULL) {
5681                 if (sopt->sopt_p != kernproc) {
5682                         error = copyout(buf, sopt->sopt_val, valsize);
5683                 } else {
5684                         bcopy(buf, CAST_DOWN(caddr_t, sopt->sopt_val), valsize);
5685                 }
5686         }
5687         return error;
5688 }
5689
5690 static int
5691 sooptcopyout_timeval(struct sockopt *sopt, const struct timeval *tv_p)
5692 {
5693         int                     error;
5694         size_t                  len;
5695         struct user64_timeval   tv64 = {};
5696         struct user32_timeval   tv32 = {};
5697         const void *            val;
5698         size_t                  valsize;
5699
5700         error = 0;
5701         if (proc_is64bit(sopt->sopt_p)) {
5702                 len = sizeof(tv64);
5703                 tv64.tv_sec = tv_p->tv_sec;
5704                 tv64.tv_usec = tv_p->tv_usec;
5705                 val = &tv64;
5706         } else {
5707                 len = sizeof(tv32);
5708                 tv32.tv_sec = tv_p->tv_sec;
5709                 tv32.tv_usec = tv_p->tv_usec;
5710                 val = &tv32;
5711         }
5712         valsize = min(len, sopt->sopt_valsize);
5713         sopt->sopt_valsize = valsize;
5714         if (sopt->sopt_val != USER_ADDR_NULL) {
5715                 if (sopt->sopt_p != kernproc) {
5716                         error = copyout(val, sopt->sopt_val, valsize);
5717                 } else {
5718                         bcopy(val, CAST_DOWN(caddr_t, sopt->sopt_val), valsize);
5719                 }
5720         }
5721         return error;
5722 }
5723
5724 /*
5725  * Return:      0                       Success
5726  *              ENOPROTOOPT
5727  *      <pr_ctloutput>:EOPNOTSUPP[AF_UNIX]
5728  *      <pr_ctloutput>:???
5729  *      <sf_getoption>:???
5730  */
5731 int
5732 sogetoptlock(struct socket *so, struct sockopt *sopt, int dolock)
5733 {
5734         int     error, optval;
5735         struct  linger l;
5736         struct  timeval tv;
5737 #if CONFIG_MACF_SOCKET
5738         struct mac extmac;
5739 #endif /* MAC_SOCKET */
5740
5741         if (sopt->sopt_dir != SOPT_GET) {
5742                 sopt->sopt_dir = SOPT_GET;
5743         }
5744
5745         if (dolock) {
5746                 socket_lock(so, 1);
5747         }
5748
5749         error = sflt_getsockopt(so, sopt);
5750         if (error != 0) {
5751                 if (error == EJUSTRETURN) {
5752                         error = 0;
5753                 }
5754                 goto out;
5755         }
5756
5757         if (sopt->sopt_level != SOL_SOCKET) {
5758                 if (so->so_proto != NULL &&
5759                     so->so_proto->pr_ctloutput != NULL) {
5760                         error = (*so->so_proto->pr_ctloutput)(so, sopt);
5761                         goto out;
5762                 }
5763                 error = ENOPROTOOPT;
5764         } else {
5765                 /*
5766                  * Allow socket-level (SOL_SOCKET) options to be filtered by
5767                  * the protocol layer, if needed.  A zero value returned from
5768                  * the handler means use default socket-level processing as
5769                  * done by the rest of this routine.  Otherwise, any other
5770                  * return value indicates that the option is unsupported.
5771                  */
5772                 if (so->so_proto != NULL && (error = so->so_proto->pr_usrreqs->
5773                     pru_socheckopt(so, sopt)) != 0) {
5774                         goto out;
5775                 }
5776
5777                 error = 0;
5778                 switch (sopt->sopt_name) {
5779                 case SO_LINGER:
5780                 case SO_LINGER_SEC:
5781                         l.l_onoff = ((so->so_options & SO_LINGER) ? 1 : 0);
5782                         l.l_linger = (sopt->sopt_name == SO_LINGER) ?
5783                             so->so_linger : so->so_linger / hz;
5784                         error = sooptcopyout(sopt, &l, sizeof(l));
5785                         break;
5786
5787                 case SO_USELOOPBACK:
5788                 case SO_DONTROUTE:
5789                 case SO_DEBUG:
5790                 case SO_KEEPALIVE:
5791                 case SO_REUSEADDR:
5792                 case SO_REUSEPORT:
5793                 case SO_BROADCAST:
5794                 case SO_OOBINLINE:
5795                 case SO_TIMESTAMP:
5796                 case SO_TIMESTAMP_MONOTONIC:
5797                 case SO_TIMESTAMP_CONTINUOUS:
5798                 case SO_DONTTRUNC:
5799                 case SO_WANTMORE:
5800                 case SO_WANTOOBFLAG:
5801                 case SO_NOWAKEFROMSLEEP:
5802                 case SO_NOAPNFALLBK:
5803                         optval = so->so_options & sopt->sopt_name;
5804 integer:
5805                         error = sooptcopyout(sopt, &optval, sizeof(optval));
5806                         break;
5807
5808                 case SO_TYPE:
5809                         optval = so->so_type;
5810                         goto integer;
5811
5812                 case SO_NREAD:
5813                         if (so->so_proto->pr_flags & PR_ATOMIC) {
5814                                 int pkt_total;
5815                                 struct mbuf *m1;
5816
5817                                 pkt_total = 0;
5818                                 m1 = so->so_rcv.sb_mb;
5819                                 while (m1 != NULL) {
5820                                         if (m1->m_type == MT_DATA ||
5821                                             m1->m_type == MT_HEADER ||
5822                                             m1->m_type == MT_OOBDATA) {
5823                                                 pkt_total += m1->m_len;
5824                                         }
5825                                         m1 = m1->m_next;
5826                                 }
5827                                 optval = pkt_total;
5828                         } else {
5829                                 optval = so->so_rcv.sb_cc - so->so_rcv.sb_ctl;
5830                         }
5831                         goto integer;
5832
5833                 case SO_NUMRCVPKT:
5834                         if (so->so_proto->pr_flags & PR_ATOMIC) {
5835                                 int cnt = 0;
5836                                 struct mbuf *m1;
5837
5838                                 m1 = so->so_rcv.sb_mb;
5839                                 while (m1 != NULL) {
5840                                         if (m1->m_type == MT_DATA ||
5841                                             m1->m_type == MT_HEADER ||
5842                                             m1->m_type == MT_OOBDATA) {
5843                                                 cnt += 1;
5844                                         }
5845                                         m1 = m1->m_nextpkt;
5846                                 }
5847                                 optval = cnt;
5848                                 goto integer;
5849                         } else {
5850                                 error = EINVAL;
5851                                 break;
5852                         }
5853
5854                 case SO_NWRITE:
5855                         optval = so->so_snd.sb_cc;
5856                         goto integer;
5857
5858                 case SO_ERROR:
5859                         optval = so->so_error;
5860                         so->so_error = 0;
5861                         goto integer;
5862
5863                 case SO_SNDBUF: {
5864                         u_int32_t hiwat = so->so_snd.sb_hiwat;
5865
5866                         if (so->so_snd.sb_flags & SB_UNIX) {
5867                                 struct unpcb *unp =
5868                                     (struct unpcb *)(so->so_pcb);
5869                                 if (unp != NULL && unp->unp_conn != NULL) {
5870                                         hiwat += unp->unp_conn->unp_cc;
5871                                 }
5872                         }
5873
5874                         optval = hiwat;
5875                         goto integer;
5876                 }
5877                 case SO_RCVBUF:
5878                         optval = so->so_rcv.sb_hiwat;
5879                         goto integer;
5880
5881                 case SO_SNDLOWAT:
5882                         optval = so->so_snd.sb_lowat;
5883                         goto integer;
5884
5885                 case SO_RCVLOWAT:
5886                         optval = so->so_rcv.sb_lowat;
5887                         goto integer;
5888
5889                 case SO_SNDTIMEO:
5890                 case SO_RCVTIMEO:
5891                         tv = (sopt->sopt_name == SO_SNDTIMEO ?
5892                             so->so_snd.sb_timeo : so->so_rcv.sb_timeo);
5893
5894                         error = sooptcopyout_timeval(sopt, &tv);
5895                         break;
5896
5897                 case SO_NOSIGPIPE:
5898                         optval = (so->so_flags & SOF_NOSIGPIPE);
5899                         goto integer;
5900
5901                 case SO_NOADDRERR:
5902                         optval = (so->so_flags & SOF_NOADDRAVAIL);
5903                         goto integer;
5904
5905                 case SO_REUSESHAREUID:
5906                         optval = (so->so_flags & SOF_REUSESHAREUID);
5907                         goto integer;
5908
5909
5910                 case SO_NOTIFYCONFLICT:
5911                         optval = (so->so_flags & SOF_NOTIFYCONFLICT);
5912                         goto integer;
5913
5914                 case SO_RESTRICTIONS:
5915                         optval = so_get_restrictions(so);
5916                         goto integer;
5917
5918                 case SO_AWDL_UNRESTRICTED:
5919                         if (SOCK_DOM(so) == PF_INET ||
5920                             SOCK_DOM(so) == PF_INET6) {
5921                                 optval = inp_get_awdl_unrestricted(
5922                                         sotoinpcb(so));
5923                                 goto integer;
5924                         } else {
5925                                 error = EOPNOTSUPP;
5926                         }
5927                         break;
5928
5929                 case SO_INTCOPROC_ALLOW:
5930                         if (SOCK_DOM(so) == PF_INET6) {
5931                                 optval = inp_get_intcoproc_allowed(
5932                                         sotoinpcb(so));
5933                                 goto integer;
5934                         } else {
5935                                 error = EOPNOTSUPP;
5936                         }
5937                         break;
5938
5939                 case SO_LABEL:
5940 #if CONFIG_MACF_SOCKET
5941                         if ((error = sooptcopyin(sopt, &extmac, sizeof(extmac),
5942                             sizeof(extmac))) != 0 ||
5943                             (error = mac_socket_label_get(proc_ucred(
5944                                     sopt->sopt_p), so, &extmac)) != 0) {
5945                                 break;
5946                         }
5947
5948                         error = sooptcopyout(sopt, &extmac, sizeof(extmac));
5949 #else
5950                         error = EOPNOTSUPP;
5951 #endif /* MAC_SOCKET */
5952                         break;
5953
5954                 case SO_PEERLABEL:
5955 #if CONFIG_MACF_SOCKET
5956                         if ((error = sooptcopyin(sopt, &extmac, sizeof(extmac),
5957                             sizeof(extmac))) != 0 ||
5958                             (error = mac_socketpeer_label_get(proc_ucred(
5959                                     sopt->sopt_p), so, &extmac)) != 0) {
5960                                 break;
5961                         }
5962
5963                         error = sooptcopyout(sopt, &extmac, sizeof(extmac));
5964 #else
5965                         error = EOPNOTSUPP;
5966 #endif /* MAC_SOCKET */
5967                         break;
5968
5969 #ifdef __APPLE_API_PRIVATE
5970                 case SO_UPCALLCLOSEWAIT:
5971                         optval = (so->so_flags & SOF_UPCALLCLOSEWAIT);
5972                         goto integer;
5973 #endif
5974                 case SO_RANDOMPORT:
5975                         optval = (so->so_flags & SOF_BINDRANDOMPORT);
5976                         goto integer;
5977
5978                 case SO_NP_EXTENSIONS: {
5979                         struct so_np_extensions sonpx = {};
5980
5981                         sonpx.npx_flags = (so->so_flags & SOF_NPX_SETOPTSHUT) ?
5982                             SONPX_SETOPTSHUT : 0;
5983                         sonpx.npx_mask = SONPX_MASK_VALID;
5984
5985                         error = sooptcopyout(sopt, &sonpx,
5986                             sizeof(struct so_np_extensions));
5987                         break;
5988                 }
5989
5990                 case SO_TRAFFIC_CLASS:
5991                         optval = so->so_traffic_class;
5992                         goto integer;
5993
5994                 case SO_RECV_TRAFFIC_CLASS:
5995                         optval = (so->so_flags & SOF_RECV_TRAFFIC_CLASS);
5996                         goto integer;
5997
5998                 case SO_TRAFFIC_CLASS_STATS:
5999                         error = sooptcopyout(sopt, &so->so_tc_stats,
6000                             sizeof(so->so_tc_stats));
6001                         break;
6002
6003 #if (DEVELOPMENT || DEBUG)
6004                 case SO_TRAFFIC_CLASS_DBG:
6005                         error = sogetopt_tcdbg(so, sopt);
6006                         break;
6007 #endif /* (DEVELOPMENT || DEBUG) */
6008
6009                 case SO_PRIVILEGED_TRAFFIC_CLASS:
6010                         optval = (so->so_flags & SOF_PRIVILEGED_TRAFFIC_CLASS);
6011                         goto integer;
6012
6013                 case SO_DEFUNCTOK:
6014                         optval = !(so->so_flags & SOF_NODEFUNCT);
6015                         goto integer;
6016
6017                 case SO_ISDEFUNCT:
6018                         optval = (so->so_flags & SOF_DEFUNCT);
6019                         goto integer;
6020
6021                 case SO_OPPORTUNISTIC:
6022                         optval = so_get_opportunistic(so);
6023                         goto integer;
6024
6025                 case SO_FLUSH:
6026                         /* This option is not gettable */
6027                         error = EINVAL;
6028                         break;
6029
6030                 case SO_RECV_ANYIF:
6031                         optval = so_get_recv_anyif(so);
6032                         goto integer;
6033
6034                 case SO_TRAFFIC_MGT_BACKGROUND:
6035                         /* This option is handled by lower layer(s) */
6036                         if (so->so_proto != NULL &&
6037                             so->so_proto->pr_ctloutput != NULL) {
6038                                 (void) so->so_proto->pr_ctloutput(so, sopt);
6039                         }
6040                         break;
6041
6042 #if FLOW_DIVERT
6043                 case SO_FLOW_DIVERT_TOKEN:
6044                         error = flow_divert_token_get(so, sopt);
6045                         break;
6046 #endif  /* FLOW_DIVERT */
6047
6048 #if NECP
6049                 case SO_NECP_ATTRIBUTES:
6050                         error = necp_get_socket_attributes(so, sopt);
6051                         break;
6052
6053                 case SO_NECP_CLIENTUUID:
6054                 {
6055                         uuid_t *ncu;
6056
6057                         if (SOCK_DOM(so) == PF_MULTIPATH) {
6058                                 ncu = &mpsotomppcb(so)->necp_client_uuid;
6059                         } else if (SOCK_DOM(so) == PF_INET || SOCK_DOM(so) == PF_INET6) {
6060                                 ncu = &sotoinpcb(so)->necp_client_uuid;
6061                         } else {
6062                                 error = EINVAL;
6063                                 goto out;
6064                         }
6065
6066                         error = sooptcopyout(sopt, ncu, sizeof(uuid_t));
6067                         break;
6068                 }
6069 #endif /* NECP */
6070
6071 #if CONTENT_FILTER
6072                 case SO_CFIL_SOCK_ID: {
6073                         cfil_sock_id_t sock_id;
6074
6075                         sock_id = cfil_sock_id_from_socket(so);
6076
6077                         error = sooptcopyout(sopt, &sock_id,
6078                             sizeof(cfil_sock_id_t));
6079                         break;
6080                 }
6081 #endif  /* CONTENT_FILTER */
6082
6083                 case SO_EXTENDED_BK_IDLE:
6084                         optval = (so->so_flags1 & SOF1_EXTEND_BK_IDLE_WANTED);
6085                         goto integer;
6086                 case SO_MARK_CELLFALLBACK:
6087                         optval = ((so->so_flags1 & SOF1_CELLFALLBACK) > 0)
6088                             ? 1 : 0;
6089                         goto integer;
6090                 case SO_NET_SERVICE_TYPE: {
6091                         if ((so->so_flags1 & SOF1_TC_NET_SERV_TYPE)) {
6092                                 optval = so->so_netsvctype;
6093                         } else {
6094                                 optval = NET_SERVICE_TYPE_BE;
6095                         }
6096                         goto integer;
6097                 }
6098                 case SO_NETSVC_MARKING_LEVEL:
6099                         optval = so_get_netsvc_marking_level(so);
6100                         goto integer;
6101
6102                 default:
6103                         error = ENOPROTOOPT;
6104                         break;
6105                 }
6106         }
6107 out:
6108         if (dolock) {
6109                 socket_unlock(so, 1);
6110         }
6111         return error;
6112 }
6113
6114 /*
6115  * The size limits on our soopt_getm is different from that on FreeBSD.
6116  * We limit the size of options to MCLBYTES. This will have to change
6117  * if we need to define options that need more space than MCLBYTES.
6118  */
6119 int
6120 soopt_getm(struct sockopt *sopt, struct mbuf **mp)
6121 {
6122         struct mbuf *m, *m_prev;
6123         int sopt_size = sopt->sopt_valsize;
6124         int how;
6125
6126         if (sopt_size <= 0 || sopt_size > MCLBYTES) {
6127                 return EMSGSIZE;
6128         }
6129
6130         how = sopt->sopt_p != kernproc ? M_WAIT : M_DONTWAIT;
6131         MGET(m, how, MT_DATA);
6132         if (m == NULL) {
6133                 return ENOBUFS;
6134         }
6135         if (sopt_size > MLEN) {
6136                 MCLGET(m, how);
6137                 if ((m->m_flags & M_EXT) == 0) {
6138                         m_free(m);
6139                         return ENOBUFS;
6140                 }
6141                 m->m_len = min(MCLBYTES, sopt_size);
6142         } else {
6143                 m->m_len = min(MLEN, sopt_size);
6144         }
6145         sopt_size -= m->m_len;
6146         *mp = m;
6147         m_prev = m;
6148
6149         while (sopt_size > 0) {
6150                 MGET(m, how, MT_DATA);
6151                 if (m == NULL) {
6152                         m_freem(*mp);
6153                         return ENOBUFS;
6154                 }
6155                 if (sopt_size > MLEN) {
6156                         MCLGET(m, how);
6157                         if ((m->m_flags & M_EXT) == 0) {
6158                                 m_freem(*mp);
6159                                 m_freem(m);
6160                                 return ENOBUFS;
6161                         }
6162                         m->m_len = min(MCLBYTES, sopt_size);
6163                 } else {
6164                         m->m_len = min(MLEN, sopt_size);
6165                 }
6166                 sopt_size -= m->m_len;
6167                 m_prev->m_next = m;
6168                 m_prev = m;
6169         }
6170         return 0;
6171 }
6172
6173 /* copyin sopt data into mbuf chain */
6174 int
6175 soopt_mcopyin(struct sockopt *sopt, struct mbuf *m)
6176 {
6177         struct mbuf *m0 = m;
6178
6179         if (sopt->sopt_val == USER_ADDR_NULL) {
6180                 return 0;
6181         }
6182         while (m != NULL && sopt->sopt_valsize >= m->m_len) {
6183                 if (sopt->sopt_p != kernproc) {
6184                         int error;
6185
6186                         error = copyin(sopt->sopt_val, mtod(m, char *),
6187                             m->m_len);
6188                         if (error != 0) {
6189                                 m_freem(m0);
6190                                 return error;
6191                         }
6192                 } else {
6193                         bcopy(CAST_DOWN(caddr_t, sopt->sopt_val),
6194                             mtod(m, char *), m->m_len);
6195                 }
6196                 sopt->sopt_valsize -= m->m_len;
6197                 sopt->sopt_val += m->m_len;
6198                 m = m->m_next;
6199         }
6200         /* should be allocated enoughly at ip6_sooptmcopyin() */
6201         if (m != NULL) {
6202                 panic("soopt_mcopyin");
6203                 /* NOTREACHED */
6204         }
6205         return 0;
6206 }
6207
6208 /* copyout mbuf chain data into soopt */
6209 int
6210 soopt_mcopyout(struct sockopt *sopt, struct mbuf *m)
6211 {
6212         struct mbuf *m0 = m;
6213         size_t valsize = 0;
6214
6215         if (sopt->sopt_val == USER_ADDR_NULL) {
6216                 return 0;
6217         }
6218         while (m != NULL && sopt->sopt_valsize >= m->m_len) {
6219                 if (sopt->sopt_p != kernproc) {
6220                         int error;
6221
6222                         error = copyout(mtod(m, char *), sopt->sopt_val,
6223                             m->m_len);
6224                         if (error != 0) {
6225                                 m_freem(m0);
6226                                 return error;
6227                         }
6228                 } else {
6229                         bcopy(mtod(m, char *),
6230                             CAST_DOWN(caddr_t, sopt->sopt_val), m->m_len);
6231                 }
6232                 sopt->sopt_valsize -= m->m_len;
6233                 sopt->sopt_val += m->m_len;
6234                 valsize += m->m_len;
6235                 m = m->m_next;
6236         }
6237         if (m != NULL) {
6238                 /* enough soopt buffer should be given from user-land */
6239                 m_freem(m0);
6240                 return EINVAL;
6241         }
6242         sopt->sopt_valsize = valsize;
6243         return 0;
6244 }
6245
6246 void
6247 sohasoutofband(struct socket *so)
6248 {
6249         if (so->so_pgid < 0) {
6250                 gsignal(-so->so_pgid, SIGURG);
6251         } else if (so->so_pgid > 0) {
6252                 proc_signal(so->so_pgid, SIGURG);
6253         }
6254         selwakeup(&so->so_rcv.sb_sel);
6255         if (so->so_rcv.sb_flags & SB_KNOTE) {
6256                 KNOTE(&so->so_rcv.sb_sel.si_note,
6257                     (NOTE_OOB | SO_FILT_HINT_LOCKED));
6258         }
6259 }
6260
6261 int
6262 sopoll(struct socket *so, int events, kauth_cred_t cred, void * wql)
6263 {
6264 #pragma unused(cred)
6265         struct proc *p = current_proc();
6266         int revents = 0;
6267
6268         socket_lock(so, 1);
6269         so_update_last_owner_locked(so, PROC_NULL);
6270         so_update_policy(so);
6271
6272         if (events & (POLLIN | POLLRDNORM)) {
6273                 if (soreadable(so)) {
6274                         revents |= events & (POLLIN | POLLRDNORM);
6275                 }
6276         }
6277
6278         if (events & (POLLOUT | POLLWRNORM)) {
6279                 if (sowriteable(so)) {
6280                         revents |= events & (POLLOUT | POLLWRNORM);
6281                 }
6282         }
6283
6284         if (events & (POLLPRI | POLLRDBAND)) {
6285                 if (so->so_oobmark || (so->so_state & SS_RCVATMARK)) {
6286                         revents |= events & (POLLPRI | POLLRDBAND);
6287                 }
6288         }
6289
6290         if (revents == 0) {
6291                 if (events & (POLLIN | POLLPRI | POLLRDNORM | POLLRDBAND)) {
6292                         /*
6293                          * Darwin sets the flag first,
6294                          * BSD calls selrecord first
6295                          */
6296                         so->so_rcv.sb_flags |= SB_SEL;
6297                         selrecord(p, &so->so_rcv.sb_sel, wql);
6298                 }
6299
6300                 if (events & (POLLOUT | POLLWRNORM)) {
6301                         /*
6302                          * Darwin sets the flag first,
6303                          * BSD calls selrecord first
6304                          */
6305                         so->so_snd.sb_flags |= SB_SEL;
6306                         selrecord(p, &so->so_snd.sb_sel, wql);
6307                 }
6308         }
6309
6310         socket_unlock(so, 1);
6311         return revents;
6312 }
6313
6314 int
6315 soo_kqfilter(struct fileproc *fp, struct knote *kn,
6316     struct kevent_internal_s *kev, vfs_context_t ctx)
6317 {
6318 #pragma unused(fp)
6319 #if !CONFIG_MACF_SOCKET
6320 #pragma unused(ctx)
6321 #endif /* MAC_SOCKET */
6322         struct socket *so = (struct socket *)kn->kn_fp->f_fglob->fg_data;
6323         int result;
6324
6325         socket_lock(so, 1);
6326         so_update_last_owner_locked(so, PROC_NULL);
6327         so_update_policy(so);
6328
6329 #if CONFIG_MACF_SOCKET
6330         if (mac_socket_check_kqfilter(proc_ucred(vfs_context_proc(ctx)),
6331             kn, so) != 0) {
6332                 socket_unlock(so, 1);
6333                 kn->kn_flags = EV_ERROR;
6334                 kn->kn_data = EPERM;
6335                 return 0;
6336         }
6337 #endif /* MAC_SOCKET */
6338
6339         switch (kn->kn_filter) {
6340         case EVFILT_READ:
6341                 kn->kn_filtid = EVFILTID_SOREAD;
6342                 break;
6343         case EVFILT_WRITE:
6344                 kn->kn_filtid = EVFILTID_SOWRITE;
6345                 break;
6346         case EVFILT_SOCK:
6347                 kn->kn_filtid = EVFILTID_SCK;
6348                 break;
6349         case EVFILT_EXCEPT:
6350                 kn->kn_filtid = EVFILTID_SOEXCEPT;
6351                 break;
6352         default:
6353                 socket_unlock(so, 1);
6354                 kn->kn_flags = EV_ERROR;
6355                 kn->kn_data = EINVAL;
6356                 return 0;
6357         }
6358
6359         /*
6360          * call the appropriate sub-filter attach
6361          * with the socket still locked
6362          */
6363         result = knote_fops(kn)->f_attach(kn, kev);
6364
6365         socket_unlock(so, 1);
6366
6367         return result;
6368 }
6369
6370 static int
6371 filt_soread_common(struct knote *kn, struct socket *so)
6372 {
6373         if (so->so_options & SO_ACCEPTCONN) {
6374                 int is_not_empty;
6375
6376                 /*
6377                  * Radar 6615193 handle the listen case dynamically
6378                  * for kqueue read filter. This allows to call listen()
6379                  * after registering the kqueue EVFILT_READ.
6380                  */
6381
6382                 kn->kn_data = so->so_qlen;
6383                 is_not_empty = !TAILQ_EMPTY(&so->so_comp);
6384
6385                 return is_not_empty;
6386         }
6387
6388         /* socket isn't a listener */
6389         /*
6390          * NOTE_LOWAT specifies new low water mark in data, i.e.
6391          * the bytes of protocol data. We therefore exclude any
6392          * control bytes.
6393          */
6394         kn->kn_data = so->so_rcv.sb_cc - so->so_rcv.sb_ctl;
6395
6396         if (kn->kn_sfflags & NOTE_OOB) {
6397                 if (so->so_oobmark || (so->so_state & SS_RCVATMARK)) {
6398                         kn->kn_fflags |= NOTE_OOB;
6399                         kn->kn_data -= so->so_oobmark;
6400                         return 1;
6401                 }
6402         }
6403
6404         if ((so->so_state & SS_CANTRCVMORE)
6405 #if CONTENT_FILTER
6406             && cfil_sock_data_pending(&so->so_rcv) == 0
6407 #endif /* CONTENT_FILTER */
6408             ) {
6409                 kn->kn_flags |= EV_EOF;
6410                 kn->kn_fflags = so->so_error;
6411                 return 1;
6412         }
6413
6414         if (so->so_error) {     /* temporary udp error */
6415                 return 1;
6416         }
6417
6418         int64_t lowwat = so->so_rcv.sb_lowat;
6419         /*
6420          * Ensure that when NOTE_LOWAT is used, the derived
6421          * low water mark is bounded by socket's rcv buf's
6422          * high and low water mark values.
6423          */
6424         if (kn->kn_sfflags & NOTE_LOWAT) {
6425                 if (kn->kn_sdata > so->so_rcv.sb_hiwat) {
6426                         lowwat = so->so_rcv.sb_hiwat;
6427                 } else if (kn->kn_sdata > lowwat) {
6428                         lowwat = kn->kn_sdata;
6429                 }
6430         }
6431
6432         /*
6433          * The order below is important. Since NOTE_LOWAT
6434          * overrides sb_lowat, check for NOTE_LOWAT case
6435          * first.
6436          */
6437         if (kn->kn_sfflags & NOTE_LOWAT) {
6438                 return kn->kn_data >= lowwat;
6439         }
6440
6441         return so->so_rcv.sb_cc >= lowwat;
6442 }
6443
6444 static int
6445 filt_sorattach(struct knote *kn, __unused struct kevent_internal_s *kev)
6446 {
6447         struct socket *so = (struct socket *)kn->kn_fp->f_fglob->fg_data;
6448
6449         /* socket locked */
6450
6451         /*
6452          * If the caller explicitly asked for OOB results (e.g. poll())
6453          * from EVFILT_READ, then save that off in the hookid field
6454          * and reserve the kn_flags EV_OOBAND bit for output only.
6455          */
6456         if (kn->kn_filter == EVFILT_READ &&
6457             kn->kn_flags & EV_OOBAND) {
6458                 kn->kn_flags &= ~EV_OOBAND;
6459                 kn->kn_hookid = EV_OOBAND;
6460         } else {
6461                 kn->kn_hookid = 0;
6462         }
6463         if (KNOTE_ATTACH(&so->so_rcv.sb_sel.si_note, kn)) {
6464                 so->so_rcv.sb_flags |= SB_KNOTE;
6465         }
6466
6467         /* indicate if event is already fired */
6468         return filt_soread_common(kn, so);
6469 }
6470
6471 static void
6472 filt_sordetach(struct knote *kn)
6473 {
6474         struct socket *so = (struct socket *)kn->kn_fp->f_fglob->fg_data;
6475
6476         socket_lock(so, 1);
6477         if (so->so_rcv.sb_flags & SB_KNOTE) {
6478                 if (KNOTE_DETACH(&so->so_rcv.sb_sel.si_note, kn)) {
6479                         so->so_rcv.sb_flags &= ~SB_KNOTE;
6480                 }
6481         }
6482         socket_unlock(so, 1);
6483 }
6484
6485 /*ARGSUSED*/
6486 static int
6487 filt_soread(struct knote *kn, long hint)
6488 {
6489         struct socket *so = (struct socket *)kn->kn_fp->f_fglob->fg_data;
6490         int retval;
6491
6492         if ((hint & SO_FILT_HINT_LOCKED) == 0) {
6493                 socket_lock(so, 1);
6494         }
6495
6496         retval = filt_soread_common(kn, so);
6497
6498         if ((hint & SO_FILT_HINT_LOCKED) == 0) {
6499                 socket_unlock(so, 1);
6500         }
6501
6502         return retval;
6503 }
6504
6505 static int
6506 filt_sortouch(struct knote *kn, struct kevent_internal_s *kev)
6507 {
6508         struct socket *so = (struct socket *)kn->kn_fp->f_fglob->fg_data;
6509         int retval;
6510
6511         socket_lock(so, 1);
6512
6513         /* save off the new input fflags and data */
6514         kn->kn_sfflags = kev->fflags;
6515         kn->kn_sdata = kev->data;
6516
6517         /* determine if changes result in fired events */
6518         retval = filt_soread_common(kn, so);
6519
6520         socket_unlock(so, 1);
6521
6522         return retval;
6523 }
6524
6525 static int
6526 filt_sorprocess(struct knote *kn, struct filt_process_s *data, struct kevent_internal_s *kev)
6527 {
6528 #pragma unused(data)
6529         struct socket *so = (struct socket *)kn->kn_fp->f_fglob->fg_data;
6530         int retval;
6531
6532         socket_lock(so, 1);
6533         retval = filt_soread_common(kn, so);
6534         if (retval) {
6535                 *kev = kn->kn_kevent;
6536                 if (kn->kn_flags & EV_CLEAR) {
6537                         kn->kn_fflags = 0;
6538                         kn->kn_data = 0;
6539                 }
6540         }
6541         socket_unlock(so, 1);
6542
6543         return retval;
6544 }
6545
6546 int
6547 so_wait_for_if_feedback(struct socket *so)
6548 {
6549         if ((SOCK_DOM(so) == PF_INET || SOCK_DOM(so) == PF_INET6) &&
6550             (so->so_state & SS_ISCONNECTED)) {
6551                 struct inpcb *inp = sotoinpcb(so);
6552                 if (INP_WAIT_FOR_IF_FEEDBACK(inp)) {
6553                         return 1;
6554                 }
6555         }
6556         return 0;
6557 }
6558
6559 static int
6560 filt_sowrite_common(struct knote *kn, struct socket *so)
6561 {
6562         int ret = 0;
6563
6564         kn->kn_data = sbspace(&so->so_snd);
6565         if (so->so_state & SS_CANTSENDMORE) {
6566                 kn->kn_flags |= EV_EOF;
6567                 kn->kn_fflags = so->so_error;
6568                 return 1;
6569         }
6570         if (so->so_error) {     /* temporary udp error */
6571                 return 1;
6572         }
6573         if (!socanwrite(so)) {
6574                 return 0;
6575         }
6576         if (so->so_flags1 & SOF1_PRECONNECT_DATA) {
6577                 return 1;
6578         }
6579         int64_t lowwat = so->so_snd.sb_lowat;
6580         if (kn->kn_sfflags & NOTE_LOWAT) {
6581                 if (kn->kn_sdata > so->so_snd.sb_hiwat) {
6582                         lowwat = so->so_snd.sb_hiwat;
6583                 } else if (kn->kn_sdata > lowwat) {
6584                         lowwat = kn->kn_sdata;
6585                 }
6586         }
6587         if (kn->kn_data >= lowwat) {
6588                 if ((so->so_flags & SOF_NOTSENT_LOWAT)
6589 #if (DEBUG || DEVELOPMENT)
6590                     && so_notsent_lowat_check == 1
6591 #endif /* DEBUG || DEVELOPMENT */
6592                     ) {
6593                         if ((SOCK_DOM(so) == PF_INET ||
6594                             SOCK_DOM(so) == PF_INET6) &&
6595                             so->so_type == SOCK_STREAM) {
6596                                 ret = tcp_notsent_lowat_check(so);
6597                         }
6598 #if MPTCP
6599                         else if ((SOCK_DOM(so) == PF_MULTIPATH) &&
6600                             (SOCK_PROTO(so) == IPPROTO_TCP)) {
6601                                 ret = mptcp_notsent_lowat_check(so);
6602                         }
6603 #endif
6604                         else {
6605                                 return 1;
6606                         }
6607                 } else {
6608                         ret = 1;
6609                 }
6610         }
6611         if (so_wait_for_if_feedback(so)) {
6612                 ret = 0;
6613         }
6614         return ret;
6615 }
6616
6617 static int
6618 filt_sowattach(struct knote *kn, __unused struct kevent_internal_s *kev)
6619 {
6620         struct socket *so = (struct socket *)kn->kn_fp->f_fglob->fg_data;
6621
6622         /* socket locked */
6623         if (KNOTE_ATTACH(&so->so_snd.sb_sel.si_note, kn)) {
6624                 so->so_snd.sb_flags |= SB_KNOTE;
6625         }
6626
6627         /* determine if its already fired */
6628         return filt_sowrite_common(kn, so);
6629 }
6630
6631 static void
6632 filt_sowdetach(struct knote *kn)
6633 {
6634         struct socket *so = (struct socket *)kn->kn_fp->f_fglob->fg_data;
6635         socket_lock(so, 1);
6636
6637         if (so->so_snd.sb_flags & SB_KNOTE) {
6638                 if (KNOTE_DETACH(&so->so_snd.sb_sel.si_note, kn)) {
6639                         so->so_snd.sb_flags &= ~SB_KNOTE;
6640                 }
6641         }
6642         socket_unlock(so, 1);
6643 }
6644
6645 /*ARGSUSED*/
6646 static int
6647 filt_sowrite(struct knote *kn, long hint)
6648 {
6649         struct socket *so = (struct socket *)kn->kn_fp->f_fglob->fg_data;
6650         int ret;
6651
6652         if ((hint & SO_FILT_HINT_LOCKED) == 0) {
6653                 socket_lock(so, 1);
6654         }
6655
6656         ret = filt_sowrite_common(kn, so);
6657
6658         if ((hint & SO_FILT_HINT_LOCKED) == 0) {
6659                 socket_unlock(so, 1);
6660         }
6661
6662         return ret;
6663 }
6664
6665 static int
6666 filt_sowtouch(struct knote *kn, struct kevent_internal_s *kev)
6667 {
6668         struct socket *so = (struct socket *)kn->kn_fp->f_fglob->fg_data;
6669         int ret;
6670
6671         socket_lock(so, 1);
6672
6673         /*save off the new input fflags and data */
6674         kn->kn_sfflags = kev->fflags;
6675         kn->kn_sdata = kev->data;
6676
6677         /* determine if these changes result in a triggered event */
6678         ret = filt_sowrite_common(kn, so);
6679
6680         socket_unlock(so, 1);
6681
6682         return ret;
6683 }
6684
6685 static int
6686 filt_sowprocess(struct knote *kn, struct filt_process_s *data, struct kevent_internal_s *kev)
6687 {
6688 #pragma unused(data)
6689         struct socket *so = (struct socket *)kn->kn_fp->f_fglob->fg_data;
6690         int ret;
6691
6692         socket_lock(so, 1);
6693         ret = filt_sowrite_common(kn, so);
6694         if (ret) {
6695                 *kev = kn->kn_kevent;
6696                 if (kn->kn_flags & EV_CLEAR) {
6697                         kn->kn_fflags = 0;
6698                         kn->kn_data = 0;
6699                 }
6700         }
6701         socket_unlock(so, 1);
6702         return ret;
6703 }
6704
6705 static int
6706 filt_sockev_common(struct knote *kn, struct socket *so, long ev_hint)
6707 {
6708         int ret = 0;
6709         uint32_t level_trigger = 0;
6710
6711         if (ev_hint & SO_FILT_HINT_CONNRESET) {
6712                 kn->kn_fflags |= NOTE_CONNRESET;
6713         }
6714         if (ev_hint & SO_FILT_HINT_TIMEOUT) {
6715                 kn->kn_fflags |= NOTE_TIMEOUT;
6716         }
6717         if (ev_hint & SO_FILT_HINT_NOSRCADDR) {
6718                 kn->kn_fflags |= NOTE_NOSRCADDR;
6719         }
6720         if (ev_hint & SO_FILT_HINT_IFDENIED) {
6721                 kn->kn_fflags |= NOTE_IFDENIED;
6722         }
6723         if (ev_hint & SO_FILT_HINT_KEEPALIVE) {
6724                 kn->kn_fflags |= NOTE_KEEPALIVE;
6725         }
6726         if (ev_hint & SO_FILT_HINT_ADAPTIVE_WTIMO) {
6727                 kn->kn_fflags |= NOTE_ADAPTIVE_WTIMO;
6728         }
6729         if (ev_hint & SO_FILT_HINT_ADAPTIVE_RTIMO) {
6730                 kn->kn_fflags |= NOTE_ADAPTIVE_RTIMO;
6731         }
6732         if ((ev_hint & SO_FILT_HINT_CONNECTED) ||
6733             (so->so_state & SS_ISCONNECTED)) {
6734                 kn->kn_fflags |= NOTE_CONNECTED;
6735                 level_trigger |= NOTE_CONNECTED;
6736         }
6737         if ((ev_hint & SO_FILT_HINT_DISCONNECTED) ||
6738             (so->so_state & SS_ISDISCONNECTED)) {
6739                 kn->kn_fflags |= NOTE_DISCONNECTED;
6740                 level_trigger |= NOTE_DISCONNECTED;
6741         }
6742         if (ev_hint & SO_FILT_HINT_CONNINFO_UPDATED) {
6743                 if (so->so_proto != NULL &&
6744                     (so->so_proto->pr_flags & PR_EVCONNINFO)) {
6745                         kn->kn_fflags |= NOTE_CONNINFO_UPDATED;
6746                 }
6747         }
6748
6749         if ((ev_hint & SO_FILT_HINT_NOTIFY_ACK) ||
6750             tcp_notify_ack_active(so)) {
6751                 kn->kn_fflags |= NOTE_NOTIFY_ACK;
6752         }
6753
6754         if ((so->so_state & SS_CANTRCVMORE)
6755 #if CONTENT_FILTER
6756             && cfil_sock_data_pending(&so->so_rcv) == 0
6757 #endif /* CONTENT_FILTER */
6758             ) {
6759                 kn->kn_fflags |= NOTE_READCLOSED;
6760                 level_trigger |= NOTE_READCLOSED;
6761         }
6762
6763         if (so->so_state & SS_CANTSENDMORE) {
6764                 kn->kn_fflags |= NOTE_WRITECLOSED;
6765                 level_trigger |= NOTE_WRITECLOSED;
6766         }
6767
6768         if ((ev_hint & SO_FILT_HINT_SUSPEND) ||
6769             (so->so_flags & SOF_SUSPENDED)) {
6770                 kn->kn_fflags &= ~(NOTE_SUSPEND | NOTE_RESUME);
6771
6772                 /* If resume event was delivered before, reset it */
6773                 kn->kn_hookid &= ~NOTE_RESUME;
6774
6775                 kn->kn_fflags |= NOTE_SUSPEND;
6776                 level_trigger |= NOTE_SUSPEND;
6777         }
6778
6779         if ((ev_hint & SO_FILT_HINT_RESUME) ||
6780             (so->so_flags & SOF_SUSPENDED) == 0) {
6781                 kn->kn_fflags &= ~(NOTE_SUSPEND | NOTE_RESUME);
6782
6783                 /* If suspend event was delivered before, reset it */
6784                 kn->kn_hookid &= ~NOTE_SUSPEND;
6785
6786                 kn->kn_fflags |= NOTE_RESUME;
6787                 level_trigger |= NOTE_RESUME;
6788         }
6789
6790         if (so->so_error != 0) {
6791                 ret = 1;
6792                 kn->kn_data = so->so_error;
6793                 kn->kn_flags |= EV_EOF;
6794         } else {
6795                 get_sockev_state(so, (u_int32_t *)&(kn->kn_data));
6796         }
6797
6798         /* Reset any events that are not requested on this knote */
6799         kn->kn_fflags &= (kn->kn_sfflags & EVFILT_SOCK_ALL_MASK);
6800         level_trigger &= (kn->kn_sfflags & EVFILT_SOCK_ALL_MASK);
6801
6802         /* Find the level triggerred events that are already delivered */
6803         level_trigger &= kn->kn_hookid;
6804         level_trigger &= EVFILT_SOCK_LEVEL_TRIGGER_MASK;
6805
6806         /* Do not deliver level triggerred events more than once */
6807         if ((kn->kn_fflags & ~level_trigger) != 0) {
6808                 ret = 1;
6809         }
6810
6811         return ret;
6812 }
6813
6814 static int
6815 filt_sockattach(struct knote *kn, __unused struct kevent_internal_s *kev)
6816 {
6817         struct socket *so = (struct socket *)kn->kn_fp->f_fglob->fg_data;
6818
6819         /* socket locked */
6820         kn->kn_hookid = 0;
6821         if (KNOTE_ATTACH(&so->so_klist, kn)) {
6822                 so->so_flags |= SOF_KNOTE;
6823         }
6824
6825         /* determine if event already fired */
6826         return filt_sockev_common(kn, so, 0);
6827 }
6828
6829 static void
6830 filt_sockdetach(struct knote *kn)
6831 {
6832         struct socket *so = (struct socket *)kn->kn_fp->f_fglob->fg_data;
6833         socket_lock(so, 1);
6834
6835         if ((so->so_flags & SOF_KNOTE) != 0) {
6836                 if (KNOTE_DETACH(&so->so_klist, kn)) {
6837                         so->so_flags &= ~SOF_KNOTE;
6838                 }
6839         }
6840         socket_unlock(so, 1);
6841 }
6842
6843 static int
6844 filt_sockev(struct knote *kn, long hint)
6845 {
6846         int ret = 0, locked = 0;
6847         struct socket *so = (struct socket *)kn->kn_fp->f_fglob->fg_data;
6848         long ev_hint = (hint & SO_FILT_HINT_EV);
6849
6850         if ((hint & SO_FILT_HINT_LOCKED) == 0) {
6851                 socket_lock(so, 1);
6852                 locked = 1;
6853         }
6854
6855         ret = filt_sockev_common(kn, so, ev_hint);
6856
6857         if (locked) {
6858                 socket_unlock(so, 1);
6859         }
6860
6861         return ret;
6862 }
6863
6864
6865
6866 /*
6867  *      filt_socktouch - update event state
6868  */
6869 static int
6870 filt_socktouch(
6871         struct knote *kn,
6872         struct kevent_internal_s *kev)
6873 {
6874         struct socket *so = (struct socket *)kn->kn_fp->f_fglob->fg_data;
6875         uint32_t changed_flags;
6876         int ret;
6877
6878         socket_lock(so, 1);
6879
6880         /* save off the [result] data and fflags */
6881         changed_flags = (kn->kn_sfflags ^ kn->kn_hookid);
6882
6883         /* save off the new input fflags and data */
6884         kn->kn_sfflags = kev->fflags;
6885         kn->kn_sdata = kev->data;
6886
6887         /* restrict the current results to the (smaller?) set of new interest */
6888         /*
6889          * For compatibility with previous implementations, we leave kn_fflags
6890          * as they were before.
6891          */
6892         //kn->kn_fflags &= kev->fflags;
6893
6894         /*
6895          * Since we keep track of events that are already
6896          * delivered, if any of those events are not requested
6897          * anymore the state related to them can be reset
6898          */
6899         kn->kn_hookid &=
6900             ~(changed_flags & EVFILT_SOCK_LEVEL_TRIGGER_MASK);
6901
6902         /* determine if we have events to deliver */
6903         ret = filt_sockev_common(kn, so, 0);
6904
6905         socket_unlock(so, 1);
6906
6907         return ret;
6908 }
6909
6910 /*
6911  *      filt_sockprocess - query event fired state and return data
6912  */
6913 static int
6914 filt_sockprocess(
6915         struct knote *kn,
6916         struct filt_process_s *data,
6917         struct kevent_internal_s *kev)
6918 {
6919 #pragma unused(data)
6920
6921         struct socket *so = (struct socket *)kn->kn_fp->f_fglob->fg_data;
6922         int ret = 0;
6923
6924         socket_lock(so, 1);
6925
6926         ret = filt_sockev_common(kn, so, 0);
6927         if (ret) {
6928                 *kev = kn->kn_kevent;
6929
6930                 /*
6931                  * Store the state of the events being delivered. This
6932                  * state can be used to deliver level triggered events
6933                  * ateast once and still avoid waking up the application
6934                  * multiple times as long as the event is active.
6935                  */
6936                 if (kn->kn_fflags != 0) {
6937                         kn->kn_hookid |= (kn->kn_fflags &
6938                             EVFILT_SOCK_LEVEL_TRIGGER_MASK);
6939                 }
6940
6941                 /*
6942                  * NOTE_RESUME and NOTE_SUSPEND are an exception, deliver
6943                  * only one of them and remember the last one that was
6944                  * delivered last
6945                  */
6946                 if (kn->kn_fflags & NOTE_SUSPEND) {
6947                         kn->kn_hookid &= ~NOTE_RESUME;
6948                 }
6949                 if (kn->kn_fflags & NOTE_RESUME) {
6950                         kn->kn_hookid &= ~NOTE_SUSPEND;
6951                 }
6952
6953                 if (kn->kn_flags & EV_CLEAR) {
6954                         kn->kn_data = 0;
6955                         kn->kn_fflags = 0;
6956                 }
6957         }
6958
6959         socket_unlock(so, 1);
6960
6961         return ret;
6962 }
6963
6964 void
6965 get_sockev_state(struct socket *so, u_int32_t *statep)
6966 {
6967         u_int32_t state = *(statep);
6968
6969         /*
6970          * If the state variable is already used by a previous event,
6971          * reset it.
6972          */
6973         if (state != 0) {
6974                 return;
6975         }
6976
6977         if (so->so_state & SS_ISCONNECTED) {
6978                 state |= SOCKEV_CONNECTED;
6979         } else {
6980                 state &= ~(SOCKEV_CONNECTED);
6981         }
6982         state |= ((so->so_state & SS_ISDISCONNECTED) ? SOCKEV_DISCONNECTED : 0);
6983         *(statep) = state;
6984 }
6985
6986 #define SO_LOCK_HISTORY_STR_LEN \
6987         (2 * SO_LCKDBG_MAX * (2 + (2 * sizeof (void *)) + 1) + 1)
6988
6989 __private_extern__ const char *
6990 solockhistory_nr(struct socket *so)
6991 {
6992         size_t n = 0;
6993         int i;
6994         static char lock_history_str[SO_LOCK_HISTORY_STR_LEN];
6995
6996         bzero(lock_history_str, sizeof(lock_history_str));
6997         for (i = SO_LCKDBG_MAX - 1; i >= 0; i--) {
6998                 n += snprintf(lock_history_str + n,
6999                     SO_LOCK_HISTORY_STR_LEN - n, "%p:%p ",
7000                     so->lock_lr[(so->next_lock_lr + i) % SO_LCKDBG_MAX],
7001                     so->unlock_lr[(so->next_unlock_lr + i) % SO_LCKDBG_MAX]);
7002         }
7003         return lock_history_str;
7004 }
7005
7006 void
7007 socket_lock(struct socket *so, int refcount)
7008 {
7009         void *lr_saved;
7010
7011         lr_saved = __builtin_return_address(0);
7012
7013         if (so->so_proto->pr_lock) {
7014                 (*so->so_proto->pr_lock)(so, refcount, lr_saved);
7015         } else {
7016 #ifdef MORE_LOCKING_DEBUG
7017                 LCK_MTX_ASSERT(so->so_proto->pr_domain->dom_mtx,
7018                     LCK_MTX_ASSERT_NOTOWNED);
7019 #endif
7020                 lck_mtx_lock(so->so_proto->pr_domain->dom_mtx);
7021                 if (refcount) {
7022                         so->so_usecount++;
7023                 }
7024                 so->lock_lr[so->next_lock_lr] = lr_saved;
7025                 so->next_lock_lr = (so->next_lock_lr + 1) % SO_LCKDBG_MAX;
7026         }
7027 }
7028
7029 void
7030 socket_lock_assert_owned(struct socket *so)
7031 {
7032         lck_mtx_t *mutex_held;
7033
7034         if (so->so_proto->pr_getlock != NULL) {
7035                 mutex_held = (*so->so_proto->pr_getlock)(so, 0);
7036         } else {
7037                 mutex_held = so->so_proto->pr_domain->dom_mtx;
7038         }
7039
7040         LCK_MTX_ASSERT(mutex_held, LCK_MTX_ASSERT_OWNED);
7041 }
7042
7043 int
7044 socket_try_lock(struct socket *so)
7045 {
7046         lck_mtx_t *mtx;
7047
7048         if (so->so_proto->pr_getlock != NULL) {
7049                 mtx = (*so->so_proto->pr_getlock)(so, 0);
7050         } else {
7051                 mtx = so->so_proto->pr_domain->dom_mtx;
7052         }
7053
7054         return lck_mtx_try_lock(mtx);
7055 }
7056
7057 void
7058 socket_unlock(struct socket *so, int refcount)
7059 {
7060         void *lr_saved;
7061         lck_mtx_t *mutex_held;
7062
7063         lr_saved = __builtin_return_address(0);
7064
7065         if (so->so_proto == NULL) {
7066                 panic("%s: null so_proto so=%p\n", __func__, so);
7067                 /* NOTREACHED */
7068         }
7069
7070         if (so && so->so_proto->pr_unlock) {
7071                 (*so->so_proto->pr_unlock)(so, refcount, lr_saved);
7072         } else {
7073                 mutex_held = so->so_proto->pr_domain->dom_mtx;
7074 #ifdef MORE_LOCKING_DEBUG
7075                 LCK_MTX_ASSERT(mutex_held, LCK_MTX_ASSERT_OWNED);
7076 #endif
7077                 so->unlock_lr[so->next_unlock_lr] = lr_saved;
7078                 so->next_unlock_lr = (so->next_unlock_lr + 1) % SO_LCKDBG_MAX;
7079
7080                 if (refcount) {
7081                         if (so->so_usecount <= 0) {
7082                                 panic("%s: bad refcount=%d so=%p (%d, %d, %d) "
7083                                     "lrh=%s", __func__, so->so_usecount, so,
7084                                     SOCK_DOM(so), so->so_type,
7085                                     SOCK_PROTO(so), solockhistory_nr(so));
7086                                 /* NOTREACHED */
7087                         }
7088
7089                         so->so_usecount--;
7090                         if (so->so_usecount == 0) {
7091                                 sofreelastref(so, 1);
7092                         }
7093                 }
7094                 lck_mtx_unlock(mutex_held);
7095         }
7096 }
7097
7098 /* Called with socket locked, will unlock socket */
7099 void
7100 sofree(struct socket *so)
7101 {
7102         lck_mtx_t *mutex_held;
7103
7104         if (so->so_proto->pr_getlock != NULL) {
7105                 mutex_held = (*so->so_proto->pr_getlock)(so, 0);
7106         } else {
7107                 mutex_held = so->so_proto->pr_domain->dom_mtx;
7108         }
7109         LCK_MTX_ASSERT(mutex_held, LCK_MTX_ASSERT_OWNED);
7110
7111         sofreelastref(so, 0);
7112 }
7113
7114 void
7115 soreference(struct socket *so)
7116 {
7117         socket_lock(so, 1);     /* locks & take one reference on socket */
7118         socket_unlock(so, 0);   /* unlock only */
7119 }
7120
7121 void
7122 sodereference(struct socket *so)
7123 {
7124         socket_lock(so, 0);
7125         socket_unlock(so, 1);
7126 }
7127
7128 /*
7129  * Set or clear SOF_MULTIPAGES on the socket to enable or disable the
7130  * possibility of using jumbo clusters.  Caller must ensure to hold
7131  * the socket lock.
7132  */
7133 void
7134 somultipages(struct socket *so, boolean_t set)
7135 {
7136         if (set) {
7137                 so->so_flags |= SOF_MULTIPAGES;
7138         } else {
7139                 so->so_flags &= ~SOF_MULTIPAGES;
7140         }
7141 }
7142
7143 void
7144 soif2kcl(struct socket *so, boolean_t set)
7145 {
7146         if (set) {
7147                 so->so_flags1 |= SOF1_IF_2KCL;
7148         } else {
7149                 so->so_flags1 &= ~SOF1_IF_2KCL;
7150         }
7151 }
7152
7153 int
7154 so_isdstlocal(struct socket *so)
7155 {
7156         struct inpcb *inp = (struct inpcb *)so->so_pcb;
7157
7158         if (SOCK_DOM(so) == PF_INET) {
7159                 return inaddr_local(inp->inp_faddr);
7160         } else if (SOCK_DOM(so) == PF_INET6) {
7161                 return in6addr_local(&inp->in6p_faddr);
7162         }
7163
7164         return 0;
7165 }
7166
7167 int
7168 sosetdefunct(struct proc *p, struct socket *so, int level, boolean_t noforce)
7169 {
7170         struct sockbuf *rcv, *snd;
7171         int err = 0, defunct;
7172
7173         rcv = &so->so_rcv;
7174         snd = &so->so_snd;
7175
7176         defunct = (so->so_flags & SOF_DEFUNCT);
7177         if (defunct) {
7178                 if (!(snd->sb_flags & rcv->sb_flags & SB_DROP)) {
7179                         panic("%s: SB_DROP not set", __func__);
7180                         /* NOTREACHED */
7181                 }
7182                 goto done;
7183         }
7184
7185         if (so->so_flags & SOF_NODEFUNCT) {
7186                 if (noforce) {
7187                         err = EOPNOTSUPP;
7188                         if (p != PROC_NULL) {
7189                                 SODEFUNCTLOG("%s[%d, %s]: (target pid %d "
7190                                     "name %s level %d) so 0x%llx [%d,%d] "
7191                                     "is not eligible for defunct "
7192                                     "(%d)\n", __func__, proc_selfpid(),
7193                                     proc_best_name(current_proc()), proc_pid(p),
7194                                     proc_best_name(p), level,
7195                                     (uint64_t)DEBUG_KERNEL_ADDRPERM(so),
7196                                     SOCK_DOM(so), SOCK_TYPE(so), err);
7197                         }
7198                         return err;
7199                 }
7200                 so->so_flags &= ~SOF_NODEFUNCT;
7201                 if (p != PROC_NULL) {
7202                         SODEFUNCTLOG("%s[%d, %s]: (target pid %d "
7203                             "name %s level %d) so 0x%llx [%d,%d] "
7204                             "defunct by force "
7205                             "(%d)\n", __func__, proc_selfpid(),
7206                             proc_best_name(current_proc()), proc_pid(p),
7207                             proc_best_name(p), level,
7208                             (uint64_t)DEBUG_KERNEL_ADDRPERM(so),
7209                             SOCK_DOM(so), SOCK_TYPE(so), err);
7210                 }
7211         } else if (so->so_flags1 & SOF1_EXTEND_BK_IDLE_WANTED) {
7212                 struct inpcb *inp = (struct inpcb *)so->so_pcb;
7213                 struct ifnet *ifp = inp->inp_last_outifp;
7214
7215                 if (ifp && IFNET_IS_CELLULAR(ifp)) {
7216                         OSIncrementAtomic(&soextbkidlestat.so_xbkidle_nocell);
7217                 } else if (so->so_flags & SOF_DELEGATED) {
7218                         OSIncrementAtomic(&soextbkidlestat.so_xbkidle_nodlgtd);
7219                 } else if (soextbkidlestat.so_xbkidle_time == 0) {
7220                         OSIncrementAtomic(&soextbkidlestat.so_xbkidle_notime);
7221                 } else if (noforce && p != PROC_NULL) {
7222                         OSIncrementAtomic(&soextbkidlestat.so_xbkidle_active);
7223
7224                         so->so_flags1 |= SOF1_EXTEND_BK_IDLE_INPROG;
7225                         so->so_extended_bk_start = net_uptime();
7226                         OSBitOrAtomic(P_LXBKIDLEINPROG, &p->p_ladvflag);
7227
7228                         inpcb_timer_sched(inp->inp_pcbinfo, INPCB_TIMER_LAZY);
7229
7230                         err = EOPNOTSUPP;
7231                         SODEFUNCTLOG("%s[%d, %s]: (target pid %d "
7232                             "name %s level %d) so 0x%llx [%d,%d] "
7233                             "extend bk idle "
7234                             "(%d)\n", __func__, proc_selfpid(),
7235                             proc_best_name(current_proc()), proc_pid(p),
7236                             proc_best_name(p), level,
7237                             (uint64_t)DEBUG_KERNEL_ADDRPERM(so),
7238                             SOCK_DOM(so), SOCK_TYPE(so), err);
7239                         return err;
7240                 } else {
7241                         OSIncrementAtomic(&soextbkidlestat.so_xbkidle_forced);
7242                 }
7243         }
7244
7245         so->so_flags |= SOF_DEFUNCT;
7246
7247         /* Prevent further data from being appended to the socket buffers */
7248         snd->sb_flags |= SB_DROP;
7249         rcv->sb_flags |= SB_DROP;
7250
7251         /* Flush any existing data in the socket buffers */
7252         if (rcv->sb_cc != 0) {
7253                 rcv->sb_flags &= ~SB_SEL;
7254                 selthreadclear(&rcv->sb_sel);
7255                 sbrelease(rcv);
7256         }
7257         if (snd->sb_cc != 0) {
7258                 snd->sb_flags &= ~SB_SEL;
7259                 selthreadclear(&snd->sb_sel);
7260                 sbrelease(snd);
7261         }
7262
7263 done:
7264         if (p != PROC_NULL) {
7265                 SODEFUNCTLOG("%s[%d, %s]: (target pid %d name %s level %d) "
7266                     "so 0x%llx [%d,%d] %s defunct%s\n", __func__,
7267                     proc_selfpid(), proc_best_name(current_proc()),
7268                     proc_pid(p), proc_best_name(p), level,
7269                     (uint64_t)DEBUG_KERNEL_ADDRPERM(so), SOCK_DOM(so),
7270                     SOCK_TYPE(so), defunct ? "is already" : "marked as",
7271                     (so->so_flags1 & SOF1_EXTEND_BK_IDLE_WANTED) ?
7272                     " extbkidle" : "");
7273         }
7274         return err;
7275 }
7276
7277 int
7278 sodefunct(struct proc *p, struct socket *so, int level)
7279 {
7280         struct sockbuf *rcv, *snd;
7281
7282         if (!(so->so_flags & SOF_DEFUNCT)) {
7283                 panic("%s improperly called", __func__);
7284                 /* NOTREACHED */
7285         }
7286         if (so->so_state & SS_DEFUNCT) {
7287                 goto done;
7288         }
7289
7290         rcv = &so->so_rcv;
7291         snd = &so->so_snd;
7292
7293         if (SOCK_DOM(so) == PF_INET || SOCK_DOM(so) == PF_INET6) {
7294                 char s[MAX_IPv6_STR_LEN];
7295                 char d[MAX_IPv6_STR_LEN];
7296                 struct inpcb *inp = sotoinpcb(so);
7297
7298                 if (p != PROC_NULL) {
7299                         SODEFUNCTLOG(
7300                                 "%s[%d, %s]: (target pid %d name %s level %d) "
7301                                 "so 0x%llx [%s %s:%d -> %s:%d] is now defunct "
7302                                 "[rcv_si 0x%x, snd_si 0x%x, rcv_fl 0x%x, "
7303                                 " snd_fl 0x%x]\n", __func__,
7304                                 proc_selfpid(), proc_best_name(current_proc()),
7305                                 proc_pid(p), proc_best_name(p), level,
7306                                 (uint64_t)DEBUG_KERNEL_ADDRPERM(so),
7307                                 (SOCK_TYPE(so) == SOCK_STREAM) ? "TCP" : "UDP",
7308                                 inet_ntop(SOCK_DOM(so), ((SOCK_DOM(so) == PF_INET) ?
7309                                 (void *)&inp->inp_laddr.s_addr :
7310                                 (void *)&inp->in6p_laddr),
7311                                 s, sizeof(s)), ntohs(inp->in6p_lport),
7312                                 inet_ntop(SOCK_DOM(so), (SOCK_DOM(so) == PF_INET) ?
7313                                 (void *)&inp->inp_faddr.s_addr :
7314                                 (void *)&inp->in6p_faddr,
7315                                 d, sizeof(d)), ntohs(inp->in6p_fport),
7316                                 (uint32_t)rcv->sb_sel.si_flags,
7317                                 (uint32_t)snd->sb_sel.si_flags,
7318                                 rcv->sb_flags, snd->sb_flags);
7319                 }
7320         } else if (p != PROC_NULL) {
7321                 SODEFUNCTLOG("%s[%d, %s]: (target pid %d name %s level %d) "
7322                     "so 0x%llx [%d,%d] is now defunct [rcv_si 0x%x, "
7323                     "snd_si 0x%x, rcv_fl 0x%x, snd_fl 0x%x]\n", __func__,
7324                     proc_selfpid(), proc_best_name(current_proc()),
7325                     proc_pid(p), proc_best_name(p), level,
7326                     (uint64_t)DEBUG_KERNEL_ADDRPERM(so),
7327                     SOCK_DOM(so), SOCK_TYPE(so),
7328                     (uint32_t)rcv->sb_sel.si_flags,
7329                     (uint32_t)snd->sb_sel.si_flags, rcv->sb_flags,
7330                     snd->sb_flags);
7331         }
7332
7333         /*
7334          * Unwedge threads blocked on sbwait() and sb_lock().
7335          */
7336         sbwakeup(rcv);
7337         sbwakeup(snd);
7338
7339         so->so_flags1 |= SOF1_DEFUNCTINPROG;
7340         if (rcv->sb_flags & SB_LOCK) {
7341                 sbunlock(rcv, TRUE);    /* keep socket locked */
7342         }
7343         if (snd->sb_flags & SB_LOCK) {
7344                 sbunlock(snd, TRUE);    /* keep socket locked */
7345         }
7346         /*
7347          * Flush the buffers and disconnect.  We explicitly call shutdown
7348          * on both data directions to ensure that SS_CANT{RCV,SEND}MORE
7349          * states are set for the socket.  This would also flush out data
7350          * hanging off the receive list of this socket.
7351          */
7352         (void) soshutdownlock_final(so, SHUT_RD);
7353         (void) soshutdownlock_final(so, SHUT_WR);
7354         (void) sodisconnectlocked(so);
7355
7356         /*
7357          * Explicitly handle connectionless-protocol disconnection
7358          * and release any remaining data in the socket buffers.
7359          */
7360         if (!(so->so_state & SS_ISDISCONNECTED)) {
7361                 (void) soisdisconnected(so);
7362         }
7363
7364         if (so->so_error == 0) {
7365                 so->so_error = EBADF;
7366         }
7367
7368         if (rcv->sb_cc != 0) {
7369                 rcv->sb_flags &= ~SB_SEL;
7370                 selthreadclear(&rcv->sb_sel);
7371                 sbrelease(rcv);
7372         }
7373         if (snd->sb_cc != 0) {
7374                 snd->sb_flags &= ~SB_SEL;
7375                 selthreadclear(&snd->sb_sel);
7376                 sbrelease(snd);
7377         }
7378         so->so_state |= SS_DEFUNCT;
7379         OSIncrementAtomicLong((volatile long *)&sodefunct_calls);
7380
7381 done:
7382         return 0;
7383 }
7384
7385 int
7386 soresume(struct proc *p, struct socket *so, int locked)
7387 {
7388         if (locked == 0) {
7389                 socket_lock(so, 1);
7390         }
7391
7392         if (so->so_flags1 & SOF1_EXTEND_BK_IDLE_INPROG) {
7393                 SODEFUNCTLOG("%s[%d, %s]: (target pid %d name %s) so 0x%llx "
7394                     "[%d,%d] resumed from bk idle\n",
7395                     __func__, proc_selfpid(), proc_best_name(current_proc()),
7396                     proc_pid(p), proc_best_name(p),
7397                     (uint64_t)DEBUG_KERNEL_ADDRPERM(so),
7398                     SOCK_DOM(so), SOCK_TYPE(so));
7399
7400                 so->so_flags1 &= ~SOF1_EXTEND_BK_IDLE_INPROG;
7401                 so->so_extended_bk_start = 0;
7402                 OSBitAndAtomic(~P_LXBKIDLEINPROG, &p->p_ladvflag);
7403
7404                 OSIncrementAtomic(&soextbkidlestat.so_xbkidle_resumed);
7405                 OSDecrementAtomic(&soextbkidlestat.so_xbkidle_active);
7406                 VERIFY(soextbkidlestat.so_xbkidle_active >= 0);
7407         }
7408         if (locked == 0) {
7409                 socket_unlock(so, 1);
7410         }
7411
7412         return 0;
7413 }
7414
7415 /*
7416  * Does not attempt to account for sockets that are delegated from
7417  * the current process
7418  */
7419 int
7420 so_set_extended_bk_idle(struct socket *so, int optval)
7421 {
7422         int error = 0;
7423
7424         if ((SOCK_DOM(so) != PF_INET && SOCK_DOM(so) != PF_INET6) ||
7425             SOCK_PROTO(so) != IPPROTO_TCP) {
7426                 OSDecrementAtomic(&soextbkidlestat.so_xbkidle_notsupp);
7427                 error = EOPNOTSUPP;
7428         } else if (optval == 0) {
7429                 so->so_flags1 &= ~SOF1_EXTEND_BK_IDLE_WANTED;
7430
7431                 soresume(current_proc(), so, 1);
7432         } else {
7433                 struct proc *p = current_proc();
7434                 int i;
7435                 struct filedesc *fdp;
7436                 int count = 0;
7437
7438                 /*
7439                  * Unlock socket to avoid lock ordering issue with
7440                  * the proc fd table lock
7441                  */
7442                 socket_unlock(so, 0);
7443
7444                 proc_fdlock(p);
7445
7446                 fdp = p->p_fd;
7447                 for (i = 0; i < fdp->fd_nfiles; i++) {
7448                         struct fileproc *fp = fdp->fd_ofiles[i];
7449                         struct socket *so2;
7450
7451                         if (fp == NULL ||
7452                             (fdp->fd_ofileflags[i] & UF_RESERVED) != 0 ||
7453                             FILEGLOB_DTYPE(fp->f_fglob) != DTYPE_SOCKET) {
7454                                 continue;
7455                         }
7456
7457                         so2 = (struct socket *)fp->f_fglob->fg_data;
7458                         if (so != so2 &&
7459                             so2->so_flags1 & SOF1_EXTEND_BK_IDLE_WANTED) {
7460                                 count++;
7461                         }
7462                         if (count >= soextbkidlestat.so_xbkidle_maxperproc) {
7463                                 break;
7464                         }
7465                 }
7466                 proc_fdunlock(p);
7467
7468                 socket_lock(so, 0);
7469
7470                 if (count >= soextbkidlestat.so_xbkidle_maxperproc) {
7471                         OSIncrementAtomic(&soextbkidlestat.so_xbkidle_toomany);
7472                         error = EBUSY;
7473                 } else if (so->so_flags & SOF_DELEGATED) {
7474                         OSIncrementAtomic(&soextbkidlestat.so_xbkidle_nodlgtd);
7475                         error = EBUSY;
7476                 } else {
7477                         so->so_flags1 |= SOF1_EXTEND_BK_IDLE_WANTED;
7478                         OSIncrementAtomic(&soextbkidlestat.so_xbkidle_wantok);
7479                 }
7480                 SODEFUNCTLOG("%s[%d, %s]: so 0x%llx [%d,%d] "
7481                     "%s marked for extended bk idle\n",
7482                     __func__, proc_selfpid(), proc_best_name(current_proc()),
7483                     (uint64_t)DEBUG_KERNEL_ADDRPERM(so),
7484                     SOCK_DOM(so), SOCK_TYPE(so),
7485                     (so->so_flags1 & SOF1_EXTEND_BK_IDLE_WANTED) ?
7486                     "is" : "not");
7487         }
7488
7489         return error;
7490 }
7491
7492 static void
7493 so_stop_extended_bk_idle(struct socket *so)
7494 {
7495         so->so_flags1 &= ~SOF1_EXTEND_BK_IDLE_INPROG;
7496         so->so_extended_bk_start = 0;
7497
7498         OSDecrementAtomic(&soextbkidlestat.so_xbkidle_active);
7499         VERIFY(soextbkidlestat.so_xbkidle_active >= 0);
7500         /*
7501          * Force defunct
7502          */
7503         sosetdefunct(current_proc(), so,
7504             SHUTDOWN_SOCKET_LEVEL_DISCONNECT_INTERNAL, FALSE);
7505         if (so->so_flags & SOF_DEFUNCT) {
7506                 sodefunct(current_proc(), so,
7507                     SHUTDOWN_SOCKET_LEVEL_DISCONNECT_INTERNAL);
7508         }
7509 }
7510
7511 void
7512 so_drain_extended_bk_idle(struct socket *so)
7513 {
7514         if (so && (so->so_flags1 & SOF1_EXTEND_BK_IDLE_INPROG)) {
7515                 /*
7516                  * Only penalize sockets that have outstanding data
7517                  */
7518                 if (so->so_rcv.sb_cc || so->so_snd.sb_cc) {
7519                         so_stop_extended_bk_idle(so);
7520
7521                         OSIncrementAtomic(&soextbkidlestat.so_xbkidle_drained);
7522                 }
7523         }
7524 }
7525
7526 /*
7527  * Return values tells if socket is still in extended background idle
7528  */
7529 int
7530 so_check_extended_bk_idle_time(struct socket *so)
7531 {
7532         int ret = 1;
7533
7534         if ((so->so_flags1 & SOF1_EXTEND_BK_IDLE_INPROG)) {
7535                 SODEFUNCTLOG("%s[%d, %s]: so 0x%llx [%d,%d]\n",
7536                     __func__, proc_selfpid(), proc_best_name(current_proc()),
7537                     (uint64_t)DEBUG_KERNEL_ADDRPERM(so),
7538                     SOCK_DOM(so), SOCK_TYPE(so));
7539                 if (net_uptime() - so->so_extended_bk_start >
7540                     soextbkidlestat.so_xbkidle_time) {
7541                         so_stop_extended_bk_idle(so);
7542
7543                         OSIncrementAtomic(&soextbkidlestat.so_xbkidle_expired);
7544
7545                         ret = 0;
7546                 } else {
7547                         struct inpcb *inp = (struct inpcb *)so->so_pcb;
7548
7549                         inpcb_timer_sched(inp->inp_pcbinfo, INPCB_TIMER_LAZY);
7550                         OSIncrementAtomic(&soextbkidlestat.so_xbkidle_resched);
7551                 }
7552         }
7553
7554         return ret;
7555 }
7556
7557 void
7558 resume_proc_sockets(proc_t p)
7559 {
7560         if (p->p_ladvflag & P_LXBKIDLEINPROG) {
7561                 struct filedesc *fdp;
7562                 int i;
7563
7564                 proc_fdlock(p);
7565                 fdp = p->p_fd;
7566                 for (i = 0; i < fdp->fd_nfiles; i++) {
7567                         struct fileproc *fp;
7568                         struct socket *so;
7569
7570                         fp = fdp->fd_ofiles[i];
7571                         if (fp == NULL ||
7572                             (fdp->fd_ofileflags[i] & UF_RESERVED) != 0 ||
7573                             FILEGLOB_DTYPE(fp->f_fglob) != DTYPE_SOCKET) {
7574                                 continue;
7575                         }
7576
7577                         so = (struct socket *)fp->f_fglob->fg_data;
7578                         (void) soresume(p, so, 0);
7579                 }
7580                 proc_fdunlock(p);
7581
7582                 OSBitAndAtomic(~P_LXBKIDLEINPROG, &p->p_ladvflag);
7583         }
7584 }
7585
7586 __private_extern__ int
7587 so_set_recv_anyif(struct socket *so, int optval)
7588 {
7589         int ret = 0;
7590
7591 #if INET6
7592         if (SOCK_DOM(so) == PF_INET || SOCK_DOM(so) == PF_INET6) {
7593 #else
7594         if (SOCK_DOM(so) == PF_INET) {
7595 #endif /* !INET6 */
7596                 if (optval) {
7597                         sotoinpcb(so)->inp_flags |= INP_RECV_ANYIF;
7598                 } else {
7599                         sotoinpcb(so)->inp_flags &= ~INP_RECV_ANYIF;
7600                 }
7601         }
7602
7603
7604         return ret;
7605 }
7606
7607 __private_extern__ int
7608 so_get_recv_anyif(struct socket *so)
7609 {
7610         int ret = 0;
7611
7612 #if INET6
7613         if (SOCK_DOM(so) == PF_INET || SOCK_DOM(so) == PF_INET6) {
7614 #else
7615         if (SOCK_DOM(so) == PF_INET) {
7616 #endif /* !INET6 */
7617                 ret = (sotoinpcb(so)->inp_flags & INP_RECV_ANYIF) ? 1 : 0;
7618         }
7619
7620         return ret;
7621 }
7622
7623 int
7624 so_set_restrictions(struct socket *so, uint32_t vals)
7625 {
7626         int nocell_old, nocell_new;
7627         int noexpensive_old, noexpensive_new;
7628
7629         /*
7630          * Deny-type restrictions are trapdoors; once set they cannot be
7631          * unset for the lifetime of the socket.  This allows them to be
7632          * issued by a framework on behalf of the application without
7633          * having to worry that they can be undone.
7634          *
7635          * Note here that socket-level restrictions overrides any protocol
7636          * level restrictions.  For instance, SO_RESTRICT_DENY_CELLULAR
7637          * socket restriction issued on the socket has a higher precendence
7638          * than INP_NO_IFT_CELLULAR.  The latter is affected by the UUID
7639          * policy PROC_UUID_NO_CELLULAR for unrestricted sockets only,
7640          * i.e. when SO_RESTRICT_DENY_CELLULAR has not been issued.
7641          */
7642         nocell_old = (so->so_restrictions & SO_RESTRICT_DENY_CELLULAR);
7643         noexpensive_old = (so->so_restrictions & SO_RESTRICT_DENY_EXPENSIVE);
7644         so->so_restrictions |= (vals & (SO_RESTRICT_DENY_IN |
7645             SO_RESTRICT_DENY_OUT | SO_RESTRICT_DENY_CELLULAR |
7646             SO_RESTRICT_DENY_EXPENSIVE));
7647         nocell_new = (so->so_restrictions & SO_RESTRICT_DENY_CELLULAR);
7648         noexpensive_new = (so->so_restrictions & SO_RESTRICT_DENY_EXPENSIVE);
7649
7650         /* we can only set, not clear restrictions */
7651         if ((nocell_new - nocell_old) == 0 &&
7652             (noexpensive_new - noexpensive_old) == 0) {
7653                 return 0;
7654         }
7655 #if INET6
7656         if (SOCK_DOM(so) == PF_INET || SOCK_DOM(so) == PF_INET6) {
7657 #else
7658         if (SOCK_DOM(so) == PF_INET) {
7659 #endif /* !INET6 */
7660                 if (nocell_new - nocell_old != 0) {
7661                         /*
7662                          * if deny cellular is now set, do what's needed
7663                          * for INPCB
7664                          */
7665                         inp_set_nocellular(sotoinpcb(so));
7666                 }
7667                 if (noexpensive_new - noexpensive_old != 0) {
7668                         inp_set_noexpensive(sotoinpcb(so));
7669                 }
7670         }
7671
7672         if (SOCK_DOM(so) == PF_MULTIPATH) {
7673                 mptcp_set_restrictions(so);
7674         }
7675
7676         return 0;
7677 }
7678
7679 uint32_t
7680 so_get_restrictions(struct socket *so)
7681 {
7682         return so->so_restrictions & (SO_RESTRICT_DENY_IN |
7683                SO_RESTRICT_DENY_OUT |
7684                SO_RESTRICT_DENY_CELLULAR | SO_RESTRICT_DENY_EXPENSIVE);
7685 }
7686
7687 int
7688 so_set_effective_pid(struct socket *so, int epid, struct proc *p)
7689 {
7690         struct proc *ep = PROC_NULL;
7691         int error = 0;
7692
7693         /* pid 0 is reserved for kernel */
7694         if (epid == 0) {
7695                 error = EINVAL;
7696                 goto done;
7697         }
7698
7699         /*
7700          * If this is an in-kernel socket, prevent its delegate
7701          * association from changing unless the socket option is
7702          * coming from within the kernel itself.
7703          */
7704         if (so->last_pid == 0 && p != kernproc) {
7705                 error = EACCES;
7706                 goto done;
7707         }
7708
7709         /*
7710          * If this is issued by a process that's recorded as the
7711          * real owner of the socket, or if the pid is the same as
7712          * the process's own pid, then proceed.  Otherwise ensure
7713          * that the issuing process has the necessary privileges.
7714          */
7715         if (epid != so->last_pid || epid != proc_pid(p)) {
7716                 if ((error = priv_check_cred(kauth_cred_get(),
7717                     PRIV_NET_PRIVILEGED_SOCKET_DELEGATE, 0))) {
7718                         error = EACCES;
7719                         goto done;
7720                 }
7721         }
7722
7723         /* Find the process that corresponds to the effective pid */
7724         if ((ep = proc_find(epid)) == PROC_NULL) {
7725                 error = ESRCH;
7726                 goto done;
7727         }
7728
7729         /*
7730          * If a process tries to delegate the socket to itself, then
7731          * there's really nothing to do; treat it as a way for the
7732          * delegate association to be cleared.  Note that we check
7733          * the passed-in proc rather than calling proc_selfpid(),
7734          * as we need to check the process issuing the socket option
7735          * which could be kernproc.  Given that we don't allow 0 for
7736          * effective pid, it means that a delegated in-kernel socket
7737          * stays delegated during its lifetime (which is probably OK.)
7738          */
7739         if (epid == proc_pid(p)) {
7740                 so->so_flags &= ~SOF_DELEGATED;
7741                 so->e_upid = 0;
7742                 so->e_pid = 0;
7743                 uuid_clear(so->e_uuid);
7744         } else {
7745                 so->so_flags |= SOF_DELEGATED;
7746                 so->e_upid = proc_uniqueid(ep);
7747                 so->e_pid = proc_pid(ep);
7748                 proc_getexecutableuuid(ep, so->e_uuid, sizeof(so->e_uuid));
7749         }
7750 done:
7751         if (error == 0 && net_io_policy_log) {
7752                 uuid_string_t buf;
7753
7754                 uuid_unparse(so->e_uuid, buf);
7755                 log(LOG_DEBUG, "%s[%s,%d]: so 0x%llx [%d,%d] epid %d (%s) "
7756                     "euuid %s%s\n", __func__, proc_name_address(p),
7757                     proc_pid(p), (uint64_t)DEBUG_KERNEL_ADDRPERM(so),
7758                     SOCK_DOM(so), SOCK_TYPE(so),
7759                     so->e_pid, proc_name_address(ep), buf,
7760                     ((so->so_flags & SOF_DELEGATED) ? " [delegated]" : ""));
7761         } else if (error != 0 && net_io_policy_log) {
7762                 log(LOG_ERR, "%s[%s,%d]: so 0x%llx [%d,%d] epid %d (%s) "
7763                     "ERROR (%d)\n", __func__, proc_name_address(p),
7764                     proc_pid(p), (uint64_t)DEBUG_KERNEL_ADDRPERM(so),
7765                     SOCK_DOM(so), SOCK_TYPE(so),
7766                     epid, (ep == PROC_NULL) ? "PROC_NULL" :
7767                     proc_name_address(ep), error);
7768         }
7769
7770         /* Update this socket's policy upon success */
7771         if (error == 0) {
7772                 so->so_policy_gencnt *= -1;
7773                 so_update_policy(so);
7774 #if NECP
7775                 so_update_necp_policy(so, NULL, NULL);
7776 #endif /* NECP */
7777         }
7778
7779         if (ep != PROC_NULL) {
7780                 proc_rele(ep);
7781         }
7782
7783         return error;
7784 }
7785
7786 int
7787 so_set_effective_uuid(struct socket *so, uuid_t euuid, struct proc *p)
7788 {
7789         uuid_string_t buf;
7790         uuid_t uuid;
7791         int error = 0;
7792
7793         /* UUID must not be all-zeroes (reserved for kernel) */
7794         if (uuid_is_null(euuid)) {
7795                 error = EINVAL;
7796                 goto done;
7797         }
7798
7799         /*
7800          * If this is an in-kernel socket, prevent its delegate
7801          * association from changing unless the socket option is
7802          * coming from within the kernel itself.
7803          */
7804         if (so->last_pid == 0 && p != kernproc) {
7805                 error = EACCES;
7806                 goto done;
7807         }
7808
7809         /* Get the UUID of the issuing process */
7810         proc_getexecutableuuid(p, uuid, sizeof(uuid));
7811
7812         /*
7813          * If this is issued by a process that's recorded as the
7814          * real owner of the socket, or if the uuid is the same as
7815          * the process's own uuid, then proceed.  Otherwise ensure
7816          * that the issuing process has the necessary privileges.
7817          */
7818         if (uuid_compare(euuid, so->last_uuid) != 0 ||
7819             uuid_compare(euuid, uuid) != 0) {
7820                 if ((error = priv_check_cred(kauth_cred_get(),
7821                     PRIV_NET_PRIVILEGED_SOCKET_DELEGATE, 0))) {
7822                         error = EACCES;
7823                         goto done;
7824                 }
7825         }
7826
7827         /*
7828          * If a process tries to delegate the socket to itself, then
7829          * there's really nothing to do; treat it as a way for the
7830          * delegate association to be cleared.  Note that we check
7831          * the uuid of the passed-in proc rather than that of the
7832          * current process, as we need to check the process issuing
7833          * the socket option which could be kernproc itself.  Given
7834          * that we don't allow 0 for effective uuid, it means that
7835          * a delegated in-kernel socket stays delegated during its
7836          * lifetime (which is okay.)
7837          */
7838         if (uuid_compare(euuid, uuid) == 0) {
7839                 so->so_flags &= ~SOF_DELEGATED;
7840                 so->e_upid = 0;
7841                 so->e_pid = 0;
7842                 uuid_clear(so->e_uuid);
7843         } else {
7844                 so->so_flags |= SOF_DELEGATED;
7845                 /*
7846                  * Unlike so_set_effective_pid(), we only have the UUID
7847                  * here and the process ID is not known.  Inherit the
7848                  * real {pid,upid} of the socket.
7849                  */
7850                 so->e_upid = so->last_upid;
7851                 so->e_pid = so->last_pid;
7852                 uuid_copy(so->e_uuid, euuid);
7853         }
7854
7855 done:
7856         if (error == 0 && net_io_policy_log) {
7857                 uuid_unparse(so->e_uuid, buf);
7858                 log(LOG_DEBUG, "%s[%s,%d]: so 0x%llx [%d,%d] epid %d "
7859                     "euuid %s%s\n", __func__, proc_name_address(p), proc_pid(p),
7860                     (uint64_t)DEBUG_KERNEL_ADDRPERM(so), SOCK_DOM(so),
7861                     SOCK_TYPE(so), so->e_pid, buf,
7862                     ((so->so_flags & SOF_DELEGATED) ? " [delegated]" : ""));
7863         } else if (error != 0 && net_io_policy_log) {
7864                 uuid_unparse(euuid, buf);
7865                 log(LOG_DEBUG, "%s[%s,%d]: so 0x%llx [%d,%d] euuid %s "
7866                     "ERROR (%d)\n", __func__, proc_name_address(p), proc_pid(p),
7867                     (uint64_t)DEBUG_KERNEL_ADDRPERM(so), SOCK_DOM(so),
7868                     SOCK_TYPE(so), buf, error);
7869         }
7870
7871         /* Update this socket's policy upon success */
7872         if (error == 0) {
7873                 so->so_policy_gencnt *= -1;
7874                 so_update_policy(so);
7875 #if NECP
7876                 so_update_necp_policy(so, NULL, NULL);
7877 #endif /* NECP */
7878         }
7879
7880         return error;
7881 }
7882
7883 void
7884 netpolicy_post_msg(uint32_t ev_code, struct netpolicy_event_data *ev_data,
7885     uint32_t ev_datalen)
7886 {
7887         struct kev_msg ev_msg;
7888
7889         /*
7890          * A netpolicy event always starts with a netpolicy_event_data
7891          * structure, but the caller can provide for a longer event
7892          * structure to post, depending on the event code.
7893          */
7894         VERIFY(ev_data != NULL && ev_datalen >= sizeof(*ev_data));
7895
7896         bzero(&ev_msg, sizeof(ev_msg));
7897         ev_msg.vendor_code      = KEV_VENDOR_APPLE;
7898         ev_msg.kev_class        = KEV_NETWORK_CLASS;
7899         ev_msg.kev_subclass     = KEV_NETPOLICY_SUBCLASS;
7900         ev_msg.event_code       = ev_code;
7901
7902         ev_msg.dv[0].data_ptr   = ev_data;
7903         ev_msg.dv[0].data_length = ev_datalen;
7904
7905         kev_post_msg(&ev_msg);
7906 }
7907
7908 void
7909 socket_post_kev_msg(uint32_t ev_code,
7910     struct kev_socket_event_data *ev_data,
7911     uint32_t ev_datalen)
7912 {
7913         struct kev_msg ev_msg;
7914
7915         bzero(&ev_msg, sizeof(ev_msg));
7916         ev_msg.vendor_code = KEV_VENDOR_APPLE;
7917         ev_msg.kev_class = KEV_NETWORK_CLASS;
7918         ev_msg.kev_subclass = KEV_SOCKET_SUBCLASS;
7919         ev_msg.event_code = ev_code;
7920
7921         ev_msg.dv[0].data_ptr = ev_data;
7922         ev_msg.dv[0].data_length = ev_datalen;
7923
7924         kev_post_msg(&ev_msg);
7925 }
7926
7927 void
7928 socket_post_kev_msg_closed(struct socket *so)
7929 {
7930         struct kev_socket_closed ev;
7931         struct sockaddr *socksa = NULL, *peersa = NULL;
7932         int err;
7933         bzero(&ev, sizeof(ev));
7934         err = (*so->so_proto->pr_usrreqs->pru_sockaddr)(so, &socksa);
7935         if (err == 0) {
7936                 err = (*so->so_proto->pr_usrreqs->pru_peeraddr)(so,
7937                     &peersa);
7938                 if (err == 0) {
7939                         memcpy(&ev.ev_data.kev_sockname, socksa,
7940                             min(socksa->sa_len,
7941                             sizeof(ev.ev_data.kev_sockname)));
7942                         memcpy(&ev.ev_data.kev_peername, peersa,
7943                             min(peersa->sa_len,
7944                             sizeof(ev.ev_data.kev_peername)));
7945                         socket_post_kev_msg(KEV_SOCKET_CLOSED,
7946                             &ev.ev_data, sizeof(ev));
7947                 }
7948         }
7949         if (socksa != NULL) {
7950                 FREE(socksa, M_SONAME);
7951         }
7952         if (peersa != NULL) {
7953                 FREE(peersa, M_SONAME);
7954         }
7955 }