bsd/kern/uipc_socket.c

   1 /*
   2  * Copyright (c) 1998-2019 Apple Inc. All rights reserved.
   3  *
   4  * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
   5  *
   6  * This file contains Original Code and/or Modifications of Original Code
   7  * as defined in and that are subject to the Apple Public Source License
   8  * Version 2.0 (the 'License'). You may not use this file except in
   9  * compliance with the License. The rights granted to you under the License
  10  * may not be used to create, or enable the creation or redistribution of,
  11  * unlawful or unlicensed copies of an Apple operating system, or to
  12  * circumvent, violate, or enable the circumvention or violation of, any
  13  * terms of an Apple operating system software license agreement.
  14  *
  15  * Please obtain a copy of the License at
  16  * http://www.opensource.apple.com/apsl/ and read it before using this file.
  17  *
  18  * The Original Code and all software distributed under the License are
  19  * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
  20  * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
  21  * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
  22  * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
  23  * Please see the License for the specific language governing rights and
  24  * limitations under the License.
  25  *
  26  * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
  27  */
  28 /* Copyright (c) 1995 NeXT Computer, Inc. All Rights Reserved */
  29 /*
  30  * Copyright (c) 1982, 1986, 1988, 1990, 1993
  31  *      The Regents of the University of California.  All rights reserved.
  32  *
  33  * Redistribution and use in source and binary forms, with or without
  34  * modification, are permitted provided that the following conditions
  35  * are met:
  36  * 1. Redistributions of source code must retain the above copyright
  37  *    notice, this list of conditions and the following disclaimer.
  38  * 2. Redistributions in binary form must reproduce the above copyright
  39  *    notice, this list of conditions and the following disclaimer in the
  40  *    documentation and/or other materials provided with the distribution.
  41  * 3. All advertising materials mentioning features or use of this software
  42  *    must display the following acknowledgement:
  43  *      This product includes software developed by the University of
  44  *      California, Berkeley and its contributors.
  45  * 4. Neither the name of the University nor the names of its contributors
  46  *    may be used to endorse or promote products derived from this software
  47  *    without specific prior written permission.
  48  *
  49  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
  50  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  51  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  52  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  53  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  54  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  55  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  56  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  57  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  58  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  59  * SUCH DAMAGE.
  60  *
  61  *      @(#)uipc_socket.c       8.3 (Berkeley) 4/15/94
  62  */
  63 /*
  64  * NOTICE: This file was modified by SPARTA, Inc. in 2005 to introduce
  65  * support for mandatory and extensible security protections.  This notice
  66  * is included in support of clause 2.2 (b) of the Apple Public License,
  67  * Version 2.0.
  68  */
  69
  70 #include <sys/param.h>
  71 #include <sys/systm.h>
  72 #include <sys/filedesc.h>
  73 #include <sys/proc.h>
  74 #include <sys/proc_internal.h>
  75 #include <sys/kauth.h>
  76 #include <sys/file_internal.h>
  77 #include <sys/fcntl.h>
  78 #include <sys/malloc.h>
  79 #include <sys/mbuf.h>
  80 #include <sys/domain.h>
  81 #include <sys/kernel.h>
  82 #include <sys/event.h>
  83 #include <sys/poll.h>
  84 #include <sys/protosw.h>
  85 #include <sys/socket.h>
  86 #include <sys/socketvar.h>
  87 #include <sys/resourcevar.h>
  88 #include <sys/signalvar.h>
  89 #include <sys/sysctl.h>
  90 #include <sys/syslog.h>
  91 #include <sys/uio.h>
  92 #include <sys/uio_internal.h>
  93 #include <sys/ev.h>
  94 #include <sys/kdebug.h>
  95 #include <sys/un.h>
  96 #include <sys/user.h>
  97 #include <sys/priv.h>
  98 #include <sys/kern_event.h>
  99 #include <net/route.h>
 100 #include <net/init.h>
 101 #include <net/net_api_stats.h>
 102 #include <net/ntstat.h>
 103 #include <net/content_filter.h>
 104 #include <netinet/in.h>
 105 #include <netinet/in_pcb.h>
 106 #include <netinet/in_tclass.h>
 107 #include <netinet/in_var.h>
 108 #include <netinet/tcp_var.h>
 109 #include <netinet/ip6.h>
 110 #include <netinet6/ip6_var.h>
 111 #include <netinet/flow_divert.h>
 112 #include <kern/zalloc.h>
 113 #include <kern/locks.h>
 114 #include <machine/limits.h>
 115 #include <libkern/OSAtomic.h>
 116 #include <pexpert/pexpert.h>
 117 #include <kern/assert.h>
 118 #include <kern/task.h>
 119 #include <kern/policy_internal.h>
 120
 121 #include <sys/kpi_mbuf.h>
 122 #include <sys/mcache.h>
 123 #include <sys/unpcb.h>
 124 #include <libkern/section_keywords.h>
 125
 126 #if CONFIG_MACF
 127 #include <security/mac_framework.h>
 128 #endif /* MAC */
 129
 130 #if MULTIPATH
 131 #include <netinet/mp_pcb.h>
 132 #include <netinet/mptcp_var.h>
 133 #endif /* MULTIPATH */
 134
 135 #define ROUNDUP(a, b) (((a) + ((b) - 1)) & (~((b) - 1)))
 136
 137 #if DEBUG || DEVELOPMENT
 138 #define DEBUG_KERNEL_ADDRPERM(_v) (_v)
 139 #else
 140 #define DEBUG_KERNEL_ADDRPERM(_v) VM_KERNEL_ADDRPERM(_v)
 141 #endif
 142
 143 /* TODO: this should be in a header file somewhere */
 144 extern char *proc_name_address(void *p);
 145
 146 static u_int32_t        so_cache_hw;    /* High water mark for socache */
 147 static u_int32_t        so_cache_timeouts;      /* number of timeouts */
 148 static u_int32_t        so_cache_max_freed;     /* max freed per timeout */
 149 static u_int32_t        cached_sock_count = 0;
 150 STAILQ_HEAD(, socket)   so_cache_head;
 151 int     max_cached_sock_count = MAX_CACHED_SOCKETS;
 152 static u_int32_t        so_cache_time;
 153 static int              socketinit_done;
 154 static struct zone      *so_cache_zone;
 155
 156 static lck_grp_t        *so_cache_mtx_grp;
 157 static lck_attr_t       *so_cache_mtx_attr;
 158 static lck_grp_attr_t   *so_cache_mtx_grp_attr;
 159 static lck_mtx_t        *so_cache_mtx;
 160
 161 #include <machine/limits.h>
 162
 163 static int      filt_sorattach(struct knote *kn, struct kevent_qos_s *kev);
 164 static void     filt_sordetach(struct knote *kn);
 165 static int      filt_soread(struct knote *kn, long hint);
 166 static int      filt_sortouch(struct knote *kn, struct kevent_qos_s *kev);
 167 static int      filt_sorprocess(struct knote *kn, struct kevent_qos_s *kev);
 168
 169 static int      filt_sowattach(struct knote *kn, struct kevent_qos_s *kev);
 170 static void     filt_sowdetach(struct knote *kn);
 171 static int      filt_sowrite(struct knote *kn, long hint);
 172 static int      filt_sowtouch(struct knote *kn, struct kevent_qos_s *kev);
 173 static int      filt_sowprocess(struct knote *kn, struct kevent_qos_s *kev);
 174
 175 static int      filt_sockattach(struct knote *kn, struct kevent_qos_s *kev);
 176 static void     filt_sockdetach(struct knote *kn);
 177 static int      filt_sockev(struct knote *kn, long hint);
 178 static int      filt_socktouch(struct knote *kn, struct kevent_qos_s *kev);
 179 static int      filt_sockprocess(struct knote *kn, struct kevent_qos_s *kev);
 180
 181 static int sooptcopyin_timeval(struct sockopt *, struct timeval *);
 182 static int sooptcopyout_timeval(struct sockopt *, const struct timeval *);
 183
 184 SECURITY_READ_ONLY_EARLY(struct filterops) soread_filtops = {
 185         .f_isfd = 1,
 186         .f_attach = filt_sorattach,
 187         .f_detach = filt_sordetach,
 188         .f_event = filt_soread,
 189         .f_touch = filt_sortouch,
 190         .f_process = filt_sorprocess,
 191 };
 192
 193 SECURITY_READ_ONLY_EARLY(struct filterops) sowrite_filtops = {
 194         .f_isfd = 1,
 195         .f_attach = filt_sowattach,
 196         .f_detach = filt_sowdetach,
 197         .f_event = filt_sowrite,
 198         .f_touch = filt_sowtouch,
 199         .f_process = filt_sowprocess,
 200 };
 201
 202 SECURITY_READ_ONLY_EARLY(struct filterops) sock_filtops = {
 203         .f_isfd = 1,
 204         .f_attach = filt_sockattach,
 205         .f_detach = filt_sockdetach,
 206         .f_event = filt_sockev,
 207         .f_touch = filt_socktouch,
 208         .f_process = filt_sockprocess,
 209 };
 210
 211 SECURITY_READ_ONLY_EARLY(struct filterops) soexcept_filtops = {
 212         .f_isfd = 1,
 213         .f_attach = filt_sorattach,
 214         .f_detach = filt_sordetach,
 215         .f_event = filt_soread,
 216         .f_touch = filt_sortouch,
 217         .f_process = filt_sorprocess,
 218 };
 219
 220 SYSCTL_DECL(_kern_ipc);
 221
 222 #define EVEN_MORE_LOCKING_DEBUG 0
 223
 224 int socket_debug = 0;
 225 SYSCTL_INT(_kern_ipc, OID_AUTO, socket_debug,
 226     CTLFLAG_RW | CTLFLAG_LOCKED, &socket_debug, 0, "");
 227
 228 static unsigned long sodefunct_calls = 0;
 229 SYSCTL_LONG(_kern_ipc, OID_AUTO, sodefunct_calls, CTLFLAG_LOCKED,
 230     &sodefunct_calls, "");
 231
 232 static int socket_zone = M_SOCKET;
 233 so_gen_t        so_gencnt;      /* generation count for sockets */
 234
 235 MALLOC_DEFINE(M_SONAME, "soname", "socket name");
 236 MALLOC_DEFINE(M_PCB, "pcb", "protocol control block");
 237
 238 #define DBG_LAYER_IN_BEG        NETDBG_CODE(DBG_NETSOCK, 0)
 239 #define DBG_LAYER_IN_END        NETDBG_CODE(DBG_NETSOCK, 2)
 240 #define DBG_LAYER_OUT_BEG       NETDBG_CODE(DBG_NETSOCK, 1)
 241 #define DBG_LAYER_OUT_END       NETDBG_CODE(DBG_NETSOCK, 3)
 242 #define DBG_FNC_SOSEND          NETDBG_CODE(DBG_NETSOCK, (4 << 8) | 1)
 243 #define DBG_FNC_SOSEND_LIST     NETDBG_CODE(DBG_NETSOCK, (4 << 8) | 3)
 244 #define DBG_FNC_SORECEIVE       NETDBG_CODE(DBG_NETSOCK, (8 << 8))
 245 #define DBG_FNC_SORECEIVE_LIST  NETDBG_CODE(DBG_NETSOCK, (8 << 8) | 3)
 246 #define DBG_FNC_SOSHUTDOWN      NETDBG_CODE(DBG_NETSOCK, (9 << 8))
 247
 248 #define MAX_SOOPTGETM_SIZE      (128 * MCLBYTES)
 249
 250 int somaxconn = SOMAXCONN;
 251 SYSCTL_INT(_kern_ipc, KIPC_SOMAXCONN, somaxconn,
 252     CTLFLAG_RW | CTLFLAG_LOCKED, &somaxconn, 0, "");
 253
 254 /* Should we get a maximum also ??? */
 255 static int sosendmaxchain = 65536;
 256 static int sosendminchain = 16384;
 257 static int sorecvmincopy  = 16384;
 258 SYSCTL_INT(_kern_ipc, OID_AUTO, sosendminchain,
 259     CTLFLAG_RW | CTLFLAG_LOCKED, &sosendminchain, 0, "");
 260 SYSCTL_INT(_kern_ipc, OID_AUTO, sorecvmincopy,
 261     CTLFLAG_RW | CTLFLAG_LOCKED, &sorecvmincopy, 0, "");
 262
 263 /*
 264  * Set to enable jumbo clusters (if available) for large writes when
 265  * the socket is marked with SOF_MULTIPAGES; see below.
 266  */
 267 int sosendjcl = 1;
 268 SYSCTL_INT(_kern_ipc, OID_AUTO, sosendjcl,
 269     CTLFLAG_RW | CTLFLAG_LOCKED, &sosendjcl, 0, "");
 270
 271 /*
 272  * Set this to ignore SOF_MULTIPAGES and use jumbo clusters for large
 273  * writes on the socket for all protocols on any network interfaces,
 274  * depending upon sosendjcl above.  Be extra careful when setting this
 275  * to 1, because sending down packets that cross physical pages down to
 276  * broken drivers (those that falsely assume that the physical pages
 277  * are contiguous) might lead to system panics or silent data corruption.
 278  * When set to 0, the system will respect SOF_MULTIPAGES, which is set
 279  * only for TCP sockets whose outgoing interface is IFNET_MULTIPAGES
 280  * capable.  Set this to 1 only for testing/debugging purposes.
 281  */
 282 int sosendjcl_ignore_capab = 0;
 283 SYSCTL_INT(_kern_ipc, OID_AUTO, sosendjcl_ignore_capab,
 284     CTLFLAG_RW | CTLFLAG_LOCKED, &sosendjcl_ignore_capab, 0, "");
 285
 286 /*
 287  * Set this to ignore SOF1_IF_2KCL and use big clusters for large
 288  * writes on the socket for all protocols on any network interfaces.
 289  * Be extra careful when setting this to 1, because sending down packets with
 290  * clusters larger that 2 KB might lead to system panics or data corruption.
 291  * When set to 0, the system will respect SOF1_IF_2KCL, which is set
 292  * on the outgoing interface
 293  * Set this to 1  for testing/debugging purposes only.
 294  */
 295 int sosendbigcl_ignore_capab = 0;
 296 SYSCTL_INT(_kern_ipc, OID_AUTO, sosendbigcl_ignore_capab,
 297     CTLFLAG_RW | CTLFLAG_LOCKED, &sosendbigcl_ignore_capab, 0, "");
 298
 299 int sodefunctlog = 0;
 300 SYSCTL_INT(_kern_ipc, OID_AUTO, sodefunctlog, CTLFLAG_RW | CTLFLAG_LOCKED,
 301     &sodefunctlog, 0, "");
 302
 303 int sothrottlelog = 0;
 304 SYSCTL_INT(_kern_ipc, OID_AUTO, sothrottlelog, CTLFLAG_RW | CTLFLAG_LOCKED,
 305     &sothrottlelog, 0, "");
 306
 307 int sorestrictrecv = 1;
 308 SYSCTL_INT(_kern_ipc, OID_AUTO, sorestrictrecv, CTLFLAG_RW | CTLFLAG_LOCKED,
 309     &sorestrictrecv, 0, "Enable inbound interface restrictions");
 310
 311 int sorestrictsend = 1;
 312 SYSCTL_INT(_kern_ipc, OID_AUTO, sorestrictsend, CTLFLAG_RW | CTLFLAG_LOCKED,
 313     &sorestrictsend, 0, "Enable outbound interface restrictions");
 314
 315 int soreserveheadroom = 1;
 316 SYSCTL_INT(_kern_ipc, OID_AUTO, soreserveheadroom, CTLFLAG_RW | CTLFLAG_LOCKED,
 317     &soreserveheadroom, 0, "To allocate contiguous datagram buffers");
 318
 319 #if (DEBUG || DEVELOPMENT)
 320 int so_notsent_lowat_check = 1;
 321 SYSCTL_INT(_kern_ipc, OID_AUTO, notsent_lowat, CTLFLAG_RW | CTLFLAG_LOCKED,
 322     &so_notsent_lowat_check, 0, "enable/disable notsnet lowat check");
 323 #endif /* DEBUG || DEVELOPMENT */
 324
 325 int so_accept_list_waits = 0;
 326 #if (DEBUG || DEVELOPMENT)
 327 SYSCTL_INT(_kern_ipc, OID_AUTO, accept_list_waits, CTLFLAG_RW | CTLFLAG_LOCKED,
 328     &so_accept_list_waits, 0, "number of waits for listener incomp list");
 329 #endif /* DEBUG || DEVELOPMENT */
 330
 331 extern struct inpcbinfo tcbinfo;
 332
 333 /* TODO: these should be in header file */
 334 extern int get_inpcb_str_size(void);
 335 extern int get_tcp_str_size(void);
 336
 337 vm_size_t       so_cache_zone_element_size;
 338
 339 static int sodelayed_copy(struct socket *, struct uio *, struct mbuf **,
 340     user_ssize_t *);
 341 static void cached_sock_alloc(struct socket **, int);
 342 static void cached_sock_free(struct socket *);
 343
 344 /*
 345  * Maximum of extended background idle sockets per process
 346  * Set to zero to disable further setting of the option
 347  */
 348
 349 #define SO_IDLE_BK_IDLE_MAX_PER_PROC    1
 350 #define SO_IDLE_BK_IDLE_TIME            600
 351 #define SO_IDLE_BK_IDLE_RCV_HIWAT       131072
 352
 353 struct soextbkidlestat soextbkidlestat;
 354
 355 SYSCTL_UINT(_kern_ipc, OID_AUTO, maxextbkidleperproc,
 356     CTLFLAG_RW | CTLFLAG_LOCKED, &soextbkidlestat.so_xbkidle_maxperproc, 0,
 357     "Maximum of extended background idle sockets per process");
 358
 359 SYSCTL_UINT(_kern_ipc, OID_AUTO, extbkidletime, CTLFLAG_RW | CTLFLAG_LOCKED,
 360     &soextbkidlestat.so_xbkidle_time, 0,
 361     "Time in seconds to keep extended background idle sockets");
 362
 363 SYSCTL_UINT(_kern_ipc, OID_AUTO, extbkidlercvhiwat, CTLFLAG_RW | CTLFLAG_LOCKED,
 364     &soextbkidlestat.so_xbkidle_rcvhiwat, 0,
 365     "High water mark for extended background idle sockets");
 366
 367 SYSCTL_STRUCT(_kern_ipc, OID_AUTO, extbkidlestat, CTLFLAG_RD | CTLFLAG_LOCKED,
 368     &soextbkidlestat, soextbkidlestat, "");
 369
 370 int so_set_extended_bk_idle(struct socket *, int);
 371
 372
 373 /*
 374  * SOTCDB_NO_DSCP is set by default, to prevent the networking stack from
 375  * setting the DSCP code on the packet based on the service class; see
 376  * <rdar://problem/11277343> for details.
 377  */
 378 __private_extern__ u_int32_t sotcdb = 0;
 379 SYSCTL_INT(_kern_ipc, OID_AUTO, sotcdb, CTLFLAG_RW | CTLFLAG_LOCKED,
 380     &sotcdb, 0, "");
 381
 382 void
 383 socketinit(void)
 384 {
 385         _CASSERT(sizeof(so_gencnt) == sizeof(uint64_t));
 386         VERIFY(IS_P2ALIGNED(&so_gencnt, sizeof(uint32_t)));
 387
 388 #ifdef __LP64__
 389         _CASSERT(sizeof(struct sa_endpoints) == sizeof(struct user64_sa_endpoints));
 390         _CASSERT(offsetof(struct sa_endpoints, sae_srcif) == offsetof(struct user64_sa_endpoints, sae_srcif));
 391         _CASSERT(offsetof(struct sa_endpoints, sae_srcaddr) == offsetof(struct user64_sa_endpoints, sae_srcaddr));
 392         _CASSERT(offsetof(struct sa_endpoints, sae_srcaddrlen) == offsetof(struct user64_sa_endpoints, sae_srcaddrlen));
 393         _CASSERT(offsetof(struct sa_endpoints, sae_dstaddr) == offsetof(struct user64_sa_endpoints, sae_dstaddr));
 394         _CASSERT(offsetof(struct sa_endpoints, sae_dstaddrlen) == offsetof(struct user64_sa_endpoints, sae_dstaddrlen));
 395 #else
 396         _CASSERT(sizeof(struct sa_endpoints) == sizeof(struct user32_sa_endpoints));
 397         _CASSERT(offsetof(struct sa_endpoints, sae_srcif) == offsetof(struct user32_sa_endpoints, sae_srcif));
 398         _CASSERT(offsetof(struct sa_endpoints, sae_srcaddr) == offsetof(struct user32_sa_endpoints, sae_srcaddr));
 399         _CASSERT(offsetof(struct sa_endpoints, sae_srcaddrlen) == offsetof(struct user32_sa_endpoints, sae_srcaddrlen));
 400         _CASSERT(offsetof(struct sa_endpoints, sae_dstaddr) == offsetof(struct user32_sa_endpoints, sae_dstaddr));
 401         _CASSERT(offsetof(struct sa_endpoints, sae_dstaddrlen) == offsetof(struct user32_sa_endpoints, sae_dstaddrlen));
 402 #endif
 403
 404         if (socketinit_done) {
 405                 printf("socketinit: already called...\n");
 406                 return;
 407         }
 408         socketinit_done = 1;
 409
 410         PE_parse_boot_argn("socket_debug", &socket_debug,
 411             sizeof(socket_debug));
 412
 413         /*
 414          * allocate lock group attribute and group for socket cache mutex
 415          */
 416         so_cache_mtx_grp_attr = lck_grp_attr_alloc_init();
 417         so_cache_mtx_grp = lck_grp_alloc_init("so_cache",
 418             so_cache_mtx_grp_attr);
 419
 420         /*
 421          * allocate the lock attribute for socket cache mutex
 422          */
 423         so_cache_mtx_attr = lck_attr_alloc_init();
 424
 425         /* cached sockets mutex */
 426         so_cache_mtx = lck_mtx_alloc_init(so_cache_mtx_grp, so_cache_mtx_attr);
 427         if (so_cache_mtx == NULL) {
 428                 panic("%s: unable to allocate so_cache_mtx\n", __func__);
 429                 /* NOTREACHED */
 430         }
 431         STAILQ_INIT(&so_cache_head);
 432
 433         so_cache_zone_element_size = (vm_size_t)(sizeof(struct socket) + 4
 434             + get_inpcb_str_size() + 4 + get_tcp_str_size());
 435
 436         so_cache_zone = zinit(so_cache_zone_element_size,
 437             (120000 * so_cache_zone_element_size), 8192, "socache zone");
 438         zone_change(so_cache_zone, Z_CALLERACCT, FALSE);
 439         zone_change(so_cache_zone, Z_NOENCRYPT, TRUE);
 440
 441         bzero(&soextbkidlestat, sizeof(struct soextbkidlestat));
 442         soextbkidlestat.so_xbkidle_maxperproc = SO_IDLE_BK_IDLE_MAX_PER_PROC;
 443         soextbkidlestat.so_xbkidle_time = SO_IDLE_BK_IDLE_TIME;
 444         soextbkidlestat.so_xbkidle_rcvhiwat = SO_IDLE_BK_IDLE_RCV_HIWAT;
 445
 446         in_pcbinit();
 447         sflt_init();
 448         socket_tclass_init();
 449 #if MULTIPATH
 450         mp_pcbinit();
 451 #endif /* MULTIPATH */
 452 }
 453
 454 static void
 455 cached_sock_alloc(struct socket **so, int waitok)
 456 {
 457         caddr_t temp;
 458         uintptr_t offset;
 459
 460         lck_mtx_lock(so_cache_mtx);
 461
 462         if (!STAILQ_EMPTY(&so_cache_head)) {
 463                 VERIFY(cached_sock_count > 0);
 464
 465                 *so = STAILQ_FIRST(&so_cache_head);
 466                 STAILQ_REMOVE_HEAD(&so_cache_head, so_cache_ent);
 467                 STAILQ_NEXT((*so), so_cache_ent) = NULL;
 468
 469                 cached_sock_count--;
 470                 lck_mtx_unlock(so_cache_mtx);
 471
 472                 temp = (*so)->so_saved_pcb;
 473                 bzero((caddr_t)*so, sizeof(struct socket));
 474
 475                 (*so)->so_saved_pcb = temp;
 476         } else {
 477                 lck_mtx_unlock(so_cache_mtx);
 478
 479                 if (waitok) {
 480                         *so = (struct socket *)zalloc(so_cache_zone);
 481                 } else {
 482                         *so = (struct socket *)zalloc_noblock(so_cache_zone);
 483                 }
 484
 485                 if (*so == NULL) {
 486                         return;
 487                 }
 488
 489                 bzero((caddr_t)*so, sizeof(struct socket));
 490
 491                 /*
 492                  * Define offsets for extra structures into our
 493                  * single block of memory. Align extra structures
 494                  * on longword boundaries.
 495                  */
 496
 497                 offset = (uintptr_t)*so;
 498                 offset += sizeof(struct socket);
 499
 500                 offset = ALIGN(offset);
 501
 502                 (*so)->so_saved_pcb = (caddr_t)offset;
 503                 offset += get_inpcb_str_size();
 504
 505                 offset = ALIGN(offset);
 506
 507                 ((struct inpcb *)(void *)(*so)->so_saved_pcb)->inp_saved_ppcb =
 508                     (caddr_t)offset;
 509         }
 510
 511         OSBitOrAtomic(SOF1_CACHED_IN_SOCK_LAYER, &(*so)->so_flags1);
 512 }
 513
 514 static void
 515 cached_sock_free(struct socket *so)
 516 {
 517         lck_mtx_lock(so_cache_mtx);
 518
 519         so_cache_time = net_uptime();
 520         if (++cached_sock_count > max_cached_sock_count) {
 521                 --cached_sock_count;
 522                 lck_mtx_unlock(so_cache_mtx);
 523                 zfree(so_cache_zone, so);
 524         } else {
 525                 if (so_cache_hw < cached_sock_count) {
 526                         so_cache_hw = cached_sock_count;
 527                 }
 528
 529                 STAILQ_INSERT_TAIL(&so_cache_head, so, so_cache_ent);
 530
 531                 so->cache_timestamp = so_cache_time;
 532                 lck_mtx_unlock(so_cache_mtx);
 533         }
 534 }
 535
 536 void
 537 so_update_last_owner_locked(struct socket *so, proc_t self)
 538 {
 539         if (so->last_pid != 0) {
 540                 /*
 541                  * last_pid and last_upid should remain zero for sockets
 542                  * created using sock_socket. The check above achieves that
 543                  */
 544                 if (self == PROC_NULL) {
 545                         self = current_proc();
 546                 }
 547
 548                 if (so->last_upid != proc_uniqueid(self) ||
 549                     so->last_pid != proc_pid(self)) {
 550                         so->last_upid = proc_uniqueid(self);
 551                         so->last_pid = proc_pid(self);
 552                         proc_getexecutableuuid(self, so->last_uuid,
 553                             sizeof(so->last_uuid));
 554                         if (so->so_proto != NULL && so->so_proto->pr_update_last_owner != NULL) {
 555                                 (*so->so_proto->pr_update_last_owner)(so, self, NULL);
 556                         }
 557                 }
 558                 proc_pidoriginatoruuid(so->so_vuuid, sizeof(so->so_vuuid));
 559         }
 560 }
 561
 562 void
 563 so_update_policy(struct socket *so)
 564 {
 565         if (SOCK_DOM(so) == PF_INET || SOCK_DOM(so) == PF_INET6) {
 566                 (void) inp_update_policy(sotoinpcb(so));
 567         }
 568 }
 569
 570 #if NECP
 571 static void
 572 so_update_necp_policy(struct socket *so, struct sockaddr *override_local_addr,
 573     struct sockaddr *override_remote_addr)
 574 {
 575         if (SOCK_DOM(so) == PF_INET || SOCK_DOM(so) == PF_INET6) {
 576                 inp_update_necp_policy(sotoinpcb(so), override_local_addr,
 577                     override_remote_addr, 0);
 578         }
 579 }
 580 #endif /* NECP */
 581
 582 boolean_t
 583 so_cache_timer(void)
 584 {
 585         struct socket   *p;
 586         int             n_freed = 0;
 587         boolean_t rc = FALSE;
 588
 589         lck_mtx_lock(so_cache_mtx);
 590         so_cache_timeouts++;
 591         so_cache_time = net_uptime();
 592
 593         while (!STAILQ_EMPTY(&so_cache_head)) {
 594                 VERIFY(cached_sock_count > 0);
 595                 p = STAILQ_FIRST(&so_cache_head);
 596                 if ((so_cache_time - p->cache_timestamp) <
 597                     SO_CACHE_TIME_LIMIT) {
 598                         break;
 599                 }
 600
 601                 STAILQ_REMOVE_HEAD(&so_cache_head, so_cache_ent);
 602                 --cached_sock_count;
 603
 604                 zfree(so_cache_zone, p);
 605
 606                 if (++n_freed >= SO_CACHE_MAX_FREE_BATCH) {
 607                         so_cache_max_freed++;
 608                         break;
 609                 }
 610         }
 611
 612         /* Schedule again if there is more to cleanup */
 613         if (!STAILQ_EMPTY(&so_cache_head)) {
 614                 rc = TRUE;
 615         }
 616
 617         lck_mtx_unlock(so_cache_mtx);
 618         return rc;
 619 }
 620
 621 /*
 622  * Get a socket structure from our zone, and initialize it.
 623  * We don't implement `waitok' yet (see comments in uipc_domain.c).
 624  * Note that it would probably be better to allocate socket
 625  * and PCB at the same time, but I'm not convinced that all
 626  * the protocols can be easily modified to do this.
 627  */
 628 struct socket *
 629 soalloc(int waitok, int dom, int type)
 630 {
 631         struct socket *so;
 632
 633         if ((dom == PF_INET) && (type == SOCK_STREAM)) {
 634                 cached_sock_alloc(&so, waitok);
 635         } else {
 636                 MALLOC_ZONE(so, struct socket *, sizeof(*so), socket_zone,
 637                     M_WAITOK);
 638                 if (so != NULL) {
 639                         bzero(so, sizeof(*so));
 640                 }
 641         }
 642         if (so != NULL) {
 643                 so->so_gencnt = OSIncrementAtomic64((SInt64 *)&so_gencnt);
 644                 so->so_zone = socket_zone;
 645
 646                 /*
 647                  * Increment the socket allocation statistics
 648                  */
 649                 INC_ATOMIC_INT64_LIM(net_api_stats.nas_socket_alloc_total);
 650
 651 #if CONFIG_MACF_SOCKET
 652                 /* Convert waitok to  M_WAITOK/M_NOWAIT for MAC Framework. */
 653                 if (mac_socket_label_init(so, !waitok) != 0) {
 654                         sodealloc(so);
 655                         return NULL;
 656                 }
 657 #endif /* MAC_SOCKET */
 658         }
 659
 660         return so;
 661 }
 662
 663 int
 664 socreate_internal(int dom, struct socket **aso, int type, int proto,
 665     struct proc *p, uint32_t flags, struct proc *ep)
 666 {
 667         struct protosw *prp;
 668         struct socket *so;
 669         int error = 0;
 670
 671 #if TCPDEBUG
 672         extern int tcpconsdebug;
 673 #endif
 674
 675         VERIFY(aso != NULL);
 676         *aso = NULL;
 677
 678         if (proto != 0) {
 679                 prp = pffindproto(dom, proto, type);
 680         } else {
 681                 prp = pffindtype(dom, type);
 682         }
 683
 684         if (prp == NULL || prp->pr_usrreqs->pru_attach == NULL) {
 685                 if (pffinddomain(dom) == NULL) {
 686                         return EAFNOSUPPORT;
 687                 }
 688                 if (proto != 0) {
 689                         if (pffindprotonotype(dom, proto) != NULL) {
 690                                 return EPROTOTYPE;
 691                         }
 692                 }
 693                 return EPROTONOSUPPORT;
 694         }
 695         if (prp->pr_type != type) {
 696                 return EPROTOTYPE;
 697         }
 698         so = soalloc(1, dom, type);
 699         if (so == NULL) {
 700                 return ENOBUFS;
 701         }
 702
 703         switch (dom) {
 704         case PF_LOCAL:
 705                 INC_ATOMIC_INT64_LIM(net_api_stats.nas_socket_domain_local_total);
 706                 break;
 707         case PF_INET:
 708                 INC_ATOMIC_INT64_LIM(net_api_stats.nas_socket_domain_inet_total);
 709                 if (type == SOCK_STREAM) {
 710                         INC_ATOMIC_INT64_LIM(net_api_stats.nas_socket_inet_stream_total);
 711                 } else {
 712                         INC_ATOMIC_INT64_LIM(net_api_stats.nas_socket_inet_dgram_total);
 713                 }
 714                 break;
 715         case PF_ROUTE:
 716                 INC_ATOMIC_INT64_LIM(net_api_stats.nas_socket_domain_route_total);
 717                 break;
 718         case PF_NDRV:
 719                 INC_ATOMIC_INT64_LIM(net_api_stats.nas_socket_domain_ndrv_total);
 720                 break;
 721         case PF_KEY:
 722                 INC_ATOMIC_INT64_LIM(net_api_stats.nas_socket_domain_key_total);
 723                 break;
 724         case PF_INET6:
 725                 INC_ATOMIC_INT64_LIM(net_api_stats.nas_socket_domain_inet6_total);
 726                 if (type == SOCK_STREAM) {
 727                         INC_ATOMIC_INT64_LIM(net_api_stats.nas_socket_inet6_stream_total);
 728                 } else {
 729                         INC_ATOMIC_INT64_LIM(net_api_stats.nas_socket_inet6_dgram_total);
 730                 }
 731                 break;
 732         case PF_SYSTEM:
 733                 INC_ATOMIC_INT64_LIM(net_api_stats.nas_socket_domain_system_total);
 734                 break;
 735         case PF_MULTIPATH:
 736                 INC_ATOMIC_INT64_LIM(net_api_stats.nas_socket_domain_multipath_total);
 737                 break;
 738         default:
 739                 INC_ATOMIC_INT64_LIM(net_api_stats.nas_socket_domain_other_total);
 740                 break;
 741         }
 742
 743         if (flags & SOCF_MPTCP) {
 744                 so->so_state |= SS_NBIO;
 745         }
 746
 747         TAILQ_INIT(&so->so_incomp);
 748         TAILQ_INIT(&so->so_comp);
 749         so->so_type = type;
 750         so->last_upid = proc_uniqueid(p);
 751         so->last_pid = proc_pid(p);
 752         proc_getexecutableuuid(p, so->last_uuid, sizeof(so->last_uuid));
 753         proc_pidoriginatoruuid(so->so_vuuid, sizeof(so->so_vuuid));
 754
 755         if (ep != PROC_NULL && ep != p) {
 756                 so->e_upid = proc_uniqueid(ep);
 757                 so->e_pid = proc_pid(ep);
 758                 proc_getexecutableuuid(ep, so->e_uuid, sizeof(so->e_uuid));
 759                 so->so_flags |= SOF_DELEGATED;
 760         }
 761
 762         so->so_cred = kauth_cred_proc_ref(p);
 763         if (!suser(kauth_cred_get(), NULL)) {
 764                 so->so_state |= SS_PRIV;
 765         }
 766
 767         so->so_proto = prp;
 768         so->so_rcv.sb_flags |= SB_RECV;
 769         so->so_rcv.sb_so = so->so_snd.sb_so = so;
 770         so->next_lock_lr = 0;
 771         so->next_unlock_lr = 0;
 772
 773 #if CONFIG_MACF_SOCKET
 774         mac_socket_label_associate(kauth_cred_get(), so);
 775 #endif /* MAC_SOCKET */
 776
 777         /*
 778          * Attachment will create the per pcb lock if necessary and
 779          * increase refcount for creation, make sure it's done before
 780          * socket is inserted in lists.
 781          */
 782         so->so_usecount++;
 783
 784         error = (*prp->pr_usrreqs->pru_attach)(so, proto, p);
 785         if (error != 0) {
 786                 /*
 787                  * Warning:
 788                  * If so_pcb is not zero, the socket will be leaked,
 789                  * so protocol attachment handler must be coded carefuly
 790                  */
 791                 so->so_state |= SS_NOFDREF;
 792                 VERIFY(so->so_usecount > 0);
 793                 so->so_usecount--;
 794                 sofreelastref(so, 1);   /* will deallocate the socket */
 795                 return error;
 796         }
 797
 798         /*
 799          * Note: needs so_pcb to be set after pru_attach
 800          */
 801         if (prp->pr_update_last_owner != NULL) {
 802                 (*prp->pr_update_last_owner)(so, p, ep);
 803         }
 804
 805         atomic_add_32(&prp->pr_domain->dom_refs, 1);
 806         TAILQ_INIT(&so->so_evlist);
 807
 808         /* Attach socket filters for this protocol */
 809         sflt_initsock(so);
 810 #if TCPDEBUG
 811         if (tcpconsdebug == 2) {
 812                 so->so_options |= SO_DEBUG;
 813         }
 814 #endif
 815         so_set_default_traffic_class(so);
 816
 817         /*
 818          * If this thread or task is marked to create backgrounded sockets,
 819          * mark the socket as background.
 820          */
 821         if (!(flags & SOCF_MPTCP) &&
 822             proc_get_effective_thread_policy(current_thread(), TASK_POLICY_NEW_SOCKETS_BG)) {
 823                 socket_set_traffic_mgt_flags(so, TRAFFIC_MGT_SO_BACKGROUND);
 824                 so->so_background_thread = current_thread();
 825         }
 826
 827         switch (dom) {
 828         /*
 829          * Don't mark Unix domain, system or multipath sockets as
 830          * eligible for defunct by default.
 831          */
 832         case PF_LOCAL:
 833         case PF_SYSTEM:
 834         case PF_MULTIPATH:
 835                 so->so_flags |= SOF_NODEFUNCT;
 836                 break;
 837         default:
 838                 break;
 839         }
 840
 841         /*
 842          * Entitlements can't be checked at socket creation time except if the
 843          * application requested a feature guarded by a privilege (c.f., socket
 844          * delegation).
 845          * The priv(9) and the Sandboxing APIs are designed with the idea that
 846          * a privilege check should only be triggered by a userland request.
 847          * A privilege check at socket creation time is time consuming and
 848          * could trigger many authorisation error messages from the security
 849          * APIs.
 850          */
 851
 852         *aso = so;
 853
 854         return 0;
 855 }
 856
 857 /*
 858  * Returns:     0                       Success
 859  *              EAFNOSUPPORT
 860  *              EPROTOTYPE
 861  *              EPROTONOSUPPORT
 862  *              ENOBUFS
 863  *      <pru_attach>:ENOBUFS[AF_UNIX]
 864  *      <pru_attach>:ENOBUFS[TCP]
 865  *      <pru_attach>:ENOMEM[TCP]
 866  *      <pru_attach>:???                [other protocol families, IPSEC]
 867  */
 868 int
 869 socreate(int dom, struct socket **aso, int type, int proto)
 870 {
 871         return socreate_internal(dom, aso, type, proto, current_proc(), 0,
 872                    PROC_NULL);
 873 }
 874
 875 int
 876 socreate_delegate(int dom, struct socket **aso, int type, int proto, pid_t epid)
 877 {
 878         int error = 0;
 879         struct proc *ep = PROC_NULL;
 880
 881         if ((proc_selfpid() != epid) && ((ep = proc_find(epid)) == PROC_NULL)) {
 882                 error = ESRCH;
 883                 goto done;
 884         }
 885
 886         error = socreate_internal(dom, aso, type, proto, current_proc(), 0, ep);
 887
 888         /*
 889          * It might not be wise to hold the proc reference when calling
 890          * socreate_internal since it calls soalloc with M_WAITOK
 891          */
 892 done:
 893         if (ep != PROC_NULL) {
 894                 proc_rele(ep);
 895         }
 896
 897         return error;
 898 }
 899
 900 /*
 901  * Returns:     0                       Success
 902  *      <pru_bind>:EINVAL               Invalid argument [COMMON_START]
 903  *      <pru_bind>:EAFNOSUPPORT         Address family not supported
 904  *      <pru_bind>:EADDRNOTAVAIL        Address not available.
 905  *      <pru_bind>:EINVAL               Invalid argument
 906  *      <pru_bind>:EAFNOSUPPORT         Address family not supported [notdef]
 907  *      <pru_bind>:EACCES               Permission denied
 908  *      <pru_bind>:EADDRINUSE           Address in use
 909  *      <pru_bind>:EAGAIN               Resource unavailable, try again
 910  *      <pru_bind>:EPERM                Operation not permitted
 911  *      <pru_bind>:???
 912  *      <sf_bind>:???
 913  *
 914  * Notes:       It's not possible to fully enumerate the return codes above,
 915  *              since socket filter authors and protocol family authors may
 916  *              not choose to limit their error returns to those listed, even
 917  *              though this may result in some software operating incorrectly.
 918  *
 919  *              The error codes which are enumerated above are those known to
 920  *              be returned by the tcp_usr_bind function supplied.
 921  */
 922 int
 923 sobindlock(struct socket *so, struct sockaddr *nam, int dolock)
 924 {
 925         struct proc *p = current_proc();
 926         int error = 0;
 927
 928         if (dolock) {
 929                 socket_lock(so, 1);
 930         }
 931
 932         so_update_last_owner_locked(so, p);
 933         so_update_policy(so);
 934
 935 #if NECP
 936         so_update_necp_policy(so, nam, NULL);
 937 #endif /* NECP */
 938
 939         /*
 940          * If this is a bind request on a socket that has been marked
 941          * as inactive, reject it now before we go any further.
 942          */
 943         if (so->so_flags & SOF_DEFUNCT) {
 944                 error = EINVAL;
 945                 SODEFUNCTLOG("%s[%d, %s]: defunct so 0x%llx [%d,%d] (%d)\n",
 946                     __func__, proc_pid(p), proc_best_name(p),
 947                     (uint64_t)DEBUG_KERNEL_ADDRPERM(so),
 948                     SOCK_DOM(so), SOCK_TYPE(so), error);
 949                 goto out;
 950         }
 951
 952         /* Socket filter */
 953         error = sflt_bind(so, nam);
 954
 955         if (error == 0) {
 956                 error = (*so->so_proto->pr_usrreqs->pru_bind)(so, nam, p);
 957         }
 958 out:
 959         if (dolock) {
 960                 socket_unlock(so, 1);
 961         }
 962
 963         if (error == EJUSTRETURN) {
 964                 error = 0;
 965         }
 966
 967         return error;
 968 }
 969
 970 void
 971 sodealloc(struct socket *so)
 972 {
 973         kauth_cred_unref(&so->so_cred);
 974
 975         /* Remove any filters */
 976         sflt_termsock(so);
 977
 978 #if CONTENT_FILTER
 979         cfil_sock_detach(so);
 980 #endif /* CONTENT_FILTER */
 981
 982         /* Delete the state allocated for msg queues on a socket */
 983         if (so->so_flags & SOF_ENABLE_MSGS) {
 984                 FREE(so->so_msg_state, M_TEMP);
 985                 so->so_msg_state = NULL;
 986         }
 987         VERIFY(so->so_msg_state == NULL);
 988
 989         so->so_gencnt = OSIncrementAtomic64((SInt64 *)&so_gencnt);
 990
 991 #if CONFIG_MACF_SOCKET
 992         mac_socket_label_destroy(so);
 993 #endif /* MAC_SOCKET */
 994
 995         if (so->so_flags1 & SOF1_CACHED_IN_SOCK_LAYER) {
 996                 cached_sock_free(so);
 997         } else {
 998                 FREE_ZONE(so, sizeof(*so), so->so_zone);
 999         }
1000 }
1001
1002 /*
1003  * Returns:     0                       Success
1004  *              EINVAL
1005  *              EOPNOTSUPP
1006  *      <pru_listen>:EINVAL[AF_UNIX]
1007  *      <pru_listen>:EINVAL[TCP]
1008  *      <pru_listen>:EADDRNOTAVAIL[TCP] Address not available.
1009  *      <pru_listen>:EINVAL[TCP]        Invalid argument
1010  *      <pru_listen>:EAFNOSUPPORT[TCP]  Address family not supported [notdef]
1011  *      <pru_listen>:EACCES[TCP]        Permission denied
1012  *      <pru_listen>:EADDRINUSE[TCP]    Address in use
1013  *      <pru_listen>:EAGAIN[TCP]        Resource unavailable, try again
1014  *      <pru_listen>:EPERM[TCP]         Operation not permitted
1015  *      <sf_listen>:???
1016  *
1017  * Notes:       Other <pru_listen> returns depend on the protocol family; all
1018  *              <sf_listen> returns depend on what the filter author causes
1019  *              their filter to return.
1020  */
1021 int
1022 solisten(struct socket *so, int backlog)
1023 {
1024         struct proc *p = current_proc();
1025         int error = 0;
1026
1027         socket_lock(so, 1);
1028
1029         so_update_last_owner_locked(so, p);
1030         so_update_policy(so);
1031
1032 #if NECP
1033         so_update_necp_policy(so, NULL, NULL);
1034 #endif /* NECP */
1035
1036         if (so->so_proto == NULL) {
1037                 error = EINVAL;
1038                 goto out;
1039         }
1040         if ((so->so_proto->pr_flags & PR_CONNREQUIRED) == 0) {
1041                 error = EOPNOTSUPP;
1042                 goto out;
1043         }
1044
1045         /*
1046          * If the listen request is made on a socket that is not fully
1047          * disconnected, or on a socket that has been marked as inactive,
1048          * reject the request now.
1049          */
1050         if ((so->so_state &
1051             (SS_ISCONNECTED | SS_ISCONNECTING | SS_ISDISCONNECTING)) ||
1052             (so->so_flags & SOF_DEFUNCT)) {
1053                 error = EINVAL;
1054                 if (so->so_flags & SOF_DEFUNCT) {
1055                         SODEFUNCTLOG("%s[%d, %s]: defunct so 0x%llx [%d,%d] "
1056                             "(%d)\n", __func__, proc_pid(p),
1057                             proc_best_name(p),
1058                             (uint64_t)DEBUG_KERNEL_ADDRPERM(so),
1059                             SOCK_DOM(so), SOCK_TYPE(so), error);
1060                 }
1061                 goto out;
1062         }
1063
1064         if ((so->so_restrictions & SO_RESTRICT_DENY_IN) != 0) {
1065                 error = EPERM;
1066                 goto out;
1067         }
1068
1069         error = sflt_listen(so);
1070         if (error == 0) {
1071                 error = (*so->so_proto->pr_usrreqs->pru_listen)(so, p);
1072         }
1073
1074         if (error) {
1075                 if (error == EJUSTRETURN) {
1076                         error = 0;
1077                 }
1078                 goto out;
1079         }
1080
1081         if (TAILQ_EMPTY(&so->so_comp)) {
1082                 so->so_options |= SO_ACCEPTCONN;
1083         }
1084         /*
1085          * POSIX: The implementation may have an upper limit on the length of
1086          * the listen queue-either global or per accepting socket. If backlog
1087          * exceeds this limit, the length of the listen queue is set to the
1088          * limit.
1089          *
1090          * If listen() is called with a backlog argument value that is less
1091          * than 0, the function behaves as if it had been called with a backlog
1092          * argument value of 0.
1093          *
1094          * A backlog argument of 0 may allow the socket to accept connections,
1095          * in which case the length of the listen queue may be set to an
1096          * implementation-defined minimum value.
1097          */
1098         if (backlog <= 0 || backlog > somaxconn) {
1099                 backlog = somaxconn;
1100         }
1101
1102         so->so_qlimit = backlog;
1103 out:
1104         socket_unlock(so, 1);
1105         return error;
1106 }
1107
1108 /*
1109  * The "accept list lock" protects the fields related to the listener queues
1110  * because we can unlock a socket to respect the lock ordering between
1111  * the listener socket and its clients sockets. The lock ordering is first to
1112  * acquire the client socket before the listener socket.
1113  *
1114  * The accept list lock serializes access to the following fields:
1115  * - of the listener socket:
1116  *   - so_comp
1117  *   - so_incomp
1118  *   - so_qlen
1119  *   - so_inqlen
1120  * - of client sockets that are in so_comp or so_incomp:
1121  *   - so_head
1122  *   - so_list
1123  *
1124  * As one can see the accept list lock protects the consistent of the
1125  * linkage of the client sockets.
1126  *
1127  * Note that those fields may be read without holding the accept list lock
1128  * for a preflight provided the accept list lock is taken when committing
1129  * to take an action based on the result of the preflight. The preflight
1130  * saves the cost of doing the unlock/lock dance.
1131  */
1132 void
1133 so_acquire_accept_list(struct socket *head, struct socket *so)
1134 {
1135         lck_mtx_t *mutex_held;
1136
1137         if (head->so_proto->pr_getlock == NULL) {
1138                 return;
1139         }
1140         mutex_held = (*head->so_proto->pr_getlock)(head, PR_F_WILLUNLOCK);
1141         LCK_MTX_ASSERT(mutex_held, LCK_MTX_ASSERT_OWNED);
1142
1143         if (!(head->so_flags1 & SOF1_ACCEPT_LIST_HELD)) {
1144                 head->so_flags1 |= SOF1_ACCEPT_LIST_HELD;
1145                 return;
1146         }
1147         if (so != NULL) {
1148                 socket_unlock(so, 0);
1149         }
1150         while (head->so_flags1 & SOF1_ACCEPT_LIST_HELD) {
1151                 so_accept_list_waits += 1;
1152                 msleep((caddr_t)&head->so_incomp, mutex_held,
1153                     PSOCK | PCATCH, __func__, NULL);
1154         }
1155         head->so_flags1 |= SOF1_ACCEPT_LIST_HELD;
1156         if (so != NULL) {
1157                 socket_unlock(head, 0);
1158                 socket_lock(so, 0);
1159                 socket_lock(head, 0);
1160         }
1161 }
1162
1163 void
1164 so_release_accept_list(struct socket *head)
1165 {
1166         if (head->so_proto->pr_getlock != NULL) {
1167                 lck_mtx_t *mutex_held;
1168
1169                 mutex_held = (*head->so_proto->pr_getlock)(head, 0);
1170                 LCK_MTX_ASSERT(mutex_held, LCK_MTX_ASSERT_OWNED);
1171
1172                 head->so_flags1 &= ~SOF1_ACCEPT_LIST_HELD;
1173                 wakeup((caddr_t)&head->so_incomp);
1174         }
1175 }
1176
1177 void
1178 sofreelastref(struct socket *so, int dealloc)
1179 {
1180         struct socket *head = so->so_head;
1181
1182         /* Assume socket is locked */
1183
1184         if (!(so->so_flags & SOF_PCBCLEARING) || !(so->so_state & SS_NOFDREF)) {
1185                 selthreadclear(&so->so_snd.sb_sel);
1186                 selthreadclear(&so->so_rcv.sb_sel);
1187                 so->so_rcv.sb_flags &= ~(SB_SEL | SB_UPCALL);
1188                 so->so_snd.sb_flags &= ~(SB_SEL | SB_UPCALL);
1189                 so->so_event = sonullevent;
1190                 return;
1191         }
1192         if (head != NULL) {
1193                 /*
1194                  * Need to lock the listener when the protocol has
1195                  * per socket locks
1196                  */
1197                 if (head->so_proto->pr_getlock != NULL) {
1198                         socket_lock(head, 1);
1199                         so_acquire_accept_list(head, so);
1200                 }
1201                 if (so->so_state & SS_INCOMP) {
1202                         so->so_state &= ~SS_INCOMP;
1203                         TAILQ_REMOVE(&head->so_incomp, so, so_list);
1204                         head->so_incqlen--;
1205                         head->so_qlen--;
1206                         so->so_head = NULL;
1207
1208                         if (head->so_proto->pr_getlock != NULL) {
1209                                 so_release_accept_list(head);
1210                                 socket_unlock(head, 1);
1211                         }
1212                 } else if (so->so_state & SS_COMP) {
1213                         if (head->so_proto->pr_getlock != NULL) {
1214                                 so_release_accept_list(head);
1215                                 socket_unlock(head, 1);
1216                         }
1217                         /*
1218                          * We must not decommission a socket that's
1219                          * on the accept(2) queue.  If we do, then
1220                          * accept(2) may hang after select(2) indicated
1221                          * that the listening socket was ready.
1222                          */
1223                         selthreadclear(&so->so_snd.sb_sel);
1224                         selthreadclear(&so->so_rcv.sb_sel);
1225                         so->so_rcv.sb_flags &= ~(SB_SEL | SB_UPCALL);
1226                         so->so_snd.sb_flags &= ~(SB_SEL | SB_UPCALL);
1227                         so->so_event = sonullevent;
1228                         return;
1229                 } else {
1230                         if (head->so_proto->pr_getlock != NULL) {
1231                                 so_release_accept_list(head);
1232                                 socket_unlock(head, 1);
1233                         }
1234                         printf("sofree: not queued\n");
1235                 }
1236         }
1237         sowflush(so);
1238         sorflush(so);
1239
1240 #if FLOW_DIVERT
1241         if (so->so_flags & SOF_FLOW_DIVERT) {
1242                 flow_divert_detach(so);
1243         }
1244 #endif  /* FLOW_DIVERT */
1245
1246         /* 3932268: disable upcall */
1247         so->so_rcv.sb_flags &= ~SB_UPCALL;
1248         so->so_snd.sb_flags &= ~(SB_UPCALL | SB_SNDBYTE_CNT);
1249         so->so_event = sonullevent;
1250
1251         if (dealloc) {
1252                 sodealloc(so);
1253         }
1254 }
1255
1256 void
1257 soclose_wait_locked(struct socket *so)
1258 {
1259         lck_mtx_t *mutex_held;
1260
1261         if (so->so_proto->pr_getlock != NULL) {
1262                 mutex_held = (*so->so_proto->pr_getlock)(so, PR_F_WILLUNLOCK);
1263         } else {
1264                 mutex_held = so->so_proto->pr_domain->dom_mtx;
1265         }
1266         LCK_MTX_ASSERT(mutex_held, LCK_MTX_ASSERT_OWNED);
1267
1268         /*
1269          * Double check here and return if there's no outstanding upcall;
1270          * otherwise proceed further only if SOF_UPCALLCLOSEWAIT is set.
1271          */
1272         if (!so->so_upcallusecount || !(so->so_flags & SOF_UPCALLCLOSEWAIT)) {
1273                 return;
1274         }
1275         so->so_rcv.sb_flags &= ~SB_UPCALL;
1276         so->so_snd.sb_flags &= ~SB_UPCALL;
1277         so->so_flags |= SOF_CLOSEWAIT;
1278
1279         (void) msleep((caddr_t)&so->so_upcallusecount, mutex_held, (PZERO - 1),
1280             "soclose_wait_locked", NULL);
1281         LCK_MTX_ASSERT(mutex_held, LCK_MTX_ASSERT_OWNED);
1282         so->so_flags &= ~SOF_CLOSEWAIT;
1283 }
1284
1285 /*
1286  * Close a socket on last file table reference removal.
1287  * Initiate disconnect if connected.
1288  * Free socket when disconnect complete.
1289  */
1290 int
1291 soclose_locked(struct socket *so)
1292 {
1293         int error = 0;
1294         struct timespec ts;
1295
1296         if (so->so_usecount == 0) {
1297                 panic("soclose: so=%p refcount=0\n", so);
1298                 /* NOTREACHED */
1299         }
1300
1301         sflt_notify(so, sock_evt_closing, NULL);
1302
1303         if (so->so_upcallusecount) {
1304                 soclose_wait_locked(so);
1305         }
1306
1307 #if CONTENT_FILTER
1308         /*
1309          * We have to wait until the content filters are done
1310          */
1311         if ((so->so_flags & SOF_CONTENT_FILTER) != 0) {
1312                 cfil_sock_close_wait(so);
1313                 cfil_sock_is_closed(so);
1314                 cfil_sock_detach(so);
1315         }
1316 #endif /* CONTENT_FILTER */
1317
1318         if (so->so_flags1 & SOF1_EXTEND_BK_IDLE_INPROG) {
1319                 soresume(current_proc(), so, 1);
1320                 so->so_flags1 &= ~SOF1_EXTEND_BK_IDLE_WANTED;
1321         }
1322
1323         if ((so->so_options & SO_ACCEPTCONN)) {
1324                 struct socket *sp, *sonext;
1325                 int persocklock = 0;
1326                 int incomp_overflow_only;
1327
1328                 /*
1329                  * We do not want new connection to be added
1330                  * to the connection queues
1331                  */
1332                 so->so_options &= ~SO_ACCEPTCONN;
1333
1334                 /*
1335                  * We can drop the lock on the listener once
1336                  * we've acquired the incoming list
1337                  */
1338                 if (so->so_proto->pr_getlock != NULL) {
1339                         persocklock = 1;
1340                         so_acquire_accept_list(so, NULL);
1341                         socket_unlock(so, 0);
1342                 }
1343 again:
1344                 incomp_overflow_only = 1;
1345
1346                 TAILQ_FOREACH_SAFE(sp, &so->so_incomp, so_list, sonext) {
1347                         /*
1348                          * Radar 5350314
1349                          * skip sockets thrown away by tcpdropdropblreq
1350                          * they will get cleanup by the garbage collection.
1351                          * otherwise, remove the incomp socket from the queue
1352                          * and let soabort trigger the appropriate cleanup.
1353                          */
1354                         if (sp->so_flags & SOF_OVERFLOW) {
1355                                 continue;
1356                         }
1357
1358                         if (persocklock != 0) {
1359                                 socket_lock(sp, 1);
1360                         }
1361
1362                         /*
1363                          * Radar 27945981
1364                          * The extra reference for the list insure the
1365                          * validity of the socket pointer when we perform the
1366                          * unlock of the head above
1367                          */
1368                         if (sp->so_state & SS_INCOMP) {
1369                                 sp->so_state &= ~SS_INCOMP;
1370                                 sp->so_head = NULL;
1371                                 TAILQ_REMOVE(&so->so_incomp, sp, so_list);
1372                                 so->so_incqlen--;
1373                                 so->so_qlen--;
1374
1375                                 (void) soabort(sp);
1376                         } else {
1377                                 panic("%s sp %p in so_incomp but !SS_INCOMP",
1378                                     __func__, sp);
1379                         }
1380
1381                         if (persocklock != 0) {
1382                                 socket_unlock(sp, 1);
1383                         }
1384                 }
1385
1386                 TAILQ_FOREACH_SAFE(sp, &so->so_comp, so_list, sonext) {
1387                         /* Dequeue from so_comp since sofree() won't do it */
1388                         if (persocklock != 0) {
1389                                 socket_lock(sp, 1);
1390                         }
1391
1392                         if (sp->so_state & SS_COMP) {
1393                                 sp->so_state &= ~SS_COMP;
1394                                 sp->so_head = NULL;
1395                                 TAILQ_REMOVE(&so->so_comp, sp, so_list);
1396                                 so->so_qlen--;
1397
1398                                 (void) soabort(sp);
1399                         } else {
1400                                 panic("%s sp %p in so_comp but !SS_COMP",
1401                                     __func__, sp);
1402                         }
1403
1404                         if (persocklock) {
1405                                 socket_unlock(sp, 1);
1406                         }
1407                 }
1408
1409                 if (incomp_overflow_only == 0 && !TAILQ_EMPTY(&so->so_incomp)) {
1410 #if (DEBUG | DEVELOPMENT)
1411                         panic("%s head %p so_comp not empty\n", __func__, so);
1412 #endif /* (DEVELOPMENT || DEBUG) */
1413
1414                         goto again;
1415                 }
1416
1417                 if (!TAILQ_EMPTY(&so->so_comp)) {
1418 #if (DEBUG | DEVELOPMENT)
1419                         panic("%s head %p so_comp not empty\n", __func__, so);
1420 #endif /* (DEVELOPMENT || DEBUG) */
1421
1422                         goto again;
1423                 }
1424
1425                 if (persocklock) {
1426                         socket_lock(so, 0);
1427                         so_release_accept_list(so);
1428                 }
1429         }
1430         if (so->so_pcb == NULL) {
1431                 /* 3915887: mark the socket as ready for dealloc */
1432                 so->so_flags |= SOF_PCBCLEARING;
1433                 goto discard;
1434         }
1435         if (so->so_state & SS_ISCONNECTED) {
1436                 if ((so->so_state & SS_ISDISCONNECTING) == 0) {
1437                         error = sodisconnectlocked(so);
1438                         if (error) {
1439                                 goto drop;
1440                         }
1441                 }
1442                 if (so->so_options & SO_LINGER) {
1443                         lck_mtx_t *mutex_held;
1444
1445                         if ((so->so_state & SS_ISDISCONNECTING) &&
1446                             (so->so_state & SS_NBIO)) {
1447                                 goto drop;
1448                         }
1449                         if (so->so_proto->pr_getlock != NULL) {
1450                                 mutex_held = (*so->so_proto->pr_getlock)(so, PR_F_WILLUNLOCK);
1451                         } else {
1452                                 mutex_held = so->so_proto->pr_domain->dom_mtx;
1453                         }
1454                         while (so->so_state & SS_ISCONNECTED) {
1455                                 ts.tv_sec = (so->so_linger / 100);
1456                                 ts.tv_nsec = (so->so_linger % 100) *
1457                                     NSEC_PER_USEC * 1000 * 10;
1458                                 error = msleep((caddr_t)&so->so_timeo,
1459                                     mutex_held, PSOCK | PCATCH, "soclose", &ts);
1460                                 if (error) {
1461                                         /*
1462                                          * It's OK when the time fires,
1463                                          * don't report an error
1464                                          */
1465                                         if (error == EWOULDBLOCK) {
1466                                                 error = 0;
1467                                         }
1468                                         break;
1469                                 }
1470                         }
1471                 }
1472         }
1473 drop:
1474         if (so->so_usecount == 0) {
1475                 panic("soclose: usecount is zero so=%p\n", so);
1476                 /* NOTREACHED */
1477         }
1478         if (so->so_pcb != NULL && !(so->so_flags & SOF_PCBCLEARING)) {
1479                 int error2 = (*so->so_proto->pr_usrreqs->pru_detach)(so);
1480                 if (error == 0) {
1481                         error = error2;
1482                 }
1483         }
1484         if (so->so_usecount <= 0) {
1485                 panic("soclose: usecount is zero so=%p\n", so);
1486                 /* NOTREACHED */
1487         }
1488 discard:
1489         if (so->so_pcb != NULL && !(so->so_flags & SOF_MP_SUBFLOW) &&
1490             (so->so_state & SS_NOFDREF)) {
1491                 panic("soclose: NOFDREF");
1492                 /* NOTREACHED */
1493         }
1494         so->so_state |= SS_NOFDREF;
1495
1496         if ((so->so_flags & SOF_KNOTE) != 0) {
1497                 KNOTE(&so->so_klist, SO_FILT_HINT_LOCKED);
1498         }
1499
1500         atomic_add_32(&so->so_proto->pr_domain->dom_refs, -1);
1501         evsofree(so);
1502
1503         VERIFY(so->so_usecount > 0);
1504         so->so_usecount--;
1505         sofree(so);
1506         return error;
1507 }
1508
1509 int
1510 soclose(struct socket *so)
1511 {
1512         int error = 0;
1513         socket_lock(so, 1);
1514
1515         if (so->so_retaincnt == 0) {
1516                 error = soclose_locked(so);
1517         } else {
1518                 /*
1519                  * if the FD is going away, but socket is
1520                  * retained in kernel remove its reference
1521                  */
1522                 so->so_usecount--;
1523                 if (so->so_usecount < 2) {
1524                         panic("soclose: retaincnt non null and so=%p "
1525                             "usecount=%d\n", so, so->so_usecount);
1526                 }
1527         }
1528         socket_unlock(so, 1);
1529         return error;
1530 }
1531
1532 /*
1533  * Must be called at splnet...
1534  */
1535 /* Should already be locked */
1536 int
1537 soabort(struct socket *so)
1538 {
1539         int error;
1540
1541 #ifdef MORE_LOCKING_DEBUG
1542         lck_mtx_t *mutex_held;
1543
1544         if (so->so_proto->pr_getlock != NULL) {
1545                 mutex_held = (*so->so_proto->pr_getlock)(so, 0);
1546         } else {
1547                 mutex_held = so->so_proto->pr_domain->dom_mtx;
1548         }
1549         LCK_MTX_ASSERT(mutex_held, LCK_MTX_ASSERT_OWNED);
1550 #endif
1551
1552         if ((so->so_flags & SOF_ABORTED) == 0) {
1553                 so->so_flags |= SOF_ABORTED;
1554                 error = (*so->so_proto->pr_usrreqs->pru_abort)(so);
1555                 if (error) {
1556                         sofree(so);
1557                         return error;
1558                 }
1559         }
1560         return 0;
1561 }
1562
1563 int
1564 soacceptlock(struct socket *so, struct sockaddr **nam, int dolock)
1565 {
1566         int error;
1567
1568         if (dolock) {
1569                 socket_lock(so, 1);
1570         }
1571
1572         so_update_last_owner_locked(so, PROC_NULL);
1573         so_update_policy(so);
1574 #if NECP
1575         so_update_necp_policy(so, NULL, NULL);
1576 #endif /* NECP */
1577
1578         if ((so->so_state & SS_NOFDREF) == 0) {
1579                 panic("soaccept: !NOFDREF");
1580         }
1581         so->so_state &= ~SS_NOFDREF;
1582         error = (*so->so_proto->pr_usrreqs->pru_accept)(so, nam);
1583
1584         if (dolock) {
1585                 socket_unlock(so, 1);
1586         }
1587         return error;
1588 }
1589
1590 int
1591 soaccept(struct socket *so, struct sockaddr **nam)
1592 {
1593         return soacceptlock(so, nam, 1);
1594 }
1595
1596 int
1597 soacceptfilter(struct socket *so, struct socket *head)
1598 {
1599         struct sockaddr *local = NULL, *remote = NULL;
1600         int error = 0;
1601
1602         /*
1603          * Hold the lock even if this socket has not been made visible
1604          * to the filter(s).  For sockets with global locks, this protects
1605          * against the head or peer going away
1606          */
1607         socket_lock(so, 1);
1608         if (sogetaddr_locked(so, &remote, 1) != 0 ||
1609             sogetaddr_locked(so, &local, 0) != 0) {
1610                 so->so_state &= ~SS_NOFDREF;
1611                 socket_unlock(so, 1);
1612                 soclose(so);
1613                 /* Out of resources; try it again next time */
1614                 error = ECONNABORTED;
1615                 goto done;
1616         }
1617
1618         error = sflt_accept(head, so, local, remote);
1619
1620         /*
1621          * If we get EJUSTRETURN from one of the filters, mark this socket
1622          * as inactive and return it anyway.  This newly accepted socket
1623          * will be disconnected later before we hand it off to the caller.
1624          */
1625         if (error == EJUSTRETURN) {
1626                 error = 0;
1627                 (void) sosetdefunct(current_proc(), so,
1628                     SHUTDOWN_SOCKET_LEVEL_DISCONNECT_INTERNAL, FALSE);
1629         }
1630
1631         if (error != 0) {
1632                 /*
1633                  * This may seem like a duplication to the above error
1634                  * handling part when we return ECONNABORTED, except
1635                  * the following is done while holding the lock since
1636                  * the socket has been exposed to the filter(s) earlier.
1637                  */
1638                 so->so_state &= ~SS_NOFDREF;
1639                 socket_unlock(so, 1);
1640                 soclose(so);
1641                 /* Propagate socket filter's error code to the caller */
1642         } else {
1643                 socket_unlock(so, 1);
1644         }
1645 done:
1646         /* Callee checks for NULL pointer */
1647         sock_freeaddr(remote);
1648         sock_freeaddr(local);
1649         return error;
1650 }
1651
1652 /*
1653  * Returns:     0                       Success
1654  *              EOPNOTSUPP              Operation not supported on socket
1655  *              EISCONN                 Socket is connected
1656  *      <pru_connect>:EADDRNOTAVAIL     Address not available.
1657  *      <pru_connect>:EINVAL            Invalid argument
1658  *      <pru_connect>:EAFNOSUPPORT      Address family not supported [notdef]
1659  *      <pru_connect>:EACCES            Permission denied
1660  *      <pru_connect>:EADDRINUSE        Address in use
1661  *      <pru_connect>:EAGAIN            Resource unavailable, try again
1662  *      <pru_connect>:EPERM             Operation not permitted
1663  *      <sf_connect_out>:???            [anything a filter writer might set]
1664  */
1665 int
1666 soconnectlock(struct socket *so, struct sockaddr *nam, int dolock)
1667 {
1668         int error;
1669         struct proc *p = current_proc();
1670
1671         if (dolock) {
1672                 socket_lock(so, 1);
1673         }
1674
1675         so_update_last_owner_locked(so, p);
1676         so_update_policy(so);
1677
1678 #if NECP
1679         so_update_necp_policy(so, NULL, nam);
1680 #endif /* NECP */
1681
1682         /*
1683          * If this is a listening socket or if this is a previously-accepted
1684          * socket that has been marked as inactive, reject the connect request.
1685          */
1686         if ((so->so_options & SO_ACCEPTCONN) || (so->so_flags & SOF_DEFUNCT)) {
1687                 error = EOPNOTSUPP;
1688                 if (so->so_flags & SOF_DEFUNCT) {
1689                         SODEFUNCTLOG("%s[%d, %s]: defunct so 0x%llx [%d,%d] "
1690                             "(%d)\n", __func__, proc_pid(p),
1691                             proc_best_name(p),
1692                             (uint64_t)DEBUG_KERNEL_ADDRPERM(so),
1693                             SOCK_DOM(so), SOCK_TYPE(so), error);
1694                 }
1695                 if (dolock) {
1696                         socket_unlock(so, 1);
1697                 }
1698                 return error;
1699         }
1700
1701         if ((so->so_restrictions & SO_RESTRICT_DENY_OUT) != 0) {
1702                 if (dolock) {
1703                         socket_unlock(so, 1);
1704                 }
1705                 return EPERM;
1706         }
1707
1708         /*
1709          * If protocol is connection-based, can only connect once.
1710          * Otherwise, if connected, try to disconnect first.
1711          * This allows user to disconnect by connecting to, e.g.,
1712          * a null address.
1713          */
1714         if (so->so_state & (SS_ISCONNECTED | SS_ISCONNECTING) &&
1715             ((so->so_proto->pr_flags & PR_CONNREQUIRED) ||
1716             (error = sodisconnectlocked(so)))) {
1717                 error = EISCONN;
1718         } else {
1719                 /*
1720                  * Run connect filter before calling protocol:
1721                  *  - non-blocking connect returns before completion;
1722                  */
1723                 error = sflt_connectout(so, nam);
1724                 if (error != 0) {
1725                         if (error == EJUSTRETURN) {
1726                                 error = 0;
1727                         }
1728                 } else {
1729                         error = (*so->so_proto->pr_usrreqs->pru_connect)
1730                             (so, nam, p);
1731                 }
1732         }
1733         if (dolock) {
1734                 socket_unlock(so, 1);
1735         }
1736         return error;
1737 }
1738
1739 int
1740 soconnect(struct socket *so, struct sockaddr *nam)
1741 {
1742         return soconnectlock(so, nam, 1);
1743 }
1744
1745 /*
1746  * Returns:     0                       Success
1747  *      <pru_connect2>:EINVAL[AF_UNIX]
1748  *      <pru_connect2>:EPROTOTYPE[AF_UNIX]
1749  *      <pru_connect2>:???              [other protocol families]
1750  *
1751  * Notes:       <pru_connect2> is not supported by [TCP].
1752  */
1753 int
1754 soconnect2(struct socket *so1, struct socket *so2)
1755 {
1756         int error;
1757
1758         socket_lock(so1, 1);
1759         if (so2->so_proto->pr_lock) {
1760                 socket_lock(so2, 1);
1761         }
1762
1763         error = (*so1->so_proto->pr_usrreqs->pru_connect2)(so1, so2);
1764
1765         socket_unlock(so1, 1);
1766         if (so2->so_proto->pr_lock) {
1767                 socket_unlock(so2, 1);
1768         }
1769         return error;
1770 }
1771
1772 int
1773 soconnectxlocked(struct socket *so, struct sockaddr *src,
1774     struct sockaddr *dst, struct proc *p, uint32_t ifscope,
1775     sae_associd_t aid, sae_connid_t *pcid, uint32_t flags, void *arg,
1776     uint32_t arglen, uio_t auio, user_ssize_t *bytes_written)
1777 {
1778         int error;
1779
1780         so_update_last_owner_locked(so, p);
1781         so_update_policy(so);
1782
1783         /*
1784          * If this is a listening socket or if this is a previously-accepted
1785          * socket that has been marked as inactive, reject the connect request.
1786          */
1787         if ((so->so_options & SO_ACCEPTCONN) || (so->so_flags & SOF_DEFUNCT)) {
1788                 error = EOPNOTSUPP;
1789                 if (so->so_flags & SOF_DEFUNCT) {
1790                         SODEFUNCTLOG("%s[%d, %s]: defunct so 0x%llx [%d,%d] "
1791                             "(%d)\n", __func__, proc_pid(p),
1792                             proc_best_name(p),
1793                             (uint64_t)DEBUG_KERNEL_ADDRPERM(so),
1794                             SOCK_DOM(so), SOCK_TYPE(so), error);
1795                 }
1796                 return error;
1797         }
1798
1799         if ((so->so_restrictions & SO_RESTRICT_DENY_OUT) != 0) {
1800                 return EPERM;
1801         }
1802
1803         /*
1804          * If protocol is connection-based, can only connect once
1805          * unless PR_MULTICONN is set.  Otherwise, if connected,
1806          * try to disconnect first.  This allows user to disconnect
1807          * by connecting to, e.g., a null address.
1808          */
1809         if ((so->so_state & (SS_ISCONNECTED | SS_ISCONNECTING)) &&
1810             !(so->so_proto->pr_flags & PR_MULTICONN) &&
1811             ((so->so_proto->pr_flags & PR_CONNREQUIRED) ||
1812             (error = sodisconnectlocked(so)) != 0)) {
1813                 error = EISCONN;
1814         } else {
1815                 /*
1816                  * Run connect filter before calling protocol:
1817                  *  - non-blocking connect returns before completion;
1818                  */
1819                 error = sflt_connectout(so, dst);
1820                 if (error != 0) {
1821                         /* Disable PRECONNECT_DATA, as we don't need to send a SYN anymore. */
1822                         so->so_flags1 &= ~SOF1_PRECONNECT_DATA;
1823                         if (error == EJUSTRETURN) {
1824                                 error = 0;
1825                         }
1826                 } else {
1827                         error = (*so->so_proto->pr_usrreqs->pru_connectx)
1828                             (so, src, dst, p, ifscope, aid, pcid,
1829                             flags, arg, arglen, auio, bytes_written);
1830                 }
1831         }
1832
1833         return error;
1834 }
1835
1836 int
1837 sodisconnectlocked(struct socket *so)
1838 {
1839         int error;
1840
1841         if ((so->so_state & SS_ISCONNECTED) == 0) {
1842                 error = ENOTCONN;
1843                 goto bad;
1844         }
1845         if (so->so_state & SS_ISDISCONNECTING) {
1846                 error = EALREADY;
1847                 goto bad;
1848         }
1849
1850         error = (*so->so_proto->pr_usrreqs->pru_disconnect)(so);
1851         if (error == 0) {
1852                 sflt_notify(so, sock_evt_disconnected, NULL);
1853         }
1854
1855 bad:
1856         return error;
1857 }
1858
1859 /* Locking version */
1860 int
1861 sodisconnect(struct socket *so)
1862 {
1863         int error;
1864
1865         socket_lock(so, 1);
1866         error = sodisconnectlocked(so);
1867         socket_unlock(so, 1);
1868         return error;
1869 }
1870
1871 int
1872 sodisconnectxlocked(struct socket *so, sae_associd_t aid, sae_connid_t cid)
1873 {
1874         int error;
1875
1876         /*
1877          * Call the protocol disconnectx handler; let it handle all
1878          * matters related to the connection state of this session.
1879          */
1880         error = (*so->so_proto->pr_usrreqs->pru_disconnectx)(so, aid, cid);
1881         if (error == 0) {
1882                 /*
1883                  * The event applies only for the session, not for
1884                  * the disconnection of individual subflows.
1885                  */
1886                 if (so->so_state & (SS_ISDISCONNECTING | SS_ISDISCONNECTED)) {
1887                         sflt_notify(so, sock_evt_disconnected, NULL);
1888                 }
1889         }
1890         return error;
1891 }
1892
1893 int
1894 sodisconnectx(struct socket *so, sae_associd_t aid, sae_connid_t cid)
1895 {
1896         int error;
1897
1898         socket_lock(so, 1);
1899         error = sodisconnectxlocked(so, aid, cid);
1900         socket_unlock(so, 1);
1901         return error;
1902 }
1903
1904 #define SBLOCKWAIT(f)   (((f) & MSG_DONTWAIT) ? 0 : SBL_WAIT)
1905
1906 /*
1907  * sosendcheck will lock the socket buffer if it isn't locked and
1908  * verify that there is space for the data being inserted.
1909  *
1910  * Returns:     0                       Success
1911  *              EPIPE
1912  *      sblock:EWOULDBLOCK
1913  *      sblock:EINTR
1914  *      sbwait:EBADF
1915  *      sbwait:EINTR
1916  *      [so_error]:???
1917  */
1918 int
1919 sosendcheck(struct socket *so, struct sockaddr *addr, user_ssize_t resid,
1920     int32_t clen, int32_t atomic, int flags, int *sblocked,
1921     struct mbuf *control)
1922 {
1923         int     error = 0;
1924         int32_t space;
1925         int     assumelock = 0;
1926
1927 restart:
1928         if (*sblocked == 0) {
1929                 if ((so->so_snd.sb_flags & SB_LOCK) != 0 &&
1930                     so->so_send_filt_thread != 0 &&
1931                     so->so_send_filt_thread == current_thread()) {
1932                         /*
1933                          * We're being called recursively from a filter,
1934                          * allow this to continue. Radar 4150520.
1935                          * Don't set sblocked because we don't want
1936                          * to perform an unlock later.
1937                          */
1938                         assumelock = 1;
1939                 } else {
1940                         error = sblock(&so->so_snd, SBLOCKWAIT(flags));
1941                         if (error) {
1942                                 if (so->so_flags & SOF_DEFUNCT) {
1943                                         goto defunct;
1944                                 }
1945                                 return error;
1946                         }
1947                         *sblocked = 1;
1948                 }
1949         }
1950
1951         /*
1952          * If a send attempt is made on a socket that has been marked
1953          * as inactive (disconnected), reject the request.
1954          */
1955         if (so->so_flags & SOF_DEFUNCT) {
1956 defunct:
1957                 error = EPIPE;
1958                 SODEFUNCTLOG("%s[%d, %s]: defunct so 0x%llx [%d,%d] (%d)\n",
1959                     __func__, proc_selfpid(), proc_best_name(current_proc()),
1960                     (uint64_t)DEBUG_KERNEL_ADDRPERM(so),
1961                     SOCK_DOM(so), SOCK_TYPE(so), error);
1962                 return error;
1963         }
1964
1965         if (so->so_state & SS_CANTSENDMORE) {
1966 #if CONTENT_FILTER
1967                 /*
1968                  * Can re-inject data of half closed connections
1969                  */
1970                 if ((so->so_state & SS_ISDISCONNECTED) == 0 &&
1971                     so->so_snd.sb_cfil_thread == current_thread() &&
1972                     cfil_sock_data_pending(&so->so_snd) != 0) {
1973                         CFIL_LOG(LOG_INFO,
1974                             "so %llx ignore SS_CANTSENDMORE",
1975                             (uint64_t)DEBUG_KERNEL_ADDRPERM(so));
1976                 } else
1977 #endif /* CONTENT_FILTER */
1978                 return EPIPE;
1979         }
1980         if (so->so_error) {
1981                 error = so->so_error;
1982                 so->so_error = 0;
1983                 return error;
1984         }
1985
1986         if ((so->so_state & SS_ISCONNECTED) == 0) {
1987                 if ((so->so_proto->pr_flags & PR_CONNREQUIRED) != 0) {
1988                         if (((so->so_state & SS_ISCONFIRMING) == 0) &&
1989                             (resid != 0 || clen == 0) &&
1990                             !(so->so_flags1 & SOF1_PRECONNECT_DATA)) {
1991                                 return ENOTCONN;
1992                         }
1993                 } else if (addr == 0) {
1994                         return (so->so_proto->pr_flags & PR_CONNREQUIRED) ?
1995                                ENOTCONN : EDESTADDRREQ;
1996                 }
1997         }
1998
1999         if (so->so_flags & SOF_ENABLE_MSGS) {
2000                 space = msgq_sbspace(so, control);
2001         } else {
2002                 space = sbspace(&so->so_snd);
2003         }
2004
2005         if (flags & MSG_OOB) {
2006                 space += 1024;
2007         }
2008         if ((atomic && resid > so->so_snd.sb_hiwat) ||
2009             clen > so->so_snd.sb_hiwat) {
2010                 return EMSGSIZE;
2011         }
2012
2013         if ((space < resid + clen &&
2014             (atomic || (space < (int32_t)so->so_snd.sb_lowat) ||
2015             space < clen)) ||
2016             (so->so_type == SOCK_STREAM && so_wait_for_if_feedback(so))) {
2017                 /*
2018                  * don't block the connectx call when there's more data
2019                  * than can be copied.
2020                  */
2021                 if (so->so_flags1 & SOF1_PRECONNECT_DATA) {
2022                         if (space == 0) {
2023                                 return EWOULDBLOCK;
2024                         }
2025                         if (space < (int32_t)so->so_snd.sb_lowat) {
2026                                 return 0;
2027                         }
2028                 }
2029                 if ((so->so_state & SS_NBIO) || (flags & MSG_NBIO) ||
2030                     assumelock) {
2031                         return EWOULDBLOCK;
2032                 }
2033                 sbunlock(&so->so_snd, TRUE);    /* keep socket locked */
2034                 *sblocked = 0;
2035                 error = sbwait(&so->so_snd);
2036                 if (error) {
2037                         if (so->so_flags & SOF_DEFUNCT) {
2038                                 goto defunct;
2039                         }
2040                         return error;
2041                 }
2042                 goto restart;
2043         }
2044         return 0;
2045 }
2046
2047 /*
2048  * Send on a socket.
2049  * If send must go all at once and message is larger than
2050  * send buffering, then hard error.
2051  * Lock against other senders.
2052  * If must go all at once and not enough room now, then
2053  * inform user that this would block and do nothing.
2054  * Otherwise, if nonblocking, send as much as possible.
2055  * The data to be sent is described by "uio" if nonzero,
2056  * otherwise by the mbuf chain "top" (which must be null
2057  * if uio is not).  Data provided in mbuf chain must be small
2058  * enough to send all at once.
2059  *
2060  * Returns nonzero on error, timeout or signal; callers
2061  * must check for short counts if EINTR/ERESTART are returned.
2062  * Data and control buffers are freed on return.
2063  *
2064  * Returns:     0                       Success
2065  *              EOPNOTSUPP
2066  *              EINVAL
2067  *              ENOBUFS
2068  *      uiomove:EFAULT
2069  *      sosendcheck:EPIPE
2070  *      sosendcheck:EWOULDBLOCK
2071  *      sosendcheck:EINTR
2072  *      sosendcheck:EBADF
2073  *      sosendcheck:EINTR
2074  *      sosendcheck:???                 [value from so_error]
2075  *      <pru_send>:ECONNRESET[TCP]
2076  *      <pru_send>:EINVAL[TCP]
2077  *      <pru_send>:ENOBUFS[TCP]
2078  *      <pru_send>:EADDRINUSE[TCP]
2079  *      <pru_send>:EADDRNOTAVAIL[TCP]
2080  *      <pru_send>:EAFNOSUPPORT[TCP]
2081  *      <pru_send>:EACCES[TCP]
2082  *      <pru_send>:EAGAIN[TCP]
2083  *      <pru_send>:EPERM[TCP]
2084  *      <pru_send>:EMSGSIZE[TCP]
2085  *      <pru_send>:EHOSTUNREACH[TCP]
2086  *      <pru_send>:ENETUNREACH[TCP]
2087  *      <pru_send>:ENETDOWN[TCP]
2088  *      <pru_send>:ENOMEM[TCP]
2089  *      <pru_send>:ENOBUFS[TCP]
2090  *      <pru_send>:???[TCP]             [ignorable: mostly IPSEC/firewall/DLIL]
2091  *      <pru_send>:EINVAL[AF_UNIX]
2092  *      <pru_send>:EOPNOTSUPP[AF_UNIX]
2093  *      <pru_send>:EPIPE[AF_UNIX]
2094  *      <pru_send>:ENOTCONN[AF_UNIX]
2095  *      <pru_send>:EISCONN[AF_UNIX]
2096  *      <pru_send>:???[AF_UNIX]         [whatever a filter author chooses]
2097  *      <sf_data_out>:???               [whatever a filter author chooses]
2098  *
2099  * Notes:       Other <pru_send> returns depend on the protocol family; all
2100  *              <sf_data_out> returns depend on what the filter author causes
2101  *              their filter to return.
2102  */
2103 int
2104 sosend(struct socket *so, struct sockaddr *addr, struct uio *uio,
2105     struct mbuf *top, struct mbuf *control, int flags)
2106 {
2107         struct mbuf **mp;
2108         struct mbuf *m, *freelist = NULL;
2109         user_ssize_t space, len, resid, orig_resid;
2110         int clen = 0, error, dontroute, mlen, sendflags;
2111         int atomic = sosendallatonce(so) || top;
2112         int sblocked = 0;
2113         struct proc *p = current_proc();
2114         struct mbuf *control_copy = NULL;
2115         uint16_t headroom = 0;
2116         boolean_t en_tracing = FALSE;
2117
2118         if (uio != NULL) {
2119                 resid = uio_resid(uio);
2120         } else {
2121                 resid = top->m_pkthdr.len;
2122         }
2123
2124         KERNEL_DEBUG((DBG_FNC_SOSEND | DBG_FUNC_START), so, resid,
2125             so->so_snd.sb_cc, so->so_snd.sb_lowat, so->so_snd.sb_hiwat);
2126
2127         socket_lock(so, 1);
2128
2129         /*
2130          * trace if tracing & network (vs. unix) sockets & and
2131          * non-loopback
2132          */
2133         if (ENTR_SHOULDTRACE &&
2134             (SOCK_CHECK_DOM(so, AF_INET) || SOCK_CHECK_DOM(so, AF_INET6))) {
2135                 struct inpcb *inp = sotoinpcb(so);
2136                 if (inp->inp_last_outifp != NULL &&
2137                     !(inp->inp_last_outifp->if_flags & IFF_LOOPBACK)) {
2138                         en_tracing = TRUE;
2139                         KERNEL_ENERGYTRACE(kEnTrActKernSockWrite, DBG_FUNC_START,
2140                             VM_KERNEL_ADDRPERM(so),
2141                             ((so->so_state & SS_NBIO) ? kEnTrFlagNonBlocking : 0),
2142                             (int64_t)resid);
2143                         orig_resid = resid;
2144                 }
2145         }
2146
2147         /*
2148          * Re-injection should not affect process accounting
2149          */
2150         if ((flags & MSG_SKIPCFIL) == 0) {
2151                 so_update_last_owner_locked(so, p);
2152                 so_update_policy(so);
2153
2154 #if NECP
2155                 so_update_necp_policy(so, NULL, addr);
2156 #endif /* NECP */
2157         }
2158
2159         if (so->so_type != SOCK_STREAM && (flags & MSG_OOB) != 0) {
2160                 error = EOPNOTSUPP;
2161                 goto out_locked;
2162         }
2163
2164         /*
2165          * In theory resid should be unsigned.
2166          * However, space must be signed, as it might be less than 0
2167          * if we over-committed, and we must use a signed comparison
2168          * of space and resid.  On the other hand, a negative resid
2169          * causes us to loop sending 0-length segments to the protocol.
2170          *
2171          * Usually, MSG_EOR isn't used on SOCK_STREAM type sockets.
2172          * But it will be used by sockets doing message delivery.
2173          *
2174          * Note: We limit resid to be a positive int value as we use
2175          * imin() to set bytes_to_copy -- radr://14558484
2176          */
2177         if (resid < 0 || resid > INT_MAX || (so->so_type == SOCK_STREAM &&
2178             !(so->so_flags & SOF_ENABLE_MSGS) && (flags & MSG_EOR))) {
2179                 error = EINVAL;
2180                 goto out_locked;
2181         }
2182
2183         dontroute = (flags & MSG_DONTROUTE) &&
2184             (so->so_options & SO_DONTROUTE) == 0 &&
2185             (so->so_proto->pr_flags & PR_ATOMIC);
2186         OSIncrementAtomicLong(&p->p_stats->p_ru.ru_msgsnd);
2187
2188         if (control != NULL) {
2189                 clen = control->m_len;
2190         }
2191
2192         if (soreserveheadroom != 0) {
2193                 headroom = so->so_pktheadroom;
2194         }
2195
2196         do {
2197                 error = sosendcheck(so, addr, resid, clen, atomic, flags,
2198                     &sblocked, control);
2199                 if (error) {
2200                         goto out_locked;
2201                 }
2202
2203                 mp = &top;
2204                 if (so->so_flags & SOF_ENABLE_MSGS) {
2205                         space = msgq_sbspace(so, control);
2206                 } else {
2207                         space = sbspace(&so->so_snd) - clen;
2208                 }
2209                 space += ((flags & MSG_OOB) ? 1024 : 0);
2210
2211                 do {
2212                         if (uio == NULL) {
2213                                 /*
2214                                  * Data is prepackaged in "top".
2215                                  */
2216                                 resid = 0;
2217                                 if (flags & MSG_EOR) {
2218                                         top->m_flags |= M_EOR;
2219                                 }
2220                         } else {
2221                                 int chainlength;
2222                                 int bytes_to_copy;
2223                                 boolean_t jumbocl;
2224                                 boolean_t bigcl;
2225                                 int bytes_to_alloc;
2226
2227                                 bytes_to_copy = imin(resid, space);
2228
2229                                 bytes_to_alloc = bytes_to_copy;
2230                                 if (top == NULL) {
2231                                         bytes_to_alloc += headroom;
2232                                 }
2233
2234                                 if (sosendminchain > 0) {
2235                                         chainlength = 0;
2236                                 } else {
2237                                         chainlength = sosendmaxchain;
2238                                 }
2239
2240                                 /*
2241                                  * Use big 4 KB cluster when the outgoing interface
2242                                  * does not prefer 2 KB clusters
2243                                  */
2244                                 bigcl = !(so->so_flags1 & SOF1_IF_2KCL) ||
2245                                     sosendbigcl_ignore_capab;
2246
2247                                 /*
2248                                  * Attempt to use larger than system page-size
2249                                  * clusters for large writes only if there is
2250                                  * a jumbo cluster pool and if the socket is
2251                                  * marked accordingly.
2252                                  */
2253                                 jumbocl = sosendjcl && njcl > 0 &&
2254                                     ((so->so_flags & SOF_MULTIPAGES) ||
2255                                     sosendjcl_ignore_capab) &&
2256                                     bigcl;
2257
2258                                 socket_unlock(so, 0);
2259
2260                                 do {
2261                                         int num_needed;
2262                                         int hdrs_needed = (top == NULL) ? 1 : 0;
2263
2264                                         /*
2265                                          * try to maintain a local cache of mbuf
2266                                          * clusters needed to complete this
2267                                          * write the list is further limited to
2268                                          * the number that are currently needed
2269                                          * to fill the socket this mechanism
2270                                          * allows a large number of mbufs/
2271                                          * clusters to be grabbed under a single
2272                                          * mbuf lock... if we can't get any
2273                                          * clusters, than fall back to trying
2274                                          * for mbufs if we fail early (or
2275                                          * miscalcluate the number needed) make
2276                                          * sure to release any clusters we
2277                                          * haven't yet consumed.
2278                                          */
2279                                         if (freelist == NULL &&
2280                                             bytes_to_alloc > MBIGCLBYTES &&
2281                                             jumbocl) {
2282                                                 num_needed =
2283                                                     bytes_to_alloc / M16KCLBYTES;
2284
2285                                                 if ((bytes_to_alloc -
2286                                                     (num_needed * M16KCLBYTES))
2287                                                     >= MINCLSIZE) {
2288                                                         num_needed++;
2289                                                 }
2290
2291                                                 freelist =
2292                                                     m_getpackets_internal(
2293                                                         (unsigned int *)&num_needed,
2294                                                         hdrs_needed, M_WAIT, 0,
2295                                                         M16KCLBYTES);
2296                                                 /*
2297                                                  * Fall back to 4K cluster size
2298                                                  * if allocation failed
2299                                                  */
2300                                         }
2301
2302                                         if (freelist == NULL &&
2303                                             bytes_to_alloc > MCLBYTES &&
2304                                             bigcl) {
2305                                                 num_needed =
2306                                                     bytes_to_alloc / MBIGCLBYTES;
2307
2308                                                 if ((bytes_to_alloc -
2309                                                     (num_needed * MBIGCLBYTES)) >=
2310                                                     MINCLSIZE) {
2311                                                         num_needed++;
2312                                                 }
2313
2314                                                 freelist =
2315                                                     m_getpackets_internal(
2316                                                         (unsigned int *)&num_needed,
2317                                                         hdrs_needed, M_WAIT, 0,
2318                                                         MBIGCLBYTES);
2319                                                 /*
2320                                                  * Fall back to cluster size
2321                                                  * if allocation failed
2322                                                  */
2323                                         }
2324
2325                                         /*
2326                                          * Allocate a cluster as we want to
2327                                          * avoid to split the data in more
2328                                          * that one segment and using MINCLSIZE
2329                                          * would lead us to allocate two mbufs
2330                                          */
2331                                         if (soreserveheadroom != 0 &&
2332                                             freelist == NULL &&
2333                                             ((top == NULL &&
2334                                             bytes_to_alloc > _MHLEN) ||
2335                                             bytes_to_alloc > _MLEN)) {
2336                                                 num_needed = ROUNDUP(bytes_to_alloc, MCLBYTES) /
2337                                                     MCLBYTES;
2338                                                 freelist =
2339                                                     m_getpackets_internal(
2340                                                         (unsigned int *)&num_needed,
2341                                                         hdrs_needed, M_WAIT, 0,
2342                                                         MCLBYTES);
2343                                                 /*
2344                                                  * Fall back to a single mbuf
2345                                                  * if allocation failed
2346                                                  */
2347                                         } else if (freelist == NULL &&
2348                                             bytes_to_alloc > MINCLSIZE) {
2349                                                 num_needed =
2350                                                     bytes_to_alloc / MCLBYTES;
2351
2352                                                 if ((bytes_to_alloc -
2353                                                     (num_needed * MCLBYTES)) >=
2354                                                     MINCLSIZE) {
2355                                                         num_needed++;
2356                                                 }
2357
2358                                                 freelist =
2359                                                     m_getpackets_internal(
2360                                                         (unsigned int *)&num_needed,
2361                                                         hdrs_needed, M_WAIT, 0,
2362                                                         MCLBYTES);
2363                                                 /*
2364                                                  * Fall back to a single mbuf
2365                                                  * if allocation failed
2366                                                  */
2367                                         }
2368                                         /*
2369                                          * For datagram protocols, leave
2370                                          * headroom for protocol headers
2371                                          * in the first cluster of the chain
2372                                          */
2373                                         if (freelist != NULL && atomic &&
2374                                             top == NULL && headroom > 0) {
2375                                                 freelist->m_data += headroom;
2376                                         }
2377
2378                                         /*
2379                                          * Fall back to regular mbufs without
2380                                          * reserving the socket headroom
2381                                          */
2382                                         if (freelist == NULL) {
2383                                                 if (top == NULL) {
2384                                                         MGETHDR(freelist,
2385                                                             M_WAIT, MT_DATA);
2386                                                 } else {
2387                                                         MGET(freelist,
2388                                                             M_WAIT, MT_DATA);
2389                                                 }
2390
2391                                                 if (freelist == NULL) {
2392                                                         error = ENOBUFS;
2393                                                         socket_lock(so, 0);
2394                                                         goto out_locked;
2395                                                 }
2396                                                 /*
2397                                                  * For datagram protocols,
2398                                                  * leave room for protocol
2399                                                  * headers in first mbuf.
2400                                                  */
2401                                                 if (atomic && top == NULL &&
2402                                                     bytes_to_copy < MHLEN) {
2403                                                         MH_ALIGN(freelist,
2404                                                             bytes_to_copy);
2405                                                 }
2406                                         }
2407                                         m = freelist;
2408                                         freelist = m->m_next;
2409                                         m->m_next = NULL;
2410
2411                                         if ((m->m_flags & M_EXT)) {
2412                                                 mlen = m->m_ext.ext_size -
2413                                                     M_LEADINGSPACE(m);
2414                                         } else if ((m->m_flags & M_PKTHDR)) {
2415                                                 mlen =
2416                                                     MHLEN - M_LEADINGSPACE(m);
2417                                         } else {
2418                                                 mlen = MLEN - M_LEADINGSPACE(m);
2419                                         }
2420                                         len = imin(mlen, bytes_to_copy);
2421
2422                                         chainlength += len;
2423
2424                                         space -= len;
2425
2426                                         error = uiomove(mtod(m, caddr_t),
2427                                             len, uio);
2428
2429                                         resid = uio_resid(uio);
2430
2431                                         m->m_len = len;
2432                                         *mp = m;
2433                                         top->m_pkthdr.len += len;
2434                                         if (error) {
2435                                                 break;
2436                                         }
2437                                         mp = &m->m_next;
2438                                         if (resid <= 0) {
2439                                                 if (flags & MSG_EOR) {
2440                                                         top->m_flags |= M_EOR;
2441                                                 }
2442                                                 break;
2443                                         }
2444                                         bytes_to_copy = min(resid, space);
2445                                 } while (space > 0 &&
2446                                     (chainlength < sosendmaxchain || atomic ||
2447                                     resid < MINCLSIZE));
2448
2449                                 socket_lock(so, 0);
2450
2451                                 if (error) {
2452                                         goto out_locked;
2453                                 }
2454                         }
2455
2456                         if (dontroute) {
2457                                 so->so_options |= SO_DONTROUTE;
2458                         }
2459
2460                         /*
2461                          * Compute flags here, for pru_send and NKEs
2462                          *
2463                          * If the user set MSG_EOF, the protocol
2464                          * understands this flag and nothing left to
2465                          * send then use PRU_SEND_EOF instead of PRU_SEND.
2466                          */
2467                         sendflags = (flags & MSG_OOB) ? PRUS_OOB :
2468                             ((flags & MSG_EOF) &&
2469                             (so->so_proto->pr_flags & PR_IMPLOPCL) &&
2470                             (resid <= 0)) ? PRUS_EOF :
2471                             /* If there is more to send set PRUS_MORETOCOME */
2472                             (resid > 0 && space > 0) ? PRUS_MORETOCOME : 0;
2473
2474                         if ((flags & MSG_SKIPCFIL) == 0) {
2475                                 /*
2476                                  * Socket filter processing
2477                                  */
2478                                 error = sflt_data_out(so, addr, &top,
2479                                     &control, (sendflags & MSG_OOB) ?
2480                                     sock_data_filt_flag_oob : 0);
2481                                 if (error) {
2482                                         if (error == EJUSTRETURN) {
2483                                                 error = 0;
2484                                                 clen = 0;
2485                                                 control = NULL;
2486                                                 top = NULL;
2487                                         }
2488                                         goto out_locked;
2489                                 }
2490 #if CONTENT_FILTER
2491                                 /*
2492                                  * Content filter processing
2493                                  */
2494                                 error = cfil_sock_data_out(so, addr, top,
2495                                     control, sendflags);
2496                                 if (error) {
2497                                         if (error == EJUSTRETURN) {
2498                                                 error = 0;
2499                                                 clen = 0;
2500                                                 control = NULL;
2501                                                 top = NULL;
2502                                         }
2503                                         goto out_locked;
2504                                 }
2505 #endif /* CONTENT_FILTER */
2506                         }
2507                         if (so->so_flags & SOF_ENABLE_MSGS) {
2508                                 /*
2509                                  * Make a copy of control mbuf,
2510                                  * so that msg priority can be
2511                                  * passed to subsequent mbufs.
2512                                  */
2513                                 control_copy = m_dup(control, M_NOWAIT);
2514                         }
2515                         error = (*so->so_proto->pr_usrreqs->pru_send)
2516                             (so, sendflags, top, addr, control, p);
2517
2518                         if (dontroute) {
2519                                 so->so_options &= ~SO_DONTROUTE;
2520                         }
2521
2522                         clen = 0;
2523                         control = control_copy;
2524                         control_copy = NULL;
2525                         top = NULL;
2526                         mp = &top;
2527                         if (error) {
2528                                 goto out_locked;
2529                         }
2530                 } while (resid && space > 0);
2531         } while (resid);
2532
2533 out_locked:
2534         if (sblocked) {
2535                 sbunlock(&so->so_snd, FALSE);   /* will unlock socket */
2536         } else {
2537                 socket_unlock(so, 1);
2538         }
2539         if (top != NULL) {
2540                 m_freem(top);
2541         }
2542         if (control != NULL) {
2543                 m_freem(control);
2544         }
2545         if (freelist != NULL) {
2546                 m_freem_list(freelist);
2547         }
2548         if (control_copy != NULL) {
2549                 m_freem(control_copy);
2550         }
2551
2552         soclearfastopen(so);
2553
2554         if (en_tracing) {
2555                 /* resid passed here is the bytes left in uio */
2556                 KERNEL_ENERGYTRACE(kEnTrActKernSockWrite, DBG_FUNC_END,
2557                     VM_KERNEL_ADDRPERM(so),
2558                     ((error == EWOULDBLOCK) ? kEnTrFlagNoWork : 0),
2559                     (int64_t)(orig_resid - resid));
2560         }
2561         KERNEL_DEBUG(DBG_FNC_SOSEND | DBG_FUNC_END, so, resid,
2562             so->so_snd.sb_cc, space, error);
2563
2564         return error;
2565 }
2566
2567 int
2568 sosend_reinject(struct socket *so, struct sockaddr *addr, struct mbuf *top, struct mbuf *control, uint32_t sendflags)
2569 {
2570         struct mbuf *m0 = NULL, *control_end = NULL;
2571
2572         socket_lock_assert_owned(so);
2573
2574         /*
2575          * top must points to mbuf chain to be sent.
2576          * If control is not NULL, top must be packet header
2577          */
2578         VERIFY(top != NULL &&
2579             (control == NULL || top->m_flags & M_PKTHDR));
2580
2581         /*
2582          * If control is not passed in, see if we can get it
2583          * from top.
2584          */
2585         if (control == NULL && (top->m_flags & M_PKTHDR) == 0) {
2586                 // Locate start of control if present and start of data
2587                 for (m0 = top; m0 != NULL; m0 = m0->m_next) {
2588                         if (m0->m_flags & M_PKTHDR) {
2589                                 top = m0;
2590                                 break;
2591                         } else if (m0->m_type == MT_CONTROL) {
2592                                 if (control == NULL) {
2593                                         // Found start of control
2594                                         control = m0;
2595                                 }
2596                                 if (control != NULL && m0->m_next != NULL && m0->m_next->m_type != MT_CONTROL) {
2597                                         // Found end of control
2598                                         control_end = m0;
2599                                 }
2600                         }
2601                 }
2602                 if (control_end != NULL) {
2603                         control_end->m_next = NULL;
2604                 }
2605         }
2606
2607         int error = (*so->so_proto->pr_usrreqs->pru_send)
2608             (so, sendflags, top, addr, control, current_proc());
2609
2610         return error;
2611 }
2612
2613 /*
2614  * Supported only connected sockets (no address) without ancillary data
2615  * (control mbuf) for atomic protocols
2616  */
2617 int
2618 sosend_list(struct socket *so, struct uio **uioarray, u_int uiocnt, int flags)
2619 {
2620         struct mbuf *m, *freelist = NULL;
2621         user_ssize_t len, resid;
2622         int error, dontroute, mlen;
2623         int atomic = sosendallatonce(so);
2624         int sblocked = 0;
2625         struct proc *p = current_proc();
2626         u_int uiofirst = 0;
2627         u_int uiolast = 0;
2628         struct mbuf *top = NULL;
2629         uint16_t headroom = 0;
2630         boolean_t bigcl;
2631
2632         KERNEL_DEBUG((DBG_FNC_SOSEND_LIST | DBG_FUNC_START), so, uiocnt,
2633             so->so_snd.sb_cc, so->so_snd.sb_lowat, so->so_snd.sb_hiwat);
2634
2635         if (so->so_type != SOCK_DGRAM) {
2636                 error = EINVAL;
2637                 goto out;
2638         }
2639         if (atomic == 0) {
2640                 error = EINVAL;
2641                 goto out;
2642         }
2643         if (so->so_proto->pr_usrreqs->pru_send_list == NULL) {
2644                 error = EPROTONOSUPPORT;
2645                 goto out;
2646         }
2647         if (flags & ~(MSG_DONTWAIT | MSG_NBIO)) {
2648                 error = EINVAL;
2649                 goto out;
2650         }
2651         resid = uio_array_resid(uioarray, uiocnt);
2652
2653         /*
2654          * In theory resid should be unsigned.
2655          * However, space must be signed, as it might be less than 0
2656          * if we over-committed, and we must use a signed comparison
2657          * of space and resid.  On the other hand, a negative resid
2658          * causes us to loop sending 0-length segments to the protocol.
2659          *
2660          * Note: We limit resid to be a positive int value as we use
2661          * imin() to set bytes_to_copy -- radr://14558484
2662          */
2663         if (resid < 0 || resid > INT_MAX) {
2664                 error = EINVAL;
2665                 goto out;
2666         }
2667
2668         socket_lock(so, 1);
2669         so_update_last_owner_locked(so, p);
2670         so_update_policy(so);
2671
2672 #if NECP
2673         so_update_necp_policy(so, NULL, NULL);
2674 #endif /* NECP */
2675
2676         dontroute = (flags & MSG_DONTROUTE) &&
2677             (so->so_options & SO_DONTROUTE) == 0 &&
2678             (so->so_proto->pr_flags & PR_ATOMIC);
2679         OSIncrementAtomicLong(&p->p_stats->p_ru.ru_msgsnd);
2680
2681         error = sosendcheck(so, NULL, resid, 0, atomic, flags,
2682             &sblocked, NULL);
2683         if (error) {
2684                 goto release;
2685         }
2686
2687         /*
2688          * Use big 4 KB clusters when the outgoing interface does not prefer
2689          * 2 KB clusters
2690          */
2691         bigcl = !(so->so_flags1 & SOF1_IF_2KCL) || sosendbigcl_ignore_capab;
2692
2693         if (soreserveheadroom != 0) {
2694                 headroom = so->so_pktheadroom;
2695         }
2696
2697         do {
2698                 int i;
2699                 int num_needed = 0;
2700                 int chainlength;
2701                 size_t maxpktlen = 0;
2702                 int bytes_to_alloc;
2703
2704                 if (sosendminchain > 0) {
2705                         chainlength = 0;
2706                 } else {
2707                         chainlength = sosendmaxchain;
2708                 }
2709
2710                 socket_unlock(so, 0);
2711
2712                 /*
2713                  * Find a set of uio that fit in a reasonable number
2714                  * of mbuf packets
2715                  */
2716                 for (i = uiofirst; i < uiocnt; i++) {
2717                         struct uio *auio = uioarray[i];
2718
2719                         len = uio_resid(auio);
2720
2721                         /* Do nothing for empty messages */
2722                         if (len == 0) {
2723                                 continue;
2724                         }
2725
2726                         num_needed += 1;
2727                         uiolast += 1;
2728
2729                         if (len > maxpktlen) {
2730                                 maxpktlen = len;
2731                         }
2732
2733                         chainlength += len;
2734                         if (chainlength > sosendmaxchain) {
2735                                 break;
2736                         }
2737                 }
2738                 /*
2739                  * Nothing left to send
2740                  */
2741                 if (num_needed == 0) {
2742                         socket_lock(so, 0);
2743                         break;
2744                 }
2745                 /*
2746                  * Allocate buffer large enough to include headroom space for
2747                  * network and link header
2748                  *
2749                  */
2750                 bytes_to_alloc = maxpktlen + headroom;
2751
2752                 /*
2753                  * Allocate a single contiguous buffer of the smallest available
2754                  * size when possible
2755                  */
2756                 if (bytes_to_alloc > MCLBYTES &&
2757                     bytes_to_alloc <= MBIGCLBYTES && bigcl) {
2758                         freelist = m_getpackets_internal(
2759                                 (unsigned int *)&num_needed,
2760                                 num_needed, M_WAIT, 1,
2761                                 MBIGCLBYTES);
2762                 } else if (bytes_to_alloc > _MHLEN &&
2763                     bytes_to_alloc <= MCLBYTES) {
2764                         freelist = m_getpackets_internal(
2765                                 (unsigned int *)&num_needed,
2766                                 num_needed, M_WAIT, 1,
2767                                 MCLBYTES);
2768                 } else {
2769                         freelist = m_allocpacket_internal(
2770                                 (unsigned int *)&num_needed,
2771                                 bytes_to_alloc, NULL, M_WAIT, 1, 0);
2772                 }
2773
2774                 if (freelist == NULL) {
2775                         socket_lock(so, 0);
2776                         error = ENOMEM;
2777                         goto release;
2778                 }
2779                 /*
2780                  * Copy each uio of the set into its own mbuf packet
2781                  */
2782                 for (i = uiofirst, m = freelist;
2783                     i < uiolast && m != NULL;
2784                     i++) {
2785                         int bytes_to_copy;
2786                         struct mbuf *n;
2787                         struct uio *auio = uioarray[i];
2788
2789                         bytes_to_copy = uio_resid(auio);
2790
2791                         /* Do nothing for empty messages */
2792                         if (bytes_to_copy == 0) {
2793                                 continue;
2794                         }
2795                         /*
2796                          * Leave headroom for protocol headers
2797                          * in the first mbuf of the chain
2798                          */
2799                         m->m_data += headroom;
2800
2801                         for (n = m; n != NULL; n = n->m_next) {
2802                                 if ((m->m_flags & M_EXT)) {
2803                                         mlen = m->m_ext.ext_size -
2804                                             M_LEADINGSPACE(m);
2805                                 } else if ((m->m_flags & M_PKTHDR)) {
2806                                         mlen =
2807                                             MHLEN - M_LEADINGSPACE(m);
2808                                 } else {
2809                                         mlen = MLEN - M_LEADINGSPACE(m);
2810                                 }
2811                                 len = imin(mlen, bytes_to_copy);
2812
2813                                 /*
2814                                  * Note: uiomove() decrements the iovec
2815                                  * length
2816                                  */
2817                                 error = uiomove(mtod(n, caddr_t),
2818                                     len, auio);
2819                                 if (error != 0) {
2820                                         break;
2821                                 }
2822                                 n->m_len = len;
2823                                 m->m_pkthdr.len += len;
2824
2825                                 VERIFY(m->m_pkthdr.len <= maxpktlen);
2826
2827                                 bytes_to_copy -= len;
2828                                 resid -= len;
2829                         }
2830                         if (m->m_pkthdr.len == 0) {
2831                                 printf(
2832                                         "%s:%d so %llx pkt %llx type %u len null\n",
2833                                         __func__, __LINE__,
2834                                         (uint64_t)DEBUG_KERNEL_ADDRPERM(so),
2835                                         (uint64_t)DEBUG_KERNEL_ADDRPERM(m),
2836                                         m->m_type);
2837                         }
2838                         if (error != 0) {
2839                                 break;
2840                         }
2841                         m = m->m_nextpkt;
2842                 }
2843
2844                 socket_lock(so, 0);
2845
2846                 if (error) {
2847                         goto release;
2848                 }
2849                 top = freelist;
2850                 freelist = NULL;
2851
2852                 if (dontroute) {
2853                         so->so_options |= SO_DONTROUTE;
2854                 }
2855
2856                 if ((flags & MSG_SKIPCFIL) == 0) {
2857                         struct mbuf **prevnextp = NULL;
2858
2859                         for (i = uiofirst, m = top;
2860                             i < uiolast && m != NULL;
2861                             i++) {
2862                                 struct mbuf *nextpkt = m->m_nextpkt;
2863
2864                                 /*
2865                                  * Socket filter processing
2866                                  */
2867                                 error = sflt_data_out(so, NULL, &m,
2868                                     NULL, 0);
2869                                 if (error != 0 && error != EJUSTRETURN) {
2870                                         goto release;
2871                                 }
2872
2873 #if CONTENT_FILTER
2874                                 if (error == 0) {
2875                                         /*
2876                                          * Content filter processing
2877                                          */
2878                                         error = cfil_sock_data_out(so, NULL, m,
2879                                             NULL, 0);
2880                                         if (error != 0 && error != EJUSTRETURN) {
2881                                                 goto release;
2882                                         }
2883                                 }
2884 #endif /* CONTENT_FILTER */
2885                                 /*
2886                                  * Remove packet from the list when
2887                                  * swallowed by a filter
2888                                  */
2889                                 if (error == EJUSTRETURN) {
2890                                         error = 0;
2891                                         if (prevnextp != NULL) {
2892                                                 *prevnextp = nextpkt;
2893                                         } else {
2894                                                 top = nextpkt;
2895                                         }
2896                                 }
2897
2898                                 m = nextpkt;
2899                                 if (m != NULL) {
2900                                         prevnextp = &m->m_nextpkt;
2901                                 }
2902                         }
2903                 }
2904                 if (top != NULL) {
2905                         error = (*so->so_proto->pr_usrreqs->pru_send_list)
2906                             (so, 0, top, NULL, NULL, p);
2907                 }
2908
2909                 if (dontroute) {
2910                         so->so_options &= ~SO_DONTROUTE;
2911                 }
2912
2913                 top = NULL;
2914                 uiofirst = uiolast;
2915         } while (resid > 0 && error == 0);
2916 release:
2917         if (sblocked) {
2918                 sbunlock(&so->so_snd, FALSE);   /* will unlock socket */
2919         } else {
2920                 socket_unlock(so, 1);
2921         }
2922 out:
2923         if (top != NULL) {
2924                 m_freem(top);
2925         }
2926         if (freelist != NULL) {
2927                 m_freem_list(freelist);
2928         }
2929
2930         KERNEL_DEBUG(DBG_FNC_SOSEND_LIST | DBG_FUNC_END, so, resid,
2931             so->so_snd.sb_cc, 0, error);
2932
2933         return error;
2934 }
2935
2936 /*
2937  * May return ERESTART when packet is dropped by MAC policy check
2938  */
2939 static int
2940 soreceive_addr(struct proc *p, struct socket *so, struct sockaddr **psa,
2941     int flags, struct mbuf **mp, struct mbuf **nextrecordp, int canwait)
2942 {
2943         int error = 0;
2944         struct mbuf *m = *mp;
2945         struct mbuf *nextrecord = *nextrecordp;
2946
2947         KASSERT(m->m_type == MT_SONAME, ("receive 1a"));
2948 #if CONFIG_MACF_SOCKET_SUBSET
2949         /*
2950          * Call the MAC framework for policy checking if we're in
2951          * the user process context and the socket isn't connected.
2952          */
2953         if (p != kernproc && !(so->so_state & SS_ISCONNECTED)) {
2954                 struct mbuf *m0 = m;
2955                 /*
2956                  * Dequeue this record (temporarily) from the receive
2957                  * list since we're about to drop the socket's lock
2958                  * where a new record may arrive and be appended to
2959                  * the list.  Upon MAC policy failure, the record
2960                  * will be freed.  Otherwise, we'll add it back to
2961                  * the head of the list.  We cannot rely on SB_LOCK
2962                  * because append operation uses the socket's lock.
2963                  */
2964                 do {
2965                         m->m_nextpkt = NULL;
2966                         sbfree(&so->so_rcv, m);
2967                         m = m->m_next;
2968                 } while (m != NULL);
2969                 m = m0;
2970                 so->so_rcv.sb_mb = nextrecord;
2971                 SB_EMPTY_FIXUP(&so->so_rcv);
2972                 SBLASTRECORDCHK(&so->so_rcv, "soreceive 1a");
2973                 SBLASTMBUFCHK(&so->so_rcv, "soreceive 1a");
2974                 socket_unlock(so, 0);
2975
2976                 if (mac_socket_check_received(proc_ucred(p), so,
2977                     mtod(m, struct sockaddr *)) != 0) {
2978                         /*
2979                          * MAC policy failure; free this record and
2980                          * process the next record (or block until
2981                          * one is available).  We have adjusted sb_cc
2982                          * and sb_mbcnt above so there is no need to
2983                          * call sbfree() again.
2984                          */
2985                         m_freem(m);
2986                         /*
2987                          * Clear SB_LOCK but don't unlock the socket.
2988                          * Process the next record or wait for one.
2989                          */
2990                         socket_lock(so, 0);
2991                         sbunlock(&so->so_rcv, TRUE); /* stay locked */
2992                         error = ERESTART;
2993                         goto done;
2994                 }
2995                 socket_lock(so, 0);
2996                 /*
2997                  * If the socket has been defunct'd, drop it.
2998                  */
2999                 if (so->so_flags & SOF_DEFUNCT) {
3000                         m_freem(m);
3001                         error = ENOTCONN;
3002                         goto done;
3003                 }
3004                 /*
3005                  * Re-adjust the socket receive list and re-enqueue
3006                  * the record in front of any packets which may have
3007                  * been appended while we dropped the lock.
3008                  */
3009                 for (m = m0; m->m_next != NULL; m = m->m_next) {
3010                         sballoc(&so->so_rcv, m);
3011                 }
3012                 sballoc(&so->so_rcv, m);
3013                 if (so->so_rcv.sb_mb == NULL) {
3014                         so->so_rcv.sb_lastrecord = m0;
3015                         so->so_rcv.sb_mbtail = m;
3016                 }
3017                 m = m0;
3018                 nextrecord = m->m_nextpkt = so->so_rcv.sb_mb;
3019                 so->so_rcv.sb_mb = m;
3020                 SBLASTRECORDCHK(&so->so_rcv, "soreceive 1b");
3021                 SBLASTMBUFCHK(&so->so_rcv, "soreceive 1b");
3022         }
3023 #endif /* CONFIG_MACF_SOCKET_SUBSET */
3024         if (psa != NULL) {
3025                 *psa = dup_sockaddr(mtod(m, struct sockaddr *), canwait);
3026                 if ((*psa == NULL) && (flags & MSG_NEEDSA)) {
3027                         error = EWOULDBLOCK;
3028                         goto done;
3029                 }
3030         }
3031         if (flags & MSG_PEEK) {
3032                 m = m->m_next;
3033         } else {
3034                 sbfree(&so->so_rcv, m);
3035                 if (m->m_next == NULL && so->so_rcv.sb_cc != 0) {
3036                         panic("%s: about to create invalid socketbuf",
3037                             __func__);
3038                         /* NOTREACHED */
3039                 }
3040                 MFREE(m, so->so_rcv.sb_mb);
3041                 m = so->so_rcv.sb_mb;
3042                 if (m != NULL) {
3043                         m->m_nextpkt = nextrecord;
3044                 } else {
3045                         so->so_rcv.sb_mb = nextrecord;
3046                         SB_EMPTY_FIXUP(&so->so_rcv);
3047                 }
3048         }
3049 done:
3050         *mp = m;
3051         *nextrecordp = nextrecord;
3052
3053         return error;
3054 }
3055
3056 /*
3057  * Process one or more MT_CONTROL mbufs present before any data mbufs
3058  * in the first mbuf chain on the socket buffer.  If MSG_PEEK, we
3059  * just copy the data; if !MSG_PEEK, we call into the protocol to
3060  * perform externalization.
3061  */
3062 static int
3063 soreceive_ctl(struct socket *so, struct mbuf **controlp, int flags,
3064     struct mbuf **mp, struct mbuf **nextrecordp)
3065 {
3066         int error = 0;
3067         struct mbuf *cm = NULL, *cmn;
3068         struct mbuf **cme = &cm;
3069         struct sockbuf *sb_rcv = &so->so_rcv;
3070         struct mbuf **msgpcm = NULL;
3071         struct mbuf *m = *mp;
3072         struct mbuf *nextrecord = *nextrecordp;
3073         struct protosw *pr = so->so_proto;
3074
3075         /*
3076          * Externalizing the control messages would require us to
3077          * drop the socket's lock below.  Once we re-acquire the
3078          * lock, the mbuf chain might change.  In order to preserve
3079          * consistency, we unlink all control messages from the
3080          * first mbuf chain in one shot and link them separately
3081          * onto a different chain.
3082          */
3083         do {
3084                 if (flags & MSG_PEEK) {
3085                         if (controlp != NULL) {
3086                                 if (*controlp == NULL) {
3087                                         msgpcm = controlp;
3088                                 }
3089                                 *controlp = m_copy(m, 0, m->m_len);
3090
3091                                 /*
3092                                  * If we failed to allocate an mbuf,
3093                                  * release any previously allocated
3094                                  * mbufs for control data. Return
3095                                  * an error. Keep the mbufs in the
3096                                  * socket as this is using
3097                                  * MSG_PEEK flag.
3098                                  */
3099                                 if (*controlp == NULL) {
3100                                         m_freem(*msgpcm);
3101                                         error = ENOBUFS;
3102                                         goto done;
3103                                 }
3104                                 controlp = &(*controlp)->m_next;
3105                         }
3106                         m = m->m_next;
3107                 } else {
3108                         m->m_nextpkt = NULL;
3109                         sbfree(sb_rcv, m);
3110                         sb_rcv->sb_mb = m->m_next;
3111                         m->m_next = NULL;
3112                         *cme = m;
3113                         cme = &(*cme)->m_next;
3114                         m = sb_rcv->sb_mb;
3115                 }
3116         } while (m != NULL && m->m_type == MT_CONTROL);
3117
3118         if (!(flags & MSG_PEEK)) {
3119                 if (sb_rcv->sb_mb != NULL) {
3120                         sb_rcv->sb_mb->m_nextpkt = nextrecord;
3121                 } else {
3122                         sb_rcv->sb_mb = nextrecord;
3123                         SB_EMPTY_FIXUP(sb_rcv);
3124                 }
3125                 if (nextrecord == NULL) {
3126                         sb_rcv->sb_lastrecord = m;
3127                 }
3128         }
3129
3130         SBLASTRECORDCHK(&so->so_rcv, "soreceive ctl");
3131         SBLASTMBUFCHK(&so->so_rcv, "soreceive ctl");
3132
3133         while (cm != NULL) {
3134                 int cmsg_type;
3135
3136                 cmn = cm->m_next;
3137                 cm->m_next = NULL;
3138                 cmsg_type = mtod(cm, struct cmsghdr *)->cmsg_type;
3139
3140                 /*
3141                  * Call the protocol to externalize SCM_RIGHTS message
3142                  * and return the modified message to the caller upon
3143                  * success.  Otherwise, all other control messages are
3144                  * returned unmodified to the caller.  Note that we
3145                  * only get into this loop if MSG_PEEK is not set.
3146                  */
3147                 if (pr->pr_domain->dom_externalize != NULL &&
3148                     cmsg_type == SCM_RIGHTS) {
3149                         /*
3150                          * Release socket lock: see 3903171.  This
3151                          * would also allow more records to be appended
3152                          * to the socket buffer.  We still have SB_LOCK
3153                          * set on it, so we can be sure that the head
3154                          * of the mbuf chain won't change.
3155                          */
3156                         socket_unlock(so, 0);
3157                         error = (*pr->pr_domain->dom_externalize)(cm);
3158                         socket_lock(so, 0);
3159                 } else {
3160                         error = 0;
3161                 }
3162
3163                 if (controlp != NULL && error == 0) {
3164                         *controlp = cm;
3165                         controlp = &(*controlp)->m_next;
3166                 } else {
3167                         (void) m_free(cm);
3168                 }
3169                 cm = cmn;
3170         }
3171         /*
3172          * Update the value of nextrecord in case we received new
3173          * records when the socket was unlocked above for
3174          * externalizing SCM_RIGHTS.
3175          */
3176         if (m != NULL) {
3177                 nextrecord = sb_rcv->sb_mb->m_nextpkt;
3178         } else {
3179                 nextrecord = sb_rcv->sb_mb;
3180         }
3181
3182 done:
3183         *mp = m;
3184         *nextrecordp = nextrecord;
3185
3186         return error;
3187 }
3188
3189 /*
3190  * Implement receive operations on a socket.
3191  * We depend on the way that records are added to the sockbuf
3192  * by sbappend*.  In particular, each record (mbufs linked through m_next)
3193  * must begin with an address if the protocol so specifies,
3194  * followed by an optional mbuf or mbufs containing ancillary data,
3195  * and then zero or more mbufs of data.
3196  * In order to avoid blocking network interrupts for the entire time here,
3197  * we splx() while doing the actual copy to user space.
3198  * Although the sockbuf is locked, new data may still be appended,
3199  * and thus we must maintain consistency of the sockbuf during that time.
3200  *
3201  * The caller may receive the data as a single mbuf chain by supplying
3202  * an mbuf **mp0 for use in returning the chain.  The uio is then used
3203  * only for the count in uio_resid.
3204  *
3205  * Returns:     0                       Success
3206  *              ENOBUFS
3207  *              ENOTCONN
3208  *              EWOULDBLOCK
3209  *      uiomove:EFAULT
3210  *      sblock:EWOULDBLOCK
3211  *      sblock:EINTR
3212  *      sbwait:EBADF
3213  *      sbwait:EINTR
3214  *      sodelayed_copy:EFAULT
3215  *      <pru_rcvoob>:EINVAL[TCP]
3216  *      <pru_rcvoob>:EWOULDBLOCK[TCP]
3217  *      <pru_rcvoob>:???
3218  *      <pr_domain->dom_externalize>:EMSGSIZE[AF_UNIX]
3219  *      <pr_domain->dom_externalize>:ENOBUFS[AF_UNIX]
3220  *      <pr_domain->dom_externalize>:???
3221  *
3222  * Notes:       Additional return values from calls through <pru_rcvoob> and
3223  *              <pr_domain->dom_externalize> depend on protocols other than
3224  *              TCP or AF_UNIX, which are documented above.
3225  */
3226 int
3227 soreceive(struct socket *so, struct sockaddr **psa, struct uio *uio,
3228     struct mbuf **mp0, struct mbuf **controlp, int *flagsp)
3229 {
3230         struct mbuf *m, **mp, *ml = NULL;
3231         struct mbuf *nextrecord, *free_list;
3232         int flags, error, offset;
3233         user_ssize_t len;
3234         struct protosw *pr = so->so_proto;
3235         int moff, type = 0;
3236         user_ssize_t orig_resid = uio_resid(uio);
3237         user_ssize_t delayed_copy_len;
3238         int can_delay;
3239         int need_event;
3240         struct proc *p = current_proc();
3241         boolean_t en_tracing = FALSE;
3242
3243         /*
3244          * Sanity check on the length passed by caller as we are making 'int'
3245          * comparisons
3246          */
3247         if (orig_resid < 0 || orig_resid > INT_MAX) {
3248                 return EINVAL;
3249         }
3250
3251         KERNEL_DEBUG(DBG_FNC_SORECEIVE | DBG_FUNC_START, so,
3252             uio_resid(uio), so->so_rcv.sb_cc, so->so_rcv.sb_lowat,
3253             so->so_rcv.sb_hiwat);
3254
3255         socket_lock(so, 1);
3256         so_update_last_owner_locked(so, p);
3257         so_update_policy(so);
3258
3259 #ifdef MORE_LOCKING_DEBUG
3260         if (so->so_usecount == 1) {
3261                 panic("%s: so=%x no other reference on socket\n", __func__, so);
3262                 /* NOTREACHED */
3263         }
3264 #endif
3265         mp = mp0;
3266         if (psa != NULL) {
3267                 *psa = NULL;
3268         }
3269         if (controlp != NULL) {
3270                 *controlp = NULL;
3271         }
3272         if (flagsp != NULL) {
3273                 flags = *flagsp & ~MSG_EOR;
3274         } else {
3275                 flags = 0;
3276         }
3277
3278         /*
3279          * If a recv attempt is made on a previously-accepted socket
3280          * that has been marked as inactive (disconnected), reject
3281          * the request.
3282          */
3283         if (so->so_flags & SOF_DEFUNCT) {
3284                 struct sockbuf *sb = &so->so_rcv;
3285
3286                 error = ENOTCONN;
3287                 SODEFUNCTLOG("%s[%d, %s]: defunct so 0x%llx [%d,%d] (%d)\n",
3288                     __func__, proc_pid(p), proc_best_name(p),
3289                     (uint64_t)DEBUG_KERNEL_ADDRPERM(so),
3290                     SOCK_DOM(so), SOCK_TYPE(so), error);
3291                 /*
3292                  * This socket should have been disconnected and flushed
3293                  * prior to being returned from sodefunct(); there should
3294                  * be no data on its receive list, so panic otherwise.
3295                  */
3296                 if (so->so_state & SS_DEFUNCT) {
3297                         sb_empty_assert(sb, __func__);
3298                 }
3299                 socket_unlock(so, 1);
3300                 return error;
3301         }
3302
3303         if ((so->so_flags1 & SOF1_PRECONNECT_DATA) &&
3304             pr->pr_usrreqs->pru_preconnect) {
3305                 /*
3306                  * A user may set the CONNECT_RESUME_ON_READ_WRITE-flag but not
3307                  * calling write() right after this. *If* the app calls a read
3308                  * we do not want to block this read indefinetely. Thus,
3309                  * we trigger a connect so that the session gets initiated.
3310                  */
3311                 error = (*pr->pr_usrreqs->pru_preconnect)(so);
3312
3313                 if (error) {
3314                         socket_unlock(so, 1);
3315                         return error;
3316                 }
3317         }
3318
3319         if (ENTR_SHOULDTRACE &&
3320             (SOCK_CHECK_DOM(so, AF_INET) || SOCK_CHECK_DOM(so, AF_INET6))) {
3321                 /*
3322                  * enable energy tracing for inet sockets that go over
3323                  * non-loopback interfaces only.
3324                  */
3325                 struct inpcb *inp = sotoinpcb(so);
3326                 if (inp->inp_last_outifp != NULL &&
3327                     !(inp->inp_last_outifp->if_flags & IFF_LOOPBACK)) {
3328                         en_tracing = TRUE;
3329                         KERNEL_ENERGYTRACE(kEnTrActKernSockRead, DBG_FUNC_START,
3330                             VM_KERNEL_ADDRPERM(so),
3331                             ((so->so_state & SS_NBIO) ?
3332                             kEnTrFlagNonBlocking : 0),
3333                             (int64_t)orig_resid);
3334                 }
3335         }
3336
3337         /*
3338          * When SO_WANTOOBFLAG is set we try to get out-of-band data
3339          * regardless of the flags argument. Here is the case were
3340          * out-of-band data is not inline.
3341          */
3342         if ((flags & MSG_OOB) ||
3343             ((so->so_options & SO_WANTOOBFLAG) != 0 &&
3344             (so->so_options & SO_OOBINLINE) == 0 &&
3345             (so->so_oobmark || (so->so_state & SS_RCVATMARK)))) {
3346                 m = m_get(M_WAIT, MT_DATA);
3347                 if (m == NULL) {
3348                         socket_unlock(so, 1);
3349                         KERNEL_DEBUG(DBG_FNC_SORECEIVE | DBG_FUNC_END,
3350                             ENOBUFS, 0, 0, 0, 0);
3351                         return ENOBUFS;
3352                 }
3353                 error = (*pr->pr_usrreqs->pru_rcvoob)(so, m, flags & MSG_PEEK);
3354                 if (error) {
3355                         goto bad;
3356                 }
3357                 socket_unlock(so, 0);
3358                 do {
3359                         error = uiomove(mtod(m, caddr_t),
3360                             imin(uio_resid(uio), m->m_len), uio);
3361                         m = m_free(m);
3362                 } while (uio_resid(uio) && error == 0 && m != NULL);
3363                 socket_lock(so, 0);
3364 bad:
3365                 if (m != NULL) {
3366                         m_freem(m);
3367                 }
3368
3369                 if ((so->so_options & SO_WANTOOBFLAG) != 0) {
3370                         if (error == EWOULDBLOCK || error == EINVAL) {
3371                                 /*
3372                                  * Let's try to get normal data:
3373                                  * EWOULDBLOCK: out-of-band data not
3374                                  * receive yet. EINVAL: out-of-band data
3375                                  * already read.
3376                                  */
3377                                 error = 0;
3378                                 goto nooob;
3379                         } else if (error == 0 && flagsp != NULL) {
3380                                 *flagsp |= MSG_OOB;
3381                         }
3382                 }
3383                 socket_unlock(so, 1);
3384                 if (en_tracing) {
3385                         KERNEL_ENERGYTRACE(kEnTrActKernSockRead, DBG_FUNC_END,
3386                             VM_KERNEL_ADDRPERM(so), 0,
3387                             (int64_t)(orig_resid - uio_resid(uio)));
3388                 }
3389                 KERNEL_DEBUG(DBG_FNC_SORECEIVE | DBG_FUNC_END, error,
3390                     0, 0, 0, 0);
3391
3392                 return error;
3393         }
3394 nooob:
3395         if (mp != NULL) {
3396                 *mp = NULL;
3397         }
3398
3399         if (so->so_state & SS_ISCONFIRMING && uio_resid(uio)) {
3400                 (*pr->pr_usrreqs->pru_rcvd)(so, 0);
3401         }
3402
3403         free_list = NULL;
3404         delayed_copy_len = 0;
3405 restart:
3406 #ifdef MORE_LOCKING_DEBUG
3407         if (so->so_usecount <= 1) {
3408                 printf("soreceive: sblock so=0x%llx ref=%d on socket\n",
3409                     (uint64_t)DEBUG_KERNEL_ADDRPERM(so), so->so_usecount);
3410         }
3411 #endif
3412         /*
3413          * See if the socket has been closed (SS_NOFDREF|SS_CANTRCVMORE)
3414          * and if so just return to the caller.  This could happen when
3415          * soreceive() is called by a socket upcall function during the
3416          * time the socket is freed.  The socket buffer would have been
3417          * locked across the upcall, therefore we cannot put this thread
3418          * to sleep (else we will deadlock) or return EWOULDBLOCK (else
3419          * we may livelock), because the lock on the socket buffer will
3420          * only be released when the upcall routine returns to its caller.
3421          * Because the socket has been officially closed, there can be
3422          * no further read on it.
3423          *
3424          * A multipath subflow socket would have its SS_NOFDREF set by
3425          * default, so check for SOF_MP_SUBFLOW socket flag; when the
3426          * socket is closed for real, SOF_MP_SUBFLOW would be cleared.
3427          */
3428         if ((so->so_state & (SS_NOFDREF | SS_CANTRCVMORE)) ==
3429             (SS_NOFDREF | SS_CANTRCVMORE) && !(so->so_flags & SOF_MP_SUBFLOW)) {
3430                 socket_unlock(so, 1);
3431                 return 0;
3432         }
3433
3434         error = sblock(&so->so_rcv, SBLOCKWAIT(flags));
3435         if (error) {
3436                 socket_unlock(so, 1);
3437                 KERNEL_DEBUG(DBG_FNC_SORECEIVE | DBG_FUNC_END, error,
3438                     0, 0, 0, 0);
3439                 if (en_tracing) {
3440                         KERNEL_ENERGYTRACE(kEnTrActKernSockRead, DBG_FUNC_END,
3441                             VM_KERNEL_ADDRPERM(so), 0,
3442                             (int64_t)(orig_resid - uio_resid(uio)));
3443                 }
3444                 return error;
3445         }
3446
3447         m = so->so_rcv.sb_mb;
3448         /*
3449          * If we have less data than requested, block awaiting more
3450          * (subject to any timeout) if:
3451          *   1. the current count is less than the low water mark, or
3452          *   2. MSG_WAITALL is set, and it is possible to do the entire
3453          *      receive operation at once if we block (resid <= hiwat).
3454          *   3. MSG_DONTWAIT is not set
3455          * If MSG_WAITALL is set but resid is larger than the receive buffer,
3456          * we have to do the receive in sections, and thus risk returning
3457          * a short count if a timeout or signal occurs after we start.
3458          */
3459         if (m == NULL || (((flags & MSG_DONTWAIT) == 0 &&
3460             so->so_rcv.sb_cc < uio_resid(uio)) &&
3461             (so->so_rcv.sb_cc < so->so_rcv.sb_lowat ||
3462             ((flags & MSG_WAITALL) && uio_resid(uio) <= so->so_rcv.sb_hiwat)) &&
3463             m->m_nextpkt == NULL && (pr->pr_flags & PR_ATOMIC) == 0)) {
3464                 /*
3465                  * Panic if we notice inconsistencies in the socket's
3466                  * receive list; both sb_mb and sb_cc should correctly
3467                  * reflect the contents of the list, otherwise we may
3468                  * end up with false positives during select() or poll()
3469                  * which could put the application in a bad state.
3470                  */
3471                 SB_MB_CHECK(&so->so_rcv);
3472
3473                 if (so->so_error) {
3474                         if (m != NULL) {
3475                                 goto dontblock;
3476                         }
3477                         error = so->so_error;
3478                         if ((flags & MSG_PEEK) == 0) {
3479                                 so->so_error = 0;
3480                         }
3481                         goto release;
3482                 }
3483                 if (so->so_state & SS_CANTRCVMORE) {
3484 #if CONTENT_FILTER
3485                         /*
3486                          * Deal with half closed connections
3487                          */
3488                         if ((so->so_state & SS_ISDISCONNECTED) == 0 &&
3489                             cfil_sock_data_pending(&so->so_rcv) != 0) {
3490                                 CFIL_LOG(LOG_INFO,
3491                                     "so %llx ignore SS_CANTRCVMORE",
3492                                     (uint64_t)DEBUG_KERNEL_ADDRPERM(so));
3493                         } else
3494 #endif /* CONTENT_FILTER */
3495                         if (m != NULL) {
3496                                 goto dontblock;
3497                         } else {
3498                                 goto release;
3499                         }
3500                 }
3501                 for (; m != NULL; m = m->m_next) {
3502                         if (m->m_type == MT_OOBDATA || (m->m_flags & M_EOR)) {
3503                                 m = so->so_rcv.sb_mb;
3504                                 goto dontblock;
3505                         }
3506                 }
3507                 if ((so->so_state & (SS_ISCONNECTED | SS_ISCONNECTING)) == 0 &&
3508                     (so->so_proto->pr_flags & PR_CONNREQUIRED)) {
3509                         error = ENOTCONN;
3510                         goto release;
3511                 }
3512                 if (uio_resid(uio) == 0) {
3513                         goto release;
3514                 }
3515
3516                 if ((so->so_state & SS_NBIO) ||
3517                     (flags & (MSG_DONTWAIT | MSG_NBIO))) {
3518                         error = EWOULDBLOCK;
3519                         goto release;
3520                 }
3521                 SBLASTRECORDCHK(&so->so_rcv, "soreceive sbwait 1");
3522                 SBLASTMBUFCHK(&so->so_rcv, "soreceive sbwait 1");
3523                 sbunlock(&so->so_rcv, TRUE);    /* keep socket locked */
3524 #if EVEN_MORE_LOCKING_DEBUG
3525                 if (socket_debug) {
3526                         printf("Waiting for socket data\n");
3527                 }
3528 #endif
3529
3530                 error = sbwait(&so->so_rcv);
3531 #if EVEN_MORE_LOCKING_DEBUG
3532                 if (socket_debug) {
3533                         printf("SORECEIVE - sbwait returned %d\n", error);
3534                 }
3535 #endif
3536                 if (so->so_usecount < 1) {
3537                         panic("%s: after 2nd sblock so=%p ref=%d on socket\n",
3538                             __func__, so, so->so_usecount);
3539                         /* NOTREACHED */
3540                 }
3541                 if (error) {
3542                         socket_unlock(so, 1);
3543                         KERNEL_DEBUG(DBG_FNC_SORECEIVE | DBG_FUNC_END, error,
3544                             0, 0, 0, 0);
3545                         if (en_tracing) {
3546                                 KERNEL_ENERGYTRACE(kEnTrActKernSockRead, DBG_FUNC_END,
3547                                     VM_KERNEL_ADDRPERM(so), 0,
3548                                     (int64_t)(orig_resid - uio_resid(uio)));
3549                         }
3550                         return error;
3551                 }
3552                 goto restart;
3553         }
3554 dontblock:
3555         OSIncrementAtomicLong(&p->p_stats->p_ru.ru_msgrcv);
3556         SBLASTRECORDCHK(&so->so_rcv, "soreceive 1");
3557         SBLASTMBUFCHK(&so->so_rcv, "soreceive 1");
3558         nextrecord = m->m_nextpkt;
3559
3560         if ((pr->pr_flags & PR_ADDR) && m->m_type == MT_SONAME) {
3561                 error = soreceive_addr(p, so, psa, flags, &m, &nextrecord,
3562                     mp0 == NULL);
3563                 if (error == ERESTART) {
3564                         goto restart;
3565                 } else if (error != 0) {
3566                         goto release;
3567                 }
3568                 orig_resid = 0;
3569         }
3570
3571         /*
3572          * Process one or more MT_CONTROL mbufs present before any data mbufs
3573          * in the first mbuf chain on the socket buffer.  If MSG_PEEK, we
3574          * just copy the data; if !MSG_PEEK, we call into the protocol to
3575          * perform externalization.
3576          */
3577         if (m != NULL && m->m_type == MT_CONTROL) {
3578                 error = soreceive_ctl(so, controlp, flags, &m, &nextrecord);
3579                 if (error != 0) {
3580                         goto release;
3581                 }
3582                 orig_resid = 0;
3583         }
3584
3585         /*
3586          * If the socket is a TCP socket with message delivery
3587          * enabled, then create a control msg to deliver the
3588          * relative TCP sequence number for this data. Waiting
3589          * until this point will protect against failures to
3590          * allocate an mbuf for control msgs.
3591          */
3592         if (so->so_type == SOCK_STREAM && SOCK_PROTO(so) == IPPROTO_TCP &&
3593             (so->so_flags & SOF_ENABLE_MSGS) && controlp != NULL) {
3594                 struct mbuf *seq_cm;
3595
3596                 seq_cm = sbcreatecontrol((caddr_t)&m->m_pkthdr.msg_seq,
3597                     sizeof(uint32_t), SCM_SEQNUM, SOL_SOCKET);
3598                 if (seq_cm == NULL) {
3599                         /* unable to allocate a control mbuf */
3600                         error = ENOBUFS;
3601                         goto release;
3602                 }
3603                 *controlp = seq_cm;
3604                 controlp = &seq_cm->m_next;
3605         }
3606
3607         if (m != NULL) {
3608                 if (!(flags & MSG_PEEK)) {
3609                         /*
3610                          * We get here because m points to an mbuf following
3611                          * any MT_SONAME or MT_CONTROL mbufs which have been
3612                          * processed above.  In any case, m should be pointing
3613                          * to the head of the mbuf chain, and the nextrecord
3614                          * should be either NULL or equal to m->m_nextpkt.
3615                          * See comments above about SB_LOCK.
3616                          */
3617                         if (m != so->so_rcv.sb_mb ||
3618                             m->m_nextpkt != nextrecord) {
3619                                 panic("%s: post-control !sync so=%p m=%p "
3620                                     "nextrecord=%p\n", __func__, so, m,
3621                                     nextrecord);
3622                                 /* NOTREACHED */
3623                         }
3624                         if (nextrecord == NULL) {
3625                                 so->so_rcv.sb_lastrecord = m;
3626                         }
3627                 }
3628                 type = m->m_type;
3629                 if (type == MT_OOBDATA) {
3630                         flags |= MSG_OOB;
3631                 }
3632         } else {
3633                 if (!(flags & MSG_PEEK)) {
3634                         SB_EMPTY_FIXUP(&so->so_rcv);
3635                 }
3636         }
3637         SBLASTRECORDCHK(&so->so_rcv, "soreceive 2");
3638         SBLASTMBUFCHK(&so->so_rcv, "soreceive 2");
3639
3640         moff = 0;
3641         offset = 0;
3642
3643         if (!(flags & MSG_PEEK) && uio_resid(uio) > sorecvmincopy) {
3644                 can_delay = 1;
3645         } else {
3646                 can_delay = 0;
3647         }
3648
3649         need_event = 0;
3650
3651         while (m != NULL &&
3652             (uio_resid(uio) - delayed_copy_len) > 0 && error == 0) {
3653                 if (m->m_type == MT_OOBDATA) {
3654                         if (type != MT_OOBDATA) {
3655                                 break;
3656                         }
3657                 } else if (type == MT_OOBDATA) {
3658                         break;
3659                 }
3660                 /*
3661                  * Make sure to allways set MSG_OOB event when getting
3662                  * out of band data inline.
3663                  */
3664                 if ((so->so_options & SO_WANTOOBFLAG) != 0 &&
3665                     (so->so_options & SO_OOBINLINE) != 0 &&
3666                     (so->so_state & SS_RCVATMARK) != 0) {
3667                         flags |= MSG_OOB;
3668                 }
3669                 so->so_state &= ~SS_RCVATMARK;
3670                 len = uio_resid(uio) - delayed_copy_len;
3671                 if (so->so_oobmark && len > so->so_oobmark - offset) {
3672                         len = so->so_oobmark - offset;
3673                 }
3674                 if (len > m->m_len - moff) {
3675                         len = m->m_len - moff;
3676                 }
3677                 /*
3678                  * If mp is set, just pass back the mbufs.
3679                  * Otherwise copy them out via the uio, then free.
3680                  * Sockbuf must be consistent here (points to current mbuf,
3681                  * it points to next record) when we drop priority;
3682                  * we must note any additions to the sockbuf when we
3683                  * block interrupts again.
3684                  */
3685                 if (mp == NULL) {
3686                         SBLASTRECORDCHK(&so->so_rcv, "soreceive uiomove");
3687                         SBLASTMBUFCHK(&so->so_rcv, "soreceive uiomove");
3688                         if (can_delay && len == m->m_len) {
3689                                 /*
3690                                  * only delay the copy if we're consuming the
3691                                  * mbuf and we're NOT in MSG_PEEK mode
3692                                  * and we have enough data to make it worthwile
3693                                  * to drop and retake the lock... can_delay
3694                                  * reflects the state of the 2 latter
3695                                  * constraints moff should always be zero
3696                                  * in these cases
3697                                  */
3698                                 delayed_copy_len += len;
3699                         } else {
3700                                 if (delayed_copy_len) {
3701                                         error = sodelayed_copy(so, uio,
3702                                             &free_list, &delayed_copy_len);
3703
3704                                         if (error) {
3705                                                 goto release;
3706                                         }
3707                                         /*
3708                                          * can only get here if MSG_PEEK is not
3709                                          * set therefore, m should point at the
3710                                          * head of the rcv queue; if it doesn't,
3711                                          * it means something drastically
3712                                          * changed while we were out from behind
3713                                          * the lock in sodelayed_copy. perhaps
3714                                          * a RST on the stream. in any event,
3715                                          * the stream has been interrupted. it's
3716                                          * probably best just to return whatever
3717                                          * data we've moved and let the caller
3718                                          * sort it out...
3719                                          */
3720                                         if (m != so->so_rcv.sb_mb) {
3721                                                 break;
3722                                         }
3723                                 }
3724                                 socket_unlock(so, 0);
3725                                 error = uiomove(mtod(m, caddr_t) + moff,
3726                                     (int)len, uio);
3727                                 socket_lock(so, 0);
3728
3729                                 if (error) {
3730                                         goto release;
3731                                 }
3732                         }
3733                 } else {
3734                         uio_setresid(uio, (uio_resid(uio) - len));
3735                 }
3736                 if (len == m->m_len - moff) {
3737                         if (m->m_flags & M_EOR) {
3738                                 flags |= MSG_EOR;
3739                         }
3740                         if (flags & MSG_PEEK) {
3741                                 m = m->m_next;
3742                                 moff = 0;
3743                         } else {
3744                                 nextrecord = m->m_nextpkt;
3745                                 sbfree(&so->so_rcv, m);
3746                                 m->m_nextpkt = NULL;
3747
3748                                 /*
3749                                  * If this packet is an unordered packet
3750                                  * (indicated by M_UNORDERED_DATA flag), remove
3751                                  * the additional bytes added to the
3752                                  * receive socket buffer size.
3753                                  */
3754                                 if ((so->so_flags & SOF_ENABLE_MSGS) &&
3755                                     m->m_len &&
3756                                     (m->m_flags & M_UNORDERED_DATA) &&
3757                                     sbreserve(&so->so_rcv,
3758                                     so->so_rcv.sb_hiwat - m->m_len)) {
3759                                         if (so->so_msg_state->msg_uno_bytes >
3760                                             m->m_len) {
3761                                                 so->so_msg_state->
3762                                                 msg_uno_bytes -= m->m_len;
3763                                         } else {
3764                                                 so->so_msg_state->
3765                                                 msg_uno_bytes = 0;
3766                                         }
3767                                         m->m_flags &= ~M_UNORDERED_DATA;
3768                                 }
3769
3770                                 if (mp != NULL) {
3771                                         *mp = m;
3772                                         mp = &m->m_next;
3773                                         so->so_rcv.sb_mb = m = m->m_next;
3774                                         *mp = NULL;
3775                                 } else {
3776                                         if (free_list == NULL) {
3777                                                 free_list = m;
3778                                         } else {
3779                                                 ml->m_next = m;
3780                                         }
3781                                         ml = m;
3782                                         so->so_rcv.sb_mb = m = m->m_next;
3783                                         ml->m_next = NULL;
3784                                 }
3785                                 if (m != NULL) {
3786                                         m->m_nextpkt = nextrecord;
3787                                         if (nextrecord == NULL) {
3788                                                 so->so_rcv.sb_lastrecord = m;
3789                                         }
3790                                 } else {
3791                                         so->so_rcv.sb_mb = nextrecord;
3792                                         SB_EMPTY_FIXUP(&so->so_rcv);
3793                                 }
3794                                 SBLASTRECORDCHK(&so->so_rcv, "soreceive 3");
3795                                 SBLASTMBUFCHK(&so->so_rcv, "soreceive 3");
3796                         }
3797                 } else {
3798                         if (flags & MSG_PEEK) {
3799                                 moff += len;
3800                         } else {
3801                                 if (mp != NULL) {
3802                                         int copy_flag;
3803
3804                                         if (flags & MSG_DONTWAIT) {
3805                                                 copy_flag = M_DONTWAIT;
3806                                         } else {
3807                                                 copy_flag = M_WAIT;
3808                                         }
3809                                         *mp = m_copym(m, 0, len, copy_flag);
3810                                         /*
3811                                          * Failed to allocate an mbuf?
3812                                          * Adjust uio_resid back, it was
3813                                          * adjusted down by len bytes which
3814                                          * we didn't copy over.
3815                                          */
3816                                         if (*mp == NULL) {
3817                                                 uio_setresid(uio,
3818                                                     (uio_resid(uio) + len));
3819                                                 break;
3820                                         }
3821                                 }
3822                                 m->m_data += len;
3823                                 m->m_len -= len;
3824                                 so->so_rcv.sb_cc -= len;
3825                         }
3826                 }
3827                 if (so->so_oobmark) {
3828                         if ((flags & MSG_PEEK) == 0) {
3829                                 so->so_oobmark -= len;
3830                                 if (so->so_oobmark == 0) {
3831                                         so->so_state |= SS_RCVATMARK;
3832                                         /*
3833                                          * delay posting the actual event until
3834                                          * after any delayed copy processing
3835                                          * has finished
3836                                          */
3837                                         need_event = 1;
3838                                         break;
3839                                 }
3840                         } else {
3841                                 offset += len;
3842                                 if (offset == so->so_oobmark) {
3843                                         break;
3844                                 }
3845                         }
3846                 }
3847                 if (flags & MSG_EOR) {
3848                         break;
3849                 }
3850                 /*
3851                  * If the MSG_WAITALL or MSG_WAITSTREAM flag is set
3852                  * (for non-atomic socket), we must not quit until
3853                  * "uio->uio_resid == 0" or an error termination.
3854                  * If a signal/timeout occurs, return with a short
3855                  * count but without error.  Keep sockbuf locked
3856                  * against other readers.
3857                  */
3858                 while (flags & (MSG_WAITALL | MSG_WAITSTREAM) && m == NULL &&
3859                     (uio_resid(uio) - delayed_copy_len) > 0 &&
3860                     !sosendallatonce(so) && !nextrecord) {
3861                         if (so->so_error || ((so->so_state & SS_CANTRCVMORE)
3862 #if CONTENT_FILTER
3863                             && cfil_sock_data_pending(&so->so_rcv) == 0
3864 #endif /* CONTENT_FILTER */
3865                             )) {
3866                                 goto release;
3867                         }
3868
3869                         /*
3870                          * Depending on the protocol (e.g. TCP), the following
3871                          * might cause the socket lock to be dropped and later
3872                          * be reacquired, and more data could have arrived and
3873                          * have been appended to the receive socket buffer by
3874                          * the time it returns.  Therefore, we only sleep in
3875                          * sbwait() below if and only if the socket buffer is
3876                          * empty, in order to avoid a false sleep.
3877                          */
3878                         if (pr->pr_flags & PR_WANTRCVD && so->so_pcb &&
3879                             (((struct inpcb *)so->so_pcb)->inp_state !=
3880                             INPCB_STATE_DEAD)) {
3881                                 (*pr->pr_usrreqs->pru_rcvd)(so, flags);
3882                         }
3883
3884                         SBLASTRECORDCHK(&so->so_rcv, "soreceive sbwait 2");
3885                         SBLASTMBUFCHK(&so->so_rcv, "soreceive sbwait 2");
3886
3887                         if (so->so_rcv.sb_mb == NULL && sbwait(&so->so_rcv)) {
3888                                 error = 0;
3889                                 goto release;
3890                         }
3891                         /*
3892                          * have to wait until after we get back from the sbwait
3893                          * to do the copy because we will drop the lock if we
3894                          * have enough data that has been delayed... by dropping
3895                          * the lock we open up a window allowing the netisr
3896                          * thread to process the incoming packets and to change
3897                          * the state of this socket... we're issuing the sbwait
3898                          * because the socket is empty and we're expecting the
3899                          * netisr thread to wake us up when more packets arrive;
3900                          * if we allow that processing to happen and then sbwait
3901                          * we could stall forever with packets sitting in the
3902                          * socket if no further packets arrive from the remote
3903                          * side.
3904                          *
3905                          * we want to copy before we've collected all the data
3906                          * to satisfy this request to allow the copy to overlap
3907                          * the incoming packet processing on an MP system
3908                          */
3909                         if (delayed_copy_len > sorecvmincopy &&
3910                             (delayed_copy_len > (so->so_rcv.sb_hiwat / 2))) {
3911                                 error = sodelayed_copy(so, uio,
3912                                     &free_list, &delayed_copy_len);
3913
3914                                 if (error) {
3915                                         goto release;
3916                                 }
3917                         }
3918                         m = so->so_rcv.sb_mb;
3919                         if (m != NULL) {
3920                                 nextrecord = m->m_nextpkt;
3921                         }
3922                         SB_MB_CHECK(&so->so_rcv);
3923                 }
3924         }
3925 #ifdef MORE_LOCKING_DEBUG
3926         if (so->so_usecount <= 1) {
3927                 panic("%s: after big while so=%p ref=%d on socket\n",
3928                     __func__, so, so->so_usecount);
3929                 /* NOTREACHED */
3930         }
3931 #endif
3932
3933         if (m != NULL && pr->pr_flags & PR_ATOMIC) {
3934                 if (so->so_options & SO_DONTTRUNC) {
3935                         flags |= MSG_RCVMORE;
3936                 } else {
3937                         flags |= MSG_TRUNC;
3938                         if ((flags & MSG_PEEK) == 0) {
3939                                 (void) sbdroprecord(&so->so_rcv);
3940                         }
3941                 }
3942         }
3943
3944         /*
3945          * pru_rcvd below (for TCP) may cause more data to be received
3946          * if the socket lock is dropped prior to sending the ACK; some
3947          * legacy OpenTransport applications don't handle this well
3948          * (if it receives less data than requested while MSG_HAVEMORE
3949          * is set), and so we set the flag now based on what we know
3950          * prior to calling pru_rcvd.
3951          */
3952         if ((so->so_options & SO_WANTMORE) && so->so_rcv.sb_cc > 0) {
3953                 flags |= MSG_HAVEMORE;
3954         }
3955
3956         if ((flags & MSG_PEEK) == 0) {
3957                 if (m == NULL) {
3958                         so->so_rcv.sb_mb = nextrecord;
3959                         /*
3960                          * First part is an inline SB_EMPTY_FIXUP().  Second
3961                          * part makes sure sb_lastrecord is up-to-date if
3962                          * there is still data in the socket buffer.
3963                          */
3964                         if (so->so_rcv.sb_mb == NULL) {
3965                                 so->so_rcv.sb_mbtail = NULL;
3966                                 so->so_rcv.sb_lastrecord = NULL;
3967                         } else if (nextrecord->m_nextpkt == NULL) {
3968                                 so->so_rcv.sb_lastrecord = nextrecord;
3969                         }
3970                         SB_MB_CHECK(&so->so_rcv);
3971                 }
3972                 SBLASTRECORDCHK(&so->so_rcv, "soreceive 4");
3973                 SBLASTMBUFCHK(&so->so_rcv, "soreceive 4");
3974                 if (pr->pr_flags & PR_WANTRCVD && so->so_pcb) {
3975                         (*pr->pr_usrreqs->pru_rcvd)(so, flags);
3976                 }
3977         }
3978
3979         if (delayed_copy_len) {
3980                 error = sodelayed_copy(so, uio, &free_list, &delayed_copy_len);
3981                 if (error) {
3982                         goto release;
3983                 }
3984         }
3985         if (free_list != NULL) {
3986                 m_freem_list(free_list);
3987                 free_list = NULL;
3988         }
3989         if (need_event) {
3990                 postevent(so, 0, EV_OOB);
3991         }
3992
3993         if (orig_resid == uio_resid(uio) && orig_resid &&
3994             (flags & MSG_EOR) == 0 && (so->so_state & SS_CANTRCVMORE) == 0) {
3995                 sbunlock(&so->so_rcv, TRUE);    /* keep socket locked */
3996                 goto restart;
3997         }
3998
3999         if (flagsp != NULL) {
4000                 *flagsp |= flags;
4001         }
4002 release:
4003 #ifdef MORE_LOCKING_DEBUG
4004         if (so->so_usecount <= 1) {
4005                 panic("%s: release so=%p ref=%d on socket\n", __func__,
4006                     so, so->so_usecount);
4007                 /* NOTREACHED */
4008         }
4009 #endif
4010         if (delayed_copy_len) {
4011                 error = sodelayed_copy(so, uio, &free_list, &delayed_copy_len);
4012         }
4013
4014         if (free_list != NULL) {
4015                 m_freem_list(free_list);
4016         }
4017
4018         sbunlock(&so->so_rcv, FALSE);   /* will unlock socket */
4019
4020         if (en_tracing) {
4021                 KERNEL_ENERGYTRACE(kEnTrActKernSockRead, DBG_FUNC_END,
4022                     VM_KERNEL_ADDRPERM(so),
4023                     ((error == EWOULDBLOCK) ? kEnTrFlagNoWork : 0),
4024                     (int64_t)(orig_resid - uio_resid(uio)));
4025         }
4026         KERNEL_DEBUG(DBG_FNC_SORECEIVE | DBG_FUNC_END, so, uio_resid(uio),
4027             so->so_rcv.sb_cc, 0, error);
4028
4029         return error;
4030 }
4031
4032 /*
4033  * Returns:     0                       Success
4034  *      uiomove:EFAULT
4035  */
4036 static int
4037 sodelayed_copy(struct socket *so, struct uio *uio, struct mbuf **free_list,
4038     user_ssize_t *resid)
4039 {
4040         int error = 0;
4041         struct mbuf *m;
4042
4043         m = *free_list;
4044
4045         socket_unlock(so, 0);
4046
4047         while (m != NULL && error == 0) {
4048                 error = uiomove(mtod(m, caddr_t), (int)m->m_len, uio);
4049                 m = m->m_next;
4050         }
4051         m_freem_list(*free_list);
4052
4053         *free_list = NULL;
4054         *resid = 0;
4055
4056         socket_lock(so, 0);
4057
4058         return error;
4059 }
4060
4061 static int
4062 sodelayed_copy_list(struct socket *so, struct recv_msg_elem *msgarray,
4063     u_int uiocnt, struct mbuf **free_list, user_ssize_t *resid)
4064 {
4065 #pragma unused(so)
4066         int error = 0;
4067         struct mbuf *ml, *m;
4068         int i = 0;
4069         struct uio *auio;
4070
4071         for (ml = *free_list, i = 0; ml != NULL && i < uiocnt;
4072             ml = ml->m_nextpkt, i++) {
4073                 auio = msgarray[i].uio;
4074                 for (m = ml; m != NULL; m = m->m_next) {
4075                         error = uiomove(mtod(m, caddr_t), m->m_len, auio);
4076                         if (error != 0) {
4077                                 goto out;
4078                         }
4079                 }
4080         }
4081 out:
4082         m_freem_list(*free_list);
4083
4084         *free_list = NULL;
4085         *resid = 0;
4086
4087         return error;
4088 }
4089
4090 int
4091 soreceive_list(struct socket *so, struct recv_msg_elem *msgarray, u_int uiocnt,
4092     int *flagsp)
4093 {
4094         struct mbuf *m;
4095         struct mbuf *nextrecord;
4096         struct mbuf *ml = NULL, *free_list = NULL, *free_tail = NULL;
4097         int error;
4098         user_ssize_t len, pktlen, delayed_copy_len = 0;
4099         struct protosw *pr = so->so_proto;
4100         user_ssize_t resid;
4101         struct proc *p = current_proc();
4102         struct uio *auio = NULL;
4103         int npkts = 0;
4104         int sblocked = 0;
4105         struct sockaddr **psa = NULL;
4106         struct mbuf **controlp = NULL;
4107         int can_delay;
4108         int flags;
4109         struct mbuf *free_others = NULL;
4110
4111         KERNEL_DEBUG(DBG_FNC_SORECEIVE_LIST | DBG_FUNC_START,
4112             so, uiocnt,
4113             so->so_rcv.sb_cc, so->so_rcv.sb_lowat, so->so_rcv.sb_hiwat);
4114
4115         /*
4116          * Sanity checks:
4117          * - Only supports don't wait flags
4118          * - Only support datagram sockets (could be extended to raw)
4119          * - Must be atomic
4120          * - Protocol must support packet chains
4121          * - The uio array is NULL (should we panic?)
4122          */
4123         if (flagsp != NULL) {
4124                 flags = *flagsp;
4125         } else {
4126                 flags = 0;
4127         }
4128         if (flags & ~(MSG_PEEK | MSG_WAITALL | MSG_DONTWAIT | MSG_NEEDSA |
4129             MSG_NBIO)) {
4130                 printf("%s invalid flags 0x%x\n", __func__, flags);
4131                 error = EINVAL;
4132                 goto out;
4133         }
4134         if (so->so_type != SOCK_DGRAM) {
4135                 error = EINVAL;
4136                 goto out;
4137         }
4138         if (sosendallatonce(so) == 0) {
4139                 error = EINVAL;
4140                 goto out;
4141         }
4142         if (so->so_proto->pr_usrreqs->pru_send_list == NULL) {
4143                 error = EPROTONOSUPPORT;
4144                 goto out;
4145         }
4146         if (msgarray == NULL) {
4147                 printf("%s uioarray is NULL\n", __func__);
4148                 error = EINVAL;
4149                 goto out;
4150         }
4151         if (uiocnt == 0) {
4152                 printf("%s uiocnt is 0\n", __func__);
4153                 error = EINVAL;
4154                 goto out;
4155         }
4156         /*
4157          * Sanity check on the length passed by caller as we are making 'int'
4158          * comparisons
4159          */
4160         resid = recv_msg_array_resid(msgarray, uiocnt);
4161         if (resid < 0 || resid > INT_MAX) {
4162                 error = EINVAL;
4163                 goto out;
4164         }
4165
4166         if (!(flags & MSG_PEEK) && sorecvmincopy > 0) {
4167                 can_delay = 1;
4168         } else {
4169                 can_delay = 0;
4170         }
4171
4172         socket_lock(so, 1);
4173         so_update_last_owner_locked(so, p);
4174         so_update_policy(so);
4175
4176 #if NECP
4177         so_update_necp_policy(so, NULL, NULL);
4178 #endif /* NECP */
4179
4180         /*
4181          * If a recv attempt is made on a previously-accepted socket
4182          * that has been marked as inactive (disconnected), reject
4183          * the request.
4184          */
4185         if (so->so_flags & SOF_DEFUNCT) {
4186                 struct sockbuf *sb = &so->so_rcv;
4187
4188                 error = ENOTCONN;
4189                 SODEFUNCTLOG("%s[%d, %s]: defunct so 0x%llx [%d,%d] (%d)\n",
4190                     __func__, proc_pid(p), proc_best_name(p),
4191                     (uint64_t)DEBUG_KERNEL_ADDRPERM(so),
4192                     SOCK_DOM(so), SOCK_TYPE(so), error);
4193                 /*
4194                  * This socket should have been disconnected and flushed
4195                  * prior to being returned from sodefunct(); there should
4196                  * be no data on its receive list, so panic otherwise.
4197                  */
4198                 if (so->so_state & SS_DEFUNCT) {
4199                         sb_empty_assert(sb, __func__);
4200                 }
4201                 goto release;
4202         }
4203
4204 next:
4205         /*
4206          * The uio may be empty
4207          */
4208         if (npkts >= uiocnt) {
4209                 error = 0;
4210                 goto release;
4211         }
4212 restart:
4213         /*
4214          * See if the socket has been closed (SS_NOFDREF|SS_CANTRCVMORE)
4215          * and if so just return to the caller.  This could happen when
4216          * soreceive() is called by a socket upcall function during the
4217          * time the socket is freed.  The socket buffer would have been
4218          * locked across the upcall, therefore we cannot put this thread
4219          * to sleep (else we will deadlock) or return EWOULDBLOCK (else
4220          * we may livelock), because the lock on the socket buffer will
4221          * only be released when the upcall routine returns to its caller.
4222          * Because the socket has been officially closed, there can be
4223          * no further read on it.
4224          */
4225         if ((so->so_state & (SS_NOFDREF | SS_CANTRCVMORE)) ==
4226             (SS_NOFDREF | SS_CANTRCVMORE)) {
4227                 error = 0;
4228                 goto release;
4229         }
4230
4231         error = sblock(&so->so_rcv, SBLOCKWAIT(flags));
4232         if (error) {
4233                 goto release;
4234         }
4235         sblocked = 1;
4236
4237         m = so->so_rcv.sb_mb;
4238         /*
4239          * Block awaiting more datagram if needed
4240          */
4241         if (m == NULL || (((flags & MSG_DONTWAIT) == 0 &&
4242             (so->so_rcv.sb_cc < so->so_rcv.sb_lowat ||
4243             ((flags & MSG_WAITALL) && npkts < uiocnt))))) {
4244                 /*
4245                  * Panic if we notice inconsistencies in the socket's
4246                  * receive list; both sb_mb and sb_cc should correctly
4247                  * reflect the contents of the list, otherwise we may
4248                  * end up with false positives during select() or poll()
4249                  * which could put the application in a bad state.
4250                  */
4251                 SB_MB_CHECK(&so->so_rcv);
4252
4253                 if (so->so_error) {
4254                         error = so->so_error;
4255                         if ((flags & MSG_PEEK) == 0) {
4256                                 so->so_error = 0;
4257                         }
4258                         goto release;
4259                 }
4260                 if (so->so_state & SS_CANTRCVMORE) {
4261                         goto release;
4262                 }
4263                 if ((so->so_state & (SS_ISCONNECTED | SS_ISCONNECTING)) == 0 &&
4264                     (so->so_proto->pr_flags & PR_CONNREQUIRED)) {
4265                         error = ENOTCONN;
4266                         goto release;
4267                 }
4268                 if ((so->so_state & SS_NBIO) ||
4269                     (flags & (MSG_DONTWAIT | MSG_NBIO))) {
4270                         error = EWOULDBLOCK;
4271                         goto release;
4272                 }
4273                 /*
4274                  * Do not block if we got some data
4275                  */
4276                 if (free_list != NULL) {
4277                         error = 0;
4278                         goto release;
4279                 }
4280
4281                 SBLASTRECORDCHK(&so->so_rcv, "soreceive sbwait 1");
4282                 SBLASTMBUFCHK(&so->so_rcv, "soreceive sbwait 1");
4283
4284                 sbunlock(&so->so_rcv, TRUE);    /* keep socket locked */
4285                 sblocked = 0;
4286
4287                 error = sbwait(&so->so_rcv);
4288                 if (error) {
4289                         goto release;
4290                 }
4291                 goto restart;
4292         }
4293
4294         OSIncrementAtomicLong(&p->p_stats->p_ru.ru_msgrcv);
4295         SBLASTRECORDCHK(&so->so_rcv, "soreceive 1");
4296         SBLASTMBUFCHK(&so->so_rcv, "soreceive 1");
4297
4298         /*
4299          * Consume the current uio index as we have a datagram
4300          */
4301         auio = msgarray[npkts].uio;
4302         resid = uio_resid(auio);
4303         msgarray[npkts].which |= SOCK_MSG_DATA;
4304         psa = (msgarray[npkts].which & SOCK_MSG_SA) ?
4305             &msgarray[npkts].psa : NULL;
4306         controlp = (msgarray[npkts].which & SOCK_MSG_CONTROL) ?
4307             &msgarray[npkts].controlp : NULL;
4308         npkts += 1;
4309         nextrecord = m->m_nextpkt;
4310
4311         if ((pr->pr_flags & PR_ADDR) && m->m_type == MT_SONAME) {
4312                 error = soreceive_addr(p, so, psa, flags, &m, &nextrecord, 1);
4313                 if (error == ERESTART) {
4314                         goto restart;
4315                 } else if (error != 0) {
4316                         goto release;
4317                 }
4318         }
4319
4320         if (m != NULL && m->m_type == MT_CONTROL) {
4321                 error = soreceive_ctl(so, controlp, flags, &m, &nextrecord);
4322                 if (error != 0) {
4323                         goto release;
4324                 }
4325         }
4326
4327         if (m->m_pkthdr.len == 0) {
4328                 printf("%s:%d so %llx pkt %llx type %u pktlen null\n",
4329                     __func__, __LINE__,
4330                     (uint64_t)DEBUG_KERNEL_ADDRPERM(so),
4331                     (uint64_t)DEBUG_KERNEL_ADDRPERM(m),
4332                     m->m_type);
4333         }
4334
4335         /*
4336          * Loop to copy the mbufs of the current record
4337          * Support zero length packets
4338          */
4339         ml = NULL;
4340         pktlen = 0;
4341         while (m != NULL && (len = resid - pktlen) >= 0 && error == 0) {
4342                 if (m->m_len == 0) {
4343                         panic("%p m_len zero", m);
4344                 }
4345                 if (m->m_type == 0) {
4346                         panic("%p m_type zero", m);
4347                 }
4348                 /*
4349                  * Clip to the residual length
4350                  */
4351                 if (len > m->m_len) {
4352                         len = m->m_len;
4353                 }
4354                 pktlen += len;
4355                 /*
4356                  * Copy the mbufs via the uio or delay the copy
4357                  * Sockbuf must be consistent here (points to current mbuf,
4358                  * it points to next record) when we drop priority;
4359                  * we must note any additions to the sockbuf when we
4360                  * block interrupts again.
4361                  */
4362                 if (len > 0 && can_delay == 0) {
4363                         socket_unlock(so, 0);
4364                         error = uiomove(mtod(m, caddr_t), (int)len, auio);
4365                         socket_lock(so, 0);
4366                         if (error) {
4367                                 goto release;
4368                         }
4369                 } else {
4370                         delayed_copy_len += len;
4371                 }
4372
4373                 if (len == m->m_len) {
4374                         /*
4375                          * m was entirely copied
4376                          */
4377                         sbfree(&so->so_rcv, m);
4378                         nextrecord = m->m_nextpkt;
4379                         m->m_nextpkt = NULL;
4380
4381                         /*
4382                          * Set the first packet to the head of the free list
4383                          */
4384                         if (free_list == NULL) {
4385                                 free_list = m;
4386                         }
4387                         /*
4388                          * Link current packet to tail of free list
4389                          */
4390                         if (ml == NULL) {
4391                                 if (free_tail != NULL) {
4392                                         free_tail->m_nextpkt = m;
4393                                 }
4394                                 free_tail = m;
4395                         }
4396                         /*
4397                          * Link current mbuf to last mbuf of current packet
4398                          */
4399                         if (ml != NULL) {
4400                                 ml->m_next = m;
4401                         }
4402                         ml = m;
4403
4404                         /*
4405                          * Move next buf to head of socket buffer
4406                          */
4407                         so->so_rcv.sb_mb = m = ml->m_next;
4408                         ml->m_next = NULL;
4409
4410                         if (m != NULL) {
4411                                 m->m_nextpkt = nextrecord;
4412                                 if (nextrecord == NULL) {
4413                                         so->so_rcv.sb_lastrecord = m;
4414                                 }
4415                         } else {
4416                                 so->so_rcv.sb_mb = nextrecord;
4417                                 SB_EMPTY_FIXUP(&so->so_rcv);
4418                         }
4419                         SBLASTRECORDCHK(&so->so_rcv, "soreceive 3");
4420                         SBLASTMBUFCHK(&so->so_rcv, "soreceive 3");
4421                 } else {
4422                         /*
4423                          * Stop the loop on partial copy
4424                          */
4425                         break;
4426                 }
4427         }
4428 #ifdef MORE_LOCKING_DEBUG
4429         if (so->so_usecount <= 1) {
4430                 panic("%s: after big while so=%llx ref=%d on socket\n",
4431                     __func__,
4432                     (uint64_t)DEBUG_KERNEL_ADDRPERM(so), so->so_usecount);
4433                 /* NOTREACHED */
4434         }
4435 #endif
4436         /*
4437          * Tell the caller we made a partial copy
4438          */
4439         if (m != NULL) {
4440                 if (so->so_options & SO_DONTTRUNC) {
4441                         /*
4442                          * Copyout first the freelist then the partial mbuf
4443                          */
4444                         socket_unlock(so, 0);
4445                         if (delayed_copy_len) {
4446                                 error = sodelayed_copy_list(so, msgarray,
4447                                     uiocnt, &free_list, &delayed_copy_len);
4448                         }
4449
4450                         if (error == 0) {
4451                                 error = uiomove(mtod(m, caddr_t), (int)len,
4452                                     auio);
4453                         }
4454                         socket_lock(so, 0);
4455                         if (error) {
4456                                 goto release;
4457                         }
4458
4459                         m->m_data += len;
4460                         m->m_len -= len;
4461                         so->so_rcv.sb_cc -= len;
4462                         flags |= MSG_RCVMORE;
4463                 } else {
4464                         (void) sbdroprecord(&so->so_rcv);
4465                         nextrecord = so->so_rcv.sb_mb;
4466                         m = NULL;
4467                         flags |= MSG_TRUNC;
4468                 }
4469         }
4470
4471         if (m == NULL) {
4472                 so->so_rcv.sb_mb = nextrecord;
4473                 /*
4474                  * First part is an inline SB_EMPTY_FIXUP().  Second
4475                  * part makes sure sb_lastrecord is up-to-date if
4476                  * there is still data in the socket buffer.
4477                  */
4478                 if (so->so_rcv.sb_mb == NULL) {
4479                         so->so_rcv.sb_mbtail = NULL;
4480                         so->so_rcv.sb_lastrecord = NULL;
4481                 } else if (nextrecord->m_nextpkt == NULL) {
4482                         so->so_rcv.sb_lastrecord = nextrecord;
4483                 }
4484                 SB_MB_CHECK(&so->so_rcv);
4485         }
4486         SBLASTRECORDCHK(&so->so_rcv, "soreceive 4");
4487         SBLASTMBUFCHK(&so->so_rcv, "soreceive 4");
4488
4489         /*
4490          * We can continue to the next packet as long as:
4491          * - We haven't exhausted the uio array
4492          * - There was no error
4493          * - A packet was not truncated
4494          * - We can still receive more data
4495          */
4496         if (npkts < uiocnt && error == 0 &&
4497             (flags & (MSG_RCVMORE | MSG_TRUNC)) == 0 &&
4498             (so->so_state & SS_CANTRCVMORE) == 0) {
4499                 sbunlock(&so->so_rcv, TRUE);    /* keep socket locked */
4500                 sblocked = 0;
4501
4502                 goto next;
4503         }
4504         if (flagsp != NULL) {
4505                 *flagsp |= flags;
4506         }
4507
4508 release:
4509         /*
4510          * pru_rcvd may cause more data to be received if the socket lock
4511          * is dropped so we set MSG_HAVEMORE now based on what we know.
4512          * That way the caller won't be surprised if it receives less data
4513          * than requested.
4514          */
4515         if ((so->so_options & SO_WANTMORE) && so->so_rcv.sb_cc > 0) {
4516                 flags |= MSG_HAVEMORE;
4517         }
4518
4519         if (pr->pr_flags & PR_WANTRCVD && so->so_pcb) {
4520                 (*pr->pr_usrreqs->pru_rcvd)(so, flags);
4521         }
4522
4523         if (sblocked) {
4524                 sbunlock(&so->so_rcv, FALSE);   /* will unlock socket */
4525         } else {
4526                 socket_unlock(so, 1);
4527         }
4528
4529         if (delayed_copy_len) {
4530                 error = sodelayed_copy_list(so, msgarray, uiocnt,
4531                     &free_list, &delayed_copy_len);
4532         }
4533 out:
4534         /*
4535          * Amortize the cost of freeing the mbufs
4536          */
4537         if (free_list != NULL) {
4538                 m_freem_list(free_list);
4539         }
4540         if (free_others != NULL) {
4541                 m_freem_list(free_others);
4542         }
4543
4544         KERNEL_DEBUG(DBG_FNC_SORECEIVE_LIST | DBG_FUNC_END, error,
4545             0, 0, 0, 0);
4546         return error;
4547 }
4548
4549 static int
4550 so_statistics_event_to_nstat_event(int64_t *input_options,
4551     uint64_t *nstat_event)
4552 {
4553         int error = 0;
4554         switch (*input_options) {
4555         case SO_STATISTICS_EVENT_ENTER_CELLFALLBACK:
4556                 *nstat_event = NSTAT_EVENT_SRC_ENTER_CELLFALLBACK;
4557                 break;
4558         case SO_STATISTICS_EVENT_EXIT_CELLFALLBACK:
4559                 *nstat_event = NSTAT_EVENT_SRC_EXIT_CELLFALLBACK;
4560                 break;
4561 #if (DEBUG || DEVELOPMENT)
4562         case SO_STATISTICS_EVENT_RESERVED_1:
4563                 *nstat_event = NSTAT_EVENT_SRC_RESERVED_1;
4564                 break;
4565         case SO_STATISTICS_EVENT_RESERVED_2:
4566                 *nstat_event = NSTAT_EVENT_SRC_RESERVED_2;
4567                 break;
4568 #endif /* (DEBUG || DEVELOPMENT) */
4569         default:
4570                 error = EINVAL;
4571                 break;
4572         }
4573         return error;
4574 }
4575
4576 /*
4577  * Returns:     0                       Success
4578  *              EINVAL
4579  *              ENOTCONN
4580  *      <pru_shutdown>:EINVAL
4581  *      <pru_shutdown>:EADDRNOTAVAIL[TCP]
4582  *      <pru_shutdown>:ENOBUFS[TCP]
4583  *      <pru_shutdown>:EMSGSIZE[TCP]
4584  *      <pru_shutdown>:EHOSTUNREACH[TCP]
4585  *      <pru_shutdown>:ENETUNREACH[TCP]
4586  *      <pru_shutdown>:ENETDOWN[TCP]
4587  *      <pru_shutdown>:ENOMEM[TCP]
4588  *      <pru_shutdown>:EACCES[TCP]
4589  *      <pru_shutdown>:EMSGSIZE[TCP]
4590  *      <pru_shutdown>:ENOBUFS[TCP]
4591  *      <pru_shutdown>:???[TCP]         [ignorable: mostly IPSEC/firewall/DLIL]
4592  *      <pru_shutdown>:???              [other protocol families]
4593  */
4594 int
4595 soshutdown(struct socket *so, int how)
4596 {
4597         int error;
4598
4599         KERNEL_DEBUG(DBG_FNC_SOSHUTDOWN | DBG_FUNC_START, how, 0, 0, 0, 0);
4600
4601         switch (how) {
4602         case SHUT_RD:
4603         case SHUT_WR:
4604         case SHUT_RDWR:
4605                 socket_lock(so, 1);
4606                 if ((so->so_state &
4607                     (SS_ISCONNECTED | SS_ISCONNECTING | SS_ISDISCONNECTING)) == 0) {
4608                         error = ENOTCONN;
4609                 } else {
4610                         error = soshutdownlock(so, how);
4611                 }
4612                 socket_unlock(so, 1);
4613                 break;
4614         default:
4615                 error = EINVAL;
4616                 break;
4617         }
4618
4619         KERNEL_DEBUG(DBG_FNC_SOSHUTDOWN | DBG_FUNC_END, how, error, 0, 0, 0);
4620
4621         return error;
4622 }
4623
4624 int
4625 soshutdownlock_final(struct socket *so, int how)
4626 {
4627         struct protosw *pr = so->so_proto;
4628         int error = 0;
4629
4630         sflt_notify(so, sock_evt_shutdown, &how);
4631
4632         if (how != SHUT_WR) {
4633                 if ((so->so_state & SS_CANTRCVMORE) != 0) {
4634                         /* read already shut down */
4635                         error = ENOTCONN;
4636                         goto done;
4637                 }
4638                 sorflush(so);
4639                 postevent(so, 0, EV_RCLOSED);
4640         }
4641         if (how != SHUT_RD) {
4642                 if ((so->so_state & SS_CANTSENDMORE) != 0) {
4643                         /* write already shut down */
4644                         error = ENOTCONN;
4645                         goto done;
4646                 }
4647                 error = (*pr->pr_usrreqs->pru_shutdown)(so);
4648                 postevent(so, 0, EV_WCLOSED);
4649         }
4650 done:
4651         KERNEL_DEBUG(DBG_FNC_SOSHUTDOWN, how, 1, 0, 0, 0);
4652         return error;
4653 }
4654
4655 int
4656 soshutdownlock(struct socket *so, int how)
4657 {
4658         int error = 0;
4659
4660 #if CONTENT_FILTER
4661         /*
4662          * A content filter may delay the actual shutdown until it
4663          * has processed the pending data
4664          */
4665         if (so->so_flags & SOF_CONTENT_FILTER) {
4666                 error = cfil_sock_shutdown(so, &how);
4667                 if (error == EJUSTRETURN) {
4668                         error = 0;
4669                         goto done;
4670                 } else if (error != 0) {
4671                         goto done;
4672                 }
4673         }
4674 #endif /* CONTENT_FILTER */
4675
4676         error = soshutdownlock_final(so, how);
4677
4678 done:
4679         return error;
4680 }
4681
4682 void
4683 sowflush(struct socket *so)
4684 {
4685         struct sockbuf *sb = &so->so_snd;
4686
4687         /*
4688          * Obtain lock on the socket buffer (SB_LOCK).  This is required
4689          * to prevent the socket buffer from being unexpectedly altered
4690          * while it is used by another thread in socket send/receive.
4691          *
4692          * sblock() must not fail here, hence the assertion.
4693          */
4694         (void) sblock(sb, SBL_WAIT | SBL_NOINTR | SBL_IGNDEFUNCT);
4695         VERIFY(sb->sb_flags & SB_LOCK);
4696
4697         sb->sb_flags            &= ~(SB_SEL | SB_UPCALL);
4698         sb->sb_flags            |= SB_DROP;
4699         sb->sb_upcall           = NULL;
4700         sb->sb_upcallarg        = NULL;
4701
4702         sbunlock(sb, TRUE);     /* keep socket locked */
4703
4704         selthreadclear(&sb->sb_sel);
4705         sbrelease(sb);
4706 }
4707
4708 void
4709 sorflush(struct socket *so)
4710 {
4711         struct sockbuf *sb = &so->so_rcv;
4712         struct protosw *pr = so->so_proto;
4713         struct sockbuf asb;
4714 #ifdef notyet
4715         lck_mtx_t *mutex_held;
4716         /*
4717          * XXX: This code is currently commented out, because we may get here
4718          * as part of sofreelastref(), and at that time, pr_getlock() may no
4719          * longer be able to return us the lock; this will be fixed in future.
4720          */
4721         if (so->so_proto->pr_getlock != NULL) {
4722                 mutex_held = (*so->so_proto->pr_getlock)(so, 0);
4723         } else {
4724                 mutex_held = so->so_proto->pr_domain->dom_mtx;
4725         }
4726
4727         LCK_MTX_ASSERT(mutex_held, LCK_MTX_ASSERT_OWNED);
4728 #endif /* notyet */
4729
4730         sflt_notify(so, sock_evt_flush_read, NULL);
4731
4732         socantrcvmore(so);
4733
4734         /*
4735          * Obtain lock on the socket buffer (SB_LOCK).  This is required
4736          * to prevent the socket buffer from being unexpectedly altered
4737          * while it is used by another thread in socket send/receive.
4738          *
4739          * sblock() must not fail here, hence the assertion.
4740          */
4741         (void) sblock(sb, SBL_WAIT | SBL_NOINTR | SBL_IGNDEFUNCT);
4742         VERIFY(sb->sb_flags & SB_LOCK);
4743
4744         /*
4745          * Copy only the relevant fields from "sb" to "asb" which we
4746          * need for sbrelease() to function.  In particular, skip
4747          * sb_sel as it contains the wait queue linkage, which would
4748          * wreak havoc if we were to issue selthreadclear() on "asb".
4749          * Make sure to not carry over SB_LOCK in "asb", as we need
4750          * to acquire it later as part of sbrelease().
4751          */
4752         bzero(&asb, sizeof(asb));
4753         asb.sb_cc               = sb->sb_cc;
4754         asb.sb_hiwat            = sb->sb_hiwat;
4755         asb.sb_mbcnt            = sb->sb_mbcnt;
4756         asb.sb_mbmax            = sb->sb_mbmax;
4757         asb.sb_ctl              = sb->sb_ctl;
4758         asb.sb_lowat            = sb->sb_lowat;
4759         asb.sb_mb               = sb->sb_mb;
4760         asb.sb_mbtail           = sb->sb_mbtail;
4761         asb.sb_lastrecord       = sb->sb_lastrecord;
4762         asb.sb_so               = sb->sb_so;
4763         asb.sb_flags            = sb->sb_flags;
4764         asb.sb_flags            &= ~(SB_LOCK | SB_SEL | SB_KNOTE | SB_UPCALL);
4765         asb.sb_flags            |= SB_DROP;
4766
4767         /*
4768          * Ideally we'd bzero() these and preserve the ones we need;
4769          * but to do that we'd need to shuffle things around in the
4770          * sockbuf, and we can't do it now because there are KEXTS
4771          * that are directly referring to the socket structure.
4772          *
4773          * Setting SB_DROP acts as a barrier to prevent further appends.
4774          * Clearing SB_SEL is done for selthreadclear() below.
4775          */
4776         sb->sb_cc               = 0;
4777         sb->sb_hiwat            = 0;
4778         sb->sb_mbcnt            = 0;
4779         sb->sb_mbmax            = 0;
4780         sb->sb_ctl              = 0;
4781         sb->sb_lowat            = 0;
4782         sb->sb_mb               = NULL;
4783         sb->sb_mbtail           = NULL;
4784         sb->sb_lastrecord       = NULL;
4785         sb->sb_timeo.tv_sec     = 0;
4786         sb->sb_timeo.tv_usec    = 0;
4787         sb->sb_upcall           = NULL;
4788         sb->sb_upcallarg        = NULL;
4789         sb->sb_flags            &= ~(SB_SEL | SB_UPCALL);
4790         sb->sb_flags            |= SB_DROP;
4791
4792         sbunlock(sb, TRUE);     /* keep socket locked */
4793
4794         /*
4795          * Note that selthreadclear() is called on the original "sb" and
4796          * not the local "asb" because of the way wait queue linkage is
4797          * implemented.  Given that selwakeup() may be triggered, SB_SEL
4798          * should no longer be set (cleared above.)
4799          */
4800         selthreadclear(&sb->sb_sel);
4801
4802         if ((pr->pr_flags & PR_RIGHTS) && pr->pr_domain->dom_dispose) {
4803                 (*pr->pr_domain->dom_dispose)(asb.sb_mb);
4804         }
4805
4806         sbrelease(&asb);
4807 }
4808
4809 /*
4810  * Perhaps this routine, and sooptcopyout(), below, ought to come in
4811  * an additional variant to handle the case where the option value needs
4812  * to be some kind of integer, but not a specific size.
4813  * In addition to their use here, these functions are also called by the
4814  * protocol-level pr_ctloutput() routines.
4815  *
4816  * Returns:     0                       Success
4817  *              EINVAL
4818  *      copyin:EFAULT
4819  */
4820 int
4821 sooptcopyin(struct sockopt *sopt, void *buf, size_t len, size_t minlen)
4822 {
4823         size_t  valsize;
4824
4825         /*
4826          * If the user gives us more than we wanted, we ignore it,
4827          * but if we don't get the minimum length the caller
4828          * wants, we return EINVAL.  On success, sopt->sopt_valsize
4829          * is set to however much we actually retrieved.
4830          */
4831         if ((valsize = sopt->sopt_valsize) < minlen) {
4832                 return EINVAL;
4833         }
4834         if (valsize > len) {
4835                 sopt->sopt_valsize = valsize = len;
4836         }
4837
4838         if (sopt->sopt_p != kernproc) {
4839                 return copyin(sopt->sopt_val, buf, valsize);
4840         }
4841
4842         bcopy(CAST_DOWN(caddr_t, sopt->sopt_val), buf, valsize);
4843         return 0;
4844 }
4845
4846 /*
4847  * sooptcopyin_timeval
4848  *   Copy in a timeval value into tv_p, and take into account whether the
4849  *   the calling process is 64-bit or 32-bit.  Moved the sanity checking
4850  *   code here so that we can verify the 64-bit tv_sec value before we lose
4851  *   the top 32-bits assigning tv64.tv_sec to tv_p->tv_sec.
4852  */
4853 static int
4854 sooptcopyin_timeval(struct sockopt *sopt, struct timeval *tv_p)
4855 {
4856         int                     error;
4857
4858         if (proc_is64bit(sopt->sopt_p)) {
4859                 struct user64_timeval   tv64;
4860
4861                 if (sopt->sopt_valsize < sizeof(tv64)) {
4862                         return EINVAL;
4863                 }
4864
4865                 sopt->sopt_valsize = sizeof(tv64);
4866                 if (sopt->sopt_p != kernproc) {
4867                         error = copyin(sopt->sopt_val, &tv64, sizeof(tv64));
4868                         if (error != 0) {
4869                                 return error;
4870                         }
4871                 } else {
4872                         bcopy(CAST_DOWN(caddr_t, sopt->sopt_val), &tv64,
4873                             sizeof(tv64));
4874                 }
4875                 if (tv64.tv_sec < 0 || tv64.tv_sec > LONG_MAX ||
4876                     tv64.tv_usec < 0 || tv64.tv_usec >= 1000000) {
4877                         return EDOM;
4878                 }
4879
4880                 tv_p->tv_sec = tv64.tv_sec;
4881                 tv_p->tv_usec = tv64.tv_usec;
4882         } else {
4883                 struct user32_timeval   tv32;
4884
4885                 if (sopt->sopt_valsize < sizeof(tv32)) {
4886                         return EINVAL;
4887                 }
4888
4889                 sopt->sopt_valsize = sizeof(tv32);
4890                 if (sopt->sopt_p != kernproc) {
4891                         error = copyin(sopt->sopt_val, &tv32, sizeof(tv32));
4892                         if (error != 0) {
4893                                 return error;
4894                         }
4895                 } else {
4896                         bcopy(CAST_DOWN(caddr_t, sopt->sopt_val), &tv32,
4897                             sizeof(tv32));
4898                 }
4899 #ifndef __LP64__
4900                 /*
4901                  * K64todo "comparison is always false due to
4902                  * limited range of data type"
4903                  */
4904                 if (tv32.tv_sec < 0 || tv32.tv_sec > LONG_MAX ||
4905                     tv32.tv_usec < 0 || tv32.tv_usec >= 1000000) {
4906                         return EDOM;
4907                 }
4908 #endif
4909                 tv_p->tv_sec = tv32.tv_sec;
4910                 tv_p->tv_usec = tv32.tv_usec;
4911         }
4912         return 0;
4913 }
4914
4915 int
4916 soopt_cred_check(struct socket *so, int priv, boolean_t allow_root,
4917     boolean_t ignore_delegate)
4918 {
4919         kauth_cred_t cred =  NULL;
4920         proc_t ep = PROC_NULL;
4921         uid_t uid;
4922         int error = 0;
4923
4924         if (ignore_delegate == false && so->so_flags & SOF_DELEGATED) {
4925                 ep = proc_find(so->e_pid);
4926                 if (ep) {
4927                         cred = kauth_cred_proc_ref(ep);
4928                 }
4929         }
4930
4931         uid = kauth_cred_getuid(cred ? cred : so->so_cred);
4932
4933         /* uid is 0 for root */
4934         if (uid != 0 || !allow_root) {
4935                 error = priv_check_cred(cred ? cred : so->so_cred, priv, 0);
4936         }
4937         if (cred) {
4938                 kauth_cred_unref(&cred);
4939         }
4940         if (ep != PROC_NULL) {
4941                 proc_rele(ep);
4942         }
4943
4944         return error;
4945 }
4946
4947 /*
4948  * Returns:     0                       Success
4949  *              EINVAL
4950  *              ENOPROTOOPT
4951  *              ENOBUFS
4952  *              EDOM
4953  *      sooptcopyin:EINVAL
4954  *      sooptcopyin:EFAULT
4955  *      sooptcopyin_timeval:EINVAL
4956  *      sooptcopyin_timeval:EFAULT
4957  *      sooptcopyin_timeval:EDOM
4958  *      <pr_ctloutput>:EOPNOTSUPP[AF_UNIX]
4959  *      <pr_ctloutput>:???w
4960  *      sflt_attach_private:???         [whatever a filter author chooses]
4961  *      <sf_setoption>:???              [whatever a filter author chooses]
4962  *
4963  * Notes:       Other <pru_listen> returns depend on the protocol family; all
4964  *              <sf_listen> returns depend on what the filter author causes
4965  *              their filter to return.
4966  */
4967 int
4968 sosetoptlock(struct socket *so, struct sockopt *sopt, int dolock)
4969 {
4970         int     error, optval;
4971         int64_t long_optval;
4972         struct  linger l;
4973         struct  timeval tv;
4974 #if CONFIG_MACF_SOCKET
4975         struct mac extmac;
4976 #endif /* MAC_SOCKET */
4977
4978         if (sopt->sopt_dir != SOPT_SET) {
4979                 sopt->sopt_dir = SOPT_SET;
4980         }
4981
4982         if (dolock) {
4983                 socket_lock(so, 1);
4984         }
4985
4986         if ((so->so_state & (SS_CANTRCVMORE | SS_CANTSENDMORE)) ==
4987             (SS_CANTRCVMORE | SS_CANTSENDMORE) &&
4988             (so->so_flags & SOF_NPX_SETOPTSHUT) == 0) {
4989                 /* the socket has been shutdown, no more sockopt's */
4990                 error = EINVAL;
4991                 goto out;
4992         }
4993
4994         error = sflt_setsockopt(so, sopt);
4995         if (error != 0) {
4996                 if (error == EJUSTRETURN) {
4997                         error = 0;
4998                 }
4999                 goto out;
5000         }
5001
5002         if (sopt->sopt_level != SOL_SOCKET) {
5003                 if (so->so_proto != NULL &&
5004                     so->so_proto->pr_ctloutput != NULL) {
5005                         error = (*so->so_proto->pr_ctloutput)(so, sopt);
5006                         goto out;
5007                 }
5008                 error = ENOPROTOOPT;
5009         } else {
5010                 /*
5011                  * Allow socket-level (SOL_SOCKET) options to be filtered by
5012                  * the protocol layer, if needed.  A zero value returned from
5013                  * the handler means use default socket-level processing as
5014                  * done by the rest of this routine.  Otherwise, any other
5015                  * return value indicates that the option is unsupported.
5016                  */
5017                 if (so->so_proto != NULL && (error = so->so_proto->pr_usrreqs->
5018                     pru_socheckopt(so, sopt)) != 0) {
5019                         goto out;
5020                 }
5021
5022                 error = 0;
5023                 switch (sopt->sopt_name) {
5024                 case SO_LINGER:
5025                 case SO_LINGER_SEC:
5026                         error = sooptcopyin(sopt, &l, sizeof(l), sizeof(l));
5027                         if (error != 0) {
5028                                 goto out;
5029                         }
5030
5031                         so->so_linger = (sopt->sopt_name == SO_LINGER) ?
5032                             l.l_linger : l.l_linger * hz;
5033                         if (l.l_onoff != 0) {
5034                                 so->so_options |= SO_LINGER;
5035                         } else {
5036                                 so->so_options &= ~SO_LINGER;
5037                         }
5038                         break;
5039
5040                 case SO_DEBUG:
5041                 case SO_KEEPALIVE:
5042                 case SO_DONTROUTE:
5043                 case SO_USELOOPBACK:
5044                 case SO_BROADCAST:
5045                 case SO_REUSEADDR:
5046                 case SO_REUSEPORT:
5047                 case SO_OOBINLINE:
5048                 case SO_TIMESTAMP:
5049                 case SO_TIMESTAMP_MONOTONIC:
5050                 case SO_TIMESTAMP_CONTINUOUS:
5051                 case SO_DONTTRUNC:
5052                 case SO_WANTMORE:
5053                 case SO_WANTOOBFLAG:
5054                 case SO_NOWAKEFROMSLEEP:
5055                 case SO_NOAPNFALLBK:
5056                         error = sooptcopyin(sopt, &optval, sizeof(optval),
5057                             sizeof(optval));
5058                         if (error != 0) {
5059                                 goto out;
5060                         }
5061                         if (optval) {
5062                                 so->so_options |= sopt->sopt_name;
5063                         } else {
5064                                 so->so_options &= ~sopt->sopt_name;
5065                         }
5066                         break;
5067
5068                 case SO_SNDBUF:
5069                 case SO_RCVBUF:
5070                 case SO_SNDLOWAT:
5071                 case SO_RCVLOWAT:
5072                         error = sooptcopyin(sopt, &optval, sizeof(optval),
5073                             sizeof(optval));
5074                         if (error != 0) {
5075                                 goto out;
5076                         }
5077
5078                         /*
5079                          * Values < 1 make no sense for any of these
5080                          * options, so disallow them.
5081                          */
5082                         if (optval < 1) {
5083                                 error = EINVAL;
5084                                 goto out;
5085                         }
5086
5087                         switch (sopt->sopt_name) {
5088                         case SO_SNDBUF:
5089                         case SO_RCVBUF: {
5090                                 struct sockbuf *sb =
5091                                     (sopt->sopt_name == SO_SNDBUF) ?
5092                                     &so->so_snd : &so->so_rcv;
5093                                 if (sbreserve(sb, (u_int32_t)optval) == 0) {
5094                                         error = ENOBUFS;
5095                                         goto out;
5096                                 }
5097                                 sb->sb_flags |= SB_USRSIZE;
5098                                 sb->sb_flags &= ~SB_AUTOSIZE;
5099                                 sb->sb_idealsize = (u_int32_t)optval;
5100                                 break;
5101                         }
5102                         /*
5103                          * Make sure the low-water is never greater than
5104                          * the high-water.
5105                          */
5106                         case SO_SNDLOWAT: {
5107                                 int space = sbspace(&so->so_snd);
5108                                 u_int32_t hiwat = so->so_snd.sb_hiwat;
5109
5110                                 if (so->so_snd.sb_flags & SB_UNIX) {
5111                                         struct unpcb *unp =
5112                                             (struct unpcb *)(so->so_pcb);
5113                                         if (unp != NULL &&
5114                                             unp->unp_conn != NULL) {
5115                                                 hiwat += unp->unp_conn->unp_cc;
5116                                         }
5117                                 }
5118
5119                                 so->so_snd.sb_lowat =
5120                                     (optval > hiwat) ?
5121                                     hiwat : optval;
5122
5123                                 if (space >= so->so_snd.sb_lowat) {
5124                                         sowwakeup(so);
5125                                 }
5126                                 break;
5127                         }
5128                         case SO_RCVLOWAT: {
5129                                 int64_t data_len;
5130                                 so->so_rcv.sb_lowat =
5131                                     (optval > so->so_rcv.sb_hiwat) ?
5132                                     so->so_rcv.sb_hiwat : optval;
5133                                 data_len = so->so_rcv.sb_cc
5134                                     - so->so_rcv.sb_ctl;
5135                                 if (data_len >= so->so_rcv.sb_lowat) {
5136                                         sorwakeup(so);
5137                                 }
5138                                 break;
5139                         }
5140                         }
5141                         break;
5142
5143                 case SO_SNDTIMEO:
5144                 case SO_RCVTIMEO:
5145                         error = sooptcopyin_timeval(sopt, &tv);
5146                         if (error != 0) {
5147                                 goto out;
5148                         }
5149
5150                         switch (sopt->sopt_name) {
5151                         case SO_SNDTIMEO:
5152                                 so->so_snd.sb_timeo = tv;
5153                                 break;
5154                         case SO_RCVTIMEO:
5155                                 so->so_rcv.sb_timeo = tv;
5156                                 break;
5157                         }
5158                         break;
5159
5160                 case SO_NKE: {
5161                         struct so_nke nke;
5162
5163                         error = sooptcopyin(sopt, &nke, sizeof(nke),
5164                             sizeof(nke));
5165                         if (error != 0) {
5166                                 goto out;
5167                         }
5168
5169                         error = sflt_attach_internal(so, nke.nke_handle);
5170                         break;
5171                 }
5172
5173                 case SO_NOSIGPIPE:
5174                         error = sooptcopyin(sopt, &optval, sizeof(optval),
5175                             sizeof(optval));
5176                         if (error != 0) {
5177                                 goto out;
5178                         }
5179                         if (optval != 0) {
5180                                 so->so_flags |= SOF_NOSIGPIPE;
5181                         } else {
5182                                 so->so_flags &= ~SOF_NOSIGPIPE;
5183                         }
5184                         break;
5185
5186                 case SO_NOADDRERR:
5187                         error = sooptcopyin(sopt, &optval, sizeof(optval),
5188                             sizeof(optval));
5189                         if (error != 0) {
5190                                 goto out;
5191                         }
5192                         if (optval != 0) {
5193                                 so->so_flags |= SOF_NOADDRAVAIL;
5194                         } else {
5195                                 so->so_flags &= ~SOF_NOADDRAVAIL;
5196                         }
5197                         break;
5198
5199                 case SO_REUSESHAREUID:
5200                         error = sooptcopyin(sopt, &optval, sizeof(optval),
5201                             sizeof(optval));
5202                         if (error != 0) {
5203                                 goto out;
5204                         }
5205                         if (optval != 0) {
5206                                 so->so_flags |= SOF_REUSESHAREUID;
5207                         } else {
5208                                 so->so_flags &= ~SOF_REUSESHAREUID;
5209                         }
5210                         break;
5211
5212                 case SO_NOTIFYCONFLICT:
5213                         if (kauth_cred_issuser(kauth_cred_get()) == 0) {
5214                                 error = EPERM;
5215                                 goto out;
5216                         }
5217                         error = sooptcopyin(sopt, &optval, sizeof(optval),
5218                             sizeof(optval));
5219                         if (error != 0) {
5220                                 goto out;
5221                         }
5222                         if (optval != 0) {
5223                                 so->so_flags |= SOF_NOTIFYCONFLICT;
5224                         } else {
5225                                 so->so_flags &= ~SOF_NOTIFYCONFLICT;
5226                         }
5227                         break;
5228
5229                 case SO_RESTRICTIONS:
5230                         error = sooptcopyin(sopt, &optval, sizeof(optval),
5231                             sizeof(optval));
5232                         if (error != 0) {
5233                                 goto out;
5234                         }
5235
5236                         error = so_set_restrictions(so, optval);
5237                         break;
5238
5239                 case SO_AWDL_UNRESTRICTED:
5240                         if (SOCK_DOM(so) != PF_INET &&
5241                             SOCK_DOM(so) != PF_INET6) {
5242                                 error = EOPNOTSUPP;
5243                                 goto out;
5244                         }
5245                         error = sooptcopyin(sopt, &optval, sizeof(optval),
5246                             sizeof(optval));
5247                         if (error != 0) {
5248                                 goto out;
5249                         }
5250                         if (optval != 0) {
5251                                 error = soopt_cred_check(so,
5252                                     PRIV_NET_RESTRICTED_AWDL, false, false);
5253                                 if (error == 0) {
5254                                         inp_set_awdl_unrestricted(
5255                                                 sotoinpcb(so));
5256                                 }
5257                         } else {
5258                                 inp_clear_awdl_unrestricted(sotoinpcb(so));
5259                         }
5260                         break;
5261                 case SO_INTCOPROC_ALLOW:
5262                         if (SOCK_DOM(so) != PF_INET6) {
5263                                 error = EOPNOTSUPP;
5264                                 goto out;
5265                         }
5266                         error = sooptcopyin(sopt, &optval, sizeof(optval),
5267                             sizeof(optval));
5268                         if (error != 0) {
5269                                 goto out;
5270                         }
5271                         if (optval != 0 &&
5272                             inp_get_intcoproc_allowed(sotoinpcb(so)) == FALSE) {
5273                                 error = soopt_cred_check(so,
5274                                     PRIV_NET_RESTRICTED_INTCOPROC, false, false);
5275                                 if (error == 0) {
5276                                         inp_set_intcoproc_allowed(
5277                                                 sotoinpcb(so));
5278                                 }
5279                         } else if (optval == 0) {
5280                                 inp_clear_intcoproc_allowed(sotoinpcb(so));
5281                         }
5282                         break;
5283
5284                 case SO_LABEL:
5285 #if CONFIG_MACF_SOCKET
5286                         if ((error = sooptcopyin(sopt, &extmac, sizeof(extmac),
5287                             sizeof(extmac))) != 0) {
5288                                 goto out;
5289                         }
5290
5291                         error = mac_setsockopt_label(proc_ucred(sopt->sopt_p),
5292                             so, &extmac);
5293 #else
5294                         error = EOPNOTSUPP;
5295 #endif /* MAC_SOCKET */
5296                         break;
5297
5298                 case SO_UPCALLCLOSEWAIT:
5299                         error = sooptcopyin(sopt, &optval, sizeof(optval),
5300                             sizeof(optval));
5301                         if (error != 0) {
5302                                 goto out;
5303                         }
5304                         if (optval != 0) {
5305                                 so->so_flags |= SOF_UPCALLCLOSEWAIT;
5306                         } else {
5307                                 so->so_flags &= ~SOF_UPCALLCLOSEWAIT;
5308                         }
5309                         break;
5310
5311                 case SO_RANDOMPORT:
5312                         error = sooptcopyin(sopt, &optval, sizeof(optval),
5313                             sizeof(optval));
5314                         if (error != 0) {
5315                                 goto out;
5316                         }
5317                         if (optval != 0) {
5318                                 so->so_flags |= SOF_BINDRANDOMPORT;
5319                         } else {
5320                                 so->so_flags &= ~SOF_BINDRANDOMPORT;
5321                         }
5322                         break;
5323
5324                 case SO_NP_EXTENSIONS: {
5325                         struct so_np_extensions sonpx;
5326
5327                         error = sooptcopyin(sopt, &sonpx, sizeof(sonpx),
5328                             sizeof(sonpx));
5329                         if (error != 0) {
5330                                 goto out;
5331                         }
5332                         if (sonpx.npx_mask & ~SONPX_MASK_VALID) {
5333                                 error = EINVAL;
5334                                 goto out;
5335                         }
5336                         /*
5337                          * Only one bit defined for now
5338                          */
5339                         if ((sonpx.npx_mask & SONPX_SETOPTSHUT)) {
5340                                 if ((sonpx.npx_flags & SONPX_SETOPTSHUT)) {
5341                                         so->so_flags |= SOF_NPX_SETOPTSHUT;
5342                                 } else {
5343                                         so->so_flags &= ~SOF_NPX_SETOPTSHUT;
5344                                 }
5345                         }
5346                         break;
5347                 }
5348
5349                 case SO_TRAFFIC_CLASS: {
5350                         error = sooptcopyin(sopt, &optval, sizeof(optval),
5351                             sizeof(optval));
5352                         if (error != 0) {
5353                                 goto out;
5354                         }
5355                         if (optval >= SO_TC_NET_SERVICE_OFFSET) {
5356                                 int netsvc = optval - SO_TC_NET_SERVICE_OFFSET;
5357                                 error = so_set_net_service_type(so, netsvc);
5358                                 goto out;
5359                         }
5360                         error = so_set_traffic_class(so, optval);
5361                         if (error != 0) {
5362                                 goto out;
5363                         }
5364                         so->so_flags1 &= ~SOF1_TC_NET_SERV_TYPE;
5365                         so->so_netsvctype = _NET_SERVICE_TYPE_UNSPEC;
5366                         break;
5367                 }
5368
5369                 case SO_RECV_TRAFFIC_CLASS: {
5370                         error = sooptcopyin(sopt, &optval, sizeof(optval),
5371                             sizeof(optval));
5372                         if (error != 0) {
5373                                 goto out;
5374                         }
5375                         if (optval == 0) {
5376                                 so->so_flags &= ~SOF_RECV_TRAFFIC_CLASS;
5377                         } else {
5378                                 so->so_flags |= SOF_RECV_TRAFFIC_CLASS;
5379                         }
5380                         break;
5381                 }
5382
5383 #if (DEVELOPMENT || DEBUG)
5384                 case SO_TRAFFIC_CLASS_DBG: {
5385                         struct so_tcdbg so_tcdbg;
5386
5387                         error = sooptcopyin(sopt, &so_tcdbg,
5388                             sizeof(struct so_tcdbg), sizeof(struct so_tcdbg));
5389                         if (error != 0) {
5390                                 goto out;
5391                         }
5392                         error = so_set_tcdbg(so, &so_tcdbg);
5393                         if (error != 0) {
5394                                 goto out;
5395                         }
5396                         break;
5397                 }
5398 #endif /* (DEVELOPMENT || DEBUG) */
5399
5400                 case SO_PRIVILEGED_TRAFFIC_CLASS:
5401                         error = priv_check_cred(kauth_cred_get(),
5402                             PRIV_NET_PRIVILEGED_TRAFFIC_CLASS, 0);
5403                         if (error != 0) {
5404                                 goto out;
5405                         }
5406                         error = sooptcopyin(sopt, &optval, sizeof(optval),
5407                             sizeof(optval));
5408                         if (error != 0) {
5409                                 goto out;
5410                         }
5411                         if (optval == 0) {
5412                                 so->so_flags &= ~SOF_PRIVILEGED_TRAFFIC_CLASS;
5413                         } else {
5414                                 so->so_flags |= SOF_PRIVILEGED_TRAFFIC_CLASS;
5415                         }
5416                         break;
5417
5418 #if (DEVELOPMENT || DEBUG)
5419                 case SO_DEFUNCTIT:
5420                         error = sosetdefunct(current_proc(), so, 0, FALSE);
5421                         if (error == 0) {
5422                                 error = sodefunct(current_proc(), so, 0);
5423                         }
5424
5425                         break;
5426 #endif /* (DEVELOPMENT || DEBUG) */
5427
5428                 case SO_DEFUNCTOK:
5429                         error = sooptcopyin(sopt, &optval, sizeof(optval),
5430                             sizeof(optval));
5431                         if (error != 0 || (so->so_flags & SOF_DEFUNCT)) {
5432                                 if (error == 0) {
5433                                         error = EBADF;
5434                                 }
5435                                 goto out;
5436                         }
5437                         /*
5438                          * Any process can set SO_DEFUNCTOK (clear
5439                          * SOF_NODEFUNCT), but only root can clear
5440                          * SO_DEFUNCTOK (set SOF_NODEFUNCT).
5441                          */
5442                         if (optval == 0 &&
5443                             kauth_cred_issuser(kauth_cred_get()) == 0) {
5444                                 error = EPERM;
5445                                 goto out;
5446                         }
5447                         if (optval) {
5448                                 so->so_flags &= ~SOF_NODEFUNCT;
5449                         } else {
5450                                 so->so_flags |= SOF_NODEFUNCT;
5451                         }
5452
5453                         if (SOCK_DOM(so) == PF_INET ||
5454                             SOCK_DOM(so) == PF_INET6) {
5455                                 char s[MAX_IPv6_STR_LEN];
5456                                 char d[MAX_IPv6_STR_LEN];
5457                                 struct inpcb *inp = sotoinpcb(so);
5458
5459                                 SODEFUNCTLOG("%s[%d, %s]: so 0x%llx "
5460                                     "[%s %s:%d -> %s:%d] is now marked "
5461                                     "as %seligible for "
5462                                     "defunct\n", __func__, proc_selfpid(),
5463                                     proc_best_name(current_proc()),
5464                                     (uint64_t)DEBUG_KERNEL_ADDRPERM(so),
5465                                     (SOCK_TYPE(so) == SOCK_STREAM) ?
5466                                     "TCP" : "UDP", inet_ntop(SOCK_DOM(so),
5467                                     ((SOCK_DOM(so) == PF_INET) ?
5468                                     (void *)&inp->inp_laddr.s_addr :
5469                                     (void *)&inp->in6p_laddr), s, sizeof(s)),
5470                                     ntohs(inp->in6p_lport),
5471                                     inet_ntop(SOCK_DOM(so),
5472                                     (SOCK_DOM(so) == PF_INET) ?
5473                                     (void *)&inp->inp_faddr.s_addr :
5474                                     (void *)&inp->in6p_faddr, d, sizeof(d)),
5475                                     ntohs(inp->in6p_fport),
5476                                     (so->so_flags & SOF_NODEFUNCT) ?
5477                                     "not " : "");
5478                         } else {
5479                                 SODEFUNCTLOG("%s[%d, %s]: so 0x%llx [%d,%d] "
5480                                     "is now marked as %seligible for "
5481                                     "defunct\n",
5482                                     __func__, proc_selfpid(),
5483                                     proc_best_name(current_proc()),
5484                                     (uint64_t)DEBUG_KERNEL_ADDRPERM(so),
5485                                     SOCK_DOM(so), SOCK_TYPE(so),
5486                                     (so->so_flags & SOF_NODEFUNCT) ?
5487                                     "not " : "");
5488                         }
5489                         break;
5490
5491                 case SO_ISDEFUNCT:
5492                         /* This option is not settable */
5493                         error = EINVAL;
5494                         break;
5495
5496                 case SO_OPPORTUNISTIC:
5497                         error = sooptcopyin(sopt, &optval, sizeof(optval),
5498                             sizeof(optval));
5499                         if (error == 0) {
5500                                 error = so_set_opportunistic(so, optval);
5501                         }
5502                         break;
5503
5504                 case SO_FLUSH:
5505                         /* This option is handled by lower layer(s) */
5506                         error = 0;
5507                         break;
5508
5509                 case SO_RECV_ANYIF:
5510                         error = sooptcopyin(sopt, &optval, sizeof(optval),
5511                             sizeof(optval));
5512                         if (error == 0) {
5513                                 error = so_set_recv_anyif(so, optval);
5514                         }
5515                         break;
5516
5517                 case SO_TRAFFIC_MGT_BACKGROUND: {
5518                         /* This option is handled by lower layer(s) */
5519                         error = 0;
5520                         break;
5521                 }
5522
5523 #if FLOW_DIVERT
5524                 case SO_FLOW_DIVERT_TOKEN:
5525                         error = flow_divert_token_set(so, sopt);
5526                         break;
5527 #endif  /* FLOW_DIVERT */
5528
5529
5530                 case SO_DELEGATED:
5531                         if ((error = sooptcopyin(sopt, &optval, sizeof(optval),
5532                             sizeof(optval))) != 0) {
5533                                 break;
5534                         }
5535
5536                         error = so_set_effective_pid(so, optval, sopt->sopt_p, true);
5537                         break;
5538
5539                 case SO_DELEGATED_UUID: {
5540                         uuid_t euuid;
5541
5542                         if ((error = sooptcopyin(sopt, &euuid, sizeof(euuid),
5543                             sizeof(euuid))) != 0) {
5544                                 break;
5545                         }
5546
5547                         error = so_set_effective_uuid(so, euuid, sopt->sopt_p, true);
5548                         break;
5549                 }
5550
5551 #if NECP
5552                 case SO_NECP_ATTRIBUTES:
5553                         error = necp_set_socket_attributes(so, sopt);
5554                         break;
5555
5556                 case SO_NECP_CLIENTUUID: {
5557                         if (SOCK_DOM(so) == PF_MULTIPATH) {
5558                                 /* Handled by MPTCP itself */
5559                                 break;
5560                         }
5561
5562                         if (SOCK_DOM(so) != PF_INET && SOCK_DOM(so) != PF_INET6) {
5563                                 error = EINVAL;
5564                                 goto out;
5565                         }
5566
5567                         struct inpcb *inp = sotoinpcb(so);
5568                         if (!uuid_is_null(inp->necp_client_uuid)) {
5569                                 // Clear out the old client UUID if present
5570                                 necp_inpcb_remove_cb(inp);
5571                         }
5572
5573                         error = sooptcopyin(sopt, &inp->necp_client_uuid,
5574                             sizeof(uuid_t), sizeof(uuid_t));
5575                         if (error != 0) {
5576                                 goto out;
5577                         }
5578
5579                         if (uuid_is_null(inp->necp_client_uuid)) {
5580                                 error = EINVAL;
5581                                 goto out;
5582                         }
5583
5584                         pid_t current_pid = proc_pid(current_proc());
5585                         error = necp_client_register_socket_flow(current_pid,
5586                             inp->necp_client_uuid, inp);
5587                         if (error != 0) {
5588                                 uuid_clear(inp->necp_client_uuid);
5589                                 goto out;
5590                         }
5591
5592                         if (inp->inp_lport != 0) {
5593                                 // There is a bound local port, so this is not
5594                                 // a fresh socket. Assign to the client.
5595                                 necp_client_assign_from_socket(current_pid, inp->necp_client_uuid, inp);
5596                         }
5597
5598                         break;
5599                 }
5600                 case SO_NECP_LISTENUUID: {
5601                         if (SOCK_DOM(so) != PF_INET && SOCK_DOM(so) != PF_INET6) {
5602                                 error = EINVAL;
5603                                 goto out;
5604                         }
5605
5606                         struct inpcb *inp = sotoinpcb(so);
5607                         if (!uuid_is_null(inp->necp_client_uuid)) {
5608                                 error = EINVAL;
5609                                 goto out;
5610                         }
5611
5612                         error = sooptcopyin(sopt, &inp->necp_client_uuid,
5613                             sizeof(uuid_t), sizeof(uuid_t));
5614                         if (error != 0) {
5615                                 goto out;
5616                         }
5617
5618                         if (uuid_is_null(inp->necp_client_uuid)) {
5619                                 error = EINVAL;
5620                                 goto out;
5621                         }
5622
5623                         error = necp_client_register_socket_listener(proc_pid(current_proc()),
5624                             inp->necp_client_uuid, inp);
5625                         if (error != 0) {
5626                                 uuid_clear(inp->necp_client_uuid);
5627                                 goto out;
5628                         }
5629
5630                         // Mark that the port registration is held by NECP
5631                         inp->inp_flags2 |= INP2_EXTERNAL_PORT;
5632
5633                         break;
5634                 }
5635 #endif /* NECP */
5636
5637                 case SO_EXTENDED_BK_IDLE:
5638                         error = sooptcopyin(sopt, &optval, sizeof(optval),
5639                             sizeof(optval));
5640                         if (error == 0) {
5641                                 error = so_set_extended_bk_idle(so, optval);
5642                         }
5643                         break;
5644
5645                 case SO_MARK_CELLFALLBACK:
5646                         error = sooptcopyin(sopt, &optval, sizeof(optval),
5647                             sizeof(optval));
5648                         if (error != 0) {
5649                                 goto out;
5650                         }
5651                         if (optval < 0) {
5652                                 error = EINVAL;
5653                                 goto out;
5654                         }
5655                         if (optval == 0) {
5656                                 so->so_flags1 &= ~SOF1_CELLFALLBACK;
5657                         } else {
5658                                 so->so_flags1 |= SOF1_CELLFALLBACK;
5659                         }
5660                         break;
5661
5662                 case SO_STATISTICS_EVENT:
5663                         error = sooptcopyin(sopt, &long_optval,
5664                             sizeof(long_optval), sizeof(long_optval));
5665                         if (error != 0) {
5666                                 goto out;
5667                         }
5668                         u_int64_t nstat_event = 0;
5669                         error = so_statistics_event_to_nstat_event(
5670                                 &long_optval, &nstat_event);
5671                         if (error != 0) {
5672                                 goto out;
5673                         }
5674                         nstat_pcb_event(sotoinpcb(so), nstat_event);
5675                         break;
5676
5677                 case SO_NET_SERVICE_TYPE: {
5678                         error = sooptcopyin(sopt, &optval, sizeof(optval),
5679                             sizeof(optval));
5680                         if (error != 0) {
5681                                 goto out;
5682                         }
5683                         error = so_set_net_service_type(so, optval);
5684                         break;
5685                 }
5686
5687                 case SO_QOSMARKING_POLICY_OVERRIDE:
5688                         error = priv_check_cred(kauth_cred_get(),
5689                             PRIV_NET_QOSMARKING_POLICY_OVERRIDE, 0);
5690                         if (error != 0) {
5691                                 goto out;
5692                         }
5693                         error = sooptcopyin(sopt, &optval, sizeof(optval),
5694                             sizeof(optval));
5695                         if (error != 0) {
5696                                 goto out;
5697                         }
5698                         if (optval == 0) {
5699                                 so->so_flags1 &= ~SOF1_QOSMARKING_POLICY_OVERRIDE;
5700                         } else {
5701                                 so->so_flags1 |= SOF1_QOSMARKING_POLICY_OVERRIDE;
5702                         }
5703                         break;
5704
5705                 case SO_MPKL_SEND_INFO: {
5706                         struct so_mpkl_send_info so_mpkl_send_info;
5707
5708                         error = sooptcopyin(sopt, &so_mpkl_send_info,
5709                             sizeof(struct so_mpkl_send_info), sizeof(struct so_mpkl_send_info));
5710                         if (error != 0) {
5711                                 goto out;
5712                         }
5713                         uuid_copy(so->so_mpkl_send_uuid, so_mpkl_send_info.mpkl_uuid);
5714                         so->so_mpkl_send_proto = so_mpkl_send_info.mpkl_proto;
5715
5716                         if (uuid_is_null(so->so_mpkl_send_uuid) && so->so_mpkl_send_proto == 0) {
5717                                 so->so_flags1 &= ~SOF1_MPKL_SEND_INFO;
5718                         } else {
5719                                 so->so_flags1 |= SOF1_MPKL_SEND_INFO;
5720                         }
5721                         break;
5722                 }
5723                 default:
5724                         error = ENOPROTOOPT;
5725                         break;
5726                 }
5727                 if (error == 0 && so->so_proto != NULL &&
5728                     so->so_proto->pr_ctloutput != NULL) {
5729                         (void) so->so_proto->pr_ctloutput(so, sopt);
5730                 }
5731         }
5732 out:
5733         if (dolock) {
5734                 socket_unlock(so, 1);
5735         }
5736         return error;
5737 }
5738
5739 /* Helper routines for getsockopt */
5740 int
5741 sooptcopyout(struct sockopt *sopt, void *buf, size_t len)
5742 {
5743         int     error;
5744         size_t  valsize;
5745
5746         error = 0;
5747
5748         /*
5749          * Documented get behavior is that we always return a value,
5750          * possibly truncated to fit in the user's buffer.
5751          * Traditional behavior is that we always tell the user
5752          * precisely how much we copied, rather than something useful
5753          * like the total amount we had available for her.
5754          * Note that this interface is not idempotent; the entire answer must
5755          * generated ahead of time.
5756          */
5757         valsize = min(len, sopt->sopt_valsize);
5758         sopt->sopt_valsize = valsize;
5759         if (sopt->sopt_val != USER_ADDR_NULL) {
5760                 if (sopt->sopt_p != kernproc) {
5761                         error = copyout(buf, sopt->sopt_val, valsize);
5762                 } else {
5763                         bcopy(buf, CAST_DOWN(caddr_t, sopt->sopt_val), valsize);
5764                 }
5765         }
5766         return error;
5767 }
5768
5769 static int
5770 sooptcopyout_timeval(struct sockopt *sopt, const struct timeval *tv_p)
5771 {
5772         int                     error;
5773         size_t                  len;
5774         struct user64_timeval   tv64 = {};
5775         struct user32_timeval   tv32 = {};
5776         const void *            val;
5777         size_t                  valsize;
5778
5779         error = 0;
5780         if (proc_is64bit(sopt->sopt_p)) {
5781                 len = sizeof(tv64);
5782                 tv64.tv_sec = tv_p->tv_sec;
5783                 tv64.tv_usec = tv_p->tv_usec;
5784                 val = &tv64;
5785         } else {
5786                 len = sizeof(tv32);
5787                 tv32.tv_sec = tv_p->tv_sec;
5788                 tv32.tv_usec = tv_p->tv_usec;
5789                 val = &tv32;
5790         }
5791         valsize = min(len, sopt->sopt_valsize);
5792         sopt->sopt_valsize = valsize;
5793         if (sopt->sopt_val != USER_ADDR_NULL) {
5794                 if (sopt->sopt_p != kernproc) {
5795                         error = copyout(val, sopt->sopt_val, valsize);
5796                 } else {
5797                         bcopy(val, CAST_DOWN(caddr_t, sopt->sopt_val), valsize);
5798                 }
5799         }
5800         return error;
5801 }
5802
5803 /*
5804  * Return:      0                       Success
5805  *              ENOPROTOOPT
5806  *      <pr_ctloutput>:EOPNOTSUPP[AF_UNIX]
5807  *      <pr_ctloutput>:???
5808  *      <sf_getoption>:???
5809  */
5810 int
5811 sogetoptlock(struct socket *so, struct sockopt *sopt, int dolock)
5812 {
5813         int     error, optval;
5814         struct  linger l;
5815         struct  timeval tv;
5816 #if CONFIG_MACF_SOCKET
5817         struct mac extmac;
5818 #endif /* MAC_SOCKET */
5819
5820         if (sopt->sopt_dir != SOPT_GET) {
5821                 sopt->sopt_dir = SOPT_GET;
5822         }
5823
5824         if (dolock) {
5825                 socket_lock(so, 1);
5826         }
5827
5828         error = sflt_getsockopt(so, sopt);
5829         if (error != 0) {
5830                 if (error == EJUSTRETURN) {
5831                         error = 0;
5832                 }
5833                 goto out;
5834         }
5835
5836         if (sopt->sopt_level != SOL_SOCKET) {
5837                 if (so->so_proto != NULL &&
5838                     so->so_proto->pr_ctloutput != NULL) {
5839                         error = (*so->so_proto->pr_ctloutput)(so, sopt);
5840                         goto out;
5841                 }
5842                 error = ENOPROTOOPT;
5843         } else {
5844                 /*
5845                  * Allow socket-level (SOL_SOCKET) options to be filtered by
5846                  * the protocol layer, if needed.  A zero value returned from
5847                  * the handler means use default socket-level processing as
5848                  * done by the rest of this routine.  Otherwise, any other
5849                  * return value indicates that the option is unsupported.
5850                  */
5851                 if (so->so_proto != NULL && (error = so->so_proto->pr_usrreqs->
5852                     pru_socheckopt(so, sopt)) != 0) {
5853                         goto out;
5854                 }
5855
5856                 error = 0;
5857                 switch (sopt->sopt_name) {
5858                 case SO_LINGER:
5859                 case SO_LINGER_SEC:
5860                         l.l_onoff = ((so->so_options & SO_LINGER) ? 1 : 0);
5861                         l.l_linger = (sopt->sopt_name == SO_LINGER) ?
5862                             so->so_linger : so->so_linger / hz;
5863                         error = sooptcopyout(sopt, &l, sizeof(l));
5864                         break;
5865
5866                 case SO_USELOOPBACK:
5867                 case SO_DONTROUTE:
5868                 case SO_DEBUG:
5869                 case SO_KEEPALIVE:
5870                 case SO_REUSEADDR:
5871                 case SO_REUSEPORT:
5872                 case SO_BROADCAST:
5873                 case SO_OOBINLINE:
5874                 case SO_TIMESTAMP:
5875                 case SO_TIMESTAMP_MONOTONIC:
5876                 case SO_TIMESTAMP_CONTINUOUS:
5877                 case SO_DONTTRUNC:
5878                 case SO_WANTMORE:
5879                 case SO_WANTOOBFLAG:
5880                 case SO_NOWAKEFROMSLEEP:
5881                 case SO_NOAPNFALLBK:
5882                         optval = so->so_options & sopt->sopt_name;
5883 integer:
5884                         error = sooptcopyout(sopt, &optval, sizeof(optval));
5885                         break;
5886
5887                 case SO_TYPE:
5888                         optval = so->so_type;
5889                         goto integer;
5890
5891                 case SO_NREAD:
5892                         if (so->so_proto->pr_flags & PR_ATOMIC) {
5893                                 int pkt_total;
5894                                 struct mbuf *m1;
5895
5896                                 pkt_total = 0;
5897                                 m1 = so->so_rcv.sb_mb;
5898                                 while (m1 != NULL) {
5899                                         if (m1->m_type == MT_DATA ||
5900                                             m1->m_type == MT_HEADER ||
5901                                             m1->m_type == MT_OOBDATA) {
5902                                                 pkt_total += m1->m_len;
5903                                         }
5904                                         m1 = m1->m_next;
5905                                 }
5906                                 optval = pkt_total;
5907                         } else {
5908                                 optval = so->so_rcv.sb_cc - so->so_rcv.sb_ctl;
5909                         }
5910                         goto integer;
5911
5912                 case SO_NUMRCVPKT:
5913                         if (so->so_proto->pr_flags & PR_ATOMIC) {
5914                                 int cnt = 0;
5915                                 struct mbuf *m1;
5916
5917                                 m1 = so->so_rcv.sb_mb;
5918                                 while (m1 != NULL) {
5919                                         cnt += 1;
5920                                         m1 = m1->m_nextpkt;
5921                                 }
5922                                 optval = cnt;
5923                                 goto integer;
5924                         } else {
5925                                 error = ENOPROTOOPT;
5926                                 break;
5927                         }
5928
5929                 case SO_NWRITE:
5930                         optval = so->so_snd.sb_cc;
5931                         goto integer;
5932
5933                 case SO_ERROR:
5934                         optval = so->so_error;
5935                         so->so_error = 0;
5936                         goto integer;
5937
5938                 case SO_SNDBUF: {
5939                         u_int32_t hiwat = so->so_snd.sb_hiwat;
5940
5941                         if (so->so_snd.sb_flags & SB_UNIX) {
5942                                 struct unpcb *unp =
5943                                     (struct unpcb *)(so->so_pcb);
5944                                 if (unp != NULL && unp->unp_conn != NULL) {
5945                                         hiwat += unp->unp_conn->unp_cc;
5946                                 }
5947                         }
5948
5949                         optval = hiwat;
5950                         goto integer;
5951                 }
5952                 case SO_RCVBUF:
5953                         optval = so->so_rcv.sb_hiwat;
5954                         goto integer;
5955
5956                 case SO_SNDLOWAT:
5957                         optval = so->so_snd.sb_lowat;
5958                         goto integer;
5959
5960                 case SO_RCVLOWAT:
5961                         optval = so->so_rcv.sb_lowat;
5962                         goto integer;
5963
5964                 case SO_SNDTIMEO:
5965                 case SO_RCVTIMEO:
5966                         tv = (sopt->sopt_name == SO_SNDTIMEO ?
5967                             so->so_snd.sb_timeo : so->so_rcv.sb_timeo);
5968
5969                         error = sooptcopyout_timeval(sopt, &tv);
5970                         break;
5971
5972                 case SO_NOSIGPIPE:
5973                         optval = (so->so_flags & SOF_NOSIGPIPE);
5974                         goto integer;
5975
5976                 case SO_NOADDRERR:
5977                         optval = (so->so_flags & SOF_NOADDRAVAIL);
5978                         goto integer;
5979
5980                 case SO_REUSESHAREUID:
5981                         optval = (so->so_flags & SOF_REUSESHAREUID);
5982                         goto integer;
5983
5984
5985                 case SO_NOTIFYCONFLICT:
5986                         optval = (so->so_flags & SOF_NOTIFYCONFLICT);
5987                         goto integer;
5988
5989                 case SO_RESTRICTIONS:
5990                         optval = so_get_restrictions(so);
5991                         goto integer;
5992
5993                 case SO_AWDL_UNRESTRICTED:
5994                         if (SOCK_DOM(so) == PF_INET ||
5995                             SOCK_DOM(so) == PF_INET6) {
5996                                 optval = inp_get_awdl_unrestricted(
5997                                         sotoinpcb(so));
5998                                 goto integer;
5999                         } else {
6000                                 error = EOPNOTSUPP;
6001                         }
6002                         break;
6003
6004                 case SO_INTCOPROC_ALLOW:
6005                         if (SOCK_DOM(so) == PF_INET6) {
6006                                 optval = inp_get_intcoproc_allowed(
6007                                         sotoinpcb(so));
6008                                 goto integer;
6009                         } else {
6010                                 error = EOPNOTSUPP;
6011                         }
6012                         break;
6013
6014                 case SO_LABEL:
6015 #if CONFIG_MACF_SOCKET
6016                         if ((error = sooptcopyin(sopt, &extmac, sizeof(extmac),
6017                             sizeof(extmac))) != 0 ||
6018                             (error = mac_socket_label_get(proc_ucred(
6019                                     sopt->sopt_p), so, &extmac)) != 0) {
6020                                 break;
6021                         }
6022
6023                         error = sooptcopyout(sopt, &extmac, sizeof(extmac));
6024 #else
6025                         error = EOPNOTSUPP;
6026 #endif /* MAC_SOCKET */
6027                         break;
6028
6029                 case SO_PEERLABEL:
6030 #if CONFIG_MACF_SOCKET
6031                         if ((error = sooptcopyin(sopt, &extmac, sizeof(extmac),
6032                             sizeof(extmac))) != 0 ||
6033                             (error = mac_socketpeer_label_get(proc_ucred(
6034                                     sopt->sopt_p), so, &extmac)) != 0) {
6035                                 break;
6036                         }
6037
6038                         error = sooptcopyout(sopt, &extmac, sizeof(extmac));
6039 #else
6040                         error = EOPNOTSUPP;
6041 #endif /* MAC_SOCKET */
6042                         break;
6043
6044 #ifdef __APPLE_API_PRIVATE
6045                 case SO_UPCALLCLOSEWAIT:
6046                         optval = (so->so_flags & SOF_UPCALLCLOSEWAIT);
6047                         goto integer;
6048 #endif
6049                 case SO_RANDOMPORT:
6050                         optval = (so->so_flags & SOF_BINDRANDOMPORT);
6051                         goto integer;
6052
6053                 case SO_NP_EXTENSIONS: {
6054                         struct so_np_extensions sonpx = {};
6055
6056                         sonpx.npx_flags = (so->so_flags & SOF_NPX_SETOPTSHUT) ?
6057                             SONPX_SETOPTSHUT : 0;
6058                         sonpx.npx_mask = SONPX_MASK_VALID;
6059
6060                         error = sooptcopyout(sopt, &sonpx,
6061                             sizeof(struct so_np_extensions));
6062                         break;
6063                 }
6064
6065                 case SO_TRAFFIC_CLASS:
6066                         optval = so->so_traffic_class;
6067                         goto integer;
6068
6069                 case SO_RECV_TRAFFIC_CLASS:
6070                         optval = (so->so_flags & SOF_RECV_TRAFFIC_CLASS);
6071                         goto integer;
6072
6073                 case SO_TRAFFIC_CLASS_STATS:
6074                         error = sooptcopyout(sopt, &so->so_tc_stats,
6075                             sizeof(so->so_tc_stats));
6076                         break;
6077
6078 #if (DEVELOPMENT || DEBUG)
6079                 case SO_TRAFFIC_CLASS_DBG:
6080                         error = sogetopt_tcdbg(so, sopt);
6081                         break;
6082 #endif /* (DEVELOPMENT || DEBUG) */
6083
6084                 case SO_PRIVILEGED_TRAFFIC_CLASS:
6085                         optval = (so->so_flags & SOF_PRIVILEGED_TRAFFIC_CLASS);
6086                         goto integer;
6087
6088                 case SO_DEFUNCTOK:
6089                         optval = !(so->so_flags & SOF_NODEFUNCT);
6090                         goto integer;
6091
6092                 case SO_ISDEFUNCT:
6093                         optval = (so->so_flags & SOF_DEFUNCT);
6094                         goto integer;
6095
6096                 case SO_OPPORTUNISTIC:
6097                         optval = so_get_opportunistic(so);
6098                         goto integer;
6099
6100                 case SO_FLUSH:
6101                         /* This option is not gettable */
6102                         error = EINVAL;
6103                         break;
6104
6105                 case SO_RECV_ANYIF:
6106                         optval = so_get_recv_anyif(so);
6107                         goto integer;
6108
6109                 case SO_TRAFFIC_MGT_BACKGROUND:
6110                         /* This option is handled by lower layer(s) */
6111                         if (so->so_proto != NULL &&
6112                             so->so_proto->pr_ctloutput != NULL) {
6113                                 (void) so->so_proto->pr_ctloutput(so, sopt);
6114                         }
6115                         break;
6116
6117 #if FLOW_DIVERT
6118                 case SO_FLOW_DIVERT_TOKEN:
6119                         error = flow_divert_token_get(so, sopt);
6120                         break;
6121 #endif  /* FLOW_DIVERT */
6122
6123 #if NECP
6124                 case SO_NECP_ATTRIBUTES:
6125                         error = necp_get_socket_attributes(so, sopt);
6126                         break;
6127
6128                 case SO_NECP_CLIENTUUID: {
6129                         uuid_t *ncu;
6130
6131                         if (SOCK_DOM(so) == PF_MULTIPATH) {
6132                                 ncu = &mpsotomppcb(so)->necp_client_uuid;
6133                         } else if (SOCK_DOM(so) == PF_INET || SOCK_DOM(so) == PF_INET6) {
6134                                 ncu = &sotoinpcb(so)->necp_client_uuid;
6135                         } else {
6136                                 error = EINVAL;
6137                                 goto out;
6138                         }
6139
6140                         error = sooptcopyout(sopt, ncu, sizeof(uuid_t));
6141                         break;
6142                 }
6143
6144                 case SO_NECP_LISTENUUID: {
6145                         uuid_t *nlu;
6146
6147                         if (SOCK_DOM(so) == PF_INET || SOCK_DOM(so) == PF_INET6) {
6148                                 if (sotoinpcb(so)->inp_flags2 & INP2_EXTERNAL_PORT) {
6149                                         nlu = &sotoinpcb(so)->necp_client_uuid;
6150                                 } else {
6151                                         error = ENOENT;
6152                                         goto out;
6153                                 }
6154                         } else {
6155                                 error = EINVAL;
6156                                 goto out;
6157                         }
6158
6159                         error = sooptcopyout(sopt, nlu, sizeof(uuid_t));
6160                         break;
6161                 }
6162 #endif /* NECP */
6163
6164 #if CONTENT_FILTER
6165                 case SO_CFIL_SOCK_ID: {
6166                         cfil_sock_id_t sock_id;
6167
6168                         sock_id = cfil_sock_id_from_socket(so);
6169
6170                         error = sooptcopyout(sopt, &sock_id,
6171                             sizeof(cfil_sock_id_t));
6172                         break;
6173                 }
6174 #endif  /* CONTENT_FILTER */
6175
6176                 case SO_EXTENDED_BK_IDLE:
6177                         optval = (so->so_flags1 & SOF1_EXTEND_BK_IDLE_WANTED);
6178                         goto integer;
6179                 case SO_MARK_CELLFALLBACK:
6180                         optval = ((so->so_flags1 & SOF1_CELLFALLBACK) > 0)
6181                             ? 1 : 0;
6182                         goto integer;
6183                 case SO_NET_SERVICE_TYPE: {
6184                         if ((so->so_flags1 & SOF1_TC_NET_SERV_TYPE)) {
6185                                 optval = so->so_netsvctype;
6186                         } else {
6187                                 optval = NET_SERVICE_TYPE_BE;
6188                         }
6189                         goto integer;
6190                 }
6191                 case SO_NETSVC_MARKING_LEVEL:
6192                         optval = so_get_netsvc_marking_level(so);
6193                         goto integer;
6194
6195                 case SO_MPKL_SEND_INFO: {
6196                         struct so_mpkl_send_info so_mpkl_send_info;
6197
6198                         uuid_copy(so_mpkl_send_info.mpkl_uuid, so->so_mpkl_send_uuid);
6199                         so_mpkl_send_info.mpkl_proto = so->so_mpkl_send_proto;
6200                         error = sooptcopyout(sopt, &so_mpkl_send_info,
6201                             sizeof(struct so_mpkl_send_info));
6202                         break;
6203                 }
6204                 default:
6205                         error = ENOPROTOOPT;
6206                         break;
6207                 }
6208         }
6209 out:
6210         if (dolock) {
6211                 socket_unlock(so, 1);
6212         }
6213         return error;
6214 }
6215
6216 /*
6217  * The size limits on our soopt_getm is different from that on FreeBSD.
6218  * We limit the size of options to MCLBYTES. This will have to change
6219  * if we need to define options that need more space than MCLBYTES.
6220  */
6221 int
6222 soopt_getm(struct sockopt *sopt, struct mbuf **mp)
6223 {
6224         struct mbuf *m, *m_prev;
6225         int sopt_size = sopt->sopt_valsize;
6226         int how;
6227
6228         if (sopt_size <= 0 || sopt_size > MCLBYTES) {
6229                 return EMSGSIZE;
6230         }
6231
6232         how = sopt->sopt_p != kernproc ? M_WAIT : M_DONTWAIT;
6233         MGET(m, how, MT_DATA);
6234         if (m == NULL) {
6235                 return ENOBUFS;
6236         }
6237         if (sopt_size > MLEN) {
6238                 MCLGET(m, how);
6239                 if ((m->m_flags & M_EXT) == 0) {
6240                         m_free(m);
6241                         return ENOBUFS;
6242                 }
6243                 m->m_len = min(MCLBYTES, sopt_size);
6244         } else {
6245                 m->m_len = min(MLEN, sopt_size);
6246         }
6247         sopt_size -= m->m_len;
6248         *mp = m;
6249         m_prev = m;
6250
6251         while (sopt_size > 0) {
6252                 MGET(m, how, MT_DATA);
6253                 if (m == NULL) {
6254                         m_freem(*mp);
6255                         return ENOBUFS;
6256                 }
6257                 if (sopt_size > MLEN) {
6258                         MCLGET(m, how);
6259                         if ((m->m_flags & M_EXT) == 0) {
6260                                 m_freem(*mp);
6261                                 m_freem(m);
6262                                 return ENOBUFS;
6263                         }
6264                         m->m_len = min(MCLBYTES, sopt_size);
6265                 } else {
6266                         m->m_len = min(MLEN, sopt_size);
6267                 }
6268                 sopt_size -= m->m_len;
6269                 m_prev->m_next = m;
6270                 m_prev = m;
6271         }
6272         return 0;
6273 }
6274
6275 /* copyin sopt data into mbuf chain */
6276 int
6277 soopt_mcopyin(struct sockopt *sopt, struct mbuf *m)
6278 {
6279         struct mbuf *m0 = m;
6280
6281         if (sopt->sopt_val == USER_ADDR_NULL) {
6282                 return 0;
6283         }
6284         while (m != NULL && sopt->sopt_valsize >= m->m_len) {
6285                 if (sopt->sopt_p != kernproc) {
6286                         int error;
6287
6288                         error = copyin(sopt->sopt_val, mtod(m, char *),
6289                             m->m_len);
6290                         if (error != 0) {
6291                                 m_freem(m0);
6292                                 return error;
6293                         }
6294                 } else {
6295                         bcopy(CAST_DOWN(caddr_t, sopt->sopt_val),
6296                             mtod(m, char *), m->m_len);
6297                 }
6298                 sopt->sopt_valsize -= m->m_len;
6299                 sopt->sopt_val += m->m_len;
6300                 m = m->m_next;
6301         }
6302         /* should be allocated enoughly at ip6_sooptmcopyin() */
6303         if (m != NULL) {
6304                 panic("soopt_mcopyin");
6305                 /* NOTREACHED */
6306         }
6307         return 0;
6308 }
6309
6310 /* copyout mbuf chain data into soopt */
6311 int
6312 soopt_mcopyout(struct sockopt *sopt, struct mbuf *m)
6313 {
6314         struct mbuf *m0 = m;
6315         size_t valsize = 0;
6316
6317         if (sopt->sopt_val == USER_ADDR_NULL) {
6318                 return 0;
6319         }
6320         while (m != NULL && sopt->sopt_valsize >= m->m_len) {
6321                 if (sopt->sopt_p != kernproc) {
6322                         int error;
6323
6324                         error = copyout(mtod(m, char *), sopt->sopt_val,
6325                             m->m_len);
6326                         if (error != 0) {
6327                                 m_freem(m0);
6328                                 return error;
6329                         }
6330                 } else {
6331                         bcopy(mtod(m, char *),
6332                             CAST_DOWN(caddr_t, sopt->sopt_val), m->m_len);
6333                 }
6334                 sopt->sopt_valsize -= m->m_len;
6335                 sopt->sopt_val += m->m_len;
6336                 valsize += m->m_len;
6337                 m = m->m_next;
6338         }
6339         if (m != NULL) {
6340                 /* enough soopt buffer should be given from user-land */
6341                 m_freem(m0);
6342                 return EINVAL;
6343         }
6344         sopt->sopt_valsize = valsize;
6345         return 0;
6346 }
6347
6348 void
6349 sohasoutofband(struct socket *so)
6350 {
6351         if (so->so_pgid < 0) {
6352                 gsignal(-so->so_pgid, SIGURG);
6353         } else if (so->so_pgid > 0) {
6354                 proc_signal(so->so_pgid, SIGURG);
6355         }
6356         selwakeup(&so->so_rcv.sb_sel);
6357         if (so->so_rcv.sb_flags & SB_KNOTE) {
6358                 KNOTE(&so->so_rcv.sb_sel.si_note,
6359                     (NOTE_OOB | SO_FILT_HINT_LOCKED));
6360         }
6361 }
6362
6363 int
6364 sopoll(struct socket *so, int events, kauth_cred_t cred, void * wql)
6365 {
6366 #pragma unused(cred)
6367         struct proc *p = current_proc();
6368         int revents = 0;
6369
6370         socket_lock(so, 1);
6371         so_update_last_owner_locked(so, PROC_NULL);
6372         so_update_policy(so);
6373
6374         if (events & (POLLIN | POLLRDNORM)) {
6375                 if (soreadable(so)) {
6376                         revents |= events & (POLLIN | POLLRDNORM);
6377                 }
6378         }
6379
6380         if (events & (POLLOUT | POLLWRNORM)) {
6381                 if (sowriteable(so)) {
6382                         revents |= events & (POLLOUT | POLLWRNORM);
6383                 }
6384         }
6385
6386         if (events & (POLLPRI | POLLRDBAND)) {
6387                 if (so->so_oobmark || (so->so_state & SS_RCVATMARK)) {
6388                         revents |= events & (POLLPRI | POLLRDBAND);
6389                 }
6390         }
6391
6392         if (revents == 0) {
6393                 if (events & (POLLIN | POLLPRI | POLLRDNORM | POLLRDBAND)) {
6394                         /*
6395                          * Darwin sets the flag first,
6396                          * BSD calls selrecord first
6397                          */
6398                         so->so_rcv.sb_flags |= SB_SEL;
6399                         selrecord(p, &so->so_rcv.sb_sel, wql);
6400                 }
6401
6402                 if (events & (POLLOUT | POLLWRNORM)) {
6403                         /*
6404                          * Darwin sets the flag first,
6405                          * BSD calls selrecord first
6406                          */
6407                         so->so_snd.sb_flags |= SB_SEL;
6408                         selrecord(p, &so->so_snd.sb_sel, wql);
6409                 }
6410         }
6411
6412         socket_unlock(so, 1);
6413         return revents;
6414 }
6415
6416 int
6417 soo_kqfilter(struct fileproc *fp, struct knote *kn, struct kevent_qos_s *kev)
6418 {
6419         struct socket *so = (struct socket *)fp->f_fglob->fg_data;
6420         int result;
6421
6422         socket_lock(so, 1);
6423         so_update_last_owner_locked(so, PROC_NULL);
6424         so_update_policy(so);
6425
6426 #if CONFIG_MACF_SOCKET
6427         proc_t p = knote_get_kq(kn)->kq_p;
6428         if (mac_socket_check_kqfilter(proc_ucred(p), kn, so) != 0) {
6429                 socket_unlock(so, 1);
6430                 knote_set_error(kn, EPERM);
6431                 return 0;
6432         }
6433 #endif /* MAC_SOCKET */
6434
6435         switch (kn->kn_filter) {
6436         case EVFILT_READ:
6437                 kn->kn_filtid = EVFILTID_SOREAD;
6438                 break;
6439         case EVFILT_WRITE:
6440                 kn->kn_filtid = EVFILTID_SOWRITE;
6441                 break;
6442         case EVFILT_SOCK:
6443                 kn->kn_filtid = EVFILTID_SCK;
6444                 break;
6445         case EVFILT_EXCEPT:
6446                 kn->kn_filtid = EVFILTID_SOEXCEPT;
6447                 break;
6448         default:
6449                 socket_unlock(so, 1);
6450                 knote_set_error(kn, EINVAL);
6451                 return 0;
6452         }
6453
6454         /*
6455          * call the appropriate sub-filter attach
6456          * with the socket still locked
6457          */
6458         result = knote_fops(kn)->f_attach(kn, kev);
6459
6460         socket_unlock(so, 1);
6461
6462         return result;
6463 }
6464
6465 static int
6466 filt_soread_common(struct knote *kn, struct kevent_qos_s *kev, struct socket *so)
6467 {
6468         int retval = 0;
6469         int64_t data = 0;
6470
6471         if (so->so_options & SO_ACCEPTCONN) {
6472                 /*
6473                  * Radar 6615193 handle the listen case dynamically
6474                  * for kqueue read filter. This allows to call listen()
6475                  * after registering the kqueue EVFILT_READ.
6476                  */
6477
6478                 retval = !TAILQ_EMPTY(&so->so_comp);
6479                 data = so->so_qlen;
6480                 goto out;
6481         }
6482
6483         /* socket isn't a listener */
6484         /*
6485          * NOTE_LOWAT specifies new low water mark in data, i.e.
6486          * the bytes of protocol data. We therefore exclude any
6487          * control bytes.
6488          */
6489         data = so->so_rcv.sb_cc - so->so_rcv.sb_ctl;
6490
6491         if (kn->kn_sfflags & NOTE_OOB) {
6492                 if (so->so_oobmark || (so->so_state & SS_RCVATMARK)) {
6493                         kn->kn_fflags |= NOTE_OOB;
6494                         data -= so->so_oobmark;
6495                         retval = 1;
6496                         goto out;
6497                 }
6498         }
6499
6500         if ((so->so_state & SS_CANTRCVMORE)
6501 #if CONTENT_FILTER
6502             && cfil_sock_data_pending(&so->so_rcv) == 0
6503 #endif /* CONTENT_FILTER */
6504             ) {
6505                 kn->kn_flags |= EV_EOF;
6506                 kn->kn_fflags = so->so_error;
6507                 retval = 1;
6508                 goto out;
6509         }
6510
6511         if (so->so_error) {     /* temporary udp error */
6512                 retval = 1;
6513                 goto out;
6514         }
6515
6516         int64_t lowwat = so->so_rcv.sb_lowat;
6517         /*
6518          * Ensure that when NOTE_LOWAT is used, the derived
6519          * low water mark is bounded by socket's rcv buf's
6520          * high and low water mark values.
6521          */
6522         if (kn->kn_sfflags & NOTE_LOWAT) {
6523                 if (kn->kn_sdata > so->so_rcv.sb_hiwat) {
6524                         lowwat = so->so_rcv.sb_hiwat;
6525                 } else if (kn->kn_sdata > lowwat) {
6526                         lowwat = kn->kn_sdata;
6527                 }
6528         }
6529
6530         retval = (data >= lowwat);
6531
6532 out:
6533         if (retval && kev) {
6534                 knote_fill_kevent(kn, kev, data);
6535         }
6536         return retval;
6537 }
6538
6539 static int
6540 filt_sorattach(struct knote *kn, __unused struct kevent_qos_s *kev)
6541 {
6542         struct socket *so = (struct socket *)kn->kn_fp->f_fglob->fg_data;
6543
6544         /* socket locked */
6545
6546         /*
6547          * If the caller explicitly asked for OOB results (e.g. poll())
6548          * from EVFILT_READ, then save that off in the hookid field
6549          * and reserve the kn_flags EV_OOBAND bit for output only.
6550          */
6551         if (kn->kn_filter == EVFILT_READ &&
6552             kn->kn_flags & EV_OOBAND) {
6553                 kn->kn_flags &= ~EV_OOBAND;
6554                 kn->kn_hook32 = EV_OOBAND;
6555         } else {
6556                 kn->kn_hook32 = 0;
6557         }
6558         if (KNOTE_ATTACH(&so->so_rcv.sb_sel.si_note, kn)) {
6559                 so->so_rcv.sb_flags |= SB_KNOTE;
6560         }
6561
6562         /* indicate if event is already fired */
6563         return filt_soread_common(kn, NULL, so);
6564 }
6565
6566 static void
6567 filt_sordetach(struct knote *kn)
6568 {
6569         struct socket *so = (struct socket *)kn->kn_fp->f_fglob->fg_data;
6570
6571         socket_lock(so, 1);
6572         if (so->so_rcv.sb_flags & SB_KNOTE) {
6573                 if (KNOTE_DETACH(&so->so_rcv.sb_sel.si_note, kn)) {
6574                         so->so_rcv.sb_flags &= ~SB_KNOTE;
6575                 }
6576         }
6577         socket_unlock(so, 1);
6578 }
6579
6580 /*ARGSUSED*/
6581 static int
6582 filt_soread(struct knote *kn, long hint)
6583 {
6584         struct socket *so = (struct socket *)kn->kn_fp->f_fglob->fg_data;
6585         int retval;
6586
6587         if ((hint & SO_FILT_HINT_LOCKED) == 0) {
6588                 socket_lock(so, 1);
6589         }
6590
6591         retval = filt_soread_common(kn, NULL, so);
6592
6593         if ((hint & SO_FILT_HINT_LOCKED) == 0) {
6594                 socket_unlock(so, 1);
6595         }
6596
6597         return retval;
6598 }
6599
6600 static int
6601 filt_sortouch(struct knote *kn, struct kevent_qos_s *kev)
6602 {
6603         struct socket *so = (struct socket *)kn->kn_fp->f_fglob->fg_data;
6604         int retval;
6605
6606         socket_lock(so, 1);
6607
6608         /* save off the new input fflags and data */
6609         kn->kn_sfflags = kev->fflags;
6610         kn->kn_sdata = kev->data;
6611
6612         /* determine if changes result in fired events */
6613         retval = filt_soread_common(kn, NULL, so);
6614
6615         socket_unlock(so, 1);
6616
6617         return retval;
6618 }
6619
6620 static int
6621 filt_sorprocess(struct knote *kn, struct kevent_qos_s *kev)
6622 {
6623         struct socket *so = (struct socket *)kn->kn_fp->f_fglob->fg_data;
6624         int retval;
6625
6626         socket_lock(so, 1);
6627         retval = filt_soread_common(kn, kev, so);
6628         socket_unlock(so, 1);
6629
6630         return retval;
6631 }
6632
6633 int
6634 so_wait_for_if_feedback(struct socket *so)
6635 {
6636         if ((SOCK_DOM(so) == PF_INET || SOCK_DOM(so) == PF_INET6) &&
6637             (so->so_state & SS_ISCONNECTED)) {
6638                 struct inpcb *inp = sotoinpcb(so);
6639                 if (INP_WAIT_FOR_IF_FEEDBACK(inp)) {
6640                         return 1;
6641                 }
6642         }
6643         return 0;
6644 }
6645
6646 static int
6647 filt_sowrite_common(struct knote *kn, struct kevent_qos_s *kev, struct socket *so)
6648 {
6649         int ret = 0;
6650         int64_t data = sbspace(&so->so_snd);
6651
6652         if (so->so_state & SS_CANTSENDMORE) {
6653                 kn->kn_flags |= EV_EOF;
6654                 kn->kn_fflags = so->so_error;
6655                 ret = 1;
6656                 goto out;
6657         }
6658
6659         if (so->so_error) {     /* temporary udp error */
6660                 ret = 1;
6661                 goto out;
6662         }
6663
6664         if (!socanwrite(so)) {
6665                 ret = 0;
6666                 goto out;
6667         }
6668
6669         if (so->so_flags1 & SOF1_PRECONNECT_DATA) {
6670                 ret = 1;
6671                 goto out;
6672         }
6673
6674         int64_t lowwat = so->so_snd.sb_lowat;
6675
6676         if (kn->kn_sfflags & NOTE_LOWAT) {
6677                 if (kn->kn_sdata > so->so_snd.sb_hiwat) {
6678                         lowwat = so->so_snd.sb_hiwat;
6679                 } else if (kn->kn_sdata > lowwat) {
6680                         lowwat = kn->kn_sdata;
6681                 }
6682         }
6683
6684         if (data >= lowwat) {
6685                 if ((so->so_flags & SOF_NOTSENT_LOWAT)
6686 #if (DEBUG || DEVELOPMENT)
6687                     && so_notsent_lowat_check == 1
6688 #endif /* DEBUG || DEVELOPMENT */
6689                     ) {
6690                         if ((SOCK_DOM(so) == PF_INET ||
6691                             SOCK_DOM(so) == PF_INET6) &&
6692                             so->so_type == SOCK_STREAM) {
6693                                 ret = tcp_notsent_lowat_check(so);
6694                         }
6695 #if MPTCP
6696                         else if ((SOCK_DOM(so) == PF_MULTIPATH) &&
6697                             (SOCK_PROTO(so) == IPPROTO_TCP)) {
6698                                 ret = mptcp_notsent_lowat_check(so);
6699                         }
6700 #endif
6701                         else {
6702                                 ret = 1;
6703                                 goto out;
6704                         }
6705                 } else {
6706                         ret = 1;
6707                 }
6708         }
6709         if (so_wait_for_if_feedback(so)) {
6710                 ret = 0;
6711         }
6712
6713 out:
6714         if (ret && kev) {
6715                 knote_fill_kevent(kn, kev, data);
6716         }
6717         return ret;
6718 }
6719
6720 static int
6721 filt_sowattach(struct knote *kn, __unused struct kevent_qos_s *kev)
6722 {
6723         struct socket *so = (struct socket *)kn->kn_fp->f_fglob->fg_data;
6724
6725         /* socket locked */
6726         if (KNOTE_ATTACH(&so->so_snd.sb_sel.si_note, kn)) {
6727                 so->so_snd.sb_flags |= SB_KNOTE;
6728         }
6729
6730         /* determine if its already fired */
6731         return filt_sowrite_common(kn, NULL, so);
6732 }
6733
6734 static void
6735 filt_sowdetach(struct knote *kn)
6736 {
6737         struct socket *so = (struct socket *)kn->kn_fp->f_fglob->fg_data;
6738         socket_lock(so, 1);
6739
6740         if (so->so_snd.sb_flags & SB_KNOTE) {
6741                 if (KNOTE_DETACH(&so->so_snd.sb_sel.si_note, kn)) {
6742                         so->so_snd.sb_flags &= ~SB_KNOTE;
6743                 }
6744         }
6745         socket_unlock(so, 1);
6746 }
6747
6748 /*ARGSUSED*/
6749 static int
6750 filt_sowrite(struct knote *kn, long hint)
6751 {
6752         struct socket *so = (struct socket *)kn->kn_fp->f_fglob->fg_data;
6753         int ret;
6754
6755         if ((hint & SO_FILT_HINT_LOCKED) == 0) {
6756                 socket_lock(so, 1);
6757         }
6758
6759         ret = filt_sowrite_common(kn, NULL, so);
6760
6761         if ((hint & SO_FILT_HINT_LOCKED) == 0) {
6762                 socket_unlock(so, 1);
6763         }
6764
6765         return ret;
6766 }
6767
6768 static int
6769 filt_sowtouch(struct knote *kn, struct kevent_qos_s *kev)
6770 {
6771         struct socket *so = (struct socket *)kn->kn_fp->f_fglob->fg_data;
6772         int ret;
6773
6774         socket_lock(so, 1);
6775
6776         /*save off the new input fflags and data */
6777         kn->kn_sfflags = kev->fflags;
6778         kn->kn_sdata = kev->data;
6779
6780         /* determine if these changes result in a triggered event */
6781         ret = filt_sowrite_common(kn, NULL, so);
6782
6783         socket_unlock(so, 1);
6784
6785         return ret;
6786 }
6787
6788 static int
6789 filt_sowprocess(struct knote *kn, struct kevent_qos_s *kev)
6790 {
6791         struct socket *so = (struct socket *)kn->kn_fp->f_fglob->fg_data;
6792         int ret;
6793
6794         socket_lock(so, 1);
6795         ret = filt_sowrite_common(kn, kev, so);
6796         socket_unlock(so, 1);
6797
6798         return ret;
6799 }
6800
6801 static int
6802 filt_sockev_common(struct knote *kn, struct kevent_qos_s *kev,
6803     struct socket *so, long ev_hint)
6804 {
6805         int ret = 0;
6806         int64_t data = 0;
6807         uint32_t level_trigger = 0;
6808
6809         if (ev_hint & SO_FILT_HINT_CONNRESET) {
6810                 kn->kn_fflags |= NOTE_CONNRESET;
6811         }
6812         if (ev_hint & SO_FILT_HINT_TIMEOUT) {
6813                 kn->kn_fflags |= NOTE_TIMEOUT;
6814         }
6815         if (ev_hint & SO_FILT_HINT_NOSRCADDR) {
6816                 kn->kn_fflags |= NOTE_NOSRCADDR;
6817         }
6818         if (ev_hint & SO_FILT_HINT_IFDENIED) {
6819                 kn->kn_fflags |= NOTE_IFDENIED;
6820         }
6821         if (ev_hint & SO_FILT_HINT_KEEPALIVE) {
6822                 kn->kn_fflags |= NOTE_KEEPALIVE;
6823         }
6824         if (ev_hint & SO_FILT_HINT_ADAPTIVE_WTIMO) {
6825                 kn->kn_fflags |= NOTE_ADAPTIVE_WTIMO;
6826         }
6827         if (ev_hint & SO_FILT_HINT_ADAPTIVE_RTIMO) {
6828                 kn->kn_fflags |= NOTE_ADAPTIVE_RTIMO;
6829         }
6830         if ((ev_hint & SO_FILT_HINT_CONNECTED) ||
6831             (so->so_state & SS_ISCONNECTED)) {
6832                 kn->kn_fflags |= NOTE_CONNECTED;
6833                 level_trigger |= NOTE_CONNECTED;
6834         }
6835         if ((ev_hint & SO_FILT_HINT_DISCONNECTED) ||
6836             (so->so_state & SS_ISDISCONNECTED)) {
6837                 kn->kn_fflags |= NOTE_DISCONNECTED;
6838                 level_trigger |= NOTE_DISCONNECTED;
6839         }
6840         if (ev_hint & SO_FILT_HINT_CONNINFO_UPDATED) {
6841                 if (so->so_proto != NULL &&
6842                     (so->so_proto->pr_flags & PR_EVCONNINFO)) {
6843                         kn->kn_fflags |= NOTE_CONNINFO_UPDATED;
6844                 }
6845         }
6846
6847         if ((ev_hint & SO_FILT_HINT_NOTIFY_ACK) ||
6848             tcp_notify_ack_active(so)) {
6849                 kn->kn_fflags |= NOTE_NOTIFY_ACK;
6850         }
6851
6852         if ((so->so_state & SS_CANTRCVMORE)
6853 #if CONTENT_FILTER
6854             && cfil_sock_data_pending(&so->so_rcv) == 0
6855 #endif /* CONTENT_FILTER */
6856             ) {
6857                 kn->kn_fflags |= NOTE_READCLOSED;
6858                 level_trigger |= NOTE_READCLOSED;
6859         }
6860
6861         if (so->so_state & SS_CANTSENDMORE) {
6862                 kn->kn_fflags |= NOTE_WRITECLOSED;
6863                 level_trigger |= NOTE_WRITECLOSED;
6864         }
6865
6866         if ((ev_hint & SO_FILT_HINT_SUSPEND) ||
6867             (so->so_flags & SOF_SUSPENDED)) {
6868                 kn->kn_fflags &= ~(NOTE_SUSPEND | NOTE_RESUME);
6869
6870                 /* If resume event was delivered before, reset it */
6871                 kn->kn_hook32 &= ~NOTE_RESUME;
6872
6873                 kn->kn_fflags |= NOTE_SUSPEND;
6874                 level_trigger |= NOTE_SUSPEND;
6875         }
6876
6877         if ((ev_hint & SO_FILT_HINT_RESUME) ||
6878             (so->so_flags & SOF_SUSPENDED) == 0) {
6879                 kn->kn_fflags &= ~(NOTE_SUSPEND | NOTE_RESUME);
6880
6881                 /* If suspend event was delivered before, reset it */
6882                 kn->kn_hook32 &= ~NOTE_SUSPEND;
6883
6884                 kn->kn_fflags |= NOTE_RESUME;
6885                 level_trigger |= NOTE_RESUME;
6886         }
6887
6888         if (so->so_error != 0) {
6889                 ret = 1;
6890                 data = so->so_error;
6891                 kn->kn_flags |= EV_EOF;
6892         } else {
6893                 u_int32_t data32;
6894                 get_sockev_state(so, &data32);
6895                 data = data32;
6896         }
6897
6898         /* Reset any events that are not requested on this knote */
6899         kn->kn_fflags &= (kn->kn_sfflags & EVFILT_SOCK_ALL_MASK);
6900         level_trigger &= (kn->kn_sfflags & EVFILT_SOCK_ALL_MASK);
6901
6902         /* Find the level triggerred events that are already delivered */
6903         level_trigger &= kn->kn_hook32;
6904         level_trigger &= EVFILT_SOCK_LEVEL_TRIGGER_MASK;
6905
6906         /* Do not deliver level triggerred events more than once */
6907         if ((kn->kn_fflags & ~level_trigger) != 0) {
6908                 ret = 1;
6909         }
6910
6911         if (ret && kev) {
6912                 /*
6913                  * Store the state of the events being delivered. This
6914                  * state can be used to deliver level triggered events
6915                  * ateast once and still avoid waking up the application
6916                  * multiple times as long as the event is active.
6917                  */
6918                 if (kn->kn_fflags != 0) {
6919                         kn->kn_hook32 |= (kn->kn_fflags &
6920                             EVFILT_SOCK_LEVEL_TRIGGER_MASK);
6921                 }
6922
6923                 /*
6924                  * NOTE_RESUME and NOTE_SUSPEND are an exception, deliver
6925                  * only one of them and remember the last one that was
6926                  * delivered last
6927                  */
6928                 if (kn->kn_fflags & NOTE_SUSPEND) {
6929                         kn->kn_hook32 &= ~NOTE_RESUME;
6930                 }
6931                 if (kn->kn_fflags & NOTE_RESUME) {
6932                         kn->kn_hook32 &= ~NOTE_SUSPEND;
6933                 }
6934
6935                 knote_fill_kevent(kn, kev, data);
6936         }
6937         return ret;
6938 }
6939
6940 static int
6941 filt_sockattach(struct knote *kn, __unused struct kevent_qos_s *kev)
6942 {
6943         struct socket *so = (struct socket *)kn->kn_fp->f_fglob->fg_data;
6944
6945         /* socket locked */
6946         kn->kn_hook32 = 0;
6947         if (KNOTE_ATTACH(&so->so_klist, kn)) {
6948                 so->so_flags |= SOF_KNOTE;
6949         }
6950
6951         /* determine if event already fired */
6952         return filt_sockev_common(kn, NULL, so, 0);
6953 }
6954
6955 static void
6956 filt_sockdetach(struct knote *kn)
6957 {
6958         struct socket *so = (struct socket *)kn->kn_fp->f_fglob->fg_data;
6959         socket_lock(so, 1);
6960
6961         if ((so->so_flags & SOF_KNOTE) != 0) {
6962                 if (KNOTE_DETACH(&so->so_klist, kn)) {
6963                         so->so_flags &= ~SOF_KNOTE;
6964                 }
6965         }
6966         socket_unlock(so, 1);
6967 }
6968
6969 static int
6970 filt_sockev(struct knote *kn, long hint)
6971 {
6972         int ret = 0, locked = 0;
6973         struct socket *so = (struct socket *)kn->kn_fp->f_fglob->fg_data;
6974         long ev_hint = (hint & SO_FILT_HINT_EV);
6975
6976         if ((hint & SO_FILT_HINT_LOCKED) == 0) {
6977                 socket_lock(so, 1);
6978                 locked = 1;
6979         }
6980
6981         ret = filt_sockev_common(kn, NULL, so, ev_hint);
6982
6983         if (locked) {
6984                 socket_unlock(so, 1);
6985         }
6986
6987         return ret;
6988 }
6989
6990
6991
6992 /*
6993  *      filt_socktouch - update event state
6994  */
6995 static int
6996 filt_socktouch(
6997         struct knote *kn,
6998         struct kevent_qos_s *kev)
6999 {
7000         struct socket *so = (struct socket *)kn->kn_fp->f_fglob->fg_data;
7001         uint32_t changed_flags;
7002         int ret;
7003
7004         socket_lock(so, 1);
7005
7006         /* save off the [result] data and fflags */
7007         changed_flags = (kn->kn_sfflags ^ kn->kn_hook32);
7008
7009         /* save off the new input fflags and data */
7010         kn->kn_sfflags = kev->fflags;
7011         kn->kn_sdata = kev->data;
7012
7013         /* restrict the current results to the (smaller?) set of new interest */
7014         /*
7015          * For compatibility with previous implementations, we leave kn_fflags
7016          * as they were before.
7017          */
7018         //kn->kn_fflags &= kev->fflags;
7019
7020         /*
7021          * Since we keep track of events that are already
7022          * delivered, if any of those events are not requested
7023          * anymore the state related to them can be reset
7024          */
7025         kn->kn_hook32 &= ~(changed_flags & EVFILT_SOCK_LEVEL_TRIGGER_MASK);
7026
7027         /* determine if we have events to deliver */
7028         ret = filt_sockev_common(kn, NULL, so, 0);
7029
7030         socket_unlock(so, 1);
7031
7032         return ret;
7033 }
7034
7035 /*
7036  *      filt_sockprocess - query event fired state and return data
7037  */
7038 static int
7039 filt_sockprocess(struct knote *kn, struct kevent_qos_s *kev)
7040 {
7041         struct socket *so = (struct socket *)kn->kn_fp->f_fglob->fg_data;
7042         int ret = 0;
7043
7044         socket_lock(so, 1);
7045
7046         ret = filt_sockev_common(kn, kev, so, 0);
7047
7048         socket_unlock(so, 1);
7049
7050         return ret;
7051 }
7052
7053 void
7054 get_sockev_state(struct socket *so, u_int32_t *statep)
7055 {
7056         u_int32_t state = *(statep);
7057
7058         /*
7059          * If the state variable is already used by a previous event,
7060          * reset it.
7061          */
7062         if (state != 0) {
7063                 return;
7064         }
7065
7066         if (so->so_state & SS_ISCONNECTED) {
7067                 state |= SOCKEV_CONNECTED;
7068         } else {
7069                 state &= ~(SOCKEV_CONNECTED);
7070         }
7071         state |= ((so->so_state & SS_ISDISCONNECTED) ? SOCKEV_DISCONNECTED : 0);
7072         *(statep) = state;
7073 }
7074
7075 #define SO_LOCK_HISTORY_STR_LEN \
7076         (2 * SO_LCKDBG_MAX * (2 + (2 * sizeof (void *)) + 1) + 1)
7077
7078 __private_extern__ const char *
7079 solockhistory_nr(struct socket *so)
7080 {
7081         size_t n = 0;
7082         int i;
7083         static char lock_history_str[SO_LOCK_HISTORY_STR_LEN];
7084
7085         bzero(lock_history_str, sizeof(lock_history_str));
7086         for (i = SO_LCKDBG_MAX - 1; i >= 0; i--) {
7087                 n += snprintf(lock_history_str + n,
7088                     SO_LOCK_HISTORY_STR_LEN - n, "%p:%p ",
7089                     so->lock_lr[(so->next_lock_lr + i) % SO_LCKDBG_MAX],
7090                     so->unlock_lr[(so->next_unlock_lr + i) % SO_LCKDBG_MAX]);
7091         }
7092         return lock_history_str;
7093 }
7094
7095 lck_mtx_t *
7096 socket_getlock(struct socket *so, int flags)
7097 {
7098         if (so->so_proto->pr_getlock != NULL) {
7099                 return (*so->so_proto->pr_getlock)(so, flags);
7100         } else {
7101                 return so->so_proto->pr_domain->dom_mtx;
7102         }
7103 }
7104
7105 void
7106 socket_lock(struct socket *so, int refcount)
7107 {
7108         void *lr_saved;
7109
7110         lr_saved = __builtin_return_address(0);
7111
7112         if (so->so_proto->pr_lock) {
7113                 (*so->so_proto->pr_lock)(so, refcount, lr_saved);
7114         } else {
7115 #ifdef MORE_LOCKING_DEBUG
7116                 LCK_MTX_ASSERT(so->so_proto->pr_domain->dom_mtx,
7117                     LCK_MTX_ASSERT_NOTOWNED);
7118 #endif
7119                 lck_mtx_lock(so->so_proto->pr_domain->dom_mtx);
7120                 if (refcount) {
7121                         so->so_usecount++;
7122                 }
7123                 so->lock_lr[so->next_lock_lr] = lr_saved;
7124                 so->next_lock_lr = (so->next_lock_lr + 1) % SO_LCKDBG_MAX;
7125         }
7126 }
7127
7128 void
7129 socket_lock_assert_owned(struct socket *so)
7130 {
7131         lck_mtx_t *mutex_held;
7132
7133         if (so->so_proto->pr_getlock != NULL) {
7134                 mutex_held = (*so->so_proto->pr_getlock)(so, 0);
7135         } else {
7136                 mutex_held = so->so_proto->pr_domain->dom_mtx;
7137         }
7138
7139         LCK_MTX_ASSERT(mutex_held, LCK_MTX_ASSERT_OWNED);
7140 }
7141
7142 int
7143 socket_try_lock(struct socket *so)
7144 {
7145         lck_mtx_t *mtx;
7146
7147         if (so->so_proto->pr_getlock != NULL) {
7148                 mtx = (*so->so_proto->pr_getlock)(so, 0);
7149         } else {
7150                 mtx = so->so_proto->pr_domain->dom_mtx;
7151         }
7152
7153         return lck_mtx_try_lock(mtx);
7154 }
7155
7156 void
7157 socket_unlock(struct socket *so, int refcount)
7158 {
7159         void *lr_saved;
7160         lck_mtx_t *mutex_held;
7161
7162         lr_saved = __builtin_return_address(0);
7163
7164         if (so == NULL || so->so_proto == NULL) {
7165                 panic("%s: null so_proto so=%p\n", __func__, so);
7166                 /* NOTREACHED */
7167         }
7168
7169         if (so->so_proto->pr_unlock) {
7170                 (*so->so_proto->pr_unlock)(so, refcount, lr_saved);
7171         } else {
7172                 mutex_held = so->so_proto->pr_domain->dom_mtx;
7173 #ifdef MORE_LOCKING_DEBUG
7174                 LCK_MTX_ASSERT(mutex_held, LCK_MTX_ASSERT_OWNED);
7175 #endif
7176                 so->unlock_lr[so->next_unlock_lr] = lr_saved;
7177                 so->next_unlock_lr = (so->next_unlock_lr + 1) % SO_LCKDBG_MAX;
7178
7179                 if (refcount) {
7180                         if (so->so_usecount <= 0) {
7181                                 panic("%s: bad refcount=%d so=%p (%d, %d, %d) "
7182                                     "lrh=%s", __func__, so->so_usecount, so,
7183                                     SOCK_DOM(so), so->so_type,
7184                                     SOCK_PROTO(so), solockhistory_nr(so));
7185                                 /* NOTREACHED */
7186                         }
7187
7188                         so->so_usecount--;
7189                         if (so->so_usecount == 0) {
7190                                 sofreelastref(so, 1);
7191                         }
7192                 }
7193                 lck_mtx_unlock(mutex_held);
7194         }
7195 }
7196
7197 /* Called with socket locked, will unlock socket */
7198 void
7199 sofree(struct socket *so)
7200 {
7201         lck_mtx_t *mutex_held;
7202
7203         if (so->so_proto->pr_getlock != NULL) {
7204                 mutex_held = (*so->so_proto->pr_getlock)(so, 0);
7205         } else {
7206                 mutex_held = so->so_proto->pr_domain->dom_mtx;
7207         }
7208         LCK_MTX_ASSERT(mutex_held, LCK_MTX_ASSERT_OWNED);
7209
7210         sofreelastref(so, 0);
7211 }
7212
7213 void
7214 soreference(struct socket *so)
7215 {
7216         socket_lock(so, 1);     /* locks & take one reference on socket */
7217         socket_unlock(so, 0);   /* unlock only */
7218 }
7219
7220 void
7221 sodereference(struct socket *so)
7222 {
7223         socket_lock(so, 0);
7224         socket_unlock(so, 1);
7225 }
7226
7227 /*
7228  * Set or clear SOF_MULTIPAGES on the socket to enable or disable the
7229  * possibility of using jumbo clusters.  Caller must ensure to hold
7230  * the socket lock.
7231  */
7232 void
7233 somultipages(struct socket *so, boolean_t set)
7234 {
7235         if (set) {
7236                 so->so_flags |= SOF_MULTIPAGES;
7237         } else {
7238                 so->so_flags &= ~SOF_MULTIPAGES;
7239         }
7240 }
7241
7242 void
7243 soif2kcl(struct socket *so, boolean_t set)
7244 {
7245         if (set) {
7246                 so->so_flags1 |= SOF1_IF_2KCL;
7247         } else {
7248                 so->so_flags1 &= ~SOF1_IF_2KCL;
7249         }
7250 }
7251
7252 int
7253 so_isdstlocal(struct socket *so)
7254 {
7255         struct inpcb *inp = (struct inpcb *)so->so_pcb;
7256
7257         if (SOCK_DOM(so) == PF_INET) {
7258                 return inaddr_local(inp->inp_faddr);
7259         } else if (SOCK_DOM(so) == PF_INET6) {
7260                 return in6addr_local(&inp->in6p_faddr);
7261         }
7262
7263         return 0;
7264 }
7265
7266 int
7267 sosetdefunct(struct proc *p, struct socket *so, int level, boolean_t noforce)
7268 {
7269         struct sockbuf *rcv, *snd;
7270         int err = 0, defunct;
7271
7272         rcv = &so->so_rcv;
7273         snd = &so->so_snd;
7274
7275         defunct = (so->so_flags & SOF_DEFUNCT);
7276         if (defunct) {
7277                 if (!(snd->sb_flags & rcv->sb_flags & SB_DROP)) {
7278                         panic("%s: SB_DROP not set", __func__);
7279                         /* NOTREACHED */
7280                 }
7281                 goto done;
7282         }
7283
7284         if (so->so_flags & SOF_NODEFUNCT) {
7285                 if (noforce) {
7286                         err = EOPNOTSUPP;
7287                         if (p != PROC_NULL) {
7288                                 SODEFUNCTLOG("%s[%d, %s]: (target pid %d "
7289                                     "name %s level %d) so 0x%llx [%d,%d] "
7290                                     "is not eligible for defunct "
7291                                     "(%d)\n", __func__, proc_selfpid(),
7292                                     proc_best_name(current_proc()), proc_pid(p),
7293                                     proc_best_name(p), level,
7294                                     (uint64_t)DEBUG_KERNEL_ADDRPERM(so),
7295                                     SOCK_DOM(so), SOCK_TYPE(so), err);
7296                         }
7297                         return err;
7298                 }
7299                 so->so_flags &= ~SOF_NODEFUNCT;
7300                 if (p != PROC_NULL) {
7301                         SODEFUNCTLOG("%s[%d, %s]: (target pid %d "
7302                             "name %s level %d) so 0x%llx [%d,%d] "
7303                             "defunct by force "
7304                             "(%d)\n", __func__, proc_selfpid(),
7305                             proc_best_name(current_proc()), proc_pid(p),
7306                             proc_best_name(p), level,
7307                             (uint64_t)DEBUG_KERNEL_ADDRPERM(so),
7308                             SOCK_DOM(so), SOCK_TYPE(so), err);
7309                 }
7310         } else if (so->so_flags1 & SOF1_EXTEND_BK_IDLE_WANTED) {
7311                 struct inpcb *inp = (struct inpcb *)so->so_pcb;
7312                 struct ifnet *ifp = inp->inp_last_outifp;
7313
7314                 if (ifp && IFNET_IS_CELLULAR(ifp)) {
7315                         OSIncrementAtomic(&soextbkidlestat.so_xbkidle_nocell);
7316                 } else if (so->so_flags & SOF_DELEGATED) {
7317                         OSIncrementAtomic(&soextbkidlestat.so_xbkidle_nodlgtd);
7318                 } else if (soextbkidlestat.so_xbkidle_time == 0) {
7319                         OSIncrementAtomic(&soextbkidlestat.so_xbkidle_notime);
7320                 } else if (noforce && p != PROC_NULL) {
7321                         OSIncrementAtomic(&soextbkidlestat.so_xbkidle_active);
7322
7323                         so->so_flags1 |= SOF1_EXTEND_BK_IDLE_INPROG;
7324                         so->so_extended_bk_start = net_uptime();
7325                         OSBitOrAtomic(P_LXBKIDLEINPROG, &p->p_ladvflag);
7326
7327                         inpcb_timer_sched(inp->inp_pcbinfo, INPCB_TIMER_LAZY);
7328
7329                         err = EOPNOTSUPP;
7330                         SODEFUNCTLOG("%s[%d, %s]: (target pid %d "
7331                             "name %s level %d) so 0x%llx [%d,%d] "
7332                             "extend bk idle "
7333                             "(%d)\n", __func__, proc_selfpid(),
7334                             proc_best_name(current_proc()), proc_pid(p),
7335                             proc_best_name(p), level,
7336                             (uint64_t)DEBUG_KERNEL_ADDRPERM(so),
7337                             SOCK_DOM(so), SOCK_TYPE(so), err);
7338                         return err;
7339                 } else {
7340                         OSIncrementAtomic(&soextbkidlestat.so_xbkidle_forced);
7341                 }
7342         }
7343
7344         so->so_flags |= SOF_DEFUNCT;
7345
7346         /* Prevent further data from being appended to the socket buffers */
7347         snd->sb_flags |= SB_DROP;
7348         rcv->sb_flags |= SB_DROP;
7349
7350         /* Flush any existing data in the socket buffers */
7351         if (rcv->sb_cc != 0) {
7352                 rcv->sb_flags &= ~SB_SEL;
7353                 selthreadclear(&rcv->sb_sel);
7354                 sbrelease(rcv);
7355         }
7356         if (snd->sb_cc != 0) {
7357                 snd->sb_flags &= ~SB_SEL;
7358                 selthreadclear(&snd->sb_sel);
7359                 sbrelease(snd);
7360         }
7361
7362 done:
7363         if (p != PROC_NULL) {
7364                 SODEFUNCTLOG("%s[%d, %s]: (target pid %d name %s level %d) "
7365                     "so 0x%llx [%d,%d] %s defunct%s\n", __func__,
7366                     proc_selfpid(), proc_best_name(current_proc()),
7367                     proc_pid(p), proc_best_name(p), level,
7368                     (uint64_t)DEBUG_KERNEL_ADDRPERM(so), SOCK_DOM(so),
7369                     SOCK_TYPE(so), defunct ? "is already" : "marked as",
7370                     (so->so_flags1 & SOF1_EXTEND_BK_IDLE_WANTED) ?
7371                     " extbkidle" : "");
7372         }
7373         return err;
7374 }
7375
7376 int
7377 sodefunct(struct proc *p, struct socket *so, int level)
7378 {
7379         struct sockbuf *rcv, *snd;
7380
7381         if (!(so->so_flags & SOF_DEFUNCT)) {
7382                 panic("%s improperly called", __func__);
7383                 /* NOTREACHED */
7384         }
7385         if (so->so_state & SS_DEFUNCT) {
7386                 goto done;
7387         }
7388
7389         rcv = &so->so_rcv;
7390         snd = &so->so_snd;
7391
7392         if (SOCK_DOM(so) == PF_INET || SOCK_DOM(so) == PF_INET6) {
7393                 char s[MAX_IPv6_STR_LEN];
7394                 char d[MAX_IPv6_STR_LEN];
7395                 struct inpcb *inp = sotoinpcb(so);
7396
7397                 if (p != PROC_NULL) {
7398                         SODEFUNCTLOG(
7399                                 "%s[%d, %s]: (target pid %d name %s level %d) "
7400                                 "so 0x%llx [%s %s:%d -> %s:%d] is now defunct "
7401                                 "[rcv_si 0x%x, snd_si 0x%x, rcv_fl 0x%x, "
7402                                 " snd_fl 0x%x]\n", __func__,
7403                                 proc_selfpid(), proc_best_name(current_proc()),
7404                                 proc_pid(p), proc_best_name(p), level,
7405                                 (uint64_t)DEBUG_KERNEL_ADDRPERM(so),
7406                                 (SOCK_TYPE(so) == SOCK_STREAM) ? "TCP" : "UDP",
7407                                 inet_ntop(SOCK_DOM(so), ((SOCK_DOM(so) == PF_INET) ?
7408                                 (void *)&inp->inp_laddr.s_addr :
7409                                 (void *)&inp->in6p_laddr),
7410                                 s, sizeof(s)), ntohs(inp->in6p_lport),
7411                                 inet_ntop(SOCK_DOM(so), (SOCK_DOM(so) == PF_INET) ?
7412                                 (void *)&inp->inp_faddr.s_addr :
7413                                 (void *)&inp->in6p_faddr,
7414                                 d, sizeof(d)), ntohs(inp->in6p_fport),
7415                                 (uint32_t)rcv->sb_sel.si_flags,
7416                                 (uint32_t)snd->sb_sel.si_flags,
7417                                 rcv->sb_flags, snd->sb_flags);
7418                 }
7419         } else if (p != PROC_NULL) {
7420                 SODEFUNCTLOG("%s[%d, %s]: (target pid %d name %s level %d) "
7421                     "so 0x%llx [%d,%d] is now defunct [rcv_si 0x%x, "
7422                     "snd_si 0x%x, rcv_fl 0x%x, snd_fl 0x%x]\n", __func__,
7423                     proc_selfpid(), proc_best_name(current_proc()),
7424                     proc_pid(p), proc_best_name(p), level,
7425                     (uint64_t)DEBUG_KERNEL_ADDRPERM(so),
7426                     SOCK_DOM(so), SOCK_TYPE(so),
7427                     (uint32_t)rcv->sb_sel.si_flags,
7428                     (uint32_t)snd->sb_sel.si_flags, rcv->sb_flags,
7429                     snd->sb_flags);
7430         }
7431
7432         /*
7433          * Unwedge threads blocked on sbwait() and sb_lock().
7434          */
7435         sbwakeup(rcv);
7436         sbwakeup(snd);
7437
7438         so->so_flags1 |= SOF1_DEFUNCTINPROG;
7439         if (rcv->sb_flags & SB_LOCK) {
7440                 sbunlock(rcv, TRUE);    /* keep socket locked */
7441         }
7442         if (snd->sb_flags & SB_LOCK) {
7443                 sbunlock(snd, TRUE);    /* keep socket locked */
7444         }
7445         /*
7446          * Flush the buffers and disconnect.  We explicitly call shutdown
7447          * on both data directions to ensure that SS_CANT{RCV,SEND}MORE
7448          * states are set for the socket.  This would also flush out data
7449          * hanging off the receive list of this socket.
7450          */
7451         (void) soshutdownlock_final(so, SHUT_RD);
7452         (void) soshutdownlock_final(so, SHUT_WR);
7453         (void) sodisconnectlocked(so);
7454
7455         /*
7456          * Explicitly handle connectionless-protocol disconnection
7457          * and release any remaining data in the socket buffers.
7458          */
7459         if (!(so->so_state & SS_ISDISCONNECTED)) {
7460                 (void) soisdisconnected(so);
7461         }
7462
7463         if (so->so_error == 0) {
7464                 so->so_error = EBADF;
7465         }
7466
7467         if (rcv->sb_cc != 0) {
7468                 rcv->sb_flags &= ~SB_SEL;
7469                 selthreadclear(&rcv->sb_sel);
7470                 sbrelease(rcv);
7471         }
7472         if (snd->sb_cc != 0) {
7473                 snd->sb_flags &= ~SB_SEL;
7474                 selthreadclear(&snd->sb_sel);
7475                 sbrelease(snd);
7476         }
7477         so->so_state |= SS_DEFUNCT;
7478         OSIncrementAtomicLong((volatile long *)&sodefunct_calls);
7479
7480 done:
7481         return 0;
7482 }
7483
7484 int
7485 soresume(struct proc *p, struct socket *so, int locked)
7486 {
7487         if (locked == 0) {
7488                 socket_lock(so, 1);
7489         }
7490
7491         if (so->so_flags1 & SOF1_EXTEND_BK_IDLE_INPROG) {
7492                 SODEFUNCTLOG("%s[%d, %s]: (target pid %d name %s) so 0x%llx "
7493                     "[%d,%d] resumed from bk idle\n",
7494                     __func__, proc_selfpid(), proc_best_name(current_proc()),
7495                     proc_pid(p), proc_best_name(p),
7496                     (uint64_t)DEBUG_KERNEL_ADDRPERM(so),
7497                     SOCK_DOM(so), SOCK_TYPE(so));
7498
7499                 so->so_flags1 &= ~SOF1_EXTEND_BK_IDLE_INPROG;
7500                 so->so_extended_bk_start = 0;
7501                 OSBitAndAtomic(~P_LXBKIDLEINPROG, &p->p_ladvflag);
7502
7503                 OSIncrementAtomic(&soextbkidlestat.so_xbkidle_resumed);
7504                 OSDecrementAtomic(&soextbkidlestat.so_xbkidle_active);
7505                 VERIFY(soextbkidlestat.so_xbkidle_active >= 0);
7506         }
7507         if (locked == 0) {
7508                 socket_unlock(so, 1);
7509         }
7510
7511         return 0;
7512 }
7513
7514 /*
7515  * Does not attempt to account for sockets that are delegated from
7516  * the current process
7517  */
7518 int
7519 so_set_extended_bk_idle(struct socket *so, int optval)
7520 {
7521         int error = 0;
7522
7523         if ((SOCK_DOM(so) != PF_INET && SOCK_DOM(so) != PF_INET6) ||
7524             SOCK_PROTO(so) != IPPROTO_TCP) {
7525                 OSDecrementAtomic(&soextbkidlestat.so_xbkidle_notsupp);
7526                 error = EOPNOTSUPP;
7527         } else if (optval == 0) {
7528                 so->so_flags1 &= ~SOF1_EXTEND_BK_IDLE_WANTED;
7529
7530                 soresume(current_proc(), so, 1);
7531         } else {
7532                 struct proc *p = current_proc();
7533                 int i;
7534                 struct filedesc *fdp;
7535                 int count = 0;
7536
7537                 /*
7538                  * Unlock socket to avoid lock ordering issue with
7539                  * the proc fd table lock
7540                  */
7541                 socket_unlock(so, 0);
7542
7543                 proc_fdlock(p);
7544
7545                 fdp = p->p_fd;
7546                 for (i = 0; i < fdp->fd_nfiles; i++) {
7547                         struct fileproc *fp = fdp->fd_ofiles[i];
7548                         struct socket *so2;
7549
7550                         if (fp == NULL ||
7551                             (fdp->fd_ofileflags[i] & UF_RESERVED) != 0 ||
7552                             FILEGLOB_DTYPE(fp->f_fglob) != DTYPE_SOCKET) {
7553                                 continue;
7554                         }
7555
7556                         so2 = (struct socket *)fp->f_fglob->fg_data;
7557                         if (so != so2 &&
7558                             so2->so_flags1 & SOF1_EXTEND_BK_IDLE_WANTED) {
7559                                 count++;
7560                         }
7561                         if (count >= soextbkidlestat.so_xbkidle_maxperproc) {
7562                                 break;
7563                         }
7564                 }
7565                 proc_fdunlock(p);
7566
7567                 socket_lock(so, 0);
7568
7569                 if (count >= soextbkidlestat.so_xbkidle_maxperproc) {
7570                         OSIncrementAtomic(&soextbkidlestat.so_xbkidle_toomany);
7571                         error = EBUSY;
7572                 } else if (so->so_flags & SOF_DELEGATED) {
7573                         OSIncrementAtomic(&soextbkidlestat.so_xbkidle_nodlgtd);
7574                         error = EBUSY;
7575                 } else {
7576                         so->so_flags1 |= SOF1_EXTEND_BK_IDLE_WANTED;
7577                         OSIncrementAtomic(&soextbkidlestat.so_xbkidle_wantok);
7578                 }
7579                 SODEFUNCTLOG("%s[%d, %s]: so 0x%llx [%d,%d] "
7580                     "%s marked for extended bk idle\n",
7581                     __func__, proc_selfpid(), proc_best_name(current_proc()),
7582                     (uint64_t)DEBUG_KERNEL_ADDRPERM(so),
7583                     SOCK_DOM(so), SOCK_TYPE(so),
7584                     (so->so_flags1 & SOF1_EXTEND_BK_IDLE_WANTED) ?
7585                     "is" : "not");
7586         }
7587
7588         return error;
7589 }
7590
7591 static void
7592 so_stop_extended_bk_idle(struct socket *so)
7593 {
7594         so->so_flags1 &= ~SOF1_EXTEND_BK_IDLE_INPROG;
7595         so->so_extended_bk_start = 0;
7596
7597         OSDecrementAtomic(&soextbkidlestat.so_xbkidle_active);
7598         VERIFY(soextbkidlestat.so_xbkidle_active >= 0);
7599         /*
7600          * Force defunct
7601          */
7602         sosetdefunct(current_proc(), so,
7603             SHUTDOWN_SOCKET_LEVEL_DISCONNECT_INTERNAL, FALSE);
7604         if (so->so_flags & SOF_DEFUNCT) {
7605                 sodefunct(current_proc(), so,
7606                     SHUTDOWN_SOCKET_LEVEL_DISCONNECT_INTERNAL);
7607         }
7608 }
7609
7610 void
7611 so_drain_extended_bk_idle(struct socket *so)
7612 {
7613         if (so && (so->so_flags1 & SOF1_EXTEND_BK_IDLE_INPROG)) {
7614                 /*
7615                  * Only penalize sockets that have outstanding data
7616                  */
7617                 if (so->so_rcv.sb_cc || so->so_snd.sb_cc) {
7618                         so_stop_extended_bk_idle(so);
7619
7620                         OSIncrementAtomic(&soextbkidlestat.so_xbkidle_drained);
7621                 }
7622         }
7623 }
7624
7625 /*
7626  * Return values tells if socket is still in extended background idle
7627  */
7628 int
7629 so_check_extended_bk_idle_time(struct socket *so)
7630 {
7631         int ret = 1;
7632
7633         if ((so->so_flags1 & SOF1_EXTEND_BK_IDLE_INPROG)) {
7634                 SODEFUNCTLOG("%s[%d, %s]: so 0x%llx [%d,%d]\n",
7635                     __func__, proc_selfpid(), proc_best_name(current_proc()),
7636                     (uint64_t)DEBUG_KERNEL_ADDRPERM(so),
7637                     SOCK_DOM(so), SOCK_TYPE(so));
7638                 if (net_uptime() - so->so_extended_bk_start >
7639                     soextbkidlestat.so_xbkidle_time) {
7640                         so_stop_extended_bk_idle(so);
7641
7642                         OSIncrementAtomic(&soextbkidlestat.so_xbkidle_expired);
7643
7644                         ret = 0;
7645                 } else {
7646                         struct inpcb *inp = (struct inpcb *)so->so_pcb;
7647
7648                         inpcb_timer_sched(inp->inp_pcbinfo, INPCB_TIMER_LAZY);
7649                         OSIncrementAtomic(&soextbkidlestat.so_xbkidle_resched);
7650                 }
7651         }
7652
7653         return ret;
7654 }
7655
7656 void
7657 resume_proc_sockets(proc_t p)
7658 {
7659         if (p->p_ladvflag & P_LXBKIDLEINPROG) {
7660                 struct filedesc *fdp;
7661                 int i;
7662
7663                 proc_fdlock(p);
7664                 fdp = p->p_fd;
7665                 for (i = 0; i < fdp->fd_nfiles; i++) {
7666                         struct fileproc *fp;
7667                         struct socket *so;
7668
7669                         fp = fdp->fd_ofiles[i];
7670                         if (fp == NULL ||
7671                             (fdp->fd_ofileflags[i] & UF_RESERVED) != 0 ||
7672                             FILEGLOB_DTYPE(fp->f_fglob) != DTYPE_SOCKET) {
7673                                 continue;
7674                         }
7675
7676                         so = (struct socket *)fp->f_fglob->fg_data;
7677                         (void) soresume(p, so, 0);
7678                 }
7679                 proc_fdunlock(p);
7680
7681                 OSBitAndAtomic(~P_LXBKIDLEINPROG, &p->p_ladvflag);
7682         }
7683 }
7684
7685 __private_extern__ int
7686 so_set_recv_anyif(struct socket *so, int optval)
7687 {
7688         int ret = 0;
7689
7690 #if INET6
7691         if (SOCK_DOM(so) == PF_INET || SOCK_DOM(so) == PF_INET6) {
7692 #else
7693         if (SOCK_DOM(so) == PF_INET) {
7694 #endif /* !INET6 */
7695                 if (optval) {
7696                         sotoinpcb(so)->inp_flags |= INP_RECV_ANYIF;
7697                 } else {
7698                         sotoinpcb(so)->inp_flags &= ~INP_RECV_ANYIF;
7699                 }
7700         }
7701
7702
7703         return ret;
7704 }
7705
7706 __private_extern__ int
7707 so_get_recv_anyif(struct socket *so)
7708 {
7709         int ret = 0;
7710
7711 #if INET6
7712         if (SOCK_DOM(so) == PF_INET || SOCK_DOM(so) == PF_INET6) {
7713 #else
7714         if (SOCK_DOM(so) == PF_INET) {
7715 #endif /* !INET6 */
7716                 ret = (sotoinpcb(so)->inp_flags & INP_RECV_ANYIF) ? 1 : 0;
7717         }
7718
7719         return ret;
7720 }
7721
7722 int
7723 so_set_restrictions(struct socket *so, uint32_t vals)
7724 {
7725         int nocell_old, nocell_new;
7726         int noexpensive_old, noexpensive_new;
7727         int noconstrained_old, noconstrained_new;
7728
7729         /*
7730          * Deny-type restrictions are trapdoors; once set they cannot be
7731          * unset for the lifetime of the socket.  This allows them to be
7732          * issued by a framework on behalf of the application without
7733          * having to worry that they can be undone.
7734          *
7735          * Note here that socket-level restrictions overrides any protocol
7736          * level restrictions.  For instance, SO_RESTRICT_DENY_CELLULAR
7737          * socket restriction issued on the socket has a higher precendence
7738          * than INP_NO_IFT_CELLULAR.  The latter is affected by the UUID
7739          * policy PROC_UUID_NO_CELLULAR for unrestricted sockets only,
7740          * i.e. when SO_RESTRICT_DENY_CELLULAR has not been issued.
7741          */
7742         nocell_old = (so->so_restrictions & SO_RESTRICT_DENY_CELLULAR);
7743         noexpensive_old = (so->so_restrictions & SO_RESTRICT_DENY_EXPENSIVE);
7744         noconstrained_old = (so->so_restrictions & SO_RESTRICT_DENY_CONSTRAINED);
7745         so->so_restrictions |= (vals & (SO_RESTRICT_DENY_IN |
7746             SO_RESTRICT_DENY_OUT | SO_RESTRICT_DENY_CELLULAR |
7747             SO_RESTRICT_DENY_EXPENSIVE | SO_RESTRICT_DENY_CONSTRAINED));
7748         nocell_new = (so->so_restrictions & SO_RESTRICT_DENY_CELLULAR);
7749         noexpensive_new = (so->so_restrictions & SO_RESTRICT_DENY_EXPENSIVE);
7750         noconstrained_new = (so->so_restrictions & SO_RESTRICT_DENY_CONSTRAINED);
7751
7752         /* we can only set, not clear restrictions */
7753         if ((nocell_new - nocell_old) == 0 &&
7754             (noexpensive_new - noexpensive_old) == 0 &&
7755             (noconstrained_new - noconstrained_old) == 0) {
7756                 return 0;
7757         }
7758 #if INET6
7759         if (SOCK_DOM(so) == PF_INET || SOCK_DOM(so) == PF_INET6) {
7760 #else
7761         if (SOCK_DOM(so) == PF_INET) {
7762 #endif /* !INET6 */
7763                 if (nocell_new - nocell_old != 0) {
7764                         /*
7765                          * if deny cellular is now set, do what's needed
7766                          * for INPCB
7767                          */
7768                         inp_set_nocellular(sotoinpcb(so));
7769                 }
7770                 if (noexpensive_new - noexpensive_old != 0) {
7771                         inp_set_noexpensive(sotoinpcb(so));
7772                 }
7773                 if (noconstrained_new - noconstrained_old != 0) {
7774                         inp_set_noconstrained(sotoinpcb(so));
7775                 }
7776         }
7777
7778         if (SOCK_DOM(so) == PF_MULTIPATH) {
7779                 mptcp_set_restrictions(so);
7780         }
7781
7782         return 0;
7783 }
7784
7785 uint32_t
7786 so_get_restrictions(struct socket *so)
7787 {
7788         return so->so_restrictions & (SO_RESTRICT_DENY_IN |
7789                SO_RESTRICT_DENY_OUT |
7790                SO_RESTRICT_DENY_CELLULAR | SO_RESTRICT_DENY_EXPENSIVE);
7791 }
7792
7793 int
7794 so_set_effective_pid(struct socket *so, int epid, struct proc *p, boolean_t check_cred)
7795 {
7796         struct proc *ep = PROC_NULL;
7797         int error = 0;
7798
7799         /* pid 0 is reserved for kernel */
7800         if (epid == 0) {
7801                 error = EINVAL;
7802                 goto done;
7803         }
7804
7805         /*
7806          * If this is an in-kernel socket, prevent its delegate
7807          * association from changing unless the socket option is
7808          * coming from within the kernel itself.
7809          */
7810         if (so->last_pid == 0 && p != kernproc) {
7811                 error = EACCES;
7812                 goto done;
7813         }
7814
7815         /*
7816          * If this is issued by a process that's recorded as the
7817          * real owner of the socket, or if the pid is the same as
7818          * the process's own pid, then proceed.  Otherwise ensure
7819          * that the issuing process has the necessary privileges.
7820          */
7821         if (check_cred && (epid != so->last_pid || epid != proc_pid(p))) {
7822                 if ((error = priv_check_cred(kauth_cred_get(),
7823                     PRIV_NET_PRIVILEGED_SOCKET_DELEGATE, 0))) {
7824                         error = EACCES;
7825                         goto done;
7826                 }
7827         }
7828
7829         /* Find the process that corresponds to the effective pid */
7830         if ((ep = proc_find(epid)) == PROC_NULL) {
7831                 error = ESRCH;
7832                 goto done;
7833         }
7834
7835         /*
7836          * If a process tries to delegate the socket to itself, then
7837          * there's really nothing to do; treat it as a way for the
7838          * delegate association to be cleared.  Note that we check
7839          * the passed-in proc rather than calling proc_selfpid(),
7840          * as we need to check the process issuing the socket option
7841          * which could be kernproc.  Given that we don't allow 0 for
7842          * effective pid, it means that a delegated in-kernel socket
7843          * stays delegated during its lifetime (which is probably OK.)
7844          */
7845         if (epid == proc_pid(p)) {
7846                 so->so_flags &= ~SOF_DELEGATED;
7847                 so->e_upid = 0;
7848                 so->e_pid = 0;
7849                 uuid_clear(so->e_uuid);
7850         } else {
7851                 so->so_flags |= SOF_DELEGATED;
7852                 so->e_upid = proc_uniqueid(ep);
7853                 so->e_pid = proc_pid(ep);
7854                 proc_getexecutableuuid(ep, so->e_uuid, sizeof(so->e_uuid));
7855         }
7856         if (so->so_proto != NULL && so->so_proto->pr_update_last_owner != NULL) {
7857                 (*so->so_proto->pr_update_last_owner)(so, NULL, ep);
7858         }
7859 done:
7860         if (error == 0 && net_io_policy_log) {
7861                 uuid_string_t buf;
7862
7863                 uuid_unparse(so->e_uuid, buf);
7864                 log(LOG_DEBUG, "%s[%s,%d]: so 0x%llx [%d,%d] epid %d (%s) "
7865                     "euuid %s%s\n", __func__, proc_name_address(p),
7866                     proc_pid(p), (uint64_t)DEBUG_KERNEL_ADDRPERM(so),
7867                     SOCK_DOM(so), SOCK_TYPE(so),
7868                     so->e_pid, proc_name_address(ep), buf,
7869                     ((so->so_flags & SOF_DELEGATED) ? " [delegated]" : ""));
7870         } else if (error != 0 && net_io_policy_log) {
7871                 log(LOG_ERR, "%s[%s,%d]: so 0x%llx [%d,%d] epid %d (%s) "
7872                     "ERROR (%d)\n", __func__, proc_name_address(p),
7873                     proc_pid(p), (uint64_t)DEBUG_KERNEL_ADDRPERM(so),
7874                     SOCK_DOM(so), SOCK_TYPE(so),
7875                     epid, (ep == PROC_NULL) ? "PROC_NULL" :
7876                     proc_name_address(ep), error);
7877         }
7878
7879         /* Update this socket's policy upon success */
7880         if (error == 0) {
7881                 so->so_policy_gencnt *= -1;
7882                 so_update_policy(so);
7883 #if NECP
7884                 so_update_necp_policy(so, NULL, NULL);
7885 #endif /* NECP */
7886         }
7887
7888         if (ep != PROC_NULL) {
7889                 proc_rele(ep);
7890         }
7891
7892         return error;
7893 }
7894
7895 int
7896 so_set_effective_uuid(struct socket *so, uuid_t euuid, struct proc *p, boolean_t check_cred)
7897 {
7898         uuid_string_t buf;
7899         uuid_t uuid;
7900         int error = 0;
7901
7902         /* UUID must not be all-zeroes (reserved for kernel) */
7903         if (uuid_is_null(euuid)) {
7904                 error = EINVAL;
7905                 goto done;
7906         }
7907
7908         /*
7909          * If this is an in-kernel socket, prevent its delegate
7910          * association from changing unless the socket option is
7911          * coming from within the kernel itself.
7912          */
7913         if (so->last_pid == 0 && p != kernproc) {
7914                 error = EACCES;
7915                 goto done;
7916         }
7917
7918         /* Get the UUID of the issuing process */
7919         proc_getexecutableuuid(p, uuid, sizeof(uuid));
7920
7921         /*
7922          * If this is issued by a process that's recorded as the
7923          * real owner of the socket, or if the uuid is the same as
7924          * the process's own uuid, then proceed.  Otherwise ensure
7925          * that the issuing process has the necessary privileges.
7926          */
7927         if (check_cred &&
7928             (uuid_compare(euuid, so->last_uuid) != 0 ||
7929             uuid_compare(euuid, uuid) != 0)) {
7930                 if ((error = priv_check_cred(kauth_cred_get(),
7931                     PRIV_NET_PRIVILEGED_SOCKET_DELEGATE, 0))) {
7932                         error = EACCES;
7933                         goto done;
7934                 }
7935         }
7936
7937         /*
7938          * If a process tries to delegate the socket to itself, then
7939          * there's really nothing to do; treat it as a way for the
7940          * delegate association to be cleared.  Note that we check
7941          * the uuid of the passed-in proc rather than that of the
7942          * current process, as we need to check the process issuing
7943          * the socket option which could be kernproc itself.  Given
7944          * that we don't allow 0 for effective uuid, it means that
7945          * a delegated in-kernel socket stays delegated during its
7946          * lifetime (which is okay.)
7947          */
7948         if (uuid_compare(euuid, uuid) == 0) {
7949                 so->so_flags &= ~SOF_DELEGATED;
7950                 so->e_upid = 0;
7951                 so->e_pid = 0;
7952                 uuid_clear(so->e_uuid);
7953         } else {
7954                 so->so_flags |= SOF_DELEGATED;
7955                 /*
7956                  * Unlike so_set_effective_pid(), we only have the UUID
7957                  * here and the process ID is not known.  Inherit the
7958                  * real {pid,upid} of the socket.
7959                  */
7960                 so->e_upid = so->last_upid;
7961                 so->e_pid = so->last_pid;
7962                 uuid_copy(so->e_uuid, euuid);
7963         }
7964         /*
7965          * The following will clear the effective process name as it's the same
7966          * as the real process
7967          */
7968         if (so->so_proto != NULL && so->so_proto->pr_update_last_owner != NULL) {
7969                 (*so->so_proto->pr_update_last_owner)(so, NULL, NULL);
7970         }
7971 done:
7972         if (error == 0 && net_io_policy_log) {
7973                 uuid_unparse(so->e_uuid, buf);
7974                 log(LOG_DEBUG, "%s[%s,%d]: so 0x%llx [%d,%d] epid %d "
7975                     "euuid %s%s\n", __func__, proc_name_address(p), proc_pid(p),
7976                     (uint64_t)DEBUG_KERNEL_ADDRPERM(so), SOCK_DOM(so),
7977                     SOCK_TYPE(so), so->e_pid, buf,
7978                     ((so->so_flags & SOF_DELEGATED) ? " [delegated]" : ""));
7979         } else if (error != 0 && net_io_policy_log) {
7980                 uuid_unparse(euuid, buf);
7981                 log(LOG_DEBUG, "%s[%s,%d]: so 0x%llx [%d,%d] euuid %s "
7982                     "ERROR (%d)\n", __func__, proc_name_address(p), proc_pid(p),
7983                     (uint64_t)DEBUG_KERNEL_ADDRPERM(so), SOCK_DOM(so),
7984                     SOCK_TYPE(so), buf, error);
7985         }
7986
7987         /* Update this socket's policy upon success */
7988         if (error == 0) {
7989                 so->so_policy_gencnt *= -1;
7990                 so_update_policy(so);
7991 #if NECP
7992                 so_update_necp_policy(so, NULL, NULL);
7993 #endif /* NECP */
7994         }
7995
7996         return error;
7997 }
7998
7999 void
8000 netpolicy_post_msg(uint32_t ev_code, struct netpolicy_event_data *ev_data,
8001     uint32_t ev_datalen)
8002 {
8003         struct kev_msg ev_msg;
8004
8005         /*
8006          * A netpolicy event always starts with a netpolicy_event_data
8007          * structure, but the caller can provide for a longer event
8008          * structure to post, depending on the event code.
8009          */
8010         VERIFY(ev_data != NULL && ev_datalen >= sizeof(*ev_data));
8011
8012         bzero(&ev_msg, sizeof(ev_msg));
8013         ev_msg.vendor_code      = KEV_VENDOR_APPLE;
8014         ev_msg.kev_class        = KEV_NETWORK_CLASS;
8015         ev_msg.kev_subclass     = KEV_NETPOLICY_SUBCLASS;
8016         ev_msg.event_code       = ev_code;
8017
8018         ev_msg.dv[0].data_ptr   = ev_data;
8019         ev_msg.dv[0].data_length = ev_datalen;
8020
8021         kev_post_msg(&ev_msg);
8022 }
8023
8024 void
8025 socket_post_kev_msg(uint32_t ev_code,
8026     struct kev_socket_event_data *ev_data,
8027     uint32_t ev_datalen)
8028 {
8029         struct kev_msg ev_msg;
8030
8031         bzero(&ev_msg, sizeof(ev_msg));
8032         ev_msg.vendor_code = KEV_VENDOR_APPLE;
8033         ev_msg.kev_class = KEV_NETWORK_CLASS;
8034         ev_msg.kev_subclass = KEV_SOCKET_SUBCLASS;
8035         ev_msg.event_code = ev_code;
8036
8037         ev_msg.dv[0].data_ptr = ev_data;
8038         ev_msg.dv[0].data_length = ev_datalen;
8039
8040         kev_post_msg(&ev_msg);
8041 }
8042
8043 void
8044 socket_post_kev_msg_closed(struct socket *so)
8045 {
8046         struct kev_socket_closed ev;
8047         struct sockaddr *socksa = NULL, *peersa = NULL;
8048         int err;
8049         bzero(&ev, sizeof(ev));
8050         err = (*so->so_proto->pr_usrreqs->pru_sockaddr)(so, &socksa);
8051         if (err == 0) {
8052                 err = (*so->so_proto->pr_usrreqs->pru_peeraddr)(so,
8053                     &peersa);
8054                 if (err == 0) {
8055                         memcpy(&ev.ev_data.kev_sockname, socksa,
8056                             min(socksa->sa_len,
8057                             sizeof(ev.ev_data.kev_sockname)));
8058                         memcpy(&ev.ev_data.kev_peername, peersa,
8059                             min(peersa->sa_len,
8060                             sizeof(ev.ev_data.kev_peername)));
8061                         socket_post_kev_msg(KEV_SOCKET_CLOSED,
8062                             &ev.ev_data, sizeof(ev));
8063                 }
8064         }
8065         if (socksa != NULL) {
8066                 FREE(socksa, M_SONAME);
8067         }
8068         if (peersa != NULL) {
8069                 FREE(peersa, M_SONAME);
8070         }
8071 }