bsd/kern/uipc_socket.c

   1 /*
   2  * Copyright (c) 1998-2019 Apple Inc. All rights reserved.
   3  *
   4  * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
   5  *
   6  * This file contains Original Code and/or Modifications of Original Code
   7  * as defined in and that are subject to the Apple Public Source License
   8  * Version 2.0 (the 'License'). You may not use this file except in
   9  * compliance with the License. The rights granted to you under the License
  10  * may not be used to create, or enable the creation or redistribution of,
  11  * unlawful or unlicensed copies of an Apple operating system, or to
  12  * circumvent, violate, or enable the circumvention or violation of, any
  13  * terms of an Apple operating system software license agreement.
  14  *
  15  * Please obtain a copy of the License at
  16  * http://www.opensource.apple.com/apsl/ and read it before using this file.
  17  *
  18  * The Original Code and all software distributed under the License are
  19  * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
  20  * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
  21  * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
  22  * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
  23  * Please see the License for the specific language governing rights and
  24  * limitations under the License.
  25  *
  26  * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
  27  */
  28 /* Copyright (c) 1995 NeXT Computer, Inc. All Rights Reserved */
  29 /*
  30  * Copyright (c) 1982, 1986, 1988, 1990, 1993
  31  *      The Regents of the University of California.  All rights reserved.
  32  *
  33  * Redistribution and use in source and binary forms, with or without
  34  * modification, are permitted provided that the following conditions
  35  * are met:
  36  * 1. Redistributions of source code must retain the above copyright
  37  *    notice, this list of conditions and the following disclaimer.
  38  * 2. Redistributions in binary form must reproduce the above copyright
  39  *    notice, this list of conditions and the following disclaimer in the
  40  *    documentation and/or other materials provided with the distribution.
  41  * 3. All advertising materials mentioning features or use of this software
  42  *    must display the following acknowledgement:
  43  *      This product includes software developed by the University of
  44  *      California, Berkeley and its contributors.
  45  * 4. Neither the name of the University nor the names of its contributors
  46  *    may be used to endorse or promote products derived from this software
  47  *    without specific prior written permission.
  48  *
  49  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
  50  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  51  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  52  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  53  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  54  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  55  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  56  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  57  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  58  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  59  * SUCH DAMAGE.
  60  *
  61  *      @(#)uipc_socket.c       8.3 (Berkeley) 4/15/94
  62  */
  63 /*
  64  * NOTICE: This file was modified by SPARTA, Inc. in 2005 to introduce
  65  * support for mandatory and extensible security protections.  This notice
  66  * is included in support of clause 2.2 (b) of the Apple Public License,
  67  * Version 2.0.
  68  */
  69
  70 #include <sys/param.h>
  71 #include <sys/systm.h>
  72 #include <sys/filedesc.h>
  73 #include <sys/proc.h>
  74 #include <sys/proc_internal.h>
  75 #include <sys/kauth.h>
  76 #include <sys/file_internal.h>
  77 #include <sys/fcntl.h>
  78 #include <sys/malloc.h>
  79 #include <sys/mbuf.h>
  80 #include <sys/domain.h>
  81 #include <sys/kernel.h>
  82 #include <sys/event.h>
  83 #include <sys/poll.h>
  84 #include <sys/protosw.h>
  85 #include <sys/socket.h>
  86 #include <sys/socketvar.h>
  87 #include <sys/resourcevar.h>
  88 #include <sys/signalvar.h>
  89 #include <sys/sysctl.h>
  90 #include <sys/syslog.h>
  91 #include <sys/uio.h>
  92 #include <sys/uio_internal.h>
  93 #include <sys/ev.h>
  94 #include <sys/kdebug.h>
  95 #include <sys/un.h>
  96 #include <sys/user.h>
  97 #include <sys/priv.h>
  98 #include <sys/kern_event.h>
  99 #include <net/route.h>
 100 #include <net/init.h>
 101 #include <net/net_api_stats.h>
 102 #include <net/ntstat.h>
 103 #include <net/content_filter.h>
 104 #include <netinet/in.h>
 105 #include <netinet/in_pcb.h>
 106 #include <netinet/in_tclass.h>
 107 #include <netinet/in_var.h>
 108 #include <netinet/tcp_var.h>
 109 #include <netinet/ip6.h>
 110 #include <netinet6/ip6_var.h>
 111 #include <netinet/flow_divert.h>
 112 #include <kern/zalloc.h>
 113 #include <kern/locks.h>
 114 #include <machine/limits.h>
 115 #include <libkern/OSAtomic.h>
 116 #include <pexpert/pexpert.h>
 117 #include <kern/assert.h>
 118 #include <kern/task.h>
 119 #include <kern/policy_internal.h>
 120
 121 #include <sys/kpi_mbuf.h>
 122 #include <sys/mcache.h>
 123 #include <sys/unpcb.h>
 124 #include <libkern/section_keywords.h>
 125
 126 #if CONFIG_MACF
 127 #include <security/mac_framework.h>
 128 #endif /* MAC */
 129
 130 #if MULTIPATH
 131 #include <netinet/mp_pcb.h>
 132 #include <netinet/mptcp_var.h>
 133 #endif /* MULTIPATH */
 134
 135 #define ROUNDUP(a, b) (((a) + ((b) - 1)) & (~((b) - 1)))
 136
 137 #if DEBUG || DEVELOPMENT
 138 #define DEBUG_KERNEL_ADDRPERM(_v) (_v)
 139 #else
 140 #define DEBUG_KERNEL_ADDRPERM(_v) VM_KERNEL_ADDRPERM(_v)
 141 #endif
 142
 143 /* TODO: this should be in a header file somewhere */
 144 extern char *proc_name_address(void *p);
 145
 146 static u_int32_t        so_cache_hw;    /* High water mark for socache */
 147 static u_int32_t        so_cache_timeouts;      /* number of timeouts */
 148 static u_int32_t        so_cache_max_freed;     /* max freed per timeout */
 149 static u_int32_t        cached_sock_count = 0;
 150 STAILQ_HEAD(, socket)   so_cache_head;
 151 int     max_cached_sock_count = MAX_CACHED_SOCKETS;
 152 static u_int32_t        so_cache_time;
 153 static int              socketinit_done;
 154 static struct zone      *so_cache_zone;
 155
 156 static lck_grp_t        *so_cache_mtx_grp;
 157 static lck_attr_t       *so_cache_mtx_attr;
 158 static lck_grp_attr_t   *so_cache_mtx_grp_attr;
 159 static lck_mtx_t        *so_cache_mtx;
 160
 161 #include <machine/limits.h>
 162
 163 static int      filt_sorattach(struct knote *kn, struct kevent_qos_s *kev);
 164 static void     filt_sordetach(struct knote *kn);
 165 static int      filt_soread(struct knote *kn, long hint);
 166 static int      filt_sortouch(struct knote *kn, struct kevent_qos_s *kev);
 167 static int      filt_sorprocess(struct knote *kn, struct kevent_qos_s *kev);
 168
 169 static int      filt_sowattach(struct knote *kn, struct kevent_qos_s *kev);
 170 static void     filt_sowdetach(struct knote *kn);
 171 static int      filt_sowrite(struct knote *kn, long hint);
 172 static int      filt_sowtouch(struct knote *kn, struct kevent_qos_s *kev);
 173 static int      filt_sowprocess(struct knote *kn, struct kevent_qos_s *kev);
 174
 175 static int      filt_sockattach(struct knote *kn, struct kevent_qos_s *kev);
 176 static void     filt_sockdetach(struct knote *kn);
 177 static int      filt_sockev(struct knote *kn, long hint);
 178 static int      filt_socktouch(struct knote *kn, struct kevent_qos_s *kev);
 179 static int      filt_sockprocess(struct knote *kn, struct kevent_qos_s *kev);
 180
 181 static int sooptcopyin_timeval(struct sockopt *, struct timeval *);
 182 static int sooptcopyout_timeval(struct sockopt *, const struct timeval *);
 183
 184 SECURITY_READ_ONLY_EARLY(struct filterops) soread_filtops = {
 185         .f_isfd = 1,
 186         .f_attach = filt_sorattach,
 187         .f_detach = filt_sordetach,
 188         .f_event = filt_soread,
 189         .f_touch = filt_sortouch,
 190         .f_process = filt_sorprocess,
 191 };
 192
 193 SECURITY_READ_ONLY_EARLY(struct filterops) sowrite_filtops = {
 194         .f_isfd = 1,
 195         .f_attach = filt_sowattach,
 196         .f_detach = filt_sowdetach,
 197         .f_event = filt_sowrite,
 198         .f_touch = filt_sowtouch,
 199         .f_process = filt_sowprocess,
 200 };
 201
 202 SECURITY_READ_ONLY_EARLY(struct filterops) sock_filtops = {
 203         .f_isfd = 1,
 204         .f_attach = filt_sockattach,
 205         .f_detach = filt_sockdetach,
 206         .f_event = filt_sockev,
 207         .f_touch = filt_socktouch,
 208         .f_process = filt_sockprocess,
 209 };
 210
 211 SECURITY_READ_ONLY_EARLY(struct filterops) soexcept_filtops = {
 212         .f_isfd = 1,
 213         .f_attach = filt_sorattach,
 214         .f_detach = filt_sordetach,
 215         .f_event = filt_soread,
 216         .f_touch = filt_sortouch,
 217         .f_process = filt_sorprocess,
 218 };
 219
 220 SYSCTL_DECL(_kern_ipc);
 221
 222 #define EVEN_MORE_LOCKING_DEBUG 0
 223
 224 int socket_debug = 0;
 225 SYSCTL_INT(_kern_ipc, OID_AUTO, socket_debug,
 226     CTLFLAG_RW | CTLFLAG_LOCKED, &socket_debug, 0, "");
 227
 228 static unsigned long sodefunct_calls = 0;
 229 SYSCTL_LONG(_kern_ipc, OID_AUTO, sodefunct_calls, CTLFLAG_LOCKED,
 230     &sodefunct_calls, "");
 231
 232 static int socket_zone = M_SOCKET;
 233 so_gen_t        so_gencnt;      /* generation count for sockets */
 234
 235 MALLOC_DEFINE(M_SONAME, "soname", "socket name");
 236 MALLOC_DEFINE(M_PCB, "pcb", "protocol control block");
 237
 238 #define DBG_LAYER_IN_BEG        NETDBG_CODE(DBG_NETSOCK, 0)
 239 #define DBG_LAYER_IN_END        NETDBG_CODE(DBG_NETSOCK, 2)
 240 #define DBG_LAYER_OUT_BEG       NETDBG_CODE(DBG_NETSOCK, 1)
 241 #define DBG_LAYER_OUT_END       NETDBG_CODE(DBG_NETSOCK, 3)
 242 #define DBG_FNC_SOSEND          NETDBG_CODE(DBG_NETSOCK, (4 << 8) | 1)
 243 #define DBG_FNC_SOSEND_LIST     NETDBG_CODE(DBG_NETSOCK, (4 << 8) | 3)
 244 #define DBG_FNC_SORECEIVE       NETDBG_CODE(DBG_NETSOCK, (8 << 8))
 245 #define DBG_FNC_SORECEIVE_LIST  NETDBG_CODE(DBG_NETSOCK, (8 << 8) | 3)
 246 #define DBG_FNC_SOSHUTDOWN      NETDBG_CODE(DBG_NETSOCK, (9 << 8))
 247
 248 #define MAX_SOOPTGETM_SIZE      (128 * MCLBYTES)
 249
 250 int somaxconn = SOMAXCONN;
 251 SYSCTL_INT(_kern_ipc, KIPC_SOMAXCONN, somaxconn,
 252     CTLFLAG_RW | CTLFLAG_LOCKED, &somaxconn, 0, "");
 253
 254 /* Should we get a maximum also ??? */
 255 static int sosendmaxchain = 65536;
 256 static int sosendminchain = 16384;
 257 static int sorecvmincopy  = 16384;
 258 SYSCTL_INT(_kern_ipc, OID_AUTO, sosendminchain,
 259     CTLFLAG_RW | CTLFLAG_LOCKED, &sosendminchain, 0, "");
 260 SYSCTL_INT(_kern_ipc, OID_AUTO, sorecvmincopy,
 261     CTLFLAG_RW | CTLFLAG_LOCKED, &sorecvmincopy, 0, "");
 262
 263 /*
 264  * Set to enable jumbo clusters (if available) for large writes when
 265  * the socket is marked with SOF_MULTIPAGES; see below.
 266  */
 267 int sosendjcl = 1;
 268 SYSCTL_INT(_kern_ipc, OID_AUTO, sosendjcl,
 269     CTLFLAG_RW | CTLFLAG_LOCKED, &sosendjcl, 0, "");
 270
 271 /*
 272  * Set this to ignore SOF_MULTIPAGES and use jumbo clusters for large
 273  * writes on the socket for all protocols on any network interfaces,
 274  * depending upon sosendjcl above.  Be extra careful when setting this
 275  * to 1, because sending down packets that cross physical pages down to
 276  * broken drivers (those that falsely assume that the physical pages
 277  * are contiguous) might lead to system panics or silent data corruption.
 278  * When set to 0, the system will respect SOF_MULTIPAGES, which is set
 279  * only for TCP sockets whose outgoing interface is IFNET_MULTIPAGES
 280  * capable.  Set this to 1 only for testing/debugging purposes.
 281  */
 282 int sosendjcl_ignore_capab = 0;
 283 SYSCTL_INT(_kern_ipc, OID_AUTO, sosendjcl_ignore_capab,
 284     CTLFLAG_RW | CTLFLAG_LOCKED, &sosendjcl_ignore_capab, 0, "");
 285
 286 /*
 287  * Set this to ignore SOF1_IF_2KCL and use big clusters for large
 288  * writes on the socket for all protocols on any network interfaces.
 289  * Be extra careful when setting this to 1, because sending down packets with
 290  * clusters larger that 2 KB might lead to system panics or data corruption.
 291  * When set to 0, the system will respect SOF1_IF_2KCL, which is set
 292  * on the outgoing interface
 293  * Set this to 1  for testing/debugging purposes only.
 294  */
 295 int sosendbigcl_ignore_capab = 0;
 296 SYSCTL_INT(_kern_ipc, OID_AUTO, sosendbigcl_ignore_capab,
 297     CTLFLAG_RW | CTLFLAG_LOCKED, &sosendbigcl_ignore_capab, 0, "");
 298
 299 int sodefunctlog = 0;
 300 SYSCTL_INT(_kern_ipc, OID_AUTO, sodefunctlog, CTLFLAG_RW | CTLFLAG_LOCKED,
 301     &sodefunctlog, 0, "");
 302
 303 int sothrottlelog = 0;
 304 SYSCTL_INT(_kern_ipc, OID_AUTO, sothrottlelog, CTLFLAG_RW | CTLFLAG_LOCKED,
 305     &sothrottlelog, 0, "");
 306
 307 int sorestrictrecv = 1;
 308 SYSCTL_INT(_kern_ipc, OID_AUTO, sorestrictrecv, CTLFLAG_RW | CTLFLAG_LOCKED,
 309     &sorestrictrecv, 0, "Enable inbound interface restrictions");
 310
 311 int sorestrictsend = 1;
 312 SYSCTL_INT(_kern_ipc, OID_AUTO, sorestrictsend, CTLFLAG_RW | CTLFLAG_LOCKED,
 313     &sorestrictsend, 0, "Enable outbound interface restrictions");
 314
 315 int soreserveheadroom = 1;
 316 SYSCTL_INT(_kern_ipc, OID_AUTO, soreserveheadroom, CTLFLAG_RW | CTLFLAG_LOCKED,
 317     &soreserveheadroom, 0, "To allocate contiguous datagram buffers");
 318
 319 #if (DEBUG || DEVELOPMENT)
 320 int so_notsent_lowat_check = 1;
 321 SYSCTL_INT(_kern_ipc, OID_AUTO, notsent_lowat, CTLFLAG_RW | CTLFLAG_LOCKED,
 322     &so_notsent_lowat_check, 0, "enable/disable notsnet lowat check");
 323 #endif /* DEBUG || DEVELOPMENT */
 324
 325 int so_accept_list_waits = 0;
 326 #if (DEBUG || DEVELOPMENT)
 327 SYSCTL_INT(_kern_ipc, OID_AUTO, accept_list_waits, CTLFLAG_RW | CTLFLAG_LOCKED,
 328     &so_accept_list_waits, 0, "number of waits for listener incomp list");
 329 #endif /* DEBUG || DEVELOPMENT */
 330
 331 extern struct inpcbinfo tcbinfo;
 332
 333 /* TODO: these should be in header file */
 334 extern int get_inpcb_str_size(void);
 335 extern int get_tcp_str_size(void);
 336
 337 vm_size_t       so_cache_zone_element_size;
 338
 339 static int sodelayed_copy(struct socket *, struct uio *, struct mbuf **,
 340     user_ssize_t *);
 341 static void cached_sock_alloc(struct socket **, int);
 342 static void cached_sock_free(struct socket *);
 343
 344 /*
 345  * Maximum of extended background idle sockets per process
 346  * Set to zero to disable further setting of the option
 347  */
 348
 349 #define SO_IDLE_BK_IDLE_MAX_PER_PROC    1
 350 #define SO_IDLE_BK_IDLE_TIME            600
 351 #define SO_IDLE_BK_IDLE_RCV_HIWAT       131072
 352
 353 struct soextbkidlestat soextbkidlestat;
 354
 355 SYSCTL_UINT(_kern_ipc, OID_AUTO, maxextbkidleperproc,
 356     CTLFLAG_RW | CTLFLAG_LOCKED, &soextbkidlestat.so_xbkidle_maxperproc, 0,
 357     "Maximum of extended background idle sockets per process");
 358
 359 SYSCTL_UINT(_kern_ipc, OID_AUTO, extbkidletime, CTLFLAG_RW | CTLFLAG_LOCKED,
 360     &soextbkidlestat.so_xbkidle_time, 0,
 361     "Time in seconds to keep extended background idle sockets");
 362
 363 SYSCTL_UINT(_kern_ipc, OID_AUTO, extbkidlercvhiwat, CTLFLAG_RW | CTLFLAG_LOCKED,
 364     &soextbkidlestat.so_xbkidle_rcvhiwat, 0,
 365     "High water mark for extended background idle sockets");
 366
 367 SYSCTL_STRUCT(_kern_ipc, OID_AUTO, extbkidlestat, CTLFLAG_RD | CTLFLAG_LOCKED,
 368     &soextbkidlestat, soextbkidlestat, "");
 369
 370 int so_set_extended_bk_idle(struct socket *, int);
 371
 372
 373 /*
 374  * SOTCDB_NO_DSCP is set by default, to prevent the networking stack from
 375  * setting the DSCP code on the packet based on the service class; see
 376  * <rdar://problem/11277343> for details.
 377  */
 378 __private_extern__ u_int32_t sotcdb = 0;
 379 SYSCTL_INT(_kern_ipc, OID_AUTO, sotcdb, CTLFLAG_RW | CTLFLAG_LOCKED,
 380     &sotcdb, 0, "");
 381
 382 void
 383 socketinit(void)
 384 {
 385         _CASSERT(sizeof(so_gencnt) == sizeof(uint64_t));
 386         VERIFY(IS_P2ALIGNED(&so_gencnt, sizeof(uint32_t)));
 387
 388 #ifdef __LP64__
 389         _CASSERT(sizeof(struct sa_endpoints) == sizeof(struct user64_sa_endpoints));
 390         _CASSERT(offsetof(struct sa_endpoints, sae_srcif) == offsetof(struct user64_sa_endpoints, sae_srcif));
 391         _CASSERT(offsetof(struct sa_endpoints, sae_srcaddr) == offsetof(struct user64_sa_endpoints, sae_srcaddr));
 392         _CASSERT(offsetof(struct sa_endpoints, sae_srcaddrlen) == offsetof(struct user64_sa_endpoints, sae_srcaddrlen));
 393         _CASSERT(offsetof(struct sa_endpoints, sae_dstaddr) == offsetof(struct user64_sa_endpoints, sae_dstaddr));
 394         _CASSERT(offsetof(struct sa_endpoints, sae_dstaddrlen) == offsetof(struct user64_sa_endpoints, sae_dstaddrlen));
 395 #else
 396         _CASSERT(sizeof(struct sa_endpoints) == sizeof(struct user32_sa_endpoints));
 397         _CASSERT(offsetof(struct sa_endpoints, sae_srcif) == offsetof(struct user32_sa_endpoints, sae_srcif));
 398         _CASSERT(offsetof(struct sa_endpoints, sae_srcaddr) == offsetof(struct user32_sa_endpoints, sae_srcaddr));
 399         _CASSERT(offsetof(struct sa_endpoints, sae_srcaddrlen) == offsetof(struct user32_sa_endpoints, sae_srcaddrlen));
 400         _CASSERT(offsetof(struct sa_endpoints, sae_dstaddr) == offsetof(struct user32_sa_endpoints, sae_dstaddr));
 401         _CASSERT(offsetof(struct sa_endpoints, sae_dstaddrlen) == offsetof(struct user32_sa_endpoints, sae_dstaddrlen));
 402 #endif
 403
 404         if (socketinit_done) {
 405                 printf("socketinit: already called...\n");
 406                 return;
 407         }
 408         socketinit_done = 1;
 409
 410         PE_parse_boot_argn("socket_debug", &socket_debug,
 411             sizeof(socket_debug));
 412
 413         /*
 414          * allocate lock group attribute and group for socket cache mutex
 415          */
 416         so_cache_mtx_grp_attr = lck_grp_attr_alloc_init();
 417         so_cache_mtx_grp = lck_grp_alloc_init("so_cache",
 418             so_cache_mtx_grp_attr);
 419
 420         /*
 421          * allocate the lock attribute for socket cache mutex
 422          */
 423         so_cache_mtx_attr = lck_attr_alloc_init();
 424
 425         /* cached sockets mutex */
 426         so_cache_mtx = lck_mtx_alloc_init(so_cache_mtx_grp, so_cache_mtx_attr);
 427         if (so_cache_mtx == NULL) {
 428                 panic("%s: unable to allocate so_cache_mtx\n", __func__);
 429                 /* NOTREACHED */
 430         }
 431         STAILQ_INIT(&so_cache_head);
 432
 433         so_cache_zone_element_size = (vm_size_t)(sizeof(struct socket) + 4
 434             + get_inpcb_str_size() + 4 + get_tcp_str_size());
 435
 436         so_cache_zone = zinit(so_cache_zone_element_size,
 437             (120000 * so_cache_zone_element_size), 8192, "socache zone");
 438         zone_change(so_cache_zone, Z_CALLERACCT, FALSE);
 439         zone_change(so_cache_zone, Z_NOENCRYPT, TRUE);
 440
 441         bzero(&soextbkidlestat, sizeof(struct soextbkidlestat));
 442         soextbkidlestat.so_xbkidle_maxperproc = SO_IDLE_BK_IDLE_MAX_PER_PROC;
 443         soextbkidlestat.so_xbkidle_time = SO_IDLE_BK_IDLE_TIME;
 444         soextbkidlestat.so_xbkidle_rcvhiwat = SO_IDLE_BK_IDLE_RCV_HIWAT;
 445
 446         in_pcbinit();
 447         sflt_init();
 448         socket_tclass_init();
 449 #if MULTIPATH
 450         mp_pcbinit();
 451 #endif /* MULTIPATH */
 452 }
 453
 454 static void
 455 cached_sock_alloc(struct socket **so, int waitok)
 456 {
 457         caddr_t temp;
 458         uintptr_t offset;
 459
 460         lck_mtx_lock(so_cache_mtx);
 461
 462         if (!STAILQ_EMPTY(&so_cache_head)) {
 463                 VERIFY(cached_sock_count > 0);
 464
 465                 *so = STAILQ_FIRST(&so_cache_head);
 466                 STAILQ_REMOVE_HEAD(&so_cache_head, so_cache_ent);
 467                 STAILQ_NEXT((*so), so_cache_ent) = NULL;
 468
 469                 cached_sock_count--;
 470                 lck_mtx_unlock(so_cache_mtx);
 471
 472                 temp = (*so)->so_saved_pcb;
 473                 bzero((caddr_t)*so, sizeof(struct socket));
 474
 475                 (*so)->so_saved_pcb = temp;
 476         } else {
 477                 lck_mtx_unlock(so_cache_mtx);
 478
 479                 if (waitok) {
 480                         *so = (struct socket *)zalloc(so_cache_zone);
 481                 } else {
 482                         *so = (struct socket *)zalloc_noblock(so_cache_zone);
 483                 }
 484
 485                 if (*so == NULL) {
 486                         return;
 487                 }
 488
 489                 bzero((caddr_t)*so, sizeof(struct socket));
 490
 491                 /*
 492                  * Define offsets for extra structures into our
 493                  * single block of memory. Align extra structures
 494                  * on longword boundaries.
 495                  */
 496
 497                 offset = (uintptr_t)*so;
 498                 offset += sizeof(struct socket);
 499
 500                 offset = ALIGN(offset);
 501
 502                 (*so)->so_saved_pcb = (caddr_t)offset;
 503                 offset += get_inpcb_str_size();
 504
 505                 offset = ALIGN(offset);
 506
 507                 ((struct inpcb *)(void *)(*so)->so_saved_pcb)->inp_saved_ppcb =
 508                     (caddr_t)offset;
 509         }
 510
 511         OSBitOrAtomic(SOF1_CACHED_IN_SOCK_LAYER, &(*so)->so_flags1);
 512 }
 513
 514 static void
 515 cached_sock_free(struct socket *so)
 516 {
 517         lck_mtx_lock(so_cache_mtx);
 518
 519         so_cache_time = net_uptime();
 520         if (++cached_sock_count > max_cached_sock_count) {
 521                 --cached_sock_count;
 522                 lck_mtx_unlock(so_cache_mtx);
 523                 zfree(so_cache_zone, so);
 524         } else {
 525                 if (so_cache_hw < cached_sock_count) {
 526                         so_cache_hw = cached_sock_count;
 527                 }
 528
 529                 STAILQ_INSERT_TAIL(&so_cache_head, so, so_cache_ent);
 530
 531                 so->cache_timestamp = so_cache_time;
 532                 lck_mtx_unlock(so_cache_mtx);
 533         }
 534 }
 535
 536 void
 537 so_update_last_owner_locked(struct socket *so, proc_t self)
 538 {
 539         if (so->last_pid != 0) {
 540                 /*
 541                  * last_pid and last_upid should remain zero for sockets
 542                  * created using sock_socket. The check above achieves that
 543                  */
 544                 if (self == PROC_NULL) {
 545                         self = current_proc();
 546                 }
 547
 548                 if (so->last_upid != proc_uniqueid(self) ||
 549                     so->last_pid != proc_pid(self)) {
 550                         so->last_upid = proc_uniqueid(self);
 551                         so->last_pid = proc_pid(self);
 552                         proc_getexecutableuuid(self, so->last_uuid,
 553                             sizeof(so->last_uuid));
 554                         if (so->so_proto != NULL && so->so_proto->pr_update_last_owner != NULL) {
 555                                 (*so->so_proto->pr_update_last_owner)(so, self, NULL);
 556                         }
 557                 }
 558                 proc_pidoriginatoruuid(so->so_vuuid, sizeof(so->so_vuuid));
 559         }
 560 }
 561
 562 void
 563 so_update_policy(struct socket *so)
 564 {
 565         if (SOCK_DOM(so) == PF_INET || SOCK_DOM(so) == PF_INET6) {
 566                 (void) inp_update_policy(sotoinpcb(so));
 567         }
 568 }
 569
 570 #if NECP
 571 static void
 572 so_update_necp_policy(struct socket *so, struct sockaddr *override_local_addr,
 573     struct sockaddr *override_remote_addr)
 574 {
 575         if (SOCK_DOM(so) == PF_INET || SOCK_DOM(so) == PF_INET6) {
 576                 inp_update_necp_policy(sotoinpcb(so), override_local_addr,
 577                     override_remote_addr, 0);
 578         }
 579 }
 580 #endif /* NECP */
 581
 582 boolean_t
 583 so_cache_timer(void)
 584 {
 585         struct socket   *p;
 586         int             n_freed = 0;
 587         boolean_t rc = FALSE;
 588
 589         lck_mtx_lock(so_cache_mtx);
 590         so_cache_timeouts++;
 591         so_cache_time = net_uptime();
 592
 593         while (!STAILQ_EMPTY(&so_cache_head)) {
 594                 VERIFY(cached_sock_count > 0);
 595                 p = STAILQ_FIRST(&so_cache_head);
 596                 if ((so_cache_time - p->cache_timestamp) <
 597                     SO_CACHE_TIME_LIMIT) {
 598                         break;
 599                 }
 600
 601                 STAILQ_REMOVE_HEAD(&so_cache_head, so_cache_ent);
 602                 --cached_sock_count;
 603
 604                 zfree(so_cache_zone, p);
 605
 606                 if (++n_freed >= SO_CACHE_MAX_FREE_BATCH) {
 607                         so_cache_max_freed++;
 608                         break;
 609                 }
 610         }
 611
 612         /* Schedule again if there is more to cleanup */
 613         if (!STAILQ_EMPTY(&so_cache_head)) {
 614                 rc = TRUE;
 615         }
 616
 617         lck_mtx_unlock(so_cache_mtx);
 618         return rc;
 619 }
 620
 621 /*
 622  * Get a socket structure from our zone, and initialize it.
 623  * We don't implement `waitok' yet (see comments in uipc_domain.c).
 624  * Note that it would probably be better to allocate socket
 625  * and PCB at the same time, but I'm not convinced that all
 626  * the protocols can be easily modified to do this.
 627  */
 628 struct socket *
 629 soalloc(int waitok, int dom, int type)
 630 {
 631         struct socket *so;
 632
 633         if ((dom == PF_INET) && (type == SOCK_STREAM)) {
 634                 cached_sock_alloc(&so, waitok);
 635         } else {
 636                 MALLOC_ZONE(so, struct socket *, sizeof(*so), socket_zone,
 637                     M_WAITOK);
 638                 if (so != NULL) {
 639                         bzero(so, sizeof(*so));
 640                 }
 641         }
 642         if (so != NULL) {
 643                 so->so_gencnt = OSIncrementAtomic64((SInt64 *)&so_gencnt);
 644                 so->so_zone = socket_zone;
 645
 646                 /*
 647                  * Increment the socket allocation statistics
 648                  */
 649                 INC_ATOMIC_INT64_LIM(net_api_stats.nas_socket_alloc_total);
 650
 651 #if CONFIG_MACF_SOCKET
 652                 /* Convert waitok to  M_WAITOK/M_NOWAIT for MAC Framework. */
 653                 if (mac_socket_label_init(so, !waitok) != 0) {
 654                         sodealloc(so);
 655                         return NULL;
 656                 }
 657 #endif /* MAC_SOCKET */
 658         }
 659
 660         return so;
 661 }
 662
 663 int
 664 socreate_internal(int dom, struct socket **aso, int type, int proto,
 665     struct proc *p, uint32_t flags, struct proc *ep)
 666 {
 667         struct protosw *prp;
 668         struct socket *so;
 669         int error = 0;
 670 #if defined(XNU_TARGET_OS_OSX)
 671         pid_t rpid = -1;
 672 #endif
 673
 674 #if TCPDEBUG
 675         extern int tcpconsdebug;
 676 #endif
 677
 678         VERIFY(aso != NULL);
 679         *aso = NULL;
 680
 681         if (proto != 0) {
 682                 prp = pffindproto(dom, proto, type);
 683         } else {
 684                 prp = pffindtype(dom, type);
 685         }
 686
 687         if (prp == NULL || prp->pr_usrreqs->pru_attach == NULL) {
 688                 if (pffinddomain(dom) == NULL) {
 689                         return EAFNOSUPPORT;
 690                 }
 691                 if (proto != 0) {
 692                         if (pffindprotonotype(dom, proto) != NULL) {
 693                                 return EPROTOTYPE;
 694                         }
 695                 }
 696                 return EPROTONOSUPPORT;
 697         }
 698         if (prp->pr_type != type) {
 699                 return EPROTOTYPE;
 700         }
 701         so = soalloc(1, dom, type);
 702         if (so == NULL) {
 703                 return ENOBUFS;
 704         }
 705
 706         switch (dom) {
 707         case PF_LOCAL:
 708                 INC_ATOMIC_INT64_LIM(net_api_stats.nas_socket_domain_local_total);
 709                 break;
 710         case PF_INET:
 711                 INC_ATOMIC_INT64_LIM(net_api_stats.nas_socket_domain_inet_total);
 712                 if (type == SOCK_STREAM) {
 713                         INC_ATOMIC_INT64_LIM(net_api_stats.nas_socket_inet_stream_total);
 714                 } else {
 715                         INC_ATOMIC_INT64_LIM(net_api_stats.nas_socket_inet_dgram_total);
 716                 }
 717                 break;
 718         case PF_ROUTE:
 719                 INC_ATOMIC_INT64_LIM(net_api_stats.nas_socket_domain_route_total);
 720                 break;
 721         case PF_NDRV:
 722                 INC_ATOMIC_INT64_LIM(net_api_stats.nas_socket_domain_ndrv_total);
 723                 break;
 724         case PF_KEY:
 725                 INC_ATOMIC_INT64_LIM(net_api_stats.nas_socket_domain_key_total);
 726                 break;
 727         case PF_INET6:
 728                 INC_ATOMIC_INT64_LIM(net_api_stats.nas_socket_domain_inet6_total);
 729                 if (type == SOCK_STREAM) {
 730                         INC_ATOMIC_INT64_LIM(net_api_stats.nas_socket_inet6_stream_total);
 731                 } else {
 732                         INC_ATOMIC_INT64_LIM(net_api_stats.nas_socket_inet6_dgram_total);
 733                 }
 734                 break;
 735         case PF_SYSTEM:
 736                 INC_ATOMIC_INT64_LIM(net_api_stats.nas_socket_domain_system_total);
 737                 break;
 738         case PF_MULTIPATH:
 739                 INC_ATOMIC_INT64_LIM(net_api_stats.nas_socket_domain_multipath_total);
 740                 break;
 741         default:
 742                 INC_ATOMIC_INT64_LIM(net_api_stats.nas_socket_domain_other_total);
 743                 break;
 744         }
 745
 746         if (flags & SOCF_MPTCP) {
 747                 so->so_state |= SS_NBIO;
 748         }
 749
 750         TAILQ_INIT(&so->so_incomp);
 751         TAILQ_INIT(&so->so_comp);
 752         so->so_type = type;
 753         so->last_upid = proc_uniqueid(p);
 754         so->last_pid = proc_pid(p);
 755         proc_getexecutableuuid(p, so->last_uuid, sizeof(so->last_uuid));
 756         proc_pidoriginatoruuid(so->so_vuuid, sizeof(so->so_vuuid));
 757
 758         if (ep != PROC_NULL && ep != p) {
 759                 so->e_upid = proc_uniqueid(ep);
 760                 so->e_pid = proc_pid(ep);
 761                 proc_getexecutableuuid(ep, so->e_uuid, sizeof(so->e_uuid));
 762                 so->so_flags |= SOF_DELEGATED;
 763 #if defined(XNU_TARGET_OS_OSX)
 764                 if (ep->p_responsible_pid != so->e_pid) {
 765                         rpid = ep->p_responsible_pid;
 766                 }
 767 #endif
 768         }
 769
 770 #if defined(XNU_TARGET_OS_OSX)
 771         if (rpid < 0 && p->p_responsible_pid != so->last_pid) {
 772                 rpid = p->p_responsible_pid;
 773         }
 774
 775         so->so_rpid = -1;
 776         uuid_clear(so->so_ruuid);
 777         if (rpid >= 0) {
 778                 proc_t rp = proc_find(rpid);
 779                 if (rp != PROC_NULL) {
 780                         proc_getexecutableuuid(rp, so->so_ruuid, sizeof(so->so_ruuid));
 781                         so->so_rpid = rpid;
 782                         proc_rele(rp);
 783                 }
 784         }
 785 #endif
 786
 787         so->so_cred = kauth_cred_proc_ref(p);
 788         if (!suser(kauth_cred_get(), NULL)) {
 789                 so->so_state |= SS_PRIV;
 790         }
 791
 792         so->so_proto = prp;
 793         so->so_rcv.sb_flags |= SB_RECV;
 794         so->so_rcv.sb_so = so->so_snd.sb_so = so;
 795         so->next_lock_lr = 0;
 796         so->next_unlock_lr = 0;
 797
 798 #if CONFIG_MACF_SOCKET
 799         mac_socket_label_associate(kauth_cred_get(), so);
 800 #endif /* MAC_SOCKET */
 801
 802         /*
 803          * Attachment will create the per pcb lock if necessary and
 804          * increase refcount for creation, make sure it's done before
 805          * socket is inserted in lists.
 806          */
 807         so->so_usecount++;
 808
 809         error = (*prp->pr_usrreqs->pru_attach)(so, proto, p);
 810         if (error != 0) {
 811                 /*
 812                  * Warning:
 813                  * If so_pcb is not zero, the socket will be leaked,
 814                  * so protocol attachment handler must be coded carefuly
 815                  */
 816                 so->so_state |= SS_NOFDREF;
 817                 VERIFY(so->so_usecount > 0);
 818                 so->so_usecount--;
 819                 sofreelastref(so, 1);   /* will deallocate the socket */
 820                 return error;
 821         }
 822
 823         /*
 824          * Note: needs so_pcb to be set after pru_attach
 825          */
 826         if (prp->pr_update_last_owner != NULL) {
 827                 (*prp->pr_update_last_owner)(so, p, ep);
 828         }
 829
 830         atomic_add_32(&prp->pr_domain->dom_refs, 1);
 831         TAILQ_INIT(&so->so_evlist);
 832
 833         /* Attach socket filters for this protocol */
 834         sflt_initsock(so);
 835 #if TCPDEBUG
 836         if (tcpconsdebug == 2) {
 837                 so->so_options |= SO_DEBUG;
 838         }
 839 #endif
 840         so_set_default_traffic_class(so);
 841
 842         /*
 843          * If this thread or task is marked to create backgrounded sockets,
 844          * mark the socket as background.
 845          */
 846         if (!(flags & SOCF_MPTCP) &&
 847             proc_get_effective_thread_policy(current_thread(), TASK_POLICY_NEW_SOCKETS_BG)) {
 848                 socket_set_traffic_mgt_flags(so, TRAFFIC_MGT_SO_BACKGROUND);
 849                 so->so_background_thread = current_thread();
 850         }
 851
 852         switch (dom) {
 853         /*
 854          * Don't mark Unix domain or system
 855          * eligible for defunct by default.
 856          */
 857         case PF_LOCAL:
 858         case PF_SYSTEM:
 859                 so->so_flags |= SOF_NODEFUNCT;
 860                 break;
 861         default:
 862                 break;
 863         }
 864
 865         /*
 866          * Entitlements can't be checked at socket creation time except if the
 867          * application requested a feature guarded by a privilege (c.f., socket
 868          * delegation).
 869          * The priv(9) and the Sandboxing APIs are designed with the idea that
 870          * a privilege check should only be triggered by a userland request.
 871          * A privilege check at socket creation time is time consuming and
 872          * could trigger many authorisation error messages from the security
 873          * APIs.
 874          */
 875
 876         *aso = so;
 877
 878         return 0;
 879 }
 880
 881 /*
 882  * Returns:     0                       Success
 883  *              EAFNOSUPPORT
 884  *              EPROTOTYPE
 885  *              EPROTONOSUPPORT
 886  *              ENOBUFS
 887  *      <pru_attach>:ENOBUFS[AF_UNIX]
 888  *      <pru_attach>:ENOBUFS[TCP]
 889  *      <pru_attach>:ENOMEM[TCP]
 890  *      <pru_attach>:???                [other protocol families, IPSEC]
 891  */
 892 int
 893 socreate(int dom, struct socket **aso, int type, int proto)
 894 {
 895         return socreate_internal(dom, aso, type, proto, current_proc(), 0,
 896                    PROC_NULL);
 897 }
 898
 899 int
 900 socreate_delegate(int dom, struct socket **aso, int type, int proto, pid_t epid)
 901 {
 902         int error = 0;
 903         struct proc *ep = PROC_NULL;
 904
 905         if ((proc_selfpid() != epid) && ((ep = proc_find(epid)) == PROC_NULL)) {
 906                 error = ESRCH;
 907                 goto done;
 908         }
 909
 910         error = socreate_internal(dom, aso, type, proto, current_proc(), 0, ep);
 911
 912         /*
 913          * It might not be wise to hold the proc reference when calling
 914          * socreate_internal since it calls soalloc with M_WAITOK
 915          */
 916 done:
 917         if (ep != PROC_NULL) {
 918                 proc_rele(ep);
 919         }
 920
 921         return error;
 922 }
 923
 924 /*
 925  * Returns:     0                       Success
 926  *      <pru_bind>:EINVAL               Invalid argument [COMMON_START]
 927  *      <pru_bind>:EAFNOSUPPORT         Address family not supported
 928  *      <pru_bind>:EADDRNOTAVAIL        Address not available.
 929  *      <pru_bind>:EINVAL               Invalid argument
 930  *      <pru_bind>:EAFNOSUPPORT         Address family not supported [notdef]
 931  *      <pru_bind>:EACCES               Permission denied
 932  *      <pru_bind>:EADDRINUSE           Address in use
 933  *      <pru_bind>:EAGAIN               Resource unavailable, try again
 934  *      <pru_bind>:EPERM                Operation not permitted
 935  *      <pru_bind>:???
 936  *      <sf_bind>:???
 937  *
 938  * Notes:       It's not possible to fully enumerate the return codes above,
 939  *              since socket filter authors and protocol family authors may
 940  *              not choose to limit their error returns to those listed, even
 941  *              though this may result in some software operating incorrectly.
 942  *
 943  *              The error codes which are enumerated above are those known to
 944  *              be returned by the tcp_usr_bind function supplied.
 945  */
 946 int
 947 sobindlock(struct socket *so, struct sockaddr *nam, int dolock)
 948 {
 949         struct proc *p = current_proc();
 950         int error = 0;
 951
 952         if (dolock) {
 953                 socket_lock(so, 1);
 954         }
 955
 956         so_update_last_owner_locked(so, p);
 957         so_update_policy(so);
 958
 959 #if NECP
 960         so_update_necp_policy(so, nam, NULL);
 961 #endif /* NECP */
 962
 963         /*
 964          * If this is a bind request on a socket that has been marked
 965          * as inactive, reject it now before we go any further.
 966          */
 967         if (so->so_flags & SOF_DEFUNCT) {
 968                 error = EINVAL;
 969                 SODEFUNCTLOG("%s[%d, %s]: defunct so 0x%llx [%d,%d] (%d)\n",
 970                     __func__, proc_pid(p), proc_best_name(p),
 971                     (uint64_t)DEBUG_KERNEL_ADDRPERM(so),
 972                     SOCK_DOM(so), SOCK_TYPE(so), error);
 973                 goto out;
 974         }
 975
 976         /* Socket filter */
 977         error = sflt_bind(so, nam);
 978
 979         if (error == 0) {
 980                 error = (*so->so_proto->pr_usrreqs->pru_bind)(so, nam, p);
 981         }
 982 out:
 983         if (dolock) {
 984                 socket_unlock(so, 1);
 985         }
 986
 987         if (error == EJUSTRETURN) {
 988                 error = 0;
 989         }
 990
 991         return error;
 992 }
 993
 994 void
 995 sodealloc(struct socket *so)
 996 {
 997         kauth_cred_unref(&so->so_cred);
 998
 999         /* Remove any filters */
1000         sflt_termsock(so);
1001
1002 #if CONTENT_FILTER
1003         cfil_sock_detach(so);
1004 #endif /* CONTENT_FILTER */
1005
1006         /* Delete the state allocated for msg queues on a socket */
1007         if (so->so_flags & SOF_ENABLE_MSGS) {
1008                 FREE(so->so_msg_state, M_TEMP);
1009                 so->so_msg_state = NULL;
1010         }
1011         VERIFY(so->so_msg_state == NULL);
1012
1013         so->so_gencnt = OSIncrementAtomic64((SInt64 *)&so_gencnt);
1014
1015 #if CONFIG_MACF_SOCKET
1016         mac_socket_label_destroy(so);
1017 #endif /* MAC_SOCKET */
1018
1019         if (so->so_flags1 & SOF1_CACHED_IN_SOCK_LAYER) {
1020                 cached_sock_free(so);
1021         } else {
1022                 FREE_ZONE(so, sizeof(*so), so->so_zone);
1023         }
1024 }
1025
1026 /*
1027  * Returns:     0                       Success
1028  *              EINVAL
1029  *              EOPNOTSUPP
1030  *      <pru_listen>:EINVAL[AF_UNIX]
1031  *      <pru_listen>:EINVAL[TCP]
1032  *      <pru_listen>:EADDRNOTAVAIL[TCP] Address not available.
1033  *      <pru_listen>:EINVAL[TCP]        Invalid argument
1034  *      <pru_listen>:EAFNOSUPPORT[TCP]  Address family not supported [notdef]
1035  *      <pru_listen>:EACCES[TCP]        Permission denied
1036  *      <pru_listen>:EADDRINUSE[TCP]    Address in use
1037  *      <pru_listen>:EAGAIN[TCP]        Resource unavailable, try again
1038  *      <pru_listen>:EPERM[TCP]         Operation not permitted
1039  *      <sf_listen>:???
1040  *
1041  * Notes:       Other <pru_listen> returns depend on the protocol family; all
1042  *              <sf_listen> returns depend on what the filter author causes
1043  *              their filter to return.
1044  */
1045 int
1046 solisten(struct socket *so, int backlog)
1047 {
1048         struct proc *p = current_proc();
1049         int error = 0;
1050
1051         socket_lock(so, 1);
1052
1053         so_update_last_owner_locked(so, p);
1054         so_update_policy(so);
1055
1056 #if NECP
1057         so_update_necp_policy(so, NULL, NULL);
1058 #endif /* NECP */
1059
1060         if (so->so_proto == NULL) {
1061                 error = EINVAL;
1062                 goto out;
1063         }
1064         if ((so->so_proto->pr_flags & PR_CONNREQUIRED) == 0) {
1065                 error = EOPNOTSUPP;
1066                 goto out;
1067         }
1068
1069         /*
1070          * If the listen request is made on a socket that is not fully
1071          * disconnected, or on a socket that has been marked as inactive,
1072          * reject the request now.
1073          */
1074         if ((so->so_state &
1075             (SS_ISCONNECTED | SS_ISCONNECTING | SS_ISDISCONNECTING)) ||
1076             (so->so_flags & SOF_DEFUNCT)) {
1077                 error = EINVAL;
1078                 if (so->so_flags & SOF_DEFUNCT) {
1079                         SODEFUNCTLOG("%s[%d, %s]: defunct so 0x%llx [%d,%d] "
1080                             "(%d)\n", __func__, proc_pid(p),
1081                             proc_best_name(p),
1082                             (uint64_t)DEBUG_KERNEL_ADDRPERM(so),
1083                             SOCK_DOM(so), SOCK_TYPE(so), error);
1084                 }
1085                 goto out;
1086         }
1087
1088         if ((so->so_restrictions & SO_RESTRICT_DENY_IN) != 0) {
1089                 error = EPERM;
1090                 goto out;
1091         }
1092
1093         error = sflt_listen(so);
1094         if (error == 0) {
1095                 error = (*so->so_proto->pr_usrreqs->pru_listen)(so, p);
1096         }
1097
1098         if (error) {
1099                 if (error == EJUSTRETURN) {
1100                         error = 0;
1101                 }
1102                 goto out;
1103         }
1104
1105         if (TAILQ_EMPTY(&so->so_comp)) {
1106                 so->so_options |= SO_ACCEPTCONN;
1107         }
1108         /*
1109          * POSIX: The implementation may have an upper limit on the length of
1110          * the listen queue-either global or per accepting socket. If backlog
1111          * exceeds this limit, the length of the listen queue is set to the
1112          * limit.
1113          *
1114          * If listen() is called with a backlog argument value that is less
1115          * than 0, the function behaves as if it had been called with a backlog
1116          * argument value of 0.
1117          *
1118          * A backlog argument of 0 may allow the socket to accept connections,
1119          * in which case the length of the listen queue may be set to an
1120          * implementation-defined minimum value.
1121          */
1122         if (backlog <= 0 || backlog > somaxconn) {
1123                 backlog = somaxconn;
1124         }
1125
1126         so->so_qlimit = backlog;
1127 out:
1128         socket_unlock(so, 1);
1129         return error;
1130 }
1131
1132 /*
1133  * The "accept list lock" protects the fields related to the listener queues
1134  * because we can unlock a socket to respect the lock ordering between
1135  * the listener socket and its clients sockets. The lock ordering is first to
1136  * acquire the client socket before the listener socket.
1137  *
1138  * The accept list lock serializes access to the following fields:
1139  * - of the listener socket:
1140  *   - so_comp
1141  *   - so_incomp
1142  *   - so_qlen
1143  *   - so_inqlen
1144  * - of client sockets that are in so_comp or so_incomp:
1145  *   - so_head
1146  *   - so_list
1147  *
1148  * As one can see the accept list lock protects the consistent of the
1149  * linkage of the client sockets.
1150  *
1151  * Note that those fields may be read without holding the accept list lock
1152  * for a preflight provided the accept list lock is taken when committing
1153  * to take an action based on the result of the preflight. The preflight
1154  * saves the cost of doing the unlock/lock dance.
1155  */
1156 void
1157 so_acquire_accept_list(struct socket *head, struct socket *so)
1158 {
1159         lck_mtx_t *mutex_held;
1160
1161         if (head->so_proto->pr_getlock == NULL) {
1162                 return;
1163         }
1164         mutex_held = (*head->so_proto->pr_getlock)(head, PR_F_WILLUNLOCK);
1165         LCK_MTX_ASSERT(mutex_held, LCK_MTX_ASSERT_OWNED);
1166
1167         if (!(head->so_flags1 & SOF1_ACCEPT_LIST_HELD)) {
1168                 head->so_flags1 |= SOF1_ACCEPT_LIST_HELD;
1169                 return;
1170         }
1171         if (so != NULL) {
1172                 socket_unlock(so, 0);
1173         }
1174         while (head->so_flags1 & SOF1_ACCEPT_LIST_HELD) {
1175                 so_accept_list_waits += 1;
1176                 msleep((caddr_t)&head->so_incomp, mutex_held,
1177                     PSOCK | PCATCH, __func__, NULL);
1178         }
1179         head->so_flags1 |= SOF1_ACCEPT_LIST_HELD;
1180         if (so != NULL) {
1181                 socket_unlock(head, 0);
1182                 socket_lock(so, 0);
1183                 socket_lock(head, 0);
1184         }
1185 }
1186
1187 void
1188 so_release_accept_list(struct socket *head)
1189 {
1190         if (head->so_proto->pr_getlock != NULL) {
1191                 lck_mtx_t *mutex_held;
1192
1193                 mutex_held = (*head->so_proto->pr_getlock)(head, 0);
1194                 LCK_MTX_ASSERT(mutex_held, LCK_MTX_ASSERT_OWNED);
1195
1196                 head->so_flags1 &= ~SOF1_ACCEPT_LIST_HELD;
1197                 wakeup((caddr_t)&head->so_incomp);
1198         }
1199 }
1200
1201 void
1202 sofreelastref(struct socket *so, int dealloc)
1203 {
1204         struct socket *head = so->so_head;
1205
1206         /* Assume socket is locked */
1207
1208         if (!(so->so_flags & SOF_PCBCLEARING) || !(so->so_state & SS_NOFDREF)) {
1209                 selthreadclear(&so->so_snd.sb_sel);
1210                 selthreadclear(&so->so_rcv.sb_sel);
1211                 so->so_rcv.sb_flags &= ~(SB_SEL | SB_UPCALL);
1212                 so->so_snd.sb_flags &= ~(SB_SEL | SB_UPCALL);
1213                 so->so_event = sonullevent;
1214                 return;
1215         }
1216         if (head != NULL) {
1217                 /*
1218                  * Need to lock the listener when the protocol has
1219                  * per socket locks
1220                  */
1221                 if (head->so_proto->pr_getlock != NULL) {
1222                         socket_lock(head, 1);
1223                         so_acquire_accept_list(head, so);
1224                 }
1225                 if (so->so_state & SS_INCOMP) {
1226                         so->so_state &= ~SS_INCOMP;
1227                         TAILQ_REMOVE(&head->so_incomp, so, so_list);
1228                         head->so_incqlen--;
1229                         head->so_qlen--;
1230                         so->so_head = NULL;
1231
1232                         if (head->so_proto->pr_getlock != NULL) {
1233                                 so_release_accept_list(head);
1234                                 socket_unlock(head, 1);
1235                         }
1236                 } else if (so->so_state & SS_COMP) {
1237                         if (head->so_proto->pr_getlock != NULL) {
1238                                 so_release_accept_list(head);
1239                                 socket_unlock(head, 1);
1240                         }
1241                         /*
1242                          * We must not decommission a socket that's
1243                          * on the accept(2) queue.  If we do, then
1244                          * accept(2) may hang after select(2) indicated
1245                          * that the listening socket was ready.
1246                          */
1247                         selthreadclear(&so->so_snd.sb_sel);
1248                         selthreadclear(&so->so_rcv.sb_sel);
1249                         so->so_rcv.sb_flags &= ~(SB_SEL | SB_UPCALL);
1250                         so->so_snd.sb_flags &= ~(SB_SEL | SB_UPCALL);
1251                         so->so_event = sonullevent;
1252                         return;
1253                 } else {
1254                         if (head->so_proto->pr_getlock != NULL) {
1255                                 so_release_accept_list(head);
1256                                 socket_unlock(head, 1);
1257                         }
1258                         printf("sofree: not queued\n");
1259                 }
1260         }
1261         sowflush(so);
1262         sorflush(so);
1263
1264 #if FLOW_DIVERT
1265         if (so->so_flags & SOF_FLOW_DIVERT) {
1266                 flow_divert_detach(so);
1267         }
1268 #endif  /* FLOW_DIVERT */
1269
1270         /* 3932268: disable upcall */
1271         so->so_rcv.sb_flags &= ~SB_UPCALL;
1272         so->so_snd.sb_flags &= ~(SB_UPCALL | SB_SNDBYTE_CNT);
1273         so->so_event = sonullevent;
1274
1275         if (dealloc) {
1276                 sodealloc(so);
1277         }
1278 }
1279
1280 void
1281 soclose_wait_locked(struct socket *so)
1282 {
1283         lck_mtx_t *mutex_held;
1284
1285         if (so->so_proto->pr_getlock != NULL) {
1286                 mutex_held = (*so->so_proto->pr_getlock)(so, PR_F_WILLUNLOCK);
1287         } else {
1288                 mutex_held = so->so_proto->pr_domain->dom_mtx;
1289         }
1290         LCK_MTX_ASSERT(mutex_held, LCK_MTX_ASSERT_OWNED);
1291
1292         /*
1293          * Double check here and return if there's no outstanding upcall;
1294          * otherwise proceed further only if SOF_UPCALLCLOSEWAIT is set.
1295          */
1296         if (!so->so_upcallusecount || !(so->so_flags & SOF_UPCALLCLOSEWAIT)) {
1297                 return;
1298         }
1299         so->so_rcv.sb_flags &= ~SB_UPCALL;
1300         so->so_snd.sb_flags &= ~SB_UPCALL;
1301         so->so_flags |= SOF_CLOSEWAIT;
1302
1303         (void) msleep((caddr_t)&so->so_upcallusecount, mutex_held, (PZERO - 1),
1304             "soclose_wait_locked", NULL);
1305         LCK_MTX_ASSERT(mutex_held, LCK_MTX_ASSERT_OWNED);
1306         so->so_flags &= ~SOF_CLOSEWAIT;
1307 }
1308
1309 /*
1310  * Close a socket on last file table reference removal.
1311  * Initiate disconnect if connected.
1312  * Free socket when disconnect complete.
1313  */
1314 int
1315 soclose_locked(struct socket *so)
1316 {
1317         int error = 0;
1318         struct timespec ts;
1319
1320         if (so->so_usecount == 0) {
1321                 panic("soclose: so=%p refcount=0\n", so);
1322                 /* NOTREACHED */
1323         }
1324
1325         sflt_notify(so, sock_evt_closing, NULL);
1326
1327         if (so->so_upcallusecount) {
1328                 soclose_wait_locked(so);
1329         }
1330
1331 #if CONTENT_FILTER
1332         /*
1333          * We have to wait until the content filters are done
1334          */
1335         if ((so->so_flags & SOF_CONTENT_FILTER) != 0) {
1336                 cfil_sock_close_wait(so);
1337                 cfil_sock_is_closed(so);
1338                 cfil_sock_detach(so);
1339         }
1340 #endif /* CONTENT_FILTER */
1341
1342         if (so->so_flags1 & SOF1_EXTEND_BK_IDLE_INPROG) {
1343                 soresume(current_proc(), so, 1);
1344                 so->so_flags1 &= ~SOF1_EXTEND_BK_IDLE_WANTED;
1345         }
1346
1347         if ((so->so_options & SO_ACCEPTCONN)) {
1348                 struct socket *sp, *sonext;
1349                 int persocklock = 0;
1350                 int incomp_overflow_only;
1351
1352                 /*
1353                  * We do not want new connection to be added
1354                  * to the connection queues
1355                  */
1356                 so->so_options &= ~SO_ACCEPTCONN;
1357
1358                 /*
1359                  * We can drop the lock on the listener once
1360                  * we've acquired the incoming list
1361                  */
1362                 if (so->so_proto->pr_getlock != NULL) {
1363                         persocklock = 1;
1364                         so_acquire_accept_list(so, NULL);
1365                         socket_unlock(so, 0);
1366                 }
1367 again:
1368                 incomp_overflow_only = 1;
1369
1370                 TAILQ_FOREACH_SAFE(sp, &so->so_incomp, so_list, sonext) {
1371                         /*
1372                          * Radar 5350314
1373                          * skip sockets thrown away by tcpdropdropblreq
1374                          * they will get cleanup by the garbage collection.
1375                          * otherwise, remove the incomp socket from the queue
1376                          * and let soabort trigger the appropriate cleanup.
1377                          */
1378                         if (sp->so_flags & SOF_OVERFLOW) {
1379                                 continue;
1380                         }
1381
1382                         if (persocklock != 0) {
1383                                 socket_lock(sp, 1);
1384                         }
1385
1386                         /*
1387                          * Radar 27945981
1388                          * The extra reference for the list insure the
1389                          * validity of the socket pointer when we perform the
1390                          * unlock of the head above
1391                          */
1392                         if (sp->so_state & SS_INCOMP) {
1393                                 sp->so_state &= ~SS_INCOMP;
1394                                 sp->so_head = NULL;
1395                                 TAILQ_REMOVE(&so->so_incomp, sp, so_list);
1396                                 so->so_incqlen--;
1397                                 so->so_qlen--;
1398
1399                                 (void) soabort(sp);
1400                         } else {
1401                                 panic("%s sp %p in so_incomp but !SS_INCOMP",
1402                                     __func__, sp);
1403                         }
1404
1405                         if (persocklock != 0) {
1406                                 socket_unlock(sp, 1);
1407                         }
1408                 }
1409
1410                 TAILQ_FOREACH_SAFE(sp, &so->so_comp, so_list, sonext) {
1411                         /* Dequeue from so_comp since sofree() won't do it */
1412                         if (persocklock != 0) {
1413                                 socket_lock(sp, 1);
1414                         }
1415
1416                         if (sp->so_state & SS_COMP) {
1417                                 sp->so_state &= ~SS_COMP;
1418                                 sp->so_head = NULL;
1419                                 TAILQ_REMOVE(&so->so_comp, sp, so_list);
1420                                 so->so_qlen--;
1421
1422                                 (void) soabort(sp);
1423                         } else {
1424                                 panic("%s sp %p in so_comp but !SS_COMP",
1425                                     __func__, sp);
1426                         }
1427
1428                         if (persocklock) {
1429                                 socket_unlock(sp, 1);
1430                         }
1431                 }
1432
1433                 if (incomp_overflow_only == 0 && !TAILQ_EMPTY(&so->so_incomp)) {
1434 #if (DEBUG | DEVELOPMENT)
1435                         panic("%s head %p so_comp not empty\n", __func__, so);
1436 #endif /* (DEVELOPMENT || DEBUG) */
1437
1438                         goto again;
1439                 }
1440
1441                 if (!TAILQ_EMPTY(&so->so_comp)) {
1442 #if (DEBUG | DEVELOPMENT)
1443                         panic("%s head %p so_comp not empty\n", __func__, so);
1444 #endif /* (DEVELOPMENT || DEBUG) */
1445
1446                         goto again;
1447                 }
1448
1449                 if (persocklock) {
1450                         socket_lock(so, 0);
1451                         so_release_accept_list(so);
1452                 }
1453         }
1454         if (so->so_pcb == NULL) {
1455                 /* 3915887: mark the socket as ready for dealloc */
1456                 so->so_flags |= SOF_PCBCLEARING;
1457                 goto discard;
1458         }
1459         if (so->so_state & SS_ISCONNECTED) {
1460                 if ((so->so_state & SS_ISDISCONNECTING) == 0) {
1461                         error = sodisconnectlocked(so);
1462                         if (error) {
1463                                 goto drop;
1464                         }
1465                 }
1466                 if (so->so_options & SO_LINGER) {
1467                         lck_mtx_t *mutex_held;
1468
1469                         if ((so->so_state & SS_ISDISCONNECTING) &&
1470                             (so->so_state & SS_NBIO)) {
1471                                 goto drop;
1472                         }
1473                         if (so->so_proto->pr_getlock != NULL) {
1474                                 mutex_held = (*so->so_proto->pr_getlock)(so, PR_F_WILLUNLOCK);
1475                         } else {
1476                                 mutex_held = so->so_proto->pr_domain->dom_mtx;
1477                         }
1478                         while (so->so_state & SS_ISCONNECTED) {
1479                                 ts.tv_sec = (so->so_linger / 100);
1480                                 ts.tv_nsec = (so->so_linger % 100) *
1481                                     NSEC_PER_USEC * 1000 * 10;
1482                                 error = msleep((caddr_t)&so->so_timeo,
1483                                     mutex_held, PSOCK | PCATCH, "soclose", &ts);
1484                                 if (error) {
1485                                         /*
1486                                          * It's OK when the time fires,
1487                                          * don't report an error
1488                                          */
1489                                         if (error == EWOULDBLOCK) {
1490                                                 error = 0;
1491                                         }
1492                                         break;
1493                                 }
1494                         }
1495                 }
1496         }
1497 drop:
1498         if (so->so_usecount == 0) {
1499                 panic("soclose: usecount is zero so=%p\n", so);
1500                 /* NOTREACHED */
1501         }
1502         if (so->so_pcb != NULL && !(so->so_flags & SOF_PCBCLEARING)) {
1503                 int error2 = (*so->so_proto->pr_usrreqs->pru_detach)(so);
1504                 if (error == 0) {
1505                         error = error2;
1506                 }
1507         }
1508         if (so->so_usecount <= 0) {
1509                 panic("soclose: usecount is zero so=%p\n", so);
1510                 /* NOTREACHED */
1511         }
1512 discard:
1513         if (so->so_pcb != NULL && !(so->so_flags & SOF_MP_SUBFLOW) &&
1514             (so->so_state & SS_NOFDREF)) {
1515                 panic("soclose: NOFDREF");
1516                 /* NOTREACHED */
1517         }
1518         so->so_state |= SS_NOFDREF;
1519
1520         if ((so->so_flags & SOF_KNOTE) != 0) {
1521                 KNOTE(&so->so_klist, SO_FILT_HINT_LOCKED);
1522         }
1523
1524         atomic_add_32(&so->so_proto->pr_domain->dom_refs, -1);
1525         evsofree(so);
1526
1527         VERIFY(so->so_usecount > 0);
1528         so->so_usecount--;
1529         sofree(so);
1530         return error;
1531 }
1532
1533 int
1534 soclose(struct socket *so)
1535 {
1536         int error = 0;
1537         socket_lock(so, 1);
1538
1539         if (so->so_retaincnt == 0) {
1540                 error = soclose_locked(so);
1541         } else {
1542                 /*
1543                  * if the FD is going away, but socket is
1544                  * retained in kernel remove its reference
1545                  */
1546                 so->so_usecount--;
1547                 if (so->so_usecount < 2) {
1548                         panic("soclose: retaincnt non null and so=%p "
1549                             "usecount=%d\n", so, so->so_usecount);
1550                 }
1551         }
1552         socket_unlock(so, 1);
1553         return error;
1554 }
1555
1556 /*
1557  * Must be called at splnet...
1558  */
1559 /* Should already be locked */
1560 int
1561 soabort(struct socket *so)
1562 {
1563         int error;
1564
1565 #ifdef MORE_LOCKING_DEBUG
1566         lck_mtx_t *mutex_held;
1567
1568         if (so->so_proto->pr_getlock != NULL) {
1569                 mutex_held = (*so->so_proto->pr_getlock)(so, 0);
1570         } else {
1571                 mutex_held = so->so_proto->pr_domain->dom_mtx;
1572         }
1573         LCK_MTX_ASSERT(mutex_held, LCK_MTX_ASSERT_OWNED);
1574 #endif
1575
1576         if ((so->so_flags & SOF_ABORTED) == 0) {
1577                 so->so_flags |= SOF_ABORTED;
1578                 error = (*so->so_proto->pr_usrreqs->pru_abort)(so);
1579                 if (error) {
1580                         sofree(so);
1581                         return error;
1582                 }
1583         }
1584         return 0;
1585 }
1586
1587 int
1588 soacceptlock(struct socket *so, struct sockaddr **nam, int dolock)
1589 {
1590         int error;
1591
1592         if (dolock) {
1593                 socket_lock(so, 1);
1594         }
1595
1596         so_update_last_owner_locked(so, PROC_NULL);
1597         so_update_policy(so);
1598 #if NECP
1599         so_update_necp_policy(so, NULL, NULL);
1600 #endif /* NECP */
1601
1602         if ((so->so_state & SS_NOFDREF) == 0) {
1603                 panic("soaccept: !NOFDREF");
1604         }
1605         so->so_state &= ~SS_NOFDREF;
1606         error = (*so->so_proto->pr_usrreqs->pru_accept)(so, nam);
1607
1608         if (dolock) {
1609                 socket_unlock(so, 1);
1610         }
1611         return error;
1612 }
1613
1614 int
1615 soaccept(struct socket *so, struct sockaddr **nam)
1616 {
1617         return soacceptlock(so, nam, 1);
1618 }
1619
1620 int
1621 soacceptfilter(struct socket *so, struct socket *head)
1622 {
1623         struct sockaddr *local = NULL, *remote = NULL;
1624         int error = 0;
1625
1626         /*
1627          * Hold the lock even if this socket has not been made visible
1628          * to the filter(s).  For sockets with global locks, this protects
1629          * against the head or peer going away
1630          */
1631         socket_lock(so, 1);
1632         if (sogetaddr_locked(so, &remote, 1) != 0 ||
1633             sogetaddr_locked(so, &local, 0) != 0) {
1634                 so->so_state &= ~SS_NOFDREF;
1635                 socket_unlock(so, 1);
1636                 soclose(so);
1637                 /* Out of resources; try it again next time */
1638                 error = ECONNABORTED;
1639                 goto done;
1640         }
1641
1642         error = sflt_accept(head, so, local, remote);
1643
1644         /*
1645          * If we get EJUSTRETURN from one of the filters, mark this socket
1646          * as inactive and return it anyway.  This newly accepted socket
1647          * will be disconnected later before we hand it off to the caller.
1648          */
1649         if (error == EJUSTRETURN) {
1650                 error = 0;
1651                 (void) sosetdefunct(current_proc(), so,
1652                     SHUTDOWN_SOCKET_LEVEL_DISCONNECT_INTERNAL, FALSE);
1653         }
1654
1655         if (error != 0) {
1656                 /*
1657                  * This may seem like a duplication to the above error
1658                  * handling part when we return ECONNABORTED, except
1659                  * the following is done while holding the lock since
1660                  * the socket has been exposed to the filter(s) earlier.
1661                  */
1662                 so->so_state &= ~SS_NOFDREF;
1663                 socket_unlock(so, 1);
1664                 soclose(so);
1665                 /* Propagate socket filter's error code to the caller */
1666         } else {
1667                 socket_unlock(so, 1);
1668         }
1669 done:
1670         /* Callee checks for NULL pointer */
1671         sock_freeaddr(remote);
1672         sock_freeaddr(local);
1673         return error;
1674 }
1675
1676 /*
1677  * Returns:     0                       Success
1678  *              EOPNOTSUPP              Operation not supported on socket
1679  *              EISCONN                 Socket is connected
1680  *      <pru_connect>:EADDRNOTAVAIL     Address not available.
1681  *      <pru_connect>:EINVAL            Invalid argument
1682  *      <pru_connect>:EAFNOSUPPORT      Address family not supported [notdef]
1683  *      <pru_connect>:EACCES            Permission denied
1684  *      <pru_connect>:EADDRINUSE        Address in use
1685  *      <pru_connect>:EAGAIN            Resource unavailable, try again
1686  *      <pru_connect>:EPERM             Operation not permitted
1687  *      <sf_connect_out>:???            [anything a filter writer might set]
1688  */
1689 int
1690 soconnectlock(struct socket *so, struct sockaddr *nam, int dolock)
1691 {
1692         int error;
1693         struct proc *p = current_proc();
1694
1695         if (dolock) {
1696                 socket_lock(so, 1);
1697         }
1698
1699         so_update_last_owner_locked(so, p);
1700         so_update_policy(so);
1701
1702 #if NECP
1703         so_update_necp_policy(so, NULL, nam);
1704 #endif /* NECP */
1705
1706         /*
1707          * If this is a listening socket or if this is a previously-accepted
1708          * socket that has been marked as inactive, reject the connect request.
1709          */
1710         if ((so->so_options & SO_ACCEPTCONN) || (so->so_flags & SOF_DEFUNCT)) {
1711                 error = EOPNOTSUPP;
1712                 if (so->so_flags & SOF_DEFUNCT) {
1713                         SODEFUNCTLOG("%s[%d, %s]: defunct so 0x%llx [%d,%d] "
1714                             "(%d)\n", __func__, proc_pid(p),
1715                             proc_best_name(p),
1716                             (uint64_t)DEBUG_KERNEL_ADDRPERM(so),
1717                             SOCK_DOM(so), SOCK_TYPE(so), error);
1718                 }
1719                 if (dolock) {
1720                         socket_unlock(so, 1);
1721                 }
1722                 return error;
1723         }
1724
1725         if ((so->so_restrictions & SO_RESTRICT_DENY_OUT) != 0) {
1726                 if (dolock) {
1727                         socket_unlock(so, 1);
1728                 }
1729                 return EPERM;
1730         }
1731
1732         /*
1733          * If protocol is connection-based, can only connect once.
1734          * Otherwise, if connected, try to disconnect first.
1735          * This allows user to disconnect by connecting to, e.g.,
1736          * a null address.
1737          */
1738         if (so->so_state & (SS_ISCONNECTED | SS_ISCONNECTING) &&
1739             ((so->so_proto->pr_flags & PR_CONNREQUIRED) ||
1740             (error = sodisconnectlocked(so)))) {
1741                 error = EISCONN;
1742         } else {
1743                 /*
1744                  * Run connect filter before calling protocol:
1745                  *  - non-blocking connect returns before completion;
1746                  */
1747                 error = sflt_connectout(so, nam);
1748                 if (error != 0) {
1749                         if (error == EJUSTRETURN) {
1750                                 error = 0;
1751                         }
1752                 } else {
1753                         error = (*so->so_proto->pr_usrreqs->pru_connect)
1754                             (so, nam, p);
1755                         if (error != 0) {
1756                                 so->so_state &= ~SS_ISCONNECTING;
1757                         }
1758                 }
1759         }
1760         if (dolock) {
1761                 socket_unlock(so, 1);
1762         }
1763         return error;
1764 }
1765
1766 int
1767 soconnect(struct socket *so, struct sockaddr *nam)
1768 {
1769         return soconnectlock(so, nam, 1);
1770 }
1771
1772 /*
1773  * Returns:     0                       Success
1774  *      <pru_connect2>:EINVAL[AF_UNIX]
1775  *      <pru_connect2>:EPROTOTYPE[AF_UNIX]
1776  *      <pru_connect2>:???              [other protocol families]
1777  *
1778  * Notes:       <pru_connect2> is not supported by [TCP].
1779  */
1780 int
1781 soconnect2(struct socket *so1, struct socket *so2)
1782 {
1783         int error;
1784
1785         socket_lock(so1, 1);
1786         if (so2->so_proto->pr_lock) {
1787                 socket_lock(so2, 1);
1788         }
1789
1790         error = (*so1->so_proto->pr_usrreqs->pru_connect2)(so1, so2);
1791
1792         socket_unlock(so1, 1);
1793         if (so2->so_proto->pr_lock) {
1794                 socket_unlock(so2, 1);
1795         }
1796         return error;
1797 }
1798
1799 int
1800 soconnectxlocked(struct socket *so, struct sockaddr *src,
1801     struct sockaddr *dst, struct proc *p, uint32_t ifscope,
1802     sae_associd_t aid, sae_connid_t *pcid, uint32_t flags, void *arg,
1803     uint32_t arglen, uio_t auio, user_ssize_t *bytes_written)
1804 {
1805         int error;
1806
1807         so_update_last_owner_locked(so, p);
1808         so_update_policy(so);
1809
1810         /*
1811          * If this is a listening socket or if this is a previously-accepted
1812          * socket that has been marked as inactive, reject the connect request.
1813          */
1814         if ((so->so_options & SO_ACCEPTCONN) || (so->so_flags & SOF_DEFUNCT)) {
1815                 error = EOPNOTSUPP;
1816                 if (so->so_flags & SOF_DEFUNCT) {
1817                         SODEFUNCTLOG("%s[%d, %s]: defunct so 0x%llx [%d,%d] "
1818                             "(%d)\n", __func__, proc_pid(p),
1819                             proc_best_name(p),
1820                             (uint64_t)DEBUG_KERNEL_ADDRPERM(so),
1821                             SOCK_DOM(so), SOCK_TYPE(so), error);
1822                 }
1823                 return error;
1824         }
1825
1826         if ((so->so_restrictions & SO_RESTRICT_DENY_OUT) != 0) {
1827                 return EPERM;
1828         }
1829
1830         /*
1831          * If protocol is connection-based, can only connect once
1832          * unless PR_MULTICONN is set.  Otherwise, if connected,
1833          * try to disconnect first.  This allows user to disconnect
1834          * by connecting to, e.g., a null address.
1835          */
1836         if ((so->so_state & (SS_ISCONNECTED | SS_ISCONNECTING)) &&
1837             !(so->so_proto->pr_flags & PR_MULTICONN) &&
1838             ((so->so_proto->pr_flags & PR_CONNREQUIRED) ||
1839             (error = sodisconnectlocked(so)) != 0)) {
1840                 error = EISCONN;
1841         } else {
1842                 /*
1843                  * Run connect filter before calling protocol:
1844                  *  - non-blocking connect returns before completion;
1845                  */
1846                 error = sflt_connectout(so, dst);
1847                 if (error != 0) {
1848                         /* Disable PRECONNECT_DATA, as we don't need to send a SYN anymore. */
1849                         so->so_flags1 &= ~SOF1_PRECONNECT_DATA;
1850                         if (error == EJUSTRETURN) {
1851                                 error = 0;
1852                         }
1853                 } else {
1854                         error = (*so->so_proto->pr_usrreqs->pru_connectx)
1855                             (so, src, dst, p, ifscope, aid, pcid,
1856                             flags, arg, arglen, auio, bytes_written);
1857                         if (error != 0) {
1858                                 so->so_state &= ~SS_ISCONNECTING;
1859                         }
1860                 }
1861         }
1862
1863         return error;
1864 }
1865
1866 int
1867 sodisconnectlocked(struct socket *so)
1868 {
1869         int error;
1870
1871         if ((so->so_state & SS_ISCONNECTED) == 0) {
1872                 error = ENOTCONN;
1873                 goto bad;
1874         }
1875         if (so->so_state & SS_ISDISCONNECTING) {
1876                 error = EALREADY;
1877                 goto bad;
1878         }
1879
1880         error = (*so->so_proto->pr_usrreqs->pru_disconnect)(so);
1881         if (error == 0) {
1882                 sflt_notify(so, sock_evt_disconnected, NULL);
1883         }
1884
1885 bad:
1886         return error;
1887 }
1888
1889 /* Locking version */
1890 int
1891 sodisconnect(struct socket *so)
1892 {
1893         int error;
1894
1895         socket_lock(so, 1);
1896         error = sodisconnectlocked(so);
1897         socket_unlock(so, 1);
1898         return error;
1899 }
1900
1901 int
1902 sodisconnectxlocked(struct socket *so, sae_associd_t aid, sae_connid_t cid)
1903 {
1904         int error;
1905
1906         /*
1907          * Call the protocol disconnectx handler; let it handle all
1908          * matters related to the connection state of this session.
1909          */
1910         error = (*so->so_proto->pr_usrreqs->pru_disconnectx)(so, aid, cid);
1911         if (error == 0) {
1912                 /*
1913                  * The event applies only for the session, not for
1914                  * the disconnection of individual subflows.
1915                  */
1916                 if (so->so_state & (SS_ISDISCONNECTING | SS_ISDISCONNECTED)) {
1917                         sflt_notify(so, sock_evt_disconnected, NULL);
1918                 }
1919         }
1920         return error;
1921 }
1922
1923 int
1924 sodisconnectx(struct socket *so, sae_associd_t aid, sae_connid_t cid)
1925 {
1926         int error;
1927
1928         socket_lock(so, 1);
1929         error = sodisconnectxlocked(so, aid, cid);
1930         socket_unlock(so, 1);
1931         return error;
1932 }
1933
1934 #define SBLOCKWAIT(f)   (((f) & MSG_DONTWAIT) ? 0 : SBL_WAIT)
1935
1936 /*
1937  * sosendcheck will lock the socket buffer if it isn't locked and
1938  * verify that there is space for the data being inserted.
1939  *
1940  * Returns:     0                       Success
1941  *              EPIPE
1942  *      sblock:EWOULDBLOCK
1943  *      sblock:EINTR
1944  *      sbwait:EBADF
1945  *      sbwait:EINTR
1946  *      [so_error]:???
1947  */
1948 int
1949 sosendcheck(struct socket *so, struct sockaddr *addr, user_ssize_t resid,
1950     int32_t clen, int32_t atomic, int flags, int *sblocked,
1951     struct mbuf *control)
1952 {
1953         int     error = 0;
1954         int32_t space;
1955         int     assumelock = 0;
1956
1957 restart:
1958         if (*sblocked == 0) {
1959                 if ((so->so_snd.sb_flags & SB_LOCK) != 0 &&
1960                     so->so_send_filt_thread != 0 &&
1961                     so->so_send_filt_thread == current_thread()) {
1962                         /*
1963                          * We're being called recursively from a filter,
1964                          * allow this to continue. Radar 4150520.
1965                          * Don't set sblocked because we don't want
1966                          * to perform an unlock later.
1967                          */
1968                         assumelock = 1;
1969                 } else {
1970                         error = sblock(&so->so_snd, SBLOCKWAIT(flags));
1971                         if (error) {
1972                                 if (so->so_flags & SOF_DEFUNCT) {
1973                                         goto defunct;
1974                                 }
1975                                 return error;
1976                         }
1977                         *sblocked = 1;
1978                 }
1979         }
1980
1981         /*
1982          * If a send attempt is made on a socket that has been marked
1983          * as inactive (disconnected), reject the request.
1984          */
1985         if (so->so_flags & SOF_DEFUNCT) {
1986 defunct:
1987                 error = EPIPE;
1988                 SODEFUNCTLOG("%s[%d, %s]: defunct so 0x%llx [%d,%d] (%d)\n",
1989                     __func__, proc_selfpid(), proc_best_name(current_proc()),
1990                     (uint64_t)DEBUG_KERNEL_ADDRPERM(so),
1991                     SOCK_DOM(so), SOCK_TYPE(so), error);
1992                 return error;
1993         }
1994
1995         if (so->so_state & SS_CANTSENDMORE) {
1996 #if CONTENT_FILTER
1997                 /*
1998                  * Can re-inject data of half closed connections
1999                  */
2000                 if ((so->so_state & SS_ISDISCONNECTED) == 0 &&
2001                     so->so_snd.sb_cfil_thread == current_thread() &&
2002                     cfil_sock_data_pending(&so->so_snd) != 0) {
2003                         CFIL_LOG(LOG_INFO,
2004                             "so %llx ignore SS_CANTSENDMORE",
2005                             (uint64_t)DEBUG_KERNEL_ADDRPERM(so));
2006                 } else
2007 #endif /* CONTENT_FILTER */
2008                 return EPIPE;
2009         }
2010         if (so->so_error) {
2011                 error = so->so_error;
2012                 so->so_error = 0;
2013                 return error;
2014         }
2015
2016         if ((so->so_state & SS_ISCONNECTED) == 0) {
2017                 if ((so->so_proto->pr_flags & PR_CONNREQUIRED) != 0) {
2018                         if (((so->so_state & SS_ISCONFIRMING) == 0) &&
2019                             (resid != 0 || clen == 0) &&
2020                             !(so->so_flags1 & SOF1_PRECONNECT_DATA)) {
2021                                 return ENOTCONN;
2022                         }
2023                 } else if (addr == 0) {
2024                         return (so->so_proto->pr_flags & PR_CONNREQUIRED) ?
2025                                ENOTCONN : EDESTADDRREQ;
2026                 }
2027         }
2028
2029         if (so->so_flags & SOF_ENABLE_MSGS) {
2030                 space = msgq_sbspace(so, control);
2031         } else {
2032                 space = sbspace(&so->so_snd);
2033         }
2034
2035         if (flags & MSG_OOB) {
2036                 space += 1024;
2037         }
2038         if ((atomic && resid > so->so_snd.sb_hiwat) ||
2039             clen > so->so_snd.sb_hiwat) {
2040                 return EMSGSIZE;
2041         }
2042
2043         if ((space < resid + clen &&
2044             (atomic || (space < (int32_t)so->so_snd.sb_lowat) ||
2045             space < clen)) ||
2046             (so->so_type == SOCK_STREAM && so_wait_for_if_feedback(so))) {
2047                 /*
2048                  * don't block the connectx call when there's more data
2049                  * than can be copied.
2050                  */
2051                 if (so->so_flags1 & SOF1_PRECONNECT_DATA) {
2052                         if (space == 0) {
2053                                 return EWOULDBLOCK;
2054                         }
2055                         if (space < (int32_t)so->so_snd.sb_lowat) {
2056                                 return 0;
2057                         }
2058                 }
2059                 if ((so->so_state & SS_NBIO) || (flags & MSG_NBIO) ||
2060                     assumelock) {
2061                         return EWOULDBLOCK;
2062                 }
2063                 sbunlock(&so->so_snd, TRUE);    /* keep socket locked */
2064                 *sblocked = 0;
2065                 error = sbwait(&so->so_snd);
2066                 if (error) {
2067                         if (so->so_flags & SOF_DEFUNCT) {
2068                                 goto defunct;
2069                         }
2070                         return error;
2071                 }
2072                 goto restart;
2073         }
2074         return 0;
2075 }
2076
2077 /*
2078  * Send on a socket.
2079  * If send must go all at once and message is larger than
2080  * send buffering, then hard error.
2081  * Lock against other senders.
2082  * If must go all at once and not enough room now, then
2083  * inform user that this would block and do nothing.
2084  * Otherwise, if nonblocking, send as much as possible.
2085  * The data to be sent is described by "uio" if nonzero,
2086  * otherwise by the mbuf chain "top" (which must be null
2087  * if uio is not).  Data provided in mbuf chain must be small
2088  * enough to send all at once.
2089  *
2090  * Returns nonzero on error, timeout or signal; callers
2091  * must check for short counts if EINTR/ERESTART are returned.
2092  * Data and control buffers are freed on return.
2093  *
2094  * Returns:     0                       Success
2095  *              EOPNOTSUPP
2096  *              EINVAL
2097  *              ENOBUFS
2098  *      uiomove:EFAULT
2099  *      sosendcheck:EPIPE
2100  *      sosendcheck:EWOULDBLOCK
2101  *      sosendcheck:EINTR
2102  *      sosendcheck:EBADF
2103  *      sosendcheck:EINTR
2104  *      sosendcheck:???                 [value from so_error]
2105  *      <pru_send>:ECONNRESET[TCP]
2106  *      <pru_send>:EINVAL[TCP]
2107  *      <pru_send>:ENOBUFS[TCP]
2108  *      <pru_send>:EADDRINUSE[TCP]
2109  *      <pru_send>:EADDRNOTAVAIL[TCP]
2110  *      <pru_send>:EAFNOSUPPORT[TCP]
2111  *      <pru_send>:EACCES[TCP]
2112  *      <pru_send>:EAGAIN[TCP]
2113  *      <pru_send>:EPERM[TCP]
2114  *      <pru_send>:EMSGSIZE[TCP]
2115  *      <pru_send>:EHOSTUNREACH[TCP]
2116  *      <pru_send>:ENETUNREACH[TCP]
2117  *      <pru_send>:ENETDOWN[TCP]
2118  *      <pru_send>:ENOMEM[TCP]
2119  *      <pru_send>:ENOBUFS[TCP]
2120  *      <pru_send>:???[TCP]             [ignorable: mostly IPSEC/firewall/DLIL]
2121  *      <pru_send>:EINVAL[AF_UNIX]
2122  *      <pru_send>:EOPNOTSUPP[AF_UNIX]
2123  *      <pru_send>:EPIPE[AF_UNIX]
2124  *      <pru_send>:ENOTCONN[AF_UNIX]
2125  *      <pru_send>:EISCONN[AF_UNIX]
2126  *      <pru_send>:???[AF_UNIX]         [whatever a filter author chooses]
2127  *      <sf_data_out>:???               [whatever a filter author chooses]
2128  *
2129  * Notes:       Other <pru_send> returns depend on the protocol family; all
2130  *              <sf_data_out> returns depend on what the filter author causes
2131  *              their filter to return.
2132  */
2133 int
2134 sosend(struct socket *so, struct sockaddr *addr, struct uio *uio,
2135     struct mbuf *top, struct mbuf *control, int flags)
2136 {
2137         struct mbuf **mp;
2138         struct mbuf *m, *freelist = NULL;
2139         user_ssize_t space, len, resid, orig_resid;
2140         int clen = 0, error, dontroute, mlen, sendflags;
2141         int atomic = sosendallatonce(so) || top;
2142         int sblocked = 0;
2143         struct proc *p = current_proc();
2144         struct mbuf *control_copy = NULL;
2145         uint16_t headroom = 0;
2146         boolean_t en_tracing = FALSE;
2147
2148         if (uio != NULL) {
2149                 resid = uio_resid(uio);
2150         } else {
2151                 resid = top->m_pkthdr.len;
2152         }
2153
2154         KERNEL_DEBUG((DBG_FNC_SOSEND | DBG_FUNC_START), so, resid,
2155             so->so_snd.sb_cc, so->so_snd.sb_lowat, so->so_snd.sb_hiwat);
2156
2157         socket_lock(so, 1);
2158
2159         /*
2160          * trace if tracing & network (vs. unix) sockets & and
2161          * non-loopback
2162          */
2163         if (ENTR_SHOULDTRACE &&
2164             (SOCK_CHECK_DOM(so, AF_INET) || SOCK_CHECK_DOM(so, AF_INET6))) {
2165                 struct inpcb *inp = sotoinpcb(so);
2166                 if (inp->inp_last_outifp != NULL &&
2167                     !(inp->inp_last_outifp->if_flags & IFF_LOOPBACK)) {
2168                         en_tracing = TRUE;
2169                         KERNEL_ENERGYTRACE(kEnTrActKernSockWrite, DBG_FUNC_START,
2170                             VM_KERNEL_ADDRPERM(so),
2171                             ((so->so_state & SS_NBIO) ? kEnTrFlagNonBlocking : 0),
2172                             (int64_t)resid);
2173                         orig_resid = resid;
2174                 }
2175         }
2176
2177         /*
2178          * Re-injection should not affect process accounting
2179          */
2180         if ((flags & MSG_SKIPCFIL) == 0) {
2181                 so_update_last_owner_locked(so, p);
2182                 so_update_policy(so);
2183
2184 #if NECP
2185                 so_update_necp_policy(so, NULL, addr);
2186 #endif /* NECP */
2187         }
2188
2189         if (so->so_type != SOCK_STREAM && (flags & MSG_OOB) != 0) {
2190                 error = EOPNOTSUPP;
2191                 goto out_locked;
2192         }
2193
2194         /*
2195          * In theory resid should be unsigned.
2196          * However, space must be signed, as it might be less than 0
2197          * if we over-committed, and we must use a signed comparison
2198          * of space and resid.  On the other hand, a negative resid
2199          * causes us to loop sending 0-length segments to the protocol.
2200          *
2201          * Usually, MSG_EOR isn't used on SOCK_STREAM type sockets.
2202          * But it will be used by sockets doing message delivery.
2203          *
2204          * Note: We limit resid to be a positive int value as we use
2205          * imin() to set bytes_to_copy -- radr://14558484
2206          */
2207         if (resid < 0 || resid > INT_MAX || (so->so_type == SOCK_STREAM &&
2208             !(so->so_flags & SOF_ENABLE_MSGS) && (flags & MSG_EOR))) {
2209                 error = EINVAL;
2210                 goto out_locked;
2211         }
2212
2213         dontroute = (flags & MSG_DONTROUTE) &&
2214             (so->so_options & SO_DONTROUTE) == 0 &&
2215             (so->so_proto->pr_flags & PR_ATOMIC);
2216         OSIncrementAtomicLong(&p->p_stats->p_ru.ru_msgsnd);
2217
2218         if (control != NULL) {
2219                 clen = control->m_len;
2220         }
2221
2222         if (soreserveheadroom != 0) {
2223                 headroom = so->so_pktheadroom;
2224         }
2225
2226         do {
2227                 error = sosendcheck(so, addr, resid, clen, atomic, flags,
2228                     &sblocked, control);
2229                 if (error) {
2230                         goto out_locked;
2231                 }
2232
2233                 mp = &top;
2234                 if (so->so_flags & SOF_ENABLE_MSGS) {
2235                         space = msgq_sbspace(so, control);
2236                 } else {
2237                         space = sbspace(&so->so_snd) - clen;
2238                 }
2239                 space += ((flags & MSG_OOB) ? 1024 : 0);
2240
2241                 do {
2242                         if (uio == NULL) {
2243                                 /*
2244                                  * Data is prepackaged in "top".
2245                                  */
2246                                 resid = 0;
2247                                 if (flags & MSG_EOR) {
2248                                         top->m_flags |= M_EOR;
2249                                 }
2250                         } else {
2251                                 int chainlength;
2252                                 int bytes_to_copy;
2253                                 boolean_t jumbocl;
2254                                 boolean_t bigcl;
2255                                 int bytes_to_alloc;
2256
2257                                 bytes_to_copy = imin(resid, space);
2258
2259                                 bytes_to_alloc = bytes_to_copy;
2260                                 if (top == NULL) {
2261                                         bytes_to_alloc += headroom;
2262                                 }
2263
2264                                 if (sosendminchain > 0) {
2265                                         chainlength = 0;
2266                                 } else {
2267                                         chainlength = sosendmaxchain;
2268                                 }
2269
2270                                 /*
2271                                  * Use big 4 KB cluster when the outgoing interface
2272                                  * does not prefer 2 KB clusters
2273                                  */
2274                                 bigcl = !(so->so_flags1 & SOF1_IF_2KCL) ||
2275                                     sosendbigcl_ignore_capab;
2276
2277                                 /*
2278                                  * Attempt to use larger than system page-size
2279                                  * clusters for large writes only if there is
2280                                  * a jumbo cluster pool and if the socket is
2281                                  * marked accordingly.
2282                                  */
2283                                 jumbocl = sosendjcl && njcl > 0 &&
2284                                     ((so->so_flags & SOF_MULTIPAGES) ||
2285                                     sosendjcl_ignore_capab) &&
2286                                     bigcl;
2287
2288                                 socket_unlock(so, 0);
2289
2290                                 do {
2291                                         int num_needed;
2292                                         int hdrs_needed = (top == NULL) ? 1 : 0;
2293
2294                                         /*
2295                                          * try to maintain a local cache of mbuf
2296                                          * clusters needed to complete this
2297                                          * write the list is further limited to
2298                                          * the number that are currently needed
2299                                          * to fill the socket this mechanism
2300                                          * allows a large number of mbufs/
2301                                          * clusters to be grabbed under a single
2302                                          * mbuf lock... if we can't get any
2303                                          * clusters, than fall back to trying
2304                                          * for mbufs if we fail early (or
2305                                          * miscalcluate the number needed) make
2306                                          * sure to release any clusters we
2307                                          * haven't yet consumed.
2308                                          */
2309                                         if (freelist == NULL &&
2310                                             bytes_to_alloc > MBIGCLBYTES &&
2311                                             jumbocl) {
2312                                                 num_needed =
2313                                                     bytes_to_alloc / M16KCLBYTES;
2314
2315                                                 if ((bytes_to_alloc -
2316                                                     (num_needed * M16KCLBYTES))
2317                                                     >= MINCLSIZE) {
2318                                                         num_needed++;
2319                                                 }
2320
2321                                                 freelist =
2322                                                     m_getpackets_internal(
2323                                                         (unsigned int *)&num_needed,
2324                                                         hdrs_needed, M_WAIT, 0,
2325                                                         M16KCLBYTES);
2326                                                 /*
2327                                                  * Fall back to 4K cluster size
2328                                                  * if allocation failed
2329                                                  */
2330                                         }
2331
2332                                         if (freelist == NULL &&
2333                                             bytes_to_alloc > MCLBYTES &&
2334                                             bigcl) {
2335                                                 num_needed =
2336                                                     bytes_to_alloc / MBIGCLBYTES;
2337
2338                                                 if ((bytes_to_alloc -
2339                                                     (num_needed * MBIGCLBYTES)) >=
2340                                                     MINCLSIZE) {
2341                                                         num_needed++;
2342                                                 }
2343
2344                                                 freelist =
2345                                                     m_getpackets_internal(
2346                                                         (unsigned int *)&num_needed,
2347                                                         hdrs_needed, M_WAIT, 0,
2348                                                         MBIGCLBYTES);
2349                                                 /*
2350                                                  * Fall back to cluster size
2351                                                  * if allocation failed
2352                                                  */
2353                                         }
2354
2355                                         /*
2356                                          * Allocate a cluster as we want to
2357                                          * avoid to split the data in more
2358                                          * that one segment and using MINCLSIZE
2359                                          * would lead us to allocate two mbufs
2360                                          */
2361                                         if (soreserveheadroom != 0 &&
2362                                             freelist == NULL &&
2363                                             ((top == NULL &&
2364                                             bytes_to_alloc > _MHLEN) ||
2365                                             bytes_to_alloc > _MLEN)) {
2366                                                 num_needed = ROUNDUP(bytes_to_alloc, MCLBYTES) /
2367                                                     MCLBYTES;
2368                                                 freelist =
2369                                                     m_getpackets_internal(
2370                                                         (unsigned int *)&num_needed,
2371                                                         hdrs_needed, M_WAIT, 0,
2372                                                         MCLBYTES);
2373                                                 /*
2374                                                  * Fall back to a single mbuf
2375                                                  * if allocation failed
2376                                                  */
2377                                         } else if (freelist == NULL &&
2378                                             bytes_to_alloc > MINCLSIZE) {
2379                                                 num_needed =
2380                                                     bytes_to_alloc / MCLBYTES;
2381
2382                                                 if ((bytes_to_alloc -
2383                                                     (num_needed * MCLBYTES)) >=
2384                                                     MINCLSIZE) {
2385                                                         num_needed++;
2386                                                 }
2387
2388                                                 freelist =
2389                                                     m_getpackets_internal(
2390                                                         (unsigned int *)&num_needed,
2391                                                         hdrs_needed, M_WAIT, 0,
2392                                                         MCLBYTES);
2393                                                 /*
2394                                                  * Fall back to a single mbuf
2395                                                  * if allocation failed
2396                                                  */
2397                                         }
2398                                         /*
2399                                          * For datagram protocols, leave
2400                                          * headroom for protocol headers
2401                                          * in the first cluster of the chain
2402                                          */
2403                                         if (freelist != NULL && atomic &&
2404                                             top == NULL && headroom > 0) {
2405                                                 freelist->m_data += headroom;
2406                                         }
2407
2408                                         /*
2409                                          * Fall back to regular mbufs without
2410                                          * reserving the socket headroom
2411                                          */
2412                                         if (freelist == NULL) {
2413                                                 if (top == NULL) {
2414                                                         MGETHDR(freelist,
2415                                                             M_WAIT, MT_DATA);
2416                                                 } else {
2417                                                         MGET(freelist,
2418                                                             M_WAIT, MT_DATA);
2419                                                 }
2420
2421                                                 if (freelist == NULL) {
2422                                                         error = ENOBUFS;
2423                                                         socket_lock(so, 0);
2424                                                         goto out_locked;
2425                                                 }
2426                                                 /*
2427                                                  * For datagram protocols,
2428                                                  * leave room for protocol
2429                                                  * headers in first mbuf.
2430                                                  */
2431                                                 if (atomic && top == NULL &&
2432                                                     bytes_to_copy < MHLEN) {
2433                                                         MH_ALIGN(freelist,
2434                                                             bytes_to_copy);
2435                                                 }
2436                                         }
2437                                         m = freelist;
2438                                         freelist = m->m_next;
2439                                         m->m_next = NULL;
2440
2441                                         if ((m->m_flags & M_EXT)) {
2442                                                 mlen = m->m_ext.ext_size -
2443                                                     M_LEADINGSPACE(m);
2444                                         } else if ((m->m_flags & M_PKTHDR)) {
2445                                                 mlen =
2446                                                     MHLEN - M_LEADINGSPACE(m);
2447                                         } else {
2448                                                 mlen = MLEN - M_LEADINGSPACE(m);
2449                                         }
2450                                         len = imin(mlen, bytes_to_copy);
2451
2452                                         chainlength += len;
2453
2454                                         space -= len;
2455
2456                                         error = uiomove(mtod(m, caddr_t),
2457                                             len, uio);
2458
2459                                         resid = uio_resid(uio);
2460
2461                                         m->m_len = len;
2462                                         *mp = m;
2463                                         top->m_pkthdr.len += len;
2464                                         if (error) {
2465                                                 break;
2466                                         }
2467                                         mp = &m->m_next;
2468                                         if (resid <= 0) {
2469                                                 if (flags & MSG_EOR) {
2470                                                         top->m_flags |= M_EOR;
2471                                                 }
2472                                                 break;
2473                                         }
2474                                         bytes_to_copy = min(resid, space);
2475                                 } while (space > 0 &&
2476                                     (chainlength < sosendmaxchain || atomic ||
2477                                     resid < MINCLSIZE));
2478
2479                                 socket_lock(so, 0);
2480
2481                                 if (error) {
2482                                         goto out_locked;
2483                                 }
2484                         }
2485
2486                         if (dontroute) {
2487                                 so->so_options |= SO_DONTROUTE;
2488                         }
2489
2490                         /*
2491                          * Compute flags here, for pru_send and NKEs
2492                          *
2493                          * If the user set MSG_EOF, the protocol
2494                          * understands this flag and nothing left to
2495                          * send then use PRU_SEND_EOF instead of PRU_SEND.
2496                          */
2497                         sendflags = (flags & MSG_OOB) ? PRUS_OOB :
2498                             ((flags & MSG_EOF) &&
2499                             (so->so_proto->pr_flags & PR_IMPLOPCL) &&
2500                             (resid <= 0)) ? PRUS_EOF :
2501                             /* If there is more to send set PRUS_MORETOCOME */
2502                             (resid > 0 && space > 0) ? PRUS_MORETOCOME : 0;
2503
2504                         if ((flags & MSG_SKIPCFIL) == 0) {
2505                                 /*
2506                                  * Socket filter processing
2507                                  */
2508                                 error = sflt_data_out(so, addr, &top,
2509                                     &control, (sendflags & MSG_OOB) ?
2510                                     sock_data_filt_flag_oob : 0);
2511                                 if (error) {
2512                                         if (error == EJUSTRETURN) {
2513                                                 error = 0;
2514                                                 clen = 0;
2515                                                 control = NULL;
2516                                                 top = NULL;
2517                                         }
2518                                         goto out_locked;
2519                                 }
2520 #if CONTENT_FILTER
2521                                 /*
2522                                  * Content filter processing
2523                                  */
2524                                 error = cfil_sock_data_out(so, addr, top,
2525                                     control, sendflags);
2526                                 if (error) {
2527                                         if (error == EJUSTRETURN) {
2528                                                 error = 0;
2529                                                 clen = 0;
2530                                                 control = NULL;
2531                                                 top = NULL;
2532                                         }
2533                                         goto out_locked;
2534                                 }
2535 #endif /* CONTENT_FILTER */
2536                         }
2537                         if (so->so_flags & SOF_ENABLE_MSGS) {
2538                                 /*
2539                                  * Make a copy of control mbuf,
2540                                  * so that msg priority can be
2541                                  * passed to subsequent mbufs.
2542                                  */
2543                                 control_copy = m_dup(control, M_NOWAIT);
2544                         }
2545                         error = (*so->so_proto->pr_usrreqs->pru_send)
2546                             (so, sendflags, top, addr, control, p);
2547
2548                         if (dontroute) {
2549                                 so->so_options &= ~SO_DONTROUTE;
2550                         }
2551
2552                         clen = 0;
2553                         control = control_copy;
2554                         control_copy = NULL;
2555                         top = NULL;
2556                         mp = &top;
2557                         if (error) {
2558                                 goto out_locked;
2559                         }
2560                 } while (resid && space > 0);
2561         } while (resid);
2562
2563 out_locked:
2564         if (sblocked) {
2565                 sbunlock(&so->so_snd, FALSE);   /* will unlock socket */
2566         } else {
2567                 socket_unlock(so, 1);
2568         }
2569         if (top != NULL) {
2570                 m_freem(top);
2571         }
2572         if (control != NULL) {
2573                 m_freem(control);
2574         }
2575         if (freelist != NULL) {
2576                 m_freem_list(freelist);
2577         }
2578         if (control_copy != NULL) {
2579                 m_freem(control_copy);
2580         }
2581
2582         soclearfastopen(so);
2583
2584         if (en_tracing) {
2585                 /* resid passed here is the bytes left in uio */
2586                 KERNEL_ENERGYTRACE(kEnTrActKernSockWrite, DBG_FUNC_END,
2587                     VM_KERNEL_ADDRPERM(so),
2588                     ((error == EWOULDBLOCK) ? kEnTrFlagNoWork : 0),
2589                     (int64_t)(orig_resid - resid));
2590         }
2591         KERNEL_DEBUG(DBG_FNC_SOSEND | DBG_FUNC_END, so, resid,
2592             so->so_snd.sb_cc, space, error);
2593
2594         return error;
2595 }
2596
2597 int
2598 sosend_reinject(struct socket *so, struct sockaddr *addr, struct mbuf *top, struct mbuf *control, uint32_t sendflags)
2599 {
2600         struct mbuf *m0 = NULL, *control_end = NULL;
2601
2602         socket_lock_assert_owned(so);
2603
2604         /*
2605          * top must points to mbuf chain to be sent.
2606          * If control is not NULL, top must be packet header
2607          */
2608         VERIFY(top != NULL &&
2609             (control == NULL || top->m_flags & M_PKTHDR));
2610
2611         /*
2612          * If control is not passed in, see if we can get it
2613          * from top.
2614          */
2615         if (control == NULL && (top->m_flags & M_PKTHDR) == 0) {
2616                 // Locate start of control if present and start of data
2617                 for (m0 = top; m0 != NULL; m0 = m0->m_next) {
2618                         if (m0->m_flags & M_PKTHDR) {
2619                                 top = m0;
2620                                 break;
2621                         } else if (m0->m_type == MT_CONTROL) {
2622                                 if (control == NULL) {
2623                                         // Found start of control
2624                                         control = m0;
2625                                 }
2626                                 if (control != NULL && m0->m_next != NULL && m0->m_next->m_type != MT_CONTROL) {
2627                                         // Found end of control
2628                                         control_end = m0;
2629                                 }
2630                         }
2631                 }
2632                 if (control_end != NULL) {
2633                         control_end->m_next = NULL;
2634                 }
2635         }
2636
2637         int error = (*so->so_proto->pr_usrreqs->pru_send)
2638             (so, sendflags, top, addr, control, current_proc());
2639
2640         return error;
2641 }
2642
2643 /*
2644  * Supported only connected sockets (no address) without ancillary data
2645  * (control mbuf) for atomic protocols
2646  */
2647 int
2648 sosend_list(struct socket *so, struct uio **uioarray, u_int uiocnt, int flags)
2649 {
2650         struct mbuf *m, *freelist = NULL;
2651         user_ssize_t len, resid;
2652         int error, dontroute, mlen;
2653         int atomic = sosendallatonce(so);
2654         int sblocked = 0;
2655         struct proc *p = current_proc();
2656         u_int uiofirst = 0;
2657         u_int uiolast = 0;
2658         struct mbuf *top = NULL;
2659         uint16_t headroom = 0;
2660         boolean_t bigcl;
2661
2662         KERNEL_DEBUG((DBG_FNC_SOSEND_LIST | DBG_FUNC_START), so, uiocnt,
2663             so->so_snd.sb_cc, so->so_snd.sb_lowat, so->so_snd.sb_hiwat);
2664
2665         if (so->so_type != SOCK_DGRAM) {
2666                 error = EINVAL;
2667                 goto out;
2668         }
2669         if (atomic == 0) {
2670                 error = EINVAL;
2671                 goto out;
2672         }
2673         if (so->so_proto->pr_usrreqs->pru_send_list == NULL) {
2674                 error = EPROTONOSUPPORT;
2675                 goto out;
2676         }
2677         if (flags & ~(MSG_DONTWAIT | MSG_NBIO)) {
2678                 error = EINVAL;
2679                 goto out;
2680         }
2681         resid = uio_array_resid(uioarray, uiocnt);
2682
2683         /*
2684          * In theory resid should be unsigned.
2685          * However, space must be signed, as it might be less than 0
2686          * if we over-committed, and we must use a signed comparison
2687          * of space and resid.  On the other hand, a negative resid
2688          * causes us to loop sending 0-length segments to the protocol.
2689          *
2690          * Note: We limit resid to be a positive int value as we use
2691          * imin() to set bytes_to_copy -- radr://14558484
2692          */
2693         if (resid < 0 || resid > INT_MAX) {
2694                 error = EINVAL;
2695                 goto out;
2696         }
2697
2698         socket_lock(so, 1);
2699         so_update_last_owner_locked(so, p);
2700         so_update_policy(so);
2701
2702 #if NECP
2703         so_update_necp_policy(so, NULL, NULL);
2704 #endif /* NECP */
2705
2706         dontroute = (flags & MSG_DONTROUTE) &&
2707             (so->so_options & SO_DONTROUTE) == 0 &&
2708             (so->so_proto->pr_flags & PR_ATOMIC);
2709         OSIncrementAtomicLong(&p->p_stats->p_ru.ru_msgsnd);
2710
2711         error = sosendcheck(so, NULL, resid, 0, atomic, flags,
2712             &sblocked, NULL);
2713         if (error) {
2714                 goto release;
2715         }
2716
2717         /*
2718          * Use big 4 KB clusters when the outgoing interface does not prefer
2719          * 2 KB clusters
2720          */
2721         bigcl = !(so->so_flags1 & SOF1_IF_2KCL) || sosendbigcl_ignore_capab;
2722
2723         if (soreserveheadroom != 0) {
2724                 headroom = so->so_pktheadroom;
2725         }
2726
2727         do {
2728                 int i;
2729                 int num_needed = 0;
2730                 int chainlength;
2731                 size_t maxpktlen = 0;
2732                 int bytes_to_alloc;
2733
2734                 if (sosendminchain > 0) {
2735                         chainlength = 0;
2736                 } else {
2737                         chainlength = sosendmaxchain;
2738                 }
2739
2740                 socket_unlock(so, 0);
2741
2742                 /*
2743                  * Find a set of uio that fit in a reasonable number
2744                  * of mbuf packets
2745                  */
2746                 for (i = uiofirst; i < uiocnt; i++) {
2747                         struct uio *auio = uioarray[i];
2748
2749                         len = uio_resid(auio);
2750
2751                         /* Do nothing for empty messages */
2752                         if (len == 0) {
2753                                 continue;
2754                         }
2755
2756                         num_needed += 1;
2757                         uiolast += 1;
2758
2759                         if (len > maxpktlen) {
2760                                 maxpktlen = len;
2761                         }
2762
2763                         chainlength += len;
2764                         if (chainlength > sosendmaxchain) {
2765                                 break;
2766                         }
2767                 }
2768                 /*
2769                  * Nothing left to send
2770                  */
2771                 if (num_needed == 0) {
2772                         socket_lock(so, 0);
2773                         break;
2774                 }
2775                 /*
2776                  * Allocate buffer large enough to include headroom space for
2777                  * network and link header
2778                  *
2779                  */
2780                 bytes_to_alloc = maxpktlen + headroom;
2781
2782                 /*
2783                  * Allocate a single contiguous buffer of the smallest available
2784                  * size when possible
2785                  */
2786                 if (bytes_to_alloc > MCLBYTES &&
2787                     bytes_to_alloc <= MBIGCLBYTES && bigcl) {
2788                         freelist = m_getpackets_internal(
2789                                 (unsigned int *)&num_needed,
2790                                 num_needed, M_WAIT, 1,
2791                                 MBIGCLBYTES);
2792                 } else if (bytes_to_alloc > _MHLEN &&
2793                     bytes_to_alloc <= MCLBYTES) {
2794                         freelist = m_getpackets_internal(
2795                                 (unsigned int *)&num_needed,
2796                                 num_needed, M_WAIT, 1,
2797                                 MCLBYTES);
2798                 } else {
2799                         freelist = m_allocpacket_internal(
2800                                 (unsigned int *)&num_needed,
2801                                 bytes_to_alloc, NULL, M_WAIT, 1, 0);
2802                 }
2803
2804                 if (freelist == NULL) {
2805                         socket_lock(so, 0);
2806                         error = ENOMEM;
2807                         goto release;
2808                 }
2809                 /*
2810                  * Copy each uio of the set into its own mbuf packet
2811                  */
2812                 for (i = uiofirst, m = freelist;
2813                     i < uiolast && m != NULL;
2814                     i++) {
2815                         int bytes_to_copy;
2816                         struct mbuf *n;
2817                         struct uio *auio = uioarray[i];
2818
2819                         bytes_to_copy = uio_resid(auio);
2820
2821                         /* Do nothing for empty messages */
2822                         if (bytes_to_copy == 0) {
2823                                 continue;
2824                         }
2825                         /*
2826                          * Leave headroom for protocol headers
2827                          * in the first mbuf of the chain
2828                          */
2829                         m->m_data += headroom;
2830
2831                         for (n = m; n != NULL; n = n->m_next) {
2832                                 if ((m->m_flags & M_EXT)) {
2833                                         mlen = m->m_ext.ext_size -
2834                                             M_LEADINGSPACE(m);
2835                                 } else if ((m->m_flags & M_PKTHDR)) {
2836                                         mlen =
2837                                             MHLEN - M_LEADINGSPACE(m);
2838                                 } else {
2839                                         mlen = MLEN - M_LEADINGSPACE(m);
2840                                 }
2841                                 len = imin(mlen, bytes_to_copy);
2842
2843                                 /*
2844                                  * Note: uiomove() decrements the iovec
2845                                  * length
2846                                  */
2847                                 error = uiomove(mtod(n, caddr_t),
2848                                     len, auio);
2849                                 if (error != 0) {
2850                                         break;
2851                                 }
2852                                 n->m_len = len;
2853                                 m->m_pkthdr.len += len;
2854
2855                                 VERIFY(m->m_pkthdr.len <= maxpktlen);
2856
2857                                 bytes_to_copy -= len;
2858                                 resid -= len;
2859                         }
2860                         if (m->m_pkthdr.len == 0) {
2861                                 printf(
2862                                         "%s:%d so %llx pkt %llx type %u len null\n",
2863                                         __func__, __LINE__,
2864                                         (uint64_t)DEBUG_KERNEL_ADDRPERM(so),
2865                                         (uint64_t)DEBUG_KERNEL_ADDRPERM(m),
2866                                         m->m_type);
2867                         }
2868                         if (error != 0) {
2869                                 break;
2870                         }
2871                         m = m->m_nextpkt;
2872                 }
2873
2874                 socket_lock(so, 0);
2875
2876                 if (error) {
2877                         goto release;
2878                 }
2879                 top = freelist;
2880                 freelist = NULL;
2881
2882                 if (dontroute) {
2883                         so->so_options |= SO_DONTROUTE;
2884                 }
2885
2886                 if ((flags & MSG_SKIPCFIL) == 0) {
2887                         struct mbuf **prevnextp = NULL;
2888
2889                         for (i = uiofirst, m = top;
2890                             i < uiolast && m != NULL;
2891                             i++) {
2892                                 struct mbuf *nextpkt = m->m_nextpkt;
2893
2894                                 /*
2895                                  * Socket filter processing
2896                                  */
2897                                 error = sflt_data_out(so, NULL, &m,
2898                                     NULL, 0);
2899                                 if (error != 0 && error != EJUSTRETURN) {
2900                                         goto release;
2901                                 }
2902
2903 #if CONTENT_FILTER
2904                                 if (error == 0) {
2905                                         /*
2906                                          * Content filter processing
2907                                          */
2908                                         error = cfil_sock_data_out(so, NULL, m,
2909                                             NULL, 0);
2910                                         if (error != 0 && error != EJUSTRETURN) {
2911                                                 goto release;
2912                                         }
2913                                 }
2914 #endif /* CONTENT_FILTER */
2915                                 /*
2916                                  * Remove packet from the list when
2917                                  * swallowed by a filter
2918                                  */
2919                                 if (error == EJUSTRETURN) {
2920                                         error = 0;
2921                                         if (prevnextp != NULL) {
2922                                                 *prevnextp = nextpkt;
2923                                         } else {
2924                                                 top = nextpkt;
2925                                         }
2926                                 }
2927
2928                                 m = nextpkt;
2929                                 if (m != NULL) {
2930                                         prevnextp = &m->m_nextpkt;
2931                                 }
2932                         }
2933                 }
2934                 if (top != NULL) {
2935                         error = (*so->so_proto->pr_usrreqs->pru_send_list)
2936                             (so, 0, top, NULL, NULL, p);
2937                 }
2938
2939                 if (dontroute) {
2940                         so->so_options &= ~SO_DONTROUTE;
2941                 }
2942
2943                 top = NULL;
2944                 uiofirst = uiolast;
2945         } while (resid > 0 && error == 0);
2946 release:
2947         if (sblocked) {
2948                 sbunlock(&so->so_snd, FALSE);   /* will unlock socket */
2949         } else {
2950                 socket_unlock(so, 1);
2951         }
2952 out:
2953         if (top != NULL) {
2954                 m_freem(top);
2955         }
2956         if (freelist != NULL) {
2957                 m_freem_list(freelist);
2958         }
2959
2960         KERNEL_DEBUG(DBG_FNC_SOSEND_LIST | DBG_FUNC_END, so, resid,
2961             so->so_snd.sb_cc, 0, error);
2962
2963         return error;
2964 }
2965
2966 /*
2967  * May return ERESTART when packet is dropped by MAC policy check
2968  */
2969 static int
2970 soreceive_addr(struct proc *p, struct socket *so, struct sockaddr **psa,
2971     int flags, struct mbuf **mp, struct mbuf **nextrecordp, int canwait)
2972 {
2973         int error = 0;
2974         struct mbuf *m = *mp;
2975         struct mbuf *nextrecord = *nextrecordp;
2976
2977         KASSERT(m->m_type == MT_SONAME, ("receive 1a"));
2978 #if CONFIG_MACF_SOCKET_SUBSET
2979         /*
2980          * Call the MAC framework for policy checking if we're in
2981          * the user process context and the socket isn't connected.
2982          */
2983         if (p != kernproc && !(so->so_state & SS_ISCONNECTED)) {
2984                 struct mbuf *m0 = m;
2985                 /*
2986                  * Dequeue this record (temporarily) from the receive
2987                  * list since we're about to drop the socket's lock
2988                  * where a new record may arrive and be appended to
2989                  * the list.  Upon MAC policy failure, the record
2990                  * will be freed.  Otherwise, we'll add it back to
2991                  * the head of the list.  We cannot rely on SB_LOCK
2992                  * because append operation uses the socket's lock.
2993                  */
2994                 do {
2995                         m->m_nextpkt = NULL;
2996                         sbfree(&so->so_rcv, m);
2997                         m = m->m_next;
2998                 } while (m != NULL);
2999                 m = m0;
3000                 so->so_rcv.sb_mb = nextrecord;
3001                 SB_EMPTY_FIXUP(&so->so_rcv);
3002                 SBLASTRECORDCHK(&so->so_rcv, "soreceive 1a");
3003                 SBLASTMBUFCHK(&so->so_rcv, "soreceive 1a");
3004                 socket_unlock(so, 0);
3005
3006                 if (mac_socket_check_received(proc_ucred(p), so,
3007                     mtod(m, struct sockaddr *)) != 0) {
3008                         /*
3009                          * MAC policy failure; free this record and
3010                          * process the next record (or block until
3011                          * one is available).  We have adjusted sb_cc
3012                          * and sb_mbcnt above so there is no need to
3013                          * call sbfree() again.
3014                          */
3015                         m_freem(m);
3016                         /*
3017                          * Clear SB_LOCK but don't unlock the socket.
3018                          * Process the next record or wait for one.
3019                          */
3020                         socket_lock(so, 0);
3021                         sbunlock(&so->so_rcv, TRUE); /* stay locked */
3022                         error = ERESTART;
3023                         goto done;
3024                 }
3025                 socket_lock(so, 0);
3026                 /*
3027                  * If the socket has been defunct'd, drop it.
3028                  */
3029                 if (so->so_flags & SOF_DEFUNCT) {
3030                         m_freem(m);
3031                         error = ENOTCONN;
3032                         goto done;
3033                 }
3034                 /*
3035                  * Re-adjust the socket receive list and re-enqueue
3036                  * the record in front of any packets which may have
3037                  * been appended while we dropped the lock.
3038                  */
3039                 for (m = m0; m->m_next != NULL; m = m->m_next) {
3040                         sballoc(&so->so_rcv, m);
3041                 }
3042                 sballoc(&so->so_rcv, m);
3043                 if (so->so_rcv.sb_mb == NULL) {
3044                         so->so_rcv.sb_lastrecord = m0;
3045                         so->so_rcv.sb_mbtail = m;
3046                 }
3047                 m = m0;
3048                 nextrecord = m->m_nextpkt = so->so_rcv.sb_mb;
3049                 so->so_rcv.sb_mb = m;
3050                 SBLASTRECORDCHK(&so->so_rcv, "soreceive 1b");
3051                 SBLASTMBUFCHK(&so->so_rcv, "soreceive 1b");
3052         }
3053 #endif /* CONFIG_MACF_SOCKET_SUBSET */
3054         if (psa != NULL) {
3055                 *psa = dup_sockaddr(mtod(m, struct sockaddr *), canwait);
3056                 if ((*psa == NULL) && (flags & MSG_NEEDSA)) {
3057                         error = EWOULDBLOCK;
3058                         goto done;
3059                 }
3060         }
3061         if (flags & MSG_PEEK) {
3062                 m = m->m_next;
3063         } else {
3064                 sbfree(&so->so_rcv, m);
3065                 if (m->m_next == NULL && so->so_rcv.sb_cc != 0) {
3066                         panic("%s: about to create invalid socketbuf",
3067                             __func__);
3068                         /* NOTREACHED */
3069                 }
3070                 MFREE(m, so->so_rcv.sb_mb);
3071                 m = so->so_rcv.sb_mb;
3072                 if (m != NULL) {
3073                         m->m_nextpkt = nextrecord;
3074                 } else {
3075                         so->so_rcv.sb_mb = nextrecord;
3076                         SB_EMPTY_FIXUP(&so->so_rcv);
3077                 }
3078         }
3079 done:
3080         *mp = m;
3081         *nextrecordp = nextrecord;
3082
3083         return error;
3084 }
3085
3086 /*
3087  * Process one or more MT_CONTROL mbufs present before any data mbufs
3088  * in the first mbuf chain on the socket buffer.  If MSG_PEEK, we
3089  * just copy the data; if !MSG_PEEK, we call into the protocol to
3090  * perform externalization.
3091  */
3092 static int
3093 soreceive_ctl(struct socket *so, struct mbuf **controlp, int flags,
3094     struct mbuf **mp, struct mbuf **nextrecordp)
3095 {
3096         int error = 0;
3097         struct mbuf *cm = NULL, *cmn;
3098         struct mbuf **cme = &cm;
3099         struct sockbuf *sb_rcv = &so->so_rcv;
3100         struct mbuf **msgpcm = NULL;
3101         struct mbuf *m = *mp;
3102         struct mbuf *nextrecord = *nextrecordp;
3103         struct protosw *pr = so->so_proto;
3104
3105         /*
3106          * Externalizing the control messages would require us to
3107          * drop the socket's lock below.  Once we re-acquire the
3108          * lock, the mbuf chain might change.  In order to preserve
3109          * consistency, we unlink all control messages from the
3110          * first mbuf chain in one shot and link them separately
3111          * onto a different chain.
3112          */
3113         do {
3114                 if (flags & MSG_PEEK) {
3115                         if (controlp != NULL) {
3116                                 if (*controlp == NULL) {
3117                                         msgpcm = controlp;
3118                                 }
3119                                 *controlp = m_copy(m, 0, m->m_len);
3120
3121                                 /*
3122                                  * If we failed to allocate an mbuf,
3123                                  * release any previously allocated
3124                                  * mbufs for control data. Return
3125                                  * an error. Keep the mbufs in the
3126                                  * socket as this is using
3127                                  * MSG_PEEK flag.
3128                                  */
3129                                 if (*controlp == NULL) {
3130                                         m_freem(*msgpcm);
3131                                         error = ENOBUFS;
3132                                         goto done;
3133                                 }
3134                                 controlp = &(*controlp)->m_next;
3135                         }
3136                         m = m->m_next;
3137                 } else {
3138                         m->m_nextpkt = NULL;
3139                         sbfree(sb_rcv, m);
3140                         sb_rcv->sb_mb = m->m_next;
3141                         m->m_next = NULL;
3142                         *cme = m;
3143                         cme = &(*cme)->m_next;
3144                         m = sb_rcv->sb_mb;
3145                 }
3146         } while (m != NULL && m->m_type == MT_CONTROL);
3147
3148         if (!(flags & MSG_PEEK)) {
3149                 if (sb_rcv->sb_mb != NULL) {
3150                         sb_rcv->sb_mb->m_nextpkt = nextrecord;
3151                 } else {
3152                         sb_rcv->sb_mb = nextrecord;
3153                         SB_EMPTY_FIXUP(sb_rcv);
3154                 }
3155                 if (nextrecord == NULL) {
3156                         sb_rcv->sb_lastrecord = m;
3157                 }
3158         }
3159
3160         SBLASTRECORDCHK(&so->so_rcv, "soreceive ctl");
3161         SBLASTMBUFCHK(&so->so_rcv, "soreceive ctl");
3162
3163         while (cm != NULL) {
3164                 int cmsg_type;
3165
3166                 cmn = cm->m_next;
3167                 cm->m_next = NULL;
3168                 cmsg_type = mtod(cm, struct cmsghdr *)->cmsg_type;
3169
3170                 /*
3171                  * Call the protocol to externalize SCM_RIGHTS message
3172                  * and return the modified message to the caller upon
3173                  * success.  Otherwise, all other control messages are
3174                  * returned unmodified to the caller.  Note that we
3175                  * only get into this loop if MSG_PEEK is not set.
3176                  */
3177                 if (pr->pr_domain->dom_externalize != NULL &&
3178                     cmsg_type == SCM_RIGHTS) {
3179                         /*
3180                          * Release socket lock: see 3903171.  This
3181                          * would also allow more records to be appended
3182                          * to the socket buffer.  We still have SB_LOCK
3183                          * set on it, so we can be sure that the head
3184                          * of the mbuf chain won't change.
3185                          */
3186                         socket_unlock(so, 0);
3187                         error = (*pr->pr_domain->dom_externalize)(cm);
3188                         socket_lock(so, 0);
3189                 } else {
3190                         error = 0;
3191                 }
3192
3193                 if (controlp != NULL && error == 0) {
3194                         *controlp = cm;
3195                         controlp = &(*controlp)->m_next;
3196                 } else {
3197                         (void) m_free(cm);
3198                 }
3199                 cm = cmn;
3200         }
3201         /*
3202          * Update the value of nextrecord in case we received new
3203          * records when the socket was unlocked above for
3204          * externalizing SCM_RIGHTS.
3205          */
3206         if (m != NULL) {
3207                 nextrecord = sb_rcv->sb_mb->m_nextpkt;
3208         } else {
3209                 nextrecord = sb_rcv->sb_mb;
3210         }
3211
3212 done:
3213         *mp = m;
3214         *nextrecordp = nextrecord;
3215
3216         return error;
3217 }
3218
3219 /*
3220  * Implement receive operations on a socket.
3221  * We depend on the way that records are added to the sockbuf
3222  * by sbappend*.  In particular, each record (mbufs linked through m_next)
3223  * must begin with an address if the protocol so specifies,
3224  * followed by an optional mbuf or mbufs containing ancillary data,
3225  * and then zero or more mbufs of data.
3226  * In order to avoid blocking network interrupts for the entire time here,
3227  * we splx() while doing the actual copy to user space.
3228  * Although the sockbuf is locked, new data may still be appended,
3229  * and thus we must maintain consistency of the sockbuf during that time.
3230  *
3231  * The caller may receive the data as a single mbuf chain by supplying
3232  * an mbuf **mp0 for use in returning the chain.  The uio is then used
3233  * only for the count in uio_resid.
3234  *
3235  * Returns:     0                       Success
3236  *              ENOBUFS
3237  *              ENOTCONN
3238  *              EWOULDBLOCK
3239  *      uiomove:EFAULT
3240  *      sblock:EWOULDBLOCK
3241  *      sblock:EINTR
3242  *      sbwait:EBADF
3243  *      sbwait:EINTR
3244  *      sodelayed_copy:EFAULT
3245  *      <pru_rcvoob>:EINVAL[TCP]
3246  *      <pru_rcvoob>:EWOULDBLOCK[TCP]
3247  *      <pru_rcvoob>:???
3248  *      <pr_domain->dom_externalize>:EMSGSIZE[AF_UNIX]
3249  *      <pr_domain->dom_externalize>:ENOBUFS[AF_UNIX]
3250  *      <pr_domain->dom_externalize>:???
3251  *
3252  * Notes:       Additional return values from calls through <pru_rcvoob> and
3253  *              <pr_domain->dom_externalize> depend on protocols other than
3254  *              TCP or AF_UNIX, which are documented above.
3255  */
3256 int
3257 soreceive(struct socket *so, struct sockaddr **psa, struct uio *uio,
3258     struct mbuf **mp0, struct mbuf **controlp, int *flagsp)
3259 {
3260         struct mbuf *m, **mp, *ml = NULL;
3261         struct mbuf *nextrecord, *free_list;
3262         int flags, error, offset;
3263         user_ssize_t len;
3264         struct protosw *pr = so->so_proto;
3265         int moff, type = 0;
3266         user_ssize_t orig_resid = uio_resid(uio);
3267         user_ssize_t delayed_copy_len;
3268         int can_delay;
3269         int need_event;
3270         struct proc *p = current_proc();
3271         boolean_t en_tracing = FALSE;
3272
3273         /*
3274          * Sanity check on the length passed by caller as we are making 'int'
3275          * comparisons
3276          */
3277         if (orig_resid < 0 || orig_resid > INT_MAX) {
3278                 return EINVAL;
3279         }
3280
3281         KERNEL_DEBUG(DBG_FNC_SORECEIVE | DBG_FUNC_START, so,
3282             uio_resid(uio), so->so_rcv.sb_cc, so->so_rcv.sb_lowat,
3283             so->so_rcv.sb_hiwat);
3284
3285         socket_lock(so, 1);
3286         so_update_last_owner_locked(so, p);
3287         so_update_policy(so);
3288
3289 #ifdef MORE_LOCKING_DEBUG
3290         if (so->so_usecount == 1) {
3291                 panic("%s: so=%x no other reference on socket\n", __func__, so);
3292                 /* NOTREACHED */
3293         }
3294 #endif
3295         mp = mp0;
3296         if (psa != NULL) {
3297                 *psa = NULL;
3298         }
3299         if (controlp != NULL) {
3300                 *controlp = NULL;
3301         }
3302         if (flagsp != NULL) {
3303                 flags = *flagsp & ~MSG_EOR;
3304         } else {
3305                 flags = 0;
3306         }
3307
3308         /*
3309          * If a recv attempt is made on a previously-accepted socket
3310          * that has been marked as inactive (disconnected), reject
3311          * the request.
3312          */
3313         if (so->so_flags & SOF_DEFUNCT) {
3314                 struct sockbuf *sb = &so->so_rcv;
3315
3316                 error = ENOTCONN;
3317                 SODEFUNCTLOG("%s[%d, %s]: defunct so 0x%llx [%d,%d] (%d)\n",
3318                     __func__, proc_pid(p), proc_best_name(p),
3319                     (uint64_t)DEBUG_KERNEL_ADDRPERM(so),
3320                     SOCK_DOM(so), SOCK_TYPE(so), error);
3321                 /*
3322                  * This socket should have been disconnected and flushed
3323                  * prior to being returned from sodefunct(); there should
3324                  * be no data on its receive list, so panic otherwise.
3325                  */
3326                 if (so->so_state & SS_DEFUNCT) {
3327                         sb_empty_assert(sb, __func__);
3328                 }
3329                 socket_unlock(so, 1);
3330                 return error;
3331         }
3332
3333         if ((so->so_flags1 & SOF1_PRECONNECT_DATA) &&
3334             pr->pr_usrreqs->pru_preconnect) {
3335                 /*
3336                  * A user may set the CONNECT_RESUME_ON_READ_WRITE-flag but not
3337                  * calling write() right after this. *If* the app calls a read
3338                  * we do not want to block this read indefinetely. Thus,
3339                  * we trigger a connect so that the session gets initiated.
3340                  */
3341                 error = (*pr->pr_usrreqs->pru_preconnect)(so);
3342
3343                 if (error) {
3344                         socket_unlock(so, 1);
3345                         return error;
3346                 }
3347         }
3348
3349         if (ENTR_SHOULDTRACE &&
3350             (SOCK_CHECK_DOM(so, AF_INET) || SOCK_CHECK_DOM(so, AF_INET6))) {
3351                 /*
3352                  * enable energy tracing for inet sockets that go over
3353                  * non-loopback interfaces only.
3354                  */
3355                 struct inpcb *inp = sotoinpcb(so);
3356                 if (inp->inp_last_outifp != NULL &&
3357                     !(inp->inp_last_outifp->if_flags & IFF_LOOPBACK)) {
3358                         en_tracing = TRUE;
3359                         KERNEL_ENERGYTRACE(kEnTrActKernSockRead, DBG_FUNC_START,
3360                             VM_KERNEL_ADDRPERM(so),
3361                             ((so->so_state & SS_NBIO) ?
3362                             kEnTrFlagNonBlocking : 0),
3363                             (int64_t)orig_resid);
3364                 }
3365         }
3366
3367         /*
3368          * When SO_WANTOOBFLAG is set we try to get out-of-band data
3369          * regardless of the flags argument. Here is the case were
3370          * out-of-band data is not inline.
3371          */
3372         if ((flags & MSG_OOB) ||
3373             ((so->so_options & SO_WANTOOBFLAG) != 0 &&
3374             (so->so_options & SO_OOBINLINE) == 0 &&
3375             (so->so_oobmark || (so->so_state & SS_RCVATMARK)))) {
3376                 m = m_get(M_WAIT, MT_DATA);
3377                 if (m == NULL) {
3378                         socket_unlock(so, 1);
3379                         KERNEL_DEBUG(DBG_FNC_SORECEIVE | DBG_FUNC_END,
3380                             ENOBUFS, 0, 0, 0, 0);
3381                         return ENOBUFS;
3382                 }
3383                 error = (*pr->pr_usrreqs->pru_rcvoob)(so, m, flags & MSG_PEEK);
3384                 if (error) {
3385                         goto bad;
3386                 }
3387                 socket_unlock(so, 0);
3388                 do {
3389                         error = uiomove(mtod(m, caddr_t),
3390                             imin(uio_resid(uio), m->m_len), uio);
3391                         m = m_free(m);
3392                 } while (uio_resid(uio) && error == 0 && m != NULL);
3393                 socket_lock(so, 0);
3394 bad:
3395                 if (m != NULL) {
3396                         m_freem(m);
3397                 }
3398
3399                 if ((so->so_options & SO_WANTOOBFLAG) != 0) {
3400                         if (error == EWOULDBLOCK || error == EINVAL) {
3401                                 /*
3402                                  * Let's try to get normal data:
3403                                  * EWOULDBLOCK: out-of-band data not
3404                                  * receive yet. EINVAL: out-of-band data
3405                                  * already read.
3406                                  */
3407                                 error = 0;
3408                                 goto nooob;
3409                         } else if (error == 0 && flagsp != NULL) {
3410                                 *flagsp |= MSG_OOB;
3411                         }
3412                 }
3413                 socket_unlock(so, 1);
3414                 if (en_tracing) {
3415                         KERNEL_ENERGYTRACE(kEnTrActKernSockRead, DBG_FUNC_END,
3416                             VM_KERNEL_ADDRPERM(so), 0,
3417                             (int64_t)(orig_resid - uio_resid(uio)));
3418                 }
3419                 KERNEL_DEBUG(DBG_FNC_SORECEIVE | DBG_FUNC_END, error,
3420                     0, 0, 0, 0);
3421
3422                 return error;
3423         }
3424 nooob:
3425         if (mp != NULL) {
3426                 *mp = NULL;
3427         }
3428
3429         if (so->so_state & SS_ISCONFIRMING && uio_resid(uio)) {
3430                 (*pr->pr_usrreqs->pru_rcvd)(so, 0);
3431         }
3432
3433         free_list = NULL;
3434         delayed_copy_len = 0;
3435 restart:
3436 #ifdef MORE_LOCKING_DEBUG
3437         if (so->so_usecount <= 1) {
3438                 printf("soreceive: sblock so=0x%llx ref=%d on socket\n",
3439                     (uint64_t)DEBUG_KERNEL_ADDRPERM(so), so->so_usecount);
3440         }
3441 #endif
3442         /*
3443          * See if the socket has been closed (SS_NOFDREF|SS_CANTRCVMORE)
3444          * and if so just return to the caller.  This could happen when
3445          * soreceive() is called by a socket upcall function during the
3446          * time the socket is freed.  The socket buffer would have been
3447          * locked across the upcall, therefore we cannot put this thread
3448          * to sleep (else we will deadlock) or return EWOULDBLOCK (else
3449          * we may livelock), because the lock on the socket buffer will
3450          * only be released when the upcall routine returns to its caller.
3451          * Because the socket has been officially closed, there can be
3452          * no further read on it.
3453          *
3454          * A multipath subflow socket would have its SS_NOFDREF set by
3455          * default, so check for SOF_MP_SUBFLOW socket flag; when the
3456          * socket is closed for real, SOF_MP_SUBFLOW would be cleared.
3457          */
3458         if ((so->so_state & (SS_NOFDREF | SS_CANTRCVMORE)) ==
3459             (SS_NOFDREF | SS_CANTRCVMORE) && !(so->so_flags & SOF_MP_SUBFLOW)) {
3460                 socket_unlock(so, 1);
3461                 return 0;
3462         }
3463
3464         error = sblock(&so->so_rcv, SBLOCKWAIT(flags));
3465         if (error) {
3466                 socket_unlock(so, 1);
3467                 KERNEL_DEBUG(DBG_FNC_SORECEIVE | DBG_FUNC_END, error,
3468                     0, 0, 0, 0);
3469                 if (en_tracing) {
3470                         KERNEL_ENERGYTRACE(kEnTrActKernSockRead, DBG_FUNC_END,
3471                             VM_KERNEL_ADDRPERM(so), 0,
3472                             (int64_t)(orig_resid - uio_resid(uio)));
3473                 }
3474                 return error;
3475         }
3476
3477         m = so->so_rcv.sb_mb;
3478         /*
3479          * If we have less data than requested, block awaiting more
3480          * (subject to any timeout) if:
3481          *   1. the current count is less than the low water mark, or
3482          *   2. MSG_WAITALL is set, and it is possible to do the entire
3483          *      receive operation at once if we block (resid <= hiwat).
3484          *   3. MSG_DONTWAIT is not set
3485          * If MSG_WAITALL is set but resid is larger than the receive buffer,
3486          * we have to do the receive in sections, and thus risk returning
3487          * a short count if a timeout or signal occurs after we start.
3488          */
3489         if (m == NULL || (((flags & MSG_DONTWAIT) == 0 &&
3490             so->so_rcv.sb_cc < uio_resid(uio)) &&
3491             (so->so_rcv.sb_cc < so->so_rcv.sb_lowat ||
3492             ((flags & MSG_WAITALL) && uio_resid(uio) <= so->so_rcv.sb_hiwat)) &&
3493             m->m_nextpkt == NULL && (pr->pr_flags & PR_ATOMIC) == 0)) {
3494                 /*
3495                  * Panic if we notice inconsistencies in the socket's
3496                  * receive list; both sb_mb and sb_cc should correctly
3497                  * reflect the contents of the list, otherwise we may
3498                  * end up with false positives during select() or poll()
3499                  * which could put the application in a bad state.
3500                  */
3501                 SB_MB_CHECK(&so->so_rcv);
3502
3503                 if (so->so_error) {
3504                         if (m != NULL) {
3505                                 goto dontblock;
3506                         }
3507                         error = so->so_error;
3508                         if ((flags & MSG_PEEK) == 0) {
3509                                 so->so_error = 0;
3510                         }
3511                         goto release;
3512                 }
3513                 if (so->so_state & SS_CANTRCVMORE) {
3514 #if CONTENT_FILTER
3515                         /*
3516                          * Deal with half closed connections
3517                          */
3518                         if ((so->so_state & SS_ISDISCONNECTED) == 0 &&
3519                             cfil_sock_data_pending(&so->so_rcv) != 0) {
3520                                 CFIL_LOG(LOG_INFO,
3521                                     "so %llx ignore SS_CANTRCVMORE",
3522                                     (uint64_t)DEBUG_KERNEL_ADDRPERM(so));
3523                         } else
3524 #endif /* CONTENT_FILTER */
3525                         if (m != NULL) {
3526                                 goto dontblock;
3527                         } else {
3528                                 goto release;
3529                         }
3530                 }
3531                 for (; m != NULL; m = m->m_next) {
3532                         if (m->m_type == MT_OOBDATA || (m->m_flags & M_EOR)) {
3533                                 m = so->so_rcv.sb_mb;
3534                                 goto dontblock;
3535                         }
3536                 }
3537                 if ((so->so_state & (SS_ISCONNECTED | SS_ISCONNECTING)) == 0 &&
3538                     (so->so_proto->pr_flags & PR_CONNREQUIRED)) {
3539                         error = ENOTCONN;
3540                         goto release;
3541                 }
3542                 if (uio_resid(uio) == 0) {
3543                         goto release;
3544                 }
3545
3546                 if ((so->so_state & SS_NBIO) ||
3547                     (flags & (MSG_DONTWAIT | MSG_NBIO))) {
3548                         error = EWOULDBLOCK;
3549                         goto release;
3550                 }
3551                 SBLASTRECORDCHK(&so->so_rcv, "soreceive sbwait 1");
3552                 SBLASTMBUFCHK(&so->so_rcv, "soreceive sbwait 1");
3553                 sbunlock(&so->so_rcv, TRUE);    /* keep socket locked */
3554 #if EVEN_MORE_LOCKING_DEBUG
3555                 if (socket_debug) {
3556                         printf("Waiting for socket data\n");
3557                 }
3558 #endif
3559
3560                 error = sbwait(&so->so_rcv);
3561 #if EVEN_MORE_LOCKING_DEBUG
3562                 if (socket_debug) {
3563                         printf("SORECEIVE - sbwait returned %d\n", error);
3564                 }
3565 #endif
3566                 if (so->so_usecount < 1) {
3567                         panic("%s: after 2nd sblock so=%p ref=%d on socket\n",
3568                             __func__, so, so->so_usecount);
3569                         /* NOTREACHED */
3570                 }
3571                 if (error) {
3572                         socket_unlock(so, 1);
3573                         KERNEL_DEBUG(DBG_FNC_SORECEIVE | DBG_FUNC_END, error,
3574                             0, 0, 0, 0);
3575                         if (en_tracing) {
3576                                 KERNEL_ENERGYTRACE(kEnTrActKernSockRead, DBG_FUNC_END,
3577                                     VM_KERNEL_ADDRPERM(so), 0,
3578                                     (int64_t)(orig_resid - uio_resid(uio)));
3579                         }
3580                         return error;
3581                 }
3582                 goto restart;
3583         }
3584 dontblock:
3585         OSIncrementAtomicLong(&p->p_stats->p_ru.ru_msgrcv);
3586         SBLASTRECORDCHK(&so->so_rcv, "soreceive 1");
3587         SBLASTMBUFCHK(&so->so_rcv, "soreceive 1");
3588         nextrecord = m->m_nextpkt;
3589
3590         if ((pr->pr_flags & PR_ADDR) && m->m_type == MT_SONAME) {
3591                 error = soreceive_addr(p, so, psa, flags, &m, &nextrecord,
3592                     mp0 == NULL);
3593                 if (error == ERESTART) {
3594                         goto restart;
3595                 } else if (error != 0) {
3596                         goto release;
3597                 }
3598                 orig_resid = 0;
3599         }
3600
3601         /*
3602          * Process one or more MT_CONTROL mbufs present before any data mbufs
3603          * in the first mbuf chain on the socket buffer.  If MSG_PEEK, we
3604          * just copy the data; if !MSG_PEEK, we call into the protocol to
3605          * perform externalization.
3606          */
3607         if (m != NULL && m->m_type == MT_CONTROL) {
3608                 error = soreceive_ctl(so, controlp, flags, &m, &nextrecord);
3609                 if (error != 0) {
3610                         goto release;
3611                 }
3612                 orig_resid = 0;
3613         }
3614
3615         /*
3616          * If the socket is a TCP socket with message delivery
3617          * enabled, then create a control msg to deliver the
3618          * relative TCP sequence number for this data. Waiting
3619          * until this point will protect against failures to
3620          * allocate an mbuf for control msgs.
3621          */
3622         if (so->so_type == SOCK_STREAM && SOCK_PROTO(so) == IPPROTO_TCP &&
3623             (so->so_flags & SOF_ENABLE_MSGS) && controlp != NULL) {
3624                 struct mbuf *seq_cm;
3625
3626                 seq_cm = sbcreatecontrol((caddr_t)&m->m_pkthdr.msg_seq,
3627                     sizeof(uint32_t), SCM_SEQNUM, SOL_SOCKET);
3628                 if (seq_cm == NULL) {
3629                         /* unable to allocate a control mbuf */
3630                         error = ENOBUFS;
3631                         goto release;
3632                 }
3633                 *controlp = seq_cm;
3634                 controlp = &seq_cm->m_next;
3635         }
3636
3637         if (m != NULL) {
3638                 if (!(flags & MSG_PEEK)) {
3639                         /*
3640                          * We get here because m points to an mbuf following
3641                          * any MT_SONAME or MT_CONTROL mbufs which have been
3642                          * processed above.  In any case, m should be pointing
3643                          * to the head of the mbuf chain, and the nextrecord
3644                          * should be either NULL or equal to m->m_nextpkt.
3645                          * See comments above about SB_LOCK.
3646                          */
3647                         if (m != so->so_rcv.sb_mb ||
3648                             m->m_nextpkt != nextrecord) {
3649                                 panic("%s: post-control !sync so=%p m=%p "
3650                                     "nextrecord=%p\n", __func__, so, m,
3651                                     nextrecord);
3652                                 /* NOTREACHED */
3653                         }
3654                         if (nextrecord == NULL) {
3655                                 so->so_rcv.sb_lastrecord = m;
3656                         }
3657                 }
3658                 type = m->m_type;
3659                 if (type == MT_OOBDATA) {
3660                         flags |= MSG_OOB;
3661                 }
3662         } else {
3663                 if (!(flags & MSG_PEEK)) {
3664                         SB_EMPTY_FIXUP(&so->so_rcv);
3665                 }
3666         }
3667         SBLASTRECORDCHK(&so->so_rcv, "soreceive 2");
3668         SBLASTMBUFCHK(&so->so_rcv, "soreceive 2");
3669
3670         moff = 0;
3671         offset = 0;
3672
3673         if (!(flags & MSG_PEEK) && uio_resid(uio) > sorecvmincopy) {
3674                 can_delay = 1;
3675         } else {
3676                 can_delay = 0;
3677         }
3678
3679         need_event = 0;
3680
3681         while (m != NULL &&
3682             (uio_resid(uio) - delayed_copy_len) > 0 && error == 0) {
3683                 if (m->m_type == MT_OOBDATA) {
3684                         if (type != MT_OOBDATA) {
3685                                 break;
3686                         }
3687                 } else if (type == MT_OOBDATA) {
3688                         break;
3689                 }
3690                 /*
3691                  * Make sure to allways set MSG_OOB event when getting
3692                  * out of band data inline.
3693                  */
3694                 if ((so->so_options & SO_WANTOOBFLAG) != 0 &&
3695                     (so->so_options & SO_OOBINLINE) != 0 &&
3696                     (so->so_state & SS_RCVATMARK) != 0) {
3697                         flags |= MSG_OOB;
3698                 }
3699                 so->so_state &= ~SS_RCVATMARK;
3700                 len = uio_resid(uio) - delayed_copy_len;
3701                 if (so->so_oobmark && len > so->so_oobmark - offset) {
3702                         len = so->so_oobmark - offset;
3703                 }
3704                 if (len > m->m_len - moff) {
3705                         len = m->m_len - moff;
3706                 }
3707                 /*
3708                  * If mp is set, just pass back the mbufs.
3709                  * Otherwise copy them out via the uio, then free.
3710                  * Sockbuf must be consistent here (points to current mbuf,
3711                  * it points to next record) when we drop priority;
3712                  * we must note any additions to the sockbuf when we
3713                  * block interrupts again.
3714                  */
3715                 if (mp == NULL) {
3716                         SBLASTRECORDCHK(&so->so_rcv, "soreceive uiomove");
3717                         SBLASTMBUFCHK(&so->so_rcv, "soreceive uiomove");
3718                         if (can_delay && len == m->m_len) {
3719                                 /*
3720                                  * only delay the copy if we're consuming the
3721                                  * mbuf and we're NOT in MSG_PEEK mode
3722                                  * and we have enough data to make it worthwile
3723                                  * to drop and retake the lock... can_delay
3724                                  * reflects the state of the 2 latter
3725                                  * constraints moff should always be zero
3726                                  * in these cases
3727                                  */
3728                                 delayed_copy_len += len;
3729                         } else {
3730                                 if (delayed_copy_len) {
3731                                         error = sodelayed_copy(so, uio,
3732                                             &free_list, &delayed_copy_len);
3733
3734                                         if (error) {
3735                                                 goto release;
3736                                         }
3737                                         /*
3738                                          * can only get here if MSG_PEEK is not
3739                                          * set therefore, m should point at the
3740                                          * head of the rcv queue; if it doesn't,
3741                                          * it means something drastically
3742                                          * changed while we were out from behind
3743                                          * the lock in sodelayed_copy. perhaps
3744                                          * a RST on the stream. in any event,
3745                                          * the stream has been interrupted. it's
3746                                          * probably best just to return whatever
3747                                          * data we've moved and let the caller
3748                                          * sort it out...
3749                                          */
3750                                         if (m != so->so_rcv.sb_mb) {
3751                                                 break;
3752                                         }
3753                                 }
3754                                 socket_unlock(so, 0);
3755                                 error = uiomove(mtod(m, caddr_t) + moff,
3756                                     (int)len, uio);
3757                                 socket_lock(so, 0);
3758
3759                                 if (error) {
3760                                         goto release;
3761                                 }
3762                         }
3763                 } else {
3764                         uio_setresid(uio, (uio_resid(uio) - len));
3765                 }
3766                 if (len == m->m_len - moff) {
3767                         if (m->m_flags & M_EOR) {
3768                                 flags |= MSG_EOR;
3769                         }
3770                         if (flags & MSG_PEEK) {
3771                                 m = m->m_next;
3772                                 moff = 0;
3773                         } else {
3774                                 nextrecord = m->m_nextpkt;
3775                                 sbfree(&so->so_rcv, m);
3776                                 m->m_nextpkt = NULL;
3777
3778                                 /*
3779                                  * If this packet is an unordered packet
3780                                  * (indicated by M_UNORDERED_DATA flag), remove
3781                                  * the additional bytes added to the
3782                                  * receive socket buffer size.
3783                                  */
3784                                 if ((so->so_flags & SOF_ENABLE_MSGS) &&
3785                                     m->m_len &&
3786                                     (m->m_flags & M_UNORDERED_DATA) &&
3787                                     sbreserve(&so->so_rcv,
3788                                     so->so_rcv.sb_hiwat - m->m_len)) {
3789                                         if (so->so_msg_state->msg_uno_bytes >
3790                                             m->m_len) {
3791                                                 so->so_msg_state->
3792                                                 msg_uno_bytes -= m->m_len;
3793                                         } else {
3794                                                 so->so_msg_state->
3795                                                 msg_uno_bytes = 0;
3796                                         }
3797                                         m->m_flags &= ~M_UNORDERED_DATA;
3798                                 }
3799
3800                                 if (mp != NULL) {
3801                                         *mp = m;
3802                                         mp = &m->m_next;
3803                                         so->so_rcv.sb_mb = m = m->m_next;
3804                                         *mp = NULL;
3805                                 } else {
3806                                         if (free_list == NULL) {
3807                                                 free_list = m;
3808                                         } else {
3809                                                 ml->m_next = m;
3810                                         }
3811                                         ml = m;
3812                                         so->so_rcv.sb_mb = m = m->m_next;
3813                                         ml->m_next = NULL;
3814                                 }
3815                                 if (m != NULL) {
3816                                         m->m_nextpkt = nextrecord;
3817                                         if (nextrecord == NULL) {
3818                                                 so->so_rcv.sb_lastrecord = m;
3819                                         }
3820                                 } else {
3821                                         so->so_rcv.sb_mb = nextrecord;
3822                                         SB_EMPTY_FIXUP(&so->so_rcv);
3823                                 }
3824                                 SBLASTRECORDCHK(&so->so_rcv, "soreceive 3");
3825                                 SBLASTMBUFCHK(&so->so_rcv, "soreceive 3");
3826                         }
3827                 } else {
3828                         if (flags & MSG_PEEK) {
3829                                 moff += len;
3830                         } else {
3831                                 if (mp != NULL) {
3832                                         int copy_flag;
3833
3834                                         if (flags & MSG_DONTWAIT) {
3835                                                 copy_flag = M_DONTWAIT;
3836                                         } else {
3837                                                 copy_flag = M_WAIT;
3838                                         }
3839                                         *mp = m_copym(m, 0, len, copy_flag);
3840                                         /*
3841                                          * Failed to allocate an mbuf?
3842                                          * Adjust uio_resid back, it was
3843                                          * adjusted down by len bytes which
3844                                          * we didn't copy over.
3845                                          */
3846                                         if (*mp == NULL) {
3847                                                 uio_setresid(uio,
3848                                                     (uio_resid(uio) + len));
3849                                                 break;
3850                                         }
3851                                 }
3852                                 m->m_data += len;
3853                                 m->m_len -= len;
3854                                 so->so_rcv.sb_cc -= len;
3855                         }
3856                 }
3857                 if (so->so_oobmark) {
3858                         if ((flags & MSG_PEEK) == 0) {
3859                                 so->so_oobmark -= len;
3860                                 if (so->so_oobmark == 0) {
3861                                         so->so_state |= SS_RCVATMARK;
3862                                         /*
3863                                          * delay posting the actual event until
3864                                          * after any delayed copy processing
3865                                          * has finished
3866                                          */
3867                                         need_event = 1;
3868                                         break;
3869                                 }
3870                         } else {
3871                                 offset += len;
3872                                 if (offset == so->so_oobmark) {
3873                                         break;
3874                                 }
3875                         }
3876                 }
3877                 if (flags & MSG_EOR) {
3878                         break;
3879                 }
3880                 /*
3881                  * If the MSG_WAITALL or MSG_WAITSTREAM flag is set
3882                  * (for non-atomic socket), we must not quit until
3883                  * "uio->uio_resid == 0" or an error termination.
3884                  * If a signal/timeout occurs, return with a short
3885                  * count but without error.  Keep sockbuf locked
3886                  * against other readers.
3887                  */
3888                 while (flags & (MSG_WAITALL | MSG_WAITSTREAM) && m == NULL &&
3889                     (uio_resid(uio) - delayed_copy_len) > 0 &&
3890                     !sosendallatonce(so) && !nextrecord) {
3891                         if (so->so_error || ((so->so_state & SS_CANTRCVMORE)
3892 #if CONTENT_FILTER
3893                             && cfil_sock_data_pending(&so->so_rcv) == 0
3894 #endif /* CONTENT_FILTER */
3895                             )) {
3896                                 goto release;
3897                         }
3898
3899                         /*
3900                          * Depending on the protocol (e.g. TCP), the following
3901                          * might cause the socket lock to be dropped and later
3902                          * be reacquired, and more data could have arrived and
3903                          * have been appended to the receive socket buffer by
3904                          * the time it returns.  Therefore, we only sleep in
3905                          * sbwait() below if and only if the socket buffer is
3906                          * empty, in order to avoid a false sleep.
3907                          */
3908                         if (pr->pr_flags & PR_WANTRCVD && so->so_pcb &&
3909                             (((struct inpcb *)so->so_pcb)->inp_state !=
3910                             INPCB_STATE_DEAD)) {
3911                                 (*pr->pr_usrreqs->pru_rcvd)(so, flags);
3912                         }
3913
3914                         SBLASTRECORDCHK(&so->so_rcv, "soreceive sbwait 2");
3915                         SBLASTMBUFCHK(&so->so_rcv, "soreceive sbwait 2");
3916
3917                         if (so->so_rcv.sb_mb == NULL && sbwait(&so->so_rcv)) {
3918                                 error = 0;
3919                                 goto release;
3920                         }
3921                         /*
3922                          * have to wait until after we get back from the sbwait
3923                          * to do the copy because we will drop the lock if we
3924                          * have enough data that has been delayed... by dropping
3925                          * the lock we open up a window allowing the netisr
3926                          * thread to process the incoming packets and to change
3927                          * the state of this socket... we're issuing the sbwait
3928                          * because the socket is empty and we're expecting the
3929                          * netisr thread to wake us up when more packets arrive;
3930                          * if we allow that processing to happen and then sbwait
3931                          * we could stall forever with packets sitting in the
3932                          * socket if no further packets arrive from the remote
3933                          * side.
3934                          *
3935                          * we want to copy before we've collected all the data
3936                          * to satisfy this request to allow the copy to overlap
3937                          * the incoming packet processing on an MP system
3938                          */
3939                         if (delayed_copy_len > sorecvmincopy &&
3940                             (delayed_copy_len > (so->so_rcv.sb_hiwat / 2))) {
3941                                 error = sodelayed_copy(so, uio,
3942                                     &free_list, &delayed_copy_len);
3943
3944                                 if (error) {
3945                                         goto release;
3946                                 }
3947                         }
3948                         m = so->so_rcv.sb_mb;
3949                         if (m != NULL) {
3950                                 nextrecord = m->m_nextpkt;
3951                         }
3952                         SB_MB_CHECK(&so->so_rcv);
3953                 }
3954         }
3955 #ifdef MORE_LOCKING_DEBUG
3956         if (so->so_usecount <= 1) {
3957                 panic("%s: after big while so=%p ref=%d on socket\n",
3958                     __func__, so, so->so_usecount);
3959                 /* NOTREACHED */
3960         }
3961 #endif
3962
3963         if (m != NULL && pr->pr_flags & PR_ATOMIC) {
3964                 if (so->so_options & SO_DONTTRUNC) {
3965                         flags |= MSG_RCVMORE;
3966                 } else {
3967                         flags |= MSG_TRUNC;
3968                         if ((flags & MSG_PEEK) == 0) {
3969                                 (void) sbdroprecord(&so->so_rcv);
3970                         }
3971                 }
3972         }
3973
3974         /*
3975          * pru_rcvd below (for TCP) may cause more data to be received
3976          * if the socket lock is dropped prior to sending the ACK; some
3977          * legacy OpenTransport applications don't handle this well
3978          * (if it receives less data than requested while MSG_HAVEMORE
3979          * is set), and so we set the flag now based on what we know
3980          * prior to calling pru_rcvd.
3981          */
3982         if ((so->so_options & SO_WANTMORE) && so->so_rcv.sb_cc > 0) {
3983                 flags |= MSG_HAVEMORE;
3984         }
3985
3986         if ((flags & MSG_PEEK) == 0) {
3987                 if (m == NULL) {
3988                         so->so_rcv.sb_mb = nextrecord;
3989                         /*
3990                          * First part is an inline SB_EMPTY_FIXUP().  Second
3991                          * part makes sure sb_lastrecord is up-to-date if
3992                          * there is still data in the socket buffer.
3993                          */
3994                         if (so->so_rcv.sb_mb == NULL) {
3995                                 so->so_rcv.sb_mbtail = NULL;
3996                                 so->so_rcv.sb_lastrecord = NULL;
3997                         } else if (nextrecord->m_nextpkt == NULL) {
3998                                 so->so_rcv.sb_lastrecord = nextrecord;
3999                         }
4000                         SB_MB_CHECK(&so->so_rcv);
4001                 }
4002                 SBLASTRECORDCHK(&so->so_rcv, "soreceive 4");
4003                 SBLASTMBUFCHK(&so->so_rcv, "soreceive 4");
4004                 if (pr->pr_flags & PR_WANTRCVD && so->so_pcb) {
4005                         (*pr->pr_usrreqs->pru_rcvd)(so, flags);
4006                 }
4007         }
4008
4009         if (delayed_copy_len) {
4010                 error = sodelayed_copy(so, uio, &free_list, &delayed_copy_len);
4011                 if (error) {
4012                         goto release;
4013                 }
4014         }
4015         if (free_list != NULL) {
4016                 m_freem_list(free_list);
4017                 free_list = NULL;
4018         }
4019         if (need_event) {
4020                 postevent(so, 0, EV_OOB);
4021         }
4022
4023         if (orig_resid == uio_resid(uio) && orig_resid &&
4024             (flags & MSG_EOR) == 0 && (so->so_state & SS_CANTRCVMORE) == 0) {
4025                 sbunlock(&so->so_rcv, TRUE);    /* keep socket locked */
4026                 goto restart;
4027         }
4028
4029         if (flagsp != NULL) {
4030                 *flagsp |= flags;
4031         }
4032 release:
4033 #ifdef MORE_LOCKING_DEBUG
4034         if (so->so_usecount <= 1) {
4035                 panic("%s: release so=%p ref=%d on socket\n", __func__,
4036                     so, so->so_usecount);
4037                 /* NOTREACHED */
4038         }
4039 #endif
4040         if (delayed_copy_len) {
4041                 error = sodelayed_copy(so, uio, &free_list, &delayed_copy_len);
4042         }
4043
4044         if (free_list != NULL) {
4045                 m_freem_list(free_list);
4046         }
4047
4048         sbunlock(&so->so_rcv, FALSE);   /* will unlock socket */
4049
4050         if (en_tracing) {
4051                 KERNEL_ENERGYTRACE(kEnTrActKernSockRead, DBG_FUNC_END,
4052                     VM_KERNEL_ADDRPERM(so),
4053                     ((error == EWOULDBLOCK) ? kEnTrFlagNoWork : 0),
4054                     (int64_t)(orig_resid - uio_resid(uio)));
4055         }
4056         KERNEL_DEBUG(DBG_FNC_SORECEIVE | DBG_FUNC_END, so, uio_resid(uio),
4057             so->so_rcv.sb_cc, 0, error);
4058
4059         return error;
4060 }
4061
4062 /*
4063  * Returns:     0                       Success
4064  *      uiomove:EFAULT
4065  */
4066 static int
4067 sodelayed_copy(struct socket *so, struct uio *uio, struct mbuf **free_list,
4068     user_ssize_t *resid)
4069 {
4070         int error = 0;
4071         struct mbuf *m;
4072
4073         m = *free_list;
4074
4075         socket_unlock(so, 0);
4076
4077         while (m != NULL && error == 0) {
4078                 error = uiomove(mtod(m, caddr_t), (int)m->m_len, uio);
4079                 m = m->m_next;
4080         }
4081         m_freem_list(*free_list);
4082
4083         *free_list = NULL;
4084         *resid = 0;
4085
4086         socket_lock(so, 0);
4087
4088         return error;
4089 }
4090
4091 static int
4092 sodelayed_copy_list(struct socket *so, struct recv_msg_elem *msgarray,
4093     u_int uiocnt, struct mbuf **free_list, user_ssize_t *resid)
4094 {
4095 #pragma unused(so)
4096         int error = 0;
4097         struct mbuf *ml, *m;
4098         int i = 0;
4099         struct uio *auio;
4100
4101         for (ml = *free_list, i = 0; ml != NULL && i < uiocnt;
4102             ml = ml->m_nextpkt, i++) {
4103                 auio = msgarray[i].uio;
4104                 for (m = ml; m != NULL; m = m->m_next) {
4105                         error = uiomove(mtod(m, caddr_t), m->m_len, auio);
4106                         if (error != 0) {
4107                                 goto out;
4108                         }
4109                 }
4110         }
4111 out:
4112         m_freem_list(*free_list);
4113
4114         *free_list = NULL;
4115         *resid = 0;
4116
4117         return error;
4118 }
4119
4120 int
4121 soreceive_list(struct socket *so, struct recv_msg_elem *msgarray, u_int uiocnt,
4122     int *flagsp)
4123 {
4124         struct mbuf *m;
4125         struct mbuf *nextrecord;
4126         struct mbuf *ml = NULL, *free_list = NULL, *free_tail = NULL;
4127         int error;
4128         user_ssize_t len, pktlen, delayed_copy_len = 0;
4129         struct protosw *pr = so->so_proto;
4130         user_ssize_t resid;
4131         struct proc *p = current_proc();
4132         struct uio *auio = NULL;
4133         int npkts = 0;
4134         int sblocked = 0;
4135         struct sockaddr **psa = NULL;
4136         struct mbuf **controlp = NULL;
4137         int can_delay;
4138         int flags;
4139         struct mbuf *free_others = NULL;
4140
4141         KERNEL_DEBUG(DBG_FNC_SORECEIVE_LIST | DBG_FUNC_START,
4142             so, uiocnt,
4143             so->so_rcv.sb_cc, so->so_rcv.sb_lowat, so->so_rcv.sb_hiwat);
4144
4145         /*
4146          * Sanity checks:
4147          * - Only supports don't wait flags
4148          * - Only support datagram sockets (could be extended to raw)
4149          * - Must be atomic
4150          * - Protocol must support packet chains
4151          * - The uio array is NULL (should we panic?)
4152          */
4153         if (flagsp != NULL) {
4154                 flags = *flagsp;
4155         } else {
4156                 flags = 0;
4157         }
4158         if (flags & ~(MSG_PEEK | MSG_WAITALL | MSG_DONTWAIT | MSG_NEEDSA |
4159             MSG_NBIO)) {
4160                 printf("%s invalid flags 0x%x\n", __func__, flags);
4161                 error = EINVAL;
4162                 goto out;
4163         }
4164         if (so->so_type != SOCK_DGRAM) {
4165                 error = EINVAL;
4166                 goto out;
4167         }
4168         if (sosendallatonce(so) == 0) {
4169                 error = EINVAL;
4170                 goto out;
4171         }
4172         if (so->so_proto->pr_usrreqs->pru_send_list == NULL) {
4173                 error = EPROTONOSUPPORT;
4174                 goto out;
4175         }
4176         if (msgarray == NULL) {
4177                 printf("%s uioarray is NULL\n", __func__);
4178                 error = EINVAL;
4179                 goto out;
4180         }
4181         if (uiocnt == 0) {
4182                 printf("%s uiocnt is 0\n", __func__);
4183                 error = EINVAL;
4184                 goto out;
4185         }
4186         /*
4187          * Sanity check on the length passed by caller as we are making 'int'
4188          * comparisons
4189          */
4190         resid = recv_msg_array_resid(msgarray, uiocnt);
4191         if (resid < 0 || resid > INT_MAX) {
4192                 error = EINVAL;
4193                 goto out;
4194         }
4195
4196         if (!(flags & MSG_PEEK) && sorecvmincopy > 0) {
4197                 can_delay = 1;
4198         } else {
4199                 can_delay = 0;
4200         }
4201
4202         socket_lock(so, 1);
4203         so_update_last_owner_locked(so, p);
4204         so_update_policy(so);
4205
4206 #if NECP
4207         so_update_necp_policy(so, NULL, NULL);
4208 #endif /* NECP */
4209
4210         /*
4211          * If a recv attempt is made on a previously-accepted socket
4212          * that has been marked as inactive (disconnected), reject
4213          * the request.
4214          */
4215         if (so->so_flags & SOF_DEFUNCT) {
4216                 struct sockbuf *sb = &so->so_rcv;
4217
4218                 error = ENOTCONN;
4219                 SODEFUNCTLOG("%s[%d, %s]: defunct so 0x%llx [%d,%d] (%d)\n",
4220                     __func__, proc_pid(p), proc_best_name(p),
4221                     (uint64_t)DEBUG_KERNEL_ADDRPERM(so),
4222                     SOCK_DOM(so), SOCK_TYPE(so), error);
4223                 /*
4224                  * This socket should have been disconnected and flushed
4225                  * prior to being returned from sodefunct(); there should
4226                  * be no data on its receive list, so panic otherwise.
4227                  */
4228                 if (so->so_state & SS_DEFUNCT) {
4229                         sb_empty_assert(sb, __func__);
4230                 }
4231                 goto release;
4232         }
4233
4234 next:
4235         /*
4236          * The uio may be empty
4237          */
4238         if (npkts >= uiocnt) {
4239                 error = 0;
4240                 goto release;
4241         }
4242 restart:
4243         /*
4244          * See if the socket has been closed (SS_NOFDREF|SS_CANTRCVMORE)
4245          * and if so just return to the caller.  This could happen when
4246          * soreceive() is called by a socket upcall function during the
4247          * time the socket is freed.  The socket buffer would have been
4248          * locked across the upcall, therefore we cannot put this thread
4249          * to sleep (else we will deadlock) or return EWOULDBLOCK (else
4250          * we may livelock), because the lock on the socket buffer will
4251          * only be released when the upcall routine returns to its caller.
4252          * Because the socket has been officially closed, there can be
4253          * no further read on it.
4254          */
4255         if ((so->so_state & (SS_NOFDREF | SS_CANTRCVMORE)) ==
4256             (SS_NOFDREF | SS_CANTRCVMORE)) {
4257                 error = 0;
4258                 goto release;
4259         }
4260
4261         error = sblock(&so->so_rcv, SBLOCKWAIT(flags));
4262         if (error) {
4263                 goto release;
4264         }
4265         sblocked = 1;
4266
4267         m = so->so_rcv.sb_mb;
4268         /*
4269          * Block awaiting more datagram if needed
4270          */
4271         if (m == NULL || (((flags & MSG_DONTWAIT) == 0 &&
4272             (so->so_rcv.sb_cc < so->so_rcv.sb_lowat ||
4273             ((flags & MSG_WAITALL) && npkts < uiocnt))))) {
4274                 /*
4275                  * Panic if we notice inconsistencies in the socket's
4276                  * receive list; both sb_mb and sb_cc should correctly
4277                  * reflect the contents of the list, otherwise we may
4278                  * end up with false positives during select() or poll()
4279                  * which could put the application in a bad state.
4280                  */
4281                 SB_MB_CHECK(&so->so_rcv);
4282
4283                 if (so->so_error) {
4284                         error = so->so_error;
4285                         if ((flags & MSG_PEEK) == 0) {
4286                                 so->so_error = 0;
4287                         }
4288                         goto release;
4289                 }
4290                 if (so->so_state & SS_CANTRCVMORE) {
4291                         goto release;
4292                 }
4293                 if ((so->so_state & (SS_ISCONNECTED | SS_ISCONNECTING)) == 0 &&
4294                     (so->so_proto->pr_flags & PR_CONNREQUIRED)) {
4295                         error = ENOTCONN;
4296                         goto release;
4297                 }
4298                 if ((so->so_state & SS_NBIO) ||
4299                     (flags & (MSG_DONTWAIT | MSG_NBIO))) {
4300                         error = EWOULDBLOCK;
4301                         goto release;
4302                 }
4303                 /*
4304                  * Do not block if we got some data
4305                  */
4306                 if (free_list != NULL) {
4307                         error = 0;
4308                         goto release;
4309                 }
4310
4311                 SBLASTRECORDCHK(&so->so_rcv, "soreceive sbwait 1");
4312                 SBLASTMBUFCHK(&so->so_rcv, "soreceive sbwait 1");
4313
4314                 sbunlock(&so->so_rcv, TRUE);    /* keep socket locked */
4315                 sblocked = 0;
4316
4317                 error = sbwait(&so->so_rcv);
4318                 if (error) {
4319                         goto release;
4320                 }
4321                 goto restart;
4322         }
4323
4324         OSIncrementAtomicLong(&p->p_stats->p_ru.ru_msgrcv);
4325         SBLASTRECORDCHK(&so->so_rcv, "soreceive 1");
4326         SBLASTMBUFCHK(&so->so_rcv, "soreceive 1");
4327
4328         /*
4329          * Consume the current uio index as we have a datagram
4330          */
4331         auio = msgarray[npkts].uio;
4332         resid = uio_resid(auio);
4333         msgarray[npkts].which |= SOCK_MSG_DATA;
4334         psa = (msgarray[npkts].which & SOCK_MSG_SA) ?
4335             &msgarray[npkts].psa : NULL;
4336         controlp = (msgarray[npkts].which & SOCK_MSG_CONTROL) ?
4337             &msgarray[npkts].controlp : NULL;
4338         npkts += 1;
4339         nextrecord = m->m_nextpkt;
4340
4341         if ((pr->pr_flags & PR_ADDR) && m->m_type == MT_SONAME) {
4342                 error = soreceive_addr(p, so, psa, flags, &m, &nextrecord, 1);
4343                 if (error == ERESTART) {
4344                         goto restart;
4345                 } else if (error != 0) {
4346                         goto release;
4347                 }
4348         }
4349
4350         if (m != NULL && m->m_type == MT_CONTROL) {
4351                 error = soreceive_ctl(so, controlp, flags, &m, &nextrecord);
4352                 if (error != 0) {
4353                         goto release;
4354                 }
4355         }
4356
4357         if (m->m_pkthdr.len == 0) {
4358                 printf("%s:%d so %llx pkt %llx type %u pktlen null\n",
4359                     __func__, __LINE__,
4360                     (uint64_t)DEBUG_KERNEL_ADDRPERM(so),
4361                     (uint64_t)DEBUG_KERNEL_ADDRPERM(m),
4362                     m->m_type);
4363         }
4364
4365         /*
4366          * Loop to copy the mbufs of the current record
4367          * Support zero length packets
4368          */
4369         ml = NULL;
4370         pktlen = 0;
4371         while (m != NULL && (len = resid - pktlen) >= 0 && error == 0) {
4372                 if (m->m_len == 0) {
4373                         panic("%p m_len zero", m);
4374                 }
4375                 if (m->m_type == 0) {
4376                         panic("%p m_type zero", m);
4377                 }
4378                 /*
4379                  * Clip to the residual length
4380                  */
4381                 if (len > m->m_len) {
4382                         len = m->m_len;
4383                 }
4384                 pktlen += len;
4385                 /*
4386                  * Copy the mbufs via the uio or delay the copy
4387                  * Sockbuf must be consistent here (points to current mbuf,
4388                  * it points to next record) when we drop priority;
4389                  * we must note any additions to the sockbuf when we
4390                  * block interrupts again.
4391                  */
4392                 if (len > 0 && can_delay == 0) {
4393                         socket_unlock(so, 0);
4394                         error = uiomove(mtod(m, caddr_t), (int)len, auio);
4395                         socket_lock(so, 0);
4396                         if (error) {
4397                                 goto release;
4398                         }
4399                 } else {
4400                         delayed_copy_len += len;
4401                 }
4402
4403                 if (len == m->m_len) {
4404                         /*
4405                          * m was entirely copied
4406                          */
4407                         sbfree(&so->so_rcv, m);
4408                         nextrecord = m->m_nextpkt;
4409                         m->m_nextpkt = NULL;
4410
4411                         /*
4412                          * Set the first packet to the head of the free list
4413                          */
4414                         if (free_list == NULL) {
4415                                 free_list = m;
4416                         }
4417                         /*
4418                          * Link current packet to tail of free list
4419                          */
4420                         if (ml == NULL) {
4421                                 if (free_tail != NULL) {
4422                                         free_tail->m_nextpkt = m;
4423                                 }
4424                                 free_tail = m;
4425                         }
4426                         /*
4427                          * Link current mbuf to last mbuf of current packet
4428                          */
4429                         if (ml != NULL) {
4430                                 ml->m_next = m;
4431                         }
4432                         ml = m;
4433
4434                         /*
4435                          * Move next buf to head of socket buffer
4436                          */
4437                         so->so_rcv.sb_mb = m = ml->m_next;
4438                         ml->m_next = NULL;
4439
4440                         if (m != NULL) {
4441                                 m->m_nextpkt = nextrecord;
4442                                 if (nextrecord == NULL) {
4443                                         so->so_rcv.sb_lastrecord = m;
4444                                 }
4445                         } else {
4446                                 so->so_rcv.sb_mb = nextrecord;
4447                                 SB_EMPTY_FIXUP(&so->so_rcv);
4448                         }
4449                         SBLASTRECORDCHK(&so->so_rcv, "soreceive 3");
4450                         SBLASTMBUFCHK(&so->so_rcv, "soreceive 3");
4451                 } else {
4452                         /*
4453                          * Stop the loop on partial copy
4454                          */
4455                         break;
4456                 }
4457         }
4458 #ifdef MORE_LOCKING_DEBUG
4459         if (so->so_usecount <= 1) {
4460                 panic("%s: after big while so=%llx ref=%d on socket\n",
4461                     __func__,
4462                     (uint64_t)DEBUG_KERNEL_ADDRPERM(so), so->so_usecount);
4463                 /* NOTREACHED */
4464         }
4465 #endif
4466         /*
4467          * Tell the caller we made a partial copy
4468          */
4469         if (m != NULL) {
4470                 if (so->so_options & SO_DONTTRUNC) {
4471                         /*
4472                          * Copyout first the freelist then the partial mbuf
4473                          */
4474                         socket_unlock(so, 0);
4475                         if (delayed_copy_len) {
4476                                 error = sodelayed_copy_list(so, msgarray,
4477                                     uiocnt, &free_list, &delayed_copy_len);
4478                         }
4479
4480                         if (error == 0) {
4481                                 error = uiomove(mtod(m, caddr_t), (int)len,
4482                                     auio);
4483                         }
4484                         socket_lock(so, 0);
4485                         if (error) {
4486                                 goto release;
4487                         }
4488
4489                         m->m_data += len;
4490                         m->m_len -= len;
4491                         so->so_rcv.sb_cc -= len;
4492                         flags |= MSG_RCVMORE;
4493                 } else {
4494                         (void) sbdroprecord(&so->so_rcv);
4495                         nextrecord = so->so_rcv.sb_mb;
4496                         m = NULL;
4497                         flags |= MSG_TRUNC;
4498                 }
4499         }
4500
4501         if (m == NULL) {
4502                 so->so_rcv.sb_mb = nextrecord;
4503                 /*
4504                  * First part is an inline SB_EMPTY_FIXUP().  Second
4505                  * part makes sure sb_lastrecord is up-to-date if
4506                  * there is still data in the socket buffer.
4507                  */
4508                 if (so->so_rcv.sb_mb == NULL) {
4509                         so->so_rcv.sb_mbtail = NULL;
4510                         so->so_rcv.sb_lastrecord = NULL;
4511                 } else if (nextrecord->m_nextpkt == NULL) {
4512                         so->so_rcv.sb_lastrecord = nextrecord;
4513                 }
4514                 SB_MB_CHECK(&so->so_rcv);
4515         }
4516         SBLASTRECORDCHK(&so->so_rcv, "soreceive 4");
4517         SBLASTMBUFCHK(&so->so_rcv, "soreceive 4");
4518
4519         /*
4520          * We can continue to the next packet as long as:
4521          * - We haven't exhausted the uio array
4522          * - There was no error
4523          * - A packet was not truncated
4524          * - We can still receive more data
4525          */
4526         if (npkts < uiocnt && error == 0 &&
4527             (flags & (MSG_RCVMORE | MSG_TRUNC)) == 0 &&
4528             (so->so_state & SS_CANTRCVMORE) == 0) {
4529                 sbunlock(&so->so_rcv, TRUE);    /* keep socket locked */
4530                 sblocked = 0;
4531
4532                 goto next;
4533         }
4534         if (flagsp != NULL) {
4535                 *flagsp |= flags;
4536         }
4537
4538 release:
4539         /*
4540          * pru_rcvd may cause more data to be received if the socket lock
4541          * is dropped so we set MSG_HAVEMORE now based on what we know.
4542          * That way the caller won't be surprised if it receives less data
4543          * than requested.
4544          */
4545         if ((so->so_options & SO_WANTMORE) && so->so_rcv.sb_cc > 0) {
4546                 flags |= MSG_HAVEMORE;
4547         }
4548
4549         if (pr->pr_flags & PR_WANTRCVD && so->so_pcb) {
4550                 (*pr->pr_usrreqs->pru_rcvd)(so, flags);
4551         }
4552
4553         if (sblocked) {
4554                 sbunlock(&so->so_rcv, FALSE);   /* will unlock socket */
4555         } else {
4556                 socket_unlock(so, 1);
4557         }
4558
4559         if (delayed_copy_len) {
4560                 error = sodelayed_copy_list(so, msgarray, uiocnt,
4561                     &free_list, &delayed_copy_len);
4562         }
4563 out:
4564         /*
4565          * Amortize the cost of freeing the mbufs
4566          */
4567         if (free_list != NULL) {
4568                 m_freem_list(free_list);
4569         }
4570         if (free_others != NULL) {
4571                 m_freem_list(free_others);
4572         }
4573
4574         KERNEL_DEBUG(DBG_FNC_SORECEIVE_LIST | DBG_FUNC_END, error,
4575             0, 0, 0, 0);
4576         return error;
4577 }
4578
4579 static int
4580 so_statistics_event_to_nstat_event(int64_t *input_options,
4581     uint64_t *nstat_event)
4582 {
4583         int error = 0;
4584         switch (*input_options) {
4585         case SO_STATISTICS_EVENT_ENTER_CELLFALLBACK:
4586                 *nstat_event = NSTAT_EVENT_SRC_ENTER_CELLFALLBACK;
4587                 break;
4588         case SO_STATISTICS_EVENT_EXIT_CELLFALLBACK:
4589                 *nstat_event = NSTAT_EVENT_SRC_EXIT_CELLFALLBACK;
4590                 break;
4591 #if (DEBUG || DEVELOPMENT)
4592         case SO_STATISTICS_EVENT_RESERVED_1:
4593                 *nstat_event = NSTAT_EVENT_SRC_RESERVED_1;
4594                 break;
4595         case SO_STATISTICS_EVENT_RESERVED_2:
4596                 *nstat_event = NSTAT_EVENT_SRC_RESERVED_2;
4597                 break;
4598 #endif /* (DEBUG || DEVELOPMENT) */
4599         default:
4600                 error = EINVAL;
4601                 break;
4602         }
4603         return error;
4604 }
4605
4606 /*
4607  * Returns:     0                       Success
4608  *              EINVAL
4609  *              ENOTCONN
4610  *      <pru_shutdown>:EINVAL
4611  *      <pru_shutdown>:EADDRNOTAVAIL[TCP]
4612  *      <pru_shutdown>:ENOBUFS[TCP]
4613  *      <pru_shutdown>:EMSGSIZE[TCP]
4614  *      <pru_shutdown>:EHOSTUNREACH[TCP]
4615  *      <pru_shutdown>:ENETUNREACH[TCP]
4616  *      <pru_shutdown>:ENETDOWN[TCP]
4617  *      <pru_shutdown>:ENOMEM[TCP]
4618  *      <pru_shutdown>:EACCES[TCP]
4619  *      <pru_shutdown>:EMSGSIZE[TCP]
4620  *      <pru_shutdown>:ENOBUFS[TCP]
4621  *      <pru_shutdown>:???[TCP]         [ignorable: mostly IPSEC/firewall/DLIL]
4622  *      <pru_shutdown>:???              [other protocol families]
4623  */
4624 int
4625 soshutdown(struct socket *so, int how)
4626 {
4627         int error;
4628
4629         KERNEL_DEBUG(DBG_FNC_SOSHUTDOWN | DBG_FUNC_START, how, 0, 0, 0, 0);
4630
4631         switch (how) {
4632         case SHUT_RD:
4633         case SHUT_WR:
4634         case SHUT_RDWR:
4635                 socket_lock(so, 1);
4636                 if ((so->so_state &
4637                     (SS_ISCONNECTED | SS_ISCONNECTING | SS_ISDISCONNECTING)) == 0) {
4638                         error = ENOTCONN;
4639                 } else {
4640                         error = soshutdownlock(so, how);
4641                 }
4642                 socket_unlock(so, 1);
4643                 break;
4644         default:
4645                 error = EINVAL;
4646                 break;
4647         }
4648
4649         KERNEL_DEBUG(DBG_FNC_SOSHUTDOWN | DBG_FUNC_END, how, error, 0, 0, 0);
4650
4651         return error;
4652 }
4653
4654 int
4655 soshutdownlock_final(struct socket *so, int how)
4656 {
4657         struct protosw *pr = so->so_proto;
4658         int error = 0;
4659
4660         sflt_notify(so, sock_evt_shutdown, &how);
4661
4662         if (how != SHUT_WR) {
4663                 if ((so->so_state & SS_CANTRCVMORE) != 0) {
4664                         /* read already shut down */
4665                         error = ENOTCONN;
4666                         goto done;
4667                 }
4668                 sorflush(so);
4669                 postevent(so, 0, EV_RCLOSED);
4670         }
4671         if (how != SHUT_RD) {
4672                 if ((so->so_state & SS_CANTSENDMORE) != 0) {
4673                         /* write already shut down */
4674                         error = ENOTCONN;
4675                         goto done;
4676                 }
4677                 error = (*pr->pr_usrreqs->pru_shutdown)(so);
4678                 postevent(so, 0, EV_WCLOSED);
4679         }
4680 done:
4681         KERNEL_DEBUG(DBG_FNC_SOSHUTDOWN, how, 1, 0, 0, 0);
4682         return error;
4683 }
4684
4685 int
4686 soshutdownlock(struct socket *so, int how)
4687 {
4688         int error = 0;
4689
4690 #if CONTENT_FILTER
4691         /*
4692          * A content filter may delay the actual shutdown until it
4693          * has processed the pending data
4694          */
4695         if (so->so_flags & SOF_CONTENT_FILTER) {
4696                 error = cfil_sock_shutdown(so, &how);
4697                 if (error == EJUSTRETURN) {
4698                         error = 0;
4699                         goto done;
4700                 } else if (error != 0) {
4701                         goto done;
4702                 }
4703         }
4704 #endif /* CONTENT_FILTER */
4705
4706         error = soshutdownlock_final(so, how);
4707
4708 done:
4709         return error;
4710 }
4711
4712 void
4713 sowflush(struct socket *so)
4714 {
4715         struct sockbuf *sb = &so->so_snd;
4716
4717         /*
4718          * Obtain lock on the socket buffer (SB_LOCK).  This is required
4719          * to prevent the socket buffer from being unexpectedly altered
4720          * while it is used by another thread in socket send/receive.
4721          *
4722          * sblock() must not fail here, hence the assertion.
4723          */
4724         (void) sblock(sb, SBL_WAIT | SBL_NOINTR | SBL_IGNDEFUNCT);
4725         VERIFY(sb->sb_flags & SB_LOCK);
4726
4727         sb->sb_flags            &= ~(SB_SEL | SB_UPCALL);
4728         sb->sb_flags            |= SB_DROP;
4729         sb->sb_upcall           = NULL;
4730         sb->sb_upcallarg        = NULL;
4731
4732         sbunlock(sb, TRUE);     /* keep socket locked */
4733
4734         selthreadclear(&sb->sb_sel);
4735         sbrelease(sb);
4736 }
4737
4738 void
4739 sorflush(struct socket *so)
4740 {
4741         struct sockbuf *sb = &so->so_rcv;
4742         struct protosw *pr = so->so_proto;
4743         struct sockbuf asb;
4744 #ifdef notyet
4745         lck_mtx_t *mutex_held;
4746         /*
4747          * XXX: This code is currently commented out, because we may get here
4748          * as part of sofreelastref(), and at that time, pr_getlock() may no
4749          * longer be able to return us the lock; this will be fixed in future.
4750          */
4751         if (so->so_proto->pr_getlock != NULL) {
4752                 mutex_held = (*so->so_proto->pr_getlock)(so, 0);
4753         } else {
4754                 mutex_held = so->so_proto->pr_domain->dom_mtx;
4755         }
4756
4757         LCK_MTX_ASSERT(mutex_held, LCK_MTX_ASSERT_OWNED);
4758 #endif /* notyet */
4759
4760         sflt_notify(so, sock_evt_flush_read, NULL);
4761
4762         socantrcvmore(so);
4763
4764         /*
4765          * Obtain lock on the socket buffer (SB_LOCK).  This is required
4766          * to prevent the socket buffer from being unexpectedly altered
4767          * while it is used by another thread in socket send/receive.
4768          *
4769          * sblock() must not fail here, hence the assertion.
4770          */
4771         (void) sblock(sb, SBL_WAIT | SBL_NOINTR | SBL_IGNDEFUNCT);
4772         VERIFY(sb->sb_flags & SB_LOCK);
4773
4774         /*
4775          * Copy only the relevant fields from "sb" to "asb" which we
4776          * need for sbrelease() to function.  In particular, skip
4777          * sb_sel as it contains the wait queue linkage, which would
4778          * wreak havoc if we were to issue selthreadclear() on "asb".
4779          * Make sure to not carry over SB_LOCK in "asb", as we need
4780          * to acquire it later as part of sbrelease().
4781          */
4782         bzero(&asb, sizeof(asb));
4783         asb.sb_cc               = sb->sb_cc;
4784         asb.sb_hiwat            = sb->sb_hiwat;
4785         asb.sb_mbcnt            = sb->sb_mbcnt;
4786         asb.sb_mbmax            = sb->sb_mbmax;
4787         asb.sb_ctl              = sb->sb_ctl;
4788         asb.sb_lowat            = sb->sb_lowat;
4789         asb.sb_mb               = sb->sb_mb;
4790         asb.sb_mbtail           = sb->sb_mbtail;
4791         asb.sb_lastrecord       = sb->sb_lastrecord;
4792         asb.sb_so               = sb->sb_so;
4793         asb.sb_flags            = sb->sb_flags;
4794         asb.sb_flags            &= ~(SB_LOCK | SB_SEL | SB_KNOTE | SB_UPCALL);
4795         asb.sb_flags            |= SB_DROP;
4796
4797         /*
4798          * Ideally we'd bzero() these and preserve the ones we need;
4799          * but to do that we'd need to shuffle things around in the
4800          * sockbuf, and we can't do it now because there are KEXTS
4801          * that are directly referring to the socket structure.
4802          *
4803          * Setting SB_DROP acts as a barrier to prevent further appends.
4804          * Clearing SB_SEL is done for selthreadclear() below.
4805          */
4806         sb->sb_cc               = 0;
4807         sb->sb_hiwat            = 0;
4808         sb->sb_mbcnt            = 0;
4809         sb->sb_mbmax            = 0;
4810         sb->sb_ctl              = 0;
4811         sb->sb_lowat            = 0;
4812         sb->sb_mb               = NULL;
4813         sb->sb_mbtail           = NULL;
4814         sb->sb_lastrecord       = NULL;
4815         sb->sb_timeo.tv_sec     = 0;
4816         sb->sb_timeo.tv_usec    = 0;
4817         sb->sb_upcall           = NULL;
4818         sb->sb_upcallarg        = NULL;
4819         sb->sb_flags            &= ~(SB_SEL | SB_UPCALL);
4820         sb->sb_flags            |= SB_DROP;
4821
4822         sbunlock(sb, TRUE);     /* keep socket locked */
4823
4824         /*
4825          * Note that selthreadclear() is called on the original "sb" and
4826          * not the local "asb" because of the way wait queue linkage is
4827          * implemented.  Given that selwakeup() may be triggered, SB_SEL
4828          * should no longer be set (cleared above.)
4829          */
4830         selthreadclear(&sb->sb_sel);
4831
4832         if ((pr->pr_flags & PR_RIGHTS) && pr->pr_domain->dom_dispose) {
4833                 (*pr->pr_domain->dom_dispose)(asb.sb_mb);
4834         }
4835
4836         sbrelease(&asb);
4837 }
4838
4839 /*
4840  * Perhaps this routine, and sooptcopyout(), below, ought to come in
4841  * an additional variant to handle the case where the option value needs
4842  * to be some kind of integer, but not a specific size.
4843  * In addition to their use here, these functions are also called by the
4844  * protocol-level pr_ctloutput() routines.
4845  *
4846  * Returns:     0                       Success
4847  *              EINVAL
4848  *      copyin:EFAULT
4849  */
4850 int
4851 sooptcopyin(struct sockopt *sopt, void *buf, size_t len, size_t minlen)
4852 {
4853         size_t  valsize;
4854
4855         /*
4856          * If the user gives us more than we wanted, we ignore it,
4857          * but if we don't get the minimum length the caller
4858          * wants, we return EINVAL.  On success, sopt->sopt_valsize
4859          * is set to however much we actually retrieved.
4860          */
4861         if ((valsize = sopt->sopt_valsize) < minlen) {
4862                 return EINVAL;
4863         }
4864         if (valsize > len) {
4865                 sopt->sopt_valsize = valsize = len;
4866         }
4867
4868         if (sopt->sopt_p != kernproc) {
4869                 return copyin(sopt->sopt_val, buf, valsize);
4870         }
4871
4872         bcopy(CAST_DOWN(caddr_t, sopt->sopt_val), buf, valsize);
4873         return 0;
4874 }
4875
4876 /*
4877  * sooptcopyin_timeval
4878  *   Copy in a timeval value into tv_p, and take into account whether the
4879  *   the calling process is 64-bit or 32-bit.  Moved the sanity checking
4880  *   code here so that we can verify the 64-bit tv_sec value before we lose
4881  *   the top 32-bits assigning tv64.tv_sec to tv_p->tv_sec.
4882  */
4883 static int
4884 sooptcopyin_timeval(struct sockopt *sopt, struct timeval *tv_p)
4885 {
4886         int                     error;
4887
4888         if (proc_is64bit(sopt->sopt_p)) {
4889                 struct user64_timeval   tv64;
4890
4891                 if (sopt->sopt_valsize < sizeof(tv64)) {
4892                         return EINVAL;
4893                 }
4894
4895                 sopt->sopt_valsize = sizeof(tv64);
4896                 if (sopt->sopt_p != kernproc) {
4897                         error = copyin(sopt->sopt_val, &tv64, sizeof(tv64));
4898                         if (error != 0) {
4899                                 return error;
4900                         }
4901                 } else {
4902                         bcopy(CAST_DOWN(caddr_t, sopt->sopt_val), &tv64,
4903                             sizeof(tv64));
4904                 }
4905                 if (tv64.tv_sec < 0 || tv64.tv_sec > LONG_MAX ||
4906                     tv64.tv_usec < 0 || tv64.tv_usec >= 1000000) {
4907                         return EDOM;
4908                 }
4909
4910                 tv_p->tv_sec = tv64.tv_sec;
4911                 tv_p->tv_usec = tv64.tv_usec;
4912         } else {
4913                 struct user32_timeval   tv32;
4914
4915                 if (sopt->sopt_valsize < sizeof(tv32)) {
4916                         return EINVAL;
4917                 }
4918
4919                 sopt->sopt_valsize = sizeof(tv32);
4920                 if (sopt->sopt_p != kernproc) {
4921                         error = copyin(sopt->sopt_val, &tv32, sizeof(tv32));
4922                         if (error != 0) {
4923                                 return error;
4924                         }
4925                 } else {
4926                         bcopy(CAST_DOWN(caddr_t, sopt->sopt_val), &tv32,
4927                             sizeof(tv32));
4928                 }
4929 #ifndef __LP64__
4930                 /*
4931                  * K64todo "comparison is always false due to
4932                  * limited range of data type"
4933                  */
4934                 if (tv32.tv_sec < 0 || tv32.tv_sec > LONG_MAX ||
4935                     tv32.tv_usec < 0 || tv32.tv_usec >= 1000000) {
4936                         return EDOM;
4937                 }
4938 #endif
4939                 tv_p->tv_sec = tv32.tv_sec;
4940                 tv_p->tv_usec = tv32.tv_usec;
4941         }
4942         return 0;
4943 }
4944
4945 int
4946 soopt_cred_check(struct socket *so, int priv, boolean_t allow_root,
4947     boolean_t ignore_delegate)
4948 {
4949         kauth_cred_t cred =  NULL;
4950         proc_t ep = PROC_NULL;
4951         uid_t uid;
4952         int error = 0;
4953
4954         if (ignore_delegate == false && so->so_flags & SOF_DELEGATED) {
4955                 ep = proc_find(so->e_pid);
4956                 if (ep) {
4957                         cred = kauth_cred_proc_ref(ep);
4958                 }
4959         }
4960
4961         uid = kauth_cred_getuid(cred ? cred : so->so_cred);
4962
4963         /* uid is 0 for root */
4964         if (uid != 0 || !allow_root) {
4965                 error = priv_check_cred(cred ? cred : so->so_cred, priv, 0);
4966         }
4967         if (cred) {
4968                 kauth_cred_unref(&cred);
4969         }
4970         if (ep != PROC_NULL) {
4971                 proc_rele(ep);
4972         }
4973
4974         return error;
4975 }
4976
4977 /*
4978  * Returns:     0                       Success
4979  *              EINVAL
4980  *              ENOPROTOOPT
4981  *              ENOBUFS
4982  *              EDOM
4983  *      sooptcopyin:EINVAL
4984  *      sooptcopyin:EFAULT
4985  *      sooptcopyin_timeval:EINVAL
4986  *      sooptcopyin_timeval:EFAULT
4987  *      sooptcopyin_timeval:EDOM
4988  *      <pr_ctloutput>:EOPNOTSUPP[AF_UNIX]
4989  *      <pr_ctloutput>:???w
4990  *      sflt_attach_private:???         [whatever a filter author chooses]
4991  *      <sf_setoption>:???              [whatever a filter author chooses]
4992  *
4993  * Notes:       Other <pru_listen> returns depend on the protocol family; all
4994  *              <sf_listen> returns depend on what the filter author causes
4995  *              their filter to return.
4996  */
4997 int
4998 sosetoptlock(struct socket *so, struct sockopt *sopt, int dolock)
4999 {
5000         int     error, optval;
5001         int64_t long_optval;
5002         struct  linger l;
5003         struct  timeval tv;
5004 #if CONFIG_MACF_SOCKET
5005         struct mac extmac;
5006 #endif /* MAC_SOCKET */
5007
5008         if (sopt->sopt_dir != SOPT_SET) {
5009                 sopt->sopt_dir = SOPT_SET;
5010         }
5011
5012         if (dolock) {
5013                 socket_lock(so, 1);
5014         }
5015
5016         if ((so->so_state & (SS_CANTRCVMORE | SS_CANTSENDMORE)) ==
5017             (SS_CANTRCVMORE | SS_CANTSENDMORE) &&
5018             (so->so_flags & SOF_NPX_SETOPTSHUT) == 0) {
5019                 /* the socket has been shutdown, no more sockopt's */
5020                 error = EINVAL;
5021                 goto out;
5022         }
5023
5024         error = sflt_setsockopt(so, sopt);
5025         if (error != 0) {
5026                 if (error == EJUSTRETURN) {
5027                         error = 0;
5028                 }
5029                 goto out;
5030         }
5031
5032         if (sopt->sopt_level != SOL_SOCKET) {
5033                 if (so->so_proto != NULL &&
5034                     so->so_proto->pr_ctloutput != NULL) {
5035                         error = (*so->so_proto->pr_ctloutput)(so, sopt);
5036                         goto out;
5037                 }
5038                 error = ENOPROTOOPT;
5039         } else {
5040                 /*
5041                  * Allow socket-level (SOL_SOCKET) options to be filtered by
5042                  * the protocol layer, if needed.  A zero value returned from
5043                  * the handler means use default socket-level processing as
5044                  * done by the rest of this routine.  Otherwise, any other
5045                  * return value indicates that the option is unsupported.
5046                  */
5047                 if (so->so_proto != NULL && (error = so->so_proto->pr_usrreqs->
5048                     pru_socheckopt(so, sopt)) != 0) {
5049                         goto out;
5050                 }
5051
5052                 error = 0;
5053                 switch (sopt->sopt_name) {
5054                 case SO_LINGER:
5055                 case SO_LINGER_SEC:
5056                         error = sooptcopyin(sopt, &l, sizeof(l), sizeof(l));
5057                         if (error != 0) {
5058                                 goto out;
5059                         }
5060
5061                         so->so_linger = (sopt->sopt_name == SO_LINGER) ?
5062                             l.l_linger : l.l_linger * hz;
5063                         if (l.l_onoff != 0) {
5064                                 so->so_options |= SO_LINGER;
5065                         } else {
5066                                 so->so_options &= ~SO_LINGER;
5067                         }
5068                         break;
5069
5070                 case SO_DEBUG:
5071                 case SO_KEEPALIVE:
5072                 case SO_DONTROUTE:
5073                 case SO_USELOOPBACK:
5074                 case SO_BROADCAST:
5075                 case SO_REUSEADDR:
5076                 case SO_REUSEPORT:
5077                 case SO_OOBINLINE:
5078                 case SO_TIMESTAMP:
5079                 case SO_TIMESTAMP_MONOTONIC:
5080                 case SO_TIMESTAMP_CONTINUOUS:
5081                 case SO_DONTTRUNC:
5082                 case SO_WANTMORE:
5083                 case SO_WANTOOBFLAG:
5084                 case SO_NOWAKEFROMSLEEP:
5085                 case SO_NOAPNFALLBK:
5086                         error = sooptcopyin(sopt, &optval, sizeof(optval),
5087                             sizeof(optval));
5088                         if (error != 0) {
5089                                 goto out;
5090                         }
5091                         if (optval) {
5092                                 so->so_options |= sopt->sopt_name;
5093                         } else {
5094                                 so->so_options &= ~sopt->sopt_name;
5095                         }
5096                         break;
5097
5098                 case SO_SNDBUF:
5099                 case SO_RCVBUF:
5100                 case SO_SNDLOWAT:
5101                 case SO_RCVLOWAT:
5102                         error = sooptcopyin(sopt, &optval, sizeof(optval),
5103                             sizeof(optval));
5104                         if (error != 0) {
5105                                 goto out;
5106                         }
5107
5108                         /*
5109                          * Values < 1 make no sense for any of these
5110                          * options, so disallow them.
5111                          */
5112                         if (optval < 1) {
5113                                 error = EINVAL;
5114                                 goto out;
5115                         }
5116
5117                         switch (sopt->sopt_name) {
5118                         case SO_SNDBUF:
5119                         case SO_RCVBUF: {
5120                                 struct sockbuf *sb =
5121                                     (sopt->sopt_name == SO_SNDBUF) ?
5122                                     &so->so_snd : &so->so_rcv;
5123                                 if (sbreserve(sb, (u_int32_t)optval) == 0) {
5124                                         error = ENOBUFS;
5125                                         goto out;
5126                                 }
5127                                 sb->sb_flags |= SB_USRSIZE;
5128                                 sb->sb_flags &= ~SB_AUTOSIZE;
5129                                 sb->sb_idealsize = (u_int32_t)optval;
5130                                 break;
5131                         }
5132                         /*
5133                          * Make sure the low-water is never greater than
5134                          * the high-water.
5135                          */
5136                         case SO_SNDLOWAT: {
5137                                 int space = sbspace(&so->so_snd);
5138                                 u_int32_t hiwat = so->so_snd.sb_hiwat;
5139
5140                                 if (so->so_snd.sb_flags & SB_UNIX) {
5141                                         struct unpcb *unp =
5142                                             (struct unpcb *)(so->so_pcb);
5143                                         if (unp != NULL &&
5144                                             unp->unp_conn != NULL) {
5145                                                 hiwat += unp->unp_conn->unp_cc;
5146                                         }
5147                                 }
5148
5149                                 so->so_snd.sb_lowat =
5150                                     (optval > hiwat) ?
5151                                     hiwat : optval;
5152
5153                                 if (space >= so->so_snd.sb_lowat) {
5154                                         sowwakeup(so);
5155                                 }
5156                                 break;
5157                         }
5158                         case SO_RCVLOWAT: {
5159                                 int64_t data_len;
5160                                 so->so_rcv.sb_lowat =
5161                                     (optval > so->so_rcv.sb_hiwat) ?
5162                                     so->so_rcv.sb_hiwat : optval;
5163                                 data_len = so->so_rcv.sb_cc
5164                                     - so->so_rcv.sb_ctl;
5165                                 if (data_len >= so->so_rcv.sb_lowat) {
5166                                         sorwakeup(so);
5167                                 }
5168                                 break;
5169                         }
5170                         }
5171                         break;
5172
5173                 case SO_SNDTIMEO:
5174                 case SO_RCVTIMEO:
5175                         error = sooptcopyin_timeval(sopt, &tv);
5176                         if (error != 0) {
5177                                 goto out;
5178                         }
5179
5180                         switch (sopt->sopt_name) {
5181                         case SO_SNDTIMEO:
5182                                 so->so_snd.sb_timeo = tv;
5183                                 break;
5184                         case SO_RCVTIMEO:
5185                                 so->so_rcv.sb_timeo = tv;
5186                                 break;
5187                         }
5188                         break;
5189
5190                 case SO_NKE: {
5191                         struct so_nke nke;
5192
5193                         error = sooptcopyin(sopt, &nke, sizeof(nke),
5194                             sizeof(nke));
5195                         if (error != 0) {
5196                                 goto out;
5197                         }
5198
5199                         error = sflt_attach_internal(so, nke.nke_handle);
5200                         break;
5201                 }
5202
5203                 case SO_NOSIGPIPE:
5204                         error = sooptcopyin(sopt, &optval, sizeof(optval),
5205                             sizeof(optval));
5206                         if (error != 0) {
5207                                 goto out;
5208                         }
5209                         if (optval != 0) {
5210                                 so->so_flags |= SOF_NOSIGPIPE;
5211                         } else {
5212                                 so->so_flags &= ~SOF_NOSIGPIPE;
5213                         }
5214                         break;
5215
5216                 case SO_NOADDRERR:
5217                         error = sooptcopyin(sopt, &optval, sizeof(optval),
5218                             sizeof(optval));
5219                         if (error != 0) {
5220                                 goto out;
5221                         }
5222                         if (optval != 0) {
5223                                 so->so_flags |= SOF_NOADDRAVAIL;
5224                         } else {
5225                                 so->so_flags &= ~SOF_NOADDRAVAIL;
5226                         }
5227                         break;
5228
5229                 case SO_REUSESHAREUID:
5230                         error = sooptcopyin(sopt, &optval, sizeof(optval),
5231                             sizeof(optval));
5232                         if (error != 0) {
5233                                 goto out;
5234                         }
5235                         if (optval != 0) {
5236                                 so->so_flags |= SOF_REUSESHAREUID;
5237                         } else {
5238                                 so->so_flags &= ~SOF_REUSESHAREUID;
5239                         }
5240                         break;
5241
5242                 case SO_NOTIFYCONFLICT:
5243                         if (kauth_cred_issuser(kauth_cred_get()) == 0) {
5244                                 error = EPERM;
5245                                 goto out;
5246                         }
5247                         error = sooptcopyin(sopt, &optval, sizeof(optval),
5248                             sizeof(optval));
5249                         if (error != 0) {
5250                                 goto out;
5251                         }
5252                         if (optval != 0) {
5253                                 so->so_flags |= SOF_NOTIFYCONFLICT;
5254                         } else {
5255                                 so->so_flags &= ~SOF_NOTIFYCONFLICT;
5256                         }
5257                         break;
5258
5259                 case SO_RESTRICTIONS:
5260                         error = sooptcopyin(sopt, &optval, sizeof(optval),
5261                             sizeof(optval));
5262                         if (error != 0) {
5263                                 goto out;
5264                         }
5265
5266                         error = so_set_restrictions(so, optval);
5267                         break;
5268
5269                 case SO_AWDL_UNRESTRICTED:
5270                         if (SOCK_DOM(so) != PF_INET &&
5271                             SOCK_DOM(so) != PF_INET6) {
5272                                 error = EOPNOTSUPP;
5273                                 goto out;
5274                         }
5275                         error = sooptcopyin(sopt, &optval, sizeof(optval),
5276                             sizeof(optval));
5277                         if (error != 0) {
5278                                 goto out;
5279                         }
5280                         if (optval != 0) {
5281                                 error = soopt_cred_check(so,
5282                                     PRIV_NET_RESTRICTED_AWDL, false, false);
5283                                 if (error == 0) {
5284                                         inp_set_awdl_unrestricted(
5285                                                 sotoinpcb(so));
5286                                 }
5287                         } else {
5288                                 inp_clear_awdl_unrestricted(sotoinpcb(so));
5289                         }
5290                         break;
5291                 case SO_INTCOPROC_ALLOW:
5292                         if (SOCK_DOM(so) != PF_INET6) {
5293                                 error = EOPNOTSUPP;
5294                                 goto out;
5295                         }
5296                         error = sooptcopyin(sopt, &optval, sizeof(optval),
5297                             sizeof(optval));
5298                         if (error != 0) {
5299                                 goto out;
5300                         }
5301                         if (optval != 0 &&
5302                             inp_get_intcoproc_allowed(sotoinpcb(so)) == FALSE) {
5303                                 error = soopt_cred_check(so,
5304                                     PRIV_NET_RESTRICTED_INTCOPROC, false, false);
5305                                 if (error == 0) {
5306                                         inp_set_intcoproc_allowed(
5307                                                 sotoinpcb(so));
5308                                 }
5309                         } else if (optval == 0) {
5310                                 inp_clear_intcoproc_allowed(sotoinpcb(so));
5311                         }
5312                         break;
5313
5314                 case SO_LABEL:
5315 #if CONFIG_MACF_SOCKET
5316                         if ((error = sooptcopyin(sopt, &extmac, sizeof(extmac),
5317                             sizeof(extmac))) != 0) {
5318                                 goto out;
5319                         }
5320
5321                         error = mac_setsockopt_label(proc_ucred(sopt->sopt_p),
5322                             so, &extmac);
5323 #else
5324                         error = EOPNOTSUPP;
5325 #endif /* MAC_SOCKET */
5326                         break;
5327
5328                 case SO_UPCALLCLOSEWAIT:
5329                         error = sooptcopyin(sopt, &optval, sizeof(optval),
5330                             sizeof(optval));
5331                         if (error != 0) {
5332                                 goto out;
5333                         }
5334                         if (optval != 0) {
5335                                 so->so_flags |= SOF_UPCALLCLOSEWAIT;
5336                         } else {
5337                                 so->so_flags &= ~SOF_UPCALLCLOSEWAIT;
5338                         }
5339                         break;
5340
5341                 case SO_RANDOMPORT:
5342                         error = sooptcopyin(sopt, &optval, sizeof(optval),
5343                             sizeof(optval));
5344                         if (error != 0) {
5345                                 goto out;
5346                         }
5347                         if (optval != 0) {
5348                                 so->so_flags |= SOF_BINDRANDOMPORT;
5349                         } else {
5350                                 so->so_flags &= ~SOF_BINDRANDOMPORT;
5351                         }
5352                         break;
5353
5354                 case SO_NP_EXTENSIONS: {
5355                         struct so_np_extensions sonpx;
5356
5357                         error = sooptcopyin(sopt, &sonpx, sizeof(sonpx),
5358                             sizeof(sonpx));
5359                         if (error != 0) {
5360                                 goto out;
5361                         }
5362                         if (sonpx.npx_mask & ~SONPX_MASK_VALID) {
5363                                 error = EINVAL;
5364                                 goto out;
5365                         }
5366                         /*
5367                          * Only one bit defined for now
5368                          */
5369                         if ((sonpx.npx_mask & SONPX_SETOPTSHUT)) {
5370                                 if ((sonpx.npx_flags & SONPX_SETOPTSHUT)) {
5371                                         so->so_flags |= SOF_NPX_SETOPTSHUT;
5372                                 } else {
5373                                         so->so_flags &= ~SOF_NPX_SETOPTSHUT;
5374                                 }
5375                         }
5376                         break;
5377                 }
5378
5379                 case SO_TRAFFIC_CLASS: {
5380                         error = sooptcopyin(sopt, &optval, sizeof(optval),
5381                             sizeof(optval));
5382                         if (error != 0) {
5383                                 goto out;
5384                         }
5385                         if (optval >= SO_TC_NET_SERVICE_OFFSET) {
5386                                 int netsvc = optval - SO_TC_NET_SERVICE_OFFSET;
5387                                 error = so_set_net_service_type(so, netsvc);
5388                                 goto out;
5389                         }
5390                         error = so_set_traffic_class(so, optval);
5391                         if (error != 0) {
5392                                 goto out;
5393                         }
5394                         so->so_flags1 &= ~SOF1_TC_NET_SERV_TYPE;
5395                         so->so_netsvctype = _NET_SERVICE_TYPE_UNSPEC;
5396                         break;
5397                 }
5398
5399                 case SO_RECV_TRAFFIC_CLASS: {
5400                         error = sooptcopyin(sopt, &optval, sizeof(optval),
5401                             sizeof(optval));
5402                         if (error != 0) {
5403                                 goto out;
5404                         }
5405                         if (optval == 0) {
5406                                 so->so_flags &= ~SOF_RECV_TRAFFIC_CLASS;
5407                         } else {
5408                                 so->so_flags |= SOF_RECV_TRAFFIC_CLASS;
5409                         }
5410                         break;
5411                 }
5412
5413 #if (DEVELOPMENT || DEBUG)
5414                 case SO_TRAFFIC_CLASS_DBG: {
5415                         struct so_tcdbg so_tcdbg;
5416
5417                         error = sooptcopyin(sopt, &so_tcdbg,
5418                             sizeof(struct so_tcdbg), sizeof(struct so_tcdbg));
5419                         if (error != 0) {
5420                                 goto out;
5421                         }
5422                         error = so_set_tcdbg(so, &so_tcdbg);
5423                         if (error != 0) {
5424                                 goto out;
5425                         }
5426                         break;
5427                 }
5428 #endif /* (DEVELOPMENT || DEBUG) */
5429
5430                 case SO_PRIVILEGED_TRAFFIC_CLASS:
5431                         error = priv_check_cred(kauth_cred_get(),
5432                             PRIV_NET_PRIVILEGED_TRAFFIC_CLASS, 0);
5433                         if (error != 0) {
5434                                 goto out;
5435                         }
5436                         error = sooptcopyin(sopt, &optval, sizeof(optval),
5437                             sizeof(optval));
5438                         if (error != 0) {
5439                                 goto out;
5440                         }
5441                         if (optval == 0) {
5442                                 so->so_flags &= ~SOF_PRIVILEGED_TRAFFIC_CLASS;
5443                         } else {
5444                                 so->so_flags |= SOF_PRIVILEGED_TRAFFIC_CLASS;
5445                         }
5446                         break;
5447
5448 #if (DEVELOPMENT || DEBUG)
5449                 case SO_DEFUNCTIT:
5450                         error = sosetdefunct(current_proc(), so, 0, FALSE);
5451                         if (error == 0) {
5452                                 error = sodefunct(current_proc(), so, 0);
5453                         }
5454
5455                         break;
5456 #endif /* (DEVELOPMENT || DEBUG) */
5457
5458                 case SO_DEFUNCTOK:
5459                         error = sooptcopyin(sopt, &optval, sizeof(optval),
5460                             sizeof(optval));
5461                         if (error != 0 || (so->so_flags & SOF_DEFUNCT)) {
5462                                 if (error == 0) {
5463                                         error = EBADF;
5464                                 }
5465                                 goto out;
5466                         }
5467                         /*
5468                          * Any process can set SO_DEFUNCTOK (clear
5469                          * SOF_NODEFUNCT), but only root can clear
5470                          * SO_DEFUNCTOK (set SOF_NODEFUNCT).
5471                          */
5472                         if (optval == 0 &&
5473                             kauth_cred_issuser(kauth_cred_get()) == 0) {
5474                                 error = EPERM;
5475                                 goto out;
5476                         }
5477                         if (optval) {
5478                                 so->so_flags &= ~SOF_NODEFUNCT;
5479                         } else {
5480                                 so->so_flags |= SOF_NODEFUNCT;
5481                         }
5482
5483                         if (SOCK_DOM(so) == PF_INET ||
5484                             SOCK_DOM(so) == PF_INET6) {
5485                                 char s[MAX_IPv6_STR_LEN];
5486                                 char d[MAX_IPv6_STR_LEN];
5487                                 struct inpcb *inp = sotoinpcb(so);
5488
5489                                 SODEFUNCTLOG("%s[%d, %s]: so 0x%llx "
5490                                     "[%s %s:%d -> %s:%d] is now marked "
5491                                     "as %seligible for "
5492                                     "defunct\n", __func__, proc_selfpid(),
5493                                     proc_best_name(current_proc()),
5494                                     (uint64_t)DEBUG_KERNEL_ADDRPERM(so),
5495                                     (SOCK_TYPE(so) == SOCK_STREAM) ?
5496                                     "TCP" : "UDP", inet_ntop(SOCK_DOM(so),
5497                                     ((SOCK_DOM(so) == PF_INET) ?
5498                                     (void *)&inp->inp_laddr.s_addr :
5499                                     (void *)&inp->in6p_laddr), s, sizeof(s)),
5500                                     ntohs(inp->in6p_lport),
5501                                     inet_ntop(SOCK_DOM(so),
5502                                     (SOCK_DOM(so) == PF_INET) ?
5503                                     (void *)&inp->inp_faddr.s_addr :
5504                                     (void *)&inp->in6p_faddr, d, sizeof(d)),
5505                                     ntohs(inp->in6p_fport),
5506                                     (so->so_flags & SOF_NODEFUNCT) ?
5507                                     "not " : "");
5508                         } else {
5509                                 SODEFUNCTLOG("%s[%d, %s]: so 0x%llx [%d,%d] "
5510                                     "is now marked as %seligible for "
5511                                     "defunct\n",
5512                                     __func__, proc_selfpid(),
5513                                     proc_best_name(current_proc()),
5514                                     (uint64_t)DEBUG_KERNEL_ADDRPERM(so),
5515                                     SOCK_DOM(so), SOCK_TYPE(so),
5516                                     (so->so_flags & SOF_NODEFUNCT) ?
5517                                     "not " : "");
5518                         }
5519                         break;
5520
5521                 case SO_ISDEFUNCT:
5522                         /* This option is not settable */
5523                         error = EINVAL;
5524                         break;
5525
5526                 case SO_OPPORTUNISTIC:
5527                         error = sooptcopyin(sopt, &optval, sizeof(optval),
5528                             sizeof(optval));
5529                         if (error == 0) {
5530                                 error = so_set_opportunistic(so, optval);
5531                         }
5532                         break;
5533
5534                 case SO_FLUSH:
5535                         /* This option is handled by lower layer(s) */
5536                         error = 0;
5537                         break;
5538
5539                 case SO_RECV_ANYIF:
5540                         error = sooptcopyin(sopt, &optval, sizeof(optval),
5541                             sizeof(optval));
5542                         if (error == 0) {
5543                                 error = so_set_recv_anyif(so, optval);
5544                         }
5545                         break;
5546
5547                 case SO_TRAFFIC_MGT_BACKGROUND: {
5548                         /* This option is handled by lower layer(s) */
5549                         error = 0;
5550                         break;
5551                 }
5552
5553 #if FLOW_DIVERT
5554                 case SO_FLOW_DIVERT_TOKEN:
5555                         error = flow_divert_token_set(so, sopt);
5556                         break;
5557 #endif  /* FLOW_DIVERT */
5558
5559
5560                 case SO_DELEGATED:
5561                         if ((error = sooptcopyin(sopt, &optval, sizeof(optval),
5562                             sizeof(optval))) != 0) {
5563                                 break;
5564                         }
5565
5566                         error = so_set_effective_pid(so, optval, sopt->sopt_p, true);
5567                         break;
5568
5569                 case SO_DELEGATED_UUID: {
5570                         uuid_t euuid;
5571
5572                         if ((error = sooptcopyin(sopt, &euuid, sizeof(euuid),
5573                             sizeof(euuid))) != 0) {
5574                                 break;
5575                         }
5576
5577                         error = so_set_effective_uuid(so, euuid, sopt->sopt_p, true);
5578                         break;
5579                 }
5580
5581 #if NECP
5582                 case SO_NECP_ATTRIBUTES:
5583                         error = necp_set_socket_attributes(so, sopt);
5584                         break;
5585
5586                 case SO_NECP_CLIENTUUID: {
5587                         if (SOCK_DOM(so) == PF_MULTIPATH) {
5588                                 /* Handled by MPTCP itself */
5589                                 break;
5590                         }
5591
5592                         if (SOCK_DOM(so) != PF_INET && SOCK_DOM(so) != PF_INET6) {
5593                                 error = EINVAL;
5594                                 goto out;
5595                         }
5596
5597                         struct inpcb *inp = sotoinpcb(so);
5598                         if (!uuid_is_null(inp->necp_client_uuid)) {
5599                                 // Clear out the old client UUID if present
5600                                 necp_inpcb_remove_cb(inp);
5601                         }
5602
5603                         error = sooptcopyin(sopt, &inp->necp_client_uuid,
5604                             sizeof(uuid_t), sizeof(uuid_t));
5605                         if (error != 0) {
5606                                 goto out;
5607                         }
5608
5609                         if (uuid_is_null(inp->necp_client_uuid)) {
5610                                 error = EINVAL;
5611                                 goto out;
5612                         }
5613
5614                         pid_t current_pid = proc_pid(current_proc());
5615                         error = necp_client_register_socket_flow(current_pid,
5616                             inp->necp_client_uuid, inp);
5617                         if (error != 0) {
5618                                 uuid_clear(inp->necp_client_uuid);
5619                                 goto out;
5620                         }
5621
5622                         if (inp->inp_lport != 0) {
5623                                 // There is a bound local port, so this is not
5624                                 // a fresh socket. Assign to the client.
5625                                 necp_client_assign_from_socket(current_pid, inp->necp_client_uuid, inp);
5626                         }
5627
5628                         break;
5629                 }
5630                 case SO_NECP_LISTENUUID: {
5631                         if (SOCK_DOM(so) != PF_INET && SOCK_DOM(so) != PF_INET6) {
5632                                 error = EINVAL;
5633                                 goto out;
5634                         }
5635
5636                         struct inpcb *inp = sotoinpcb(so);
5637                         if (!uuid_is_null(inp->necp_client_uuid)) {
5638                                 error = EINVAL;
5639                                 goto out;
5640                         }
5641
5642                         error = sooptcopyin(sopt, &inp->necp_client_uuid,
5643                             sizeof(uuid_t), sizeof(uuid_t));
5644                         if (error != 0) {
5645                                 goto out;
5646                         }
5647
5648                         if (uuid_is_null(inp->necp_client_uuid)) {
5649                                 error = EINVAL;
5650                                 goto out;
5651                         }
5652
5653                         error = necp_client_register_socket_listener(proc_pid(current_proc()),
5654                             inp->necp_client_uuid, inp);
5655                         if (error != 0) {
5656                                 uuid_clear(inp->necp_client_uuid);
5657                                 goto out;
5658                         }
5659
5660                         // Mark that the port registration is held by NECP
5661                         inp->inp_flags2 |= INP2_EXTERNAL_PORT;
5662
5663                         break;
5664                 }
5665 #endif /* NECP */
5666
5667                 case SO_EXTENDED_BK_IDLE:
5668                         error = sooptcopyin(sopt, &optval, sizeof(optval),
5669                             sizeof(optval));
5670                         if (error == 0) {
5671                                 error = so_set_extended_bk_idle(so, optval);
5672                         }
5673                         break;
5674
5675                 case SO_MARK_CELLFALLBACK:
5676                         error = sooptcopyin(sopt, &optval, sizeof(optval),
5677                             sizeof(optval));
5678                         if (error != 0) {
5679                                 goto out;
5680                         }
5681                         if (optval < 0) {
5682                                 error = EINVAL;
5683                                 goto out;
5684                         }
5685                         if (optval == 0) {
5686                                 so->so_flags1 &= ~SOF1_CELLFALLBACK;
5687                         } else {
5688                                 so->so_flags1 |= SOF1_CELLFALLBACK;
5689                         }
5690                         break;
5691
5692                 case SO_STATISTICS_EVENT:
5693                         error = sooptcopyin(sopt, &long_optval,
5694                             sizeof(long_optval), sizeof(long_optval));
5695                         if (error != 0) {
5696                                 goto out;
5697                         }
5698                         u_int64_t nstat_event = 0;
5699                         error = so_statistics_event_to_nstat_event(
5700                                 &long_optval, &nstat_event);
5701                         if (error != 0) {
5702                                 goto out;
5703                         }
5704                         nstat_pcb_event(sotoinpcb(so), nstat_event);
5705                         break;
5706
5707                 case SO_NET_SERVICE_TYPE: {
5708                         error = sooptcopyin(sopt, &optval, sizeof(optval),
5709                             sizeof(optval));
5710                         if (error != 0) {
5711                                 goto out;
5712                         }
5713                         error = so_set_net_service_type(so, optval);
5714                         break;
5715                 }
5716
5717                 case SO_QOSMARKING_POLICY_OVERRIDE:
5718                         error = priv_check_cred(kauth_cred_get(),
5719                             PRIV_NET_QOSMARKING_POLICY_OVERRIDE, 0);
5720                         if (error != 0) {
5721                                 goto out;
5722                         }
5723                         error = sooptcopyin(sopt, &optval, sizeof(optval),
5724                             sizeof(optval));
5725                         if (error != 0) {
5726                                 goto out;
5727                         }
5728                         if (optval == 0) {
5729                                 so->so_flags1 &= ~SOF1_QOSMARKING_POLICY_OVERRIDE;
5730                         } else {
5731                                 so->so_flags1 |= SOF1_QOSMARKING_POLICY_OVERRIDE;
5732                         }
5733                         break;
5734
5735                 case SO_MPKL_SEND_INFO: {
5736                         struct so_mpkl_send_info so_mpkl_send_info;
5737
5738                         error = sooptcopyin(sopt, &so_mpkl_send_info,
5739                             sizeof(struct so_mpkl_send_info), sizeof(struct so_mpkl_send_info));
5740                         if (error != 0) {
5741                                 goto out;
5742                         }
5743                         uuid_copy(so->so_mpkl_send_uuid, so_mpkl_send_info.mpkl_uuid);
5744                         so->so_mpkl_send_proto = so_mpkl_send_info.mpkl_proto;
5745
5746                         if (uuid_is_null(so->so_mpkl_send_uuid) && so->so_mpkl_send_proto == 0) {
5747                                 so->so_flags1 &= ~SOF1_MPKL_SEND_INFO;
5748                         } else {
5749                                 so->so_flags1 |= SOF1_MPKL_SEND_INFO;
5750                         }
5751                         break;
5752                 }
5753                 default:
5754                         error = ENOPROTOOPT;
5755                         break;
5756                 }
5757                 if (error == 0 && so->so_proto != NULL &&
5758                     so->so_proto->pr_ctloutput != NULL) {
5759                         (void) so->so_proto->pr_ctloutput(so, sopt);
5760                 }
5761         }
5762 out:
5763         if (dolock) {
5764                 socket_unlock(so, 1);
5765         }
5766         return error;
5767 }
5768
5769 /* Helper routines for getsockopt */
5770 int
5771 sooptcopyout(struct sockopt *sopt, void *buf, size_t len)
5772 {
5773         int     error;
5774         size_t  valsize;
5775
5776         error = 0;
5777
5778         /*
5779          * Documented get behavior is that we always return a value,
5780          * possibly truncated to fit in the user's buffer.
5781          * Traditional behavior is that we always tell the user
5782          * precisely how much we copied, rather than something useful
5783          * like the total amount we had available for her.
5784          * Note that this interface is not idempotent; the entire answer must
5785          * generated ahead of time.
5786          */
5787         valsize = min(len, sopt->sopt_valsize);
5788         sopt->sopt_valsize = valsize;
5789         if (sopt->sopt_val != USER_ADDR_NULL) {
5790                 if (sopt->sopt_p != kernproc) {
5791                         error = copyout(buf, sopt->sopt_val, valsize);
5792                 } else {
5793                         bcopy(buf, CAST_DOWN(caddr_t, sopt->sopt_val), valsize);
5794                 }
5795         }
5796         return error;
5797 }
5798
5799 static int
5800 sooptcopyout_timeval(struct sockopt *sopt, const struct timeval *tv_p)
5801 {
5802         int                     error;
5803         size_t                  len;
5804         struct user64_timeval   tv64 = {};
5805         struct user32_timeval   tv32 = {};
5806         const void *            val;
5807         size_t                  valsize;
5808
5809         error = 0;
5810         if (proc_is64bit(sopt->sopt_p)) {
5811                 len = sizeof(tv64);
5812                 tv64.tv_sec = tv_p->tv_sec;
5813                 tv64.tv_usec = tv_p->tv_usec;
5814                 val = &tv64;
5815         } else {
5816                 len = sizeof(tv32);
5817                 tv32.tv_sec = tv_p->tv_sec;
5818                 tv32.tv_usec = tv_p->tv_usec;
5819                 val = &tv32;
5820         }
5821         valsize = min(len, sopt->sopt_valsize);
5822         sopt->sopt_valsize = valsize;
5823         if (sopt->sopt_val != USER_ADDR_NULL) {
5824                 if (sopt->sopt_p != kernproc) {
5825                         error = copyout(val, sopt->sopt_val, valsize);
5826                 } else {
5827                         bcopy(val, CAST_DOWN(caddr_t, sopt->sopt_val), valsize);
5828                 }
5829         }
5830         return error;
5831 }
5832
5833 /*
5834  * Return:      0                       Success
5835  *              ENOPROTOOPT
5836  *      <pr_ctloutput>:EOPNOTSUPP[AF_UNIX]
5837  *      <pr_ctloutput>:???
5838  *      <sf_getoption>:???
5839  */
5840 int
5841 sogetoptlock(struct socket *so, struct sockopt *sopt, int dolock)
5842 {
5843         int     error, optval;
5844         struct  linger l;
5845         struct  timeval tv;
5846 #if CONFIG_MACF_SOCKET
5847         struct mac extmac;
5848 #endif /* MAC_SOCKET */
5849
5850         if (sopt->sopt_dir != SOPT_GET) {
5851                 sopt->sopt_dir = SOPT_GET;
5852         }
5853
5854         if (dolock) {
5855                 socket_lock(so, 1);
5856         }
5857
5858         error = sflt_getsockopt(so, sopt);
5859         if (error != 0) {
5860                 if (error == EJUSTRETURN) {
5861                         error = 0;
5862                 }
5863                 goto out;
5864         }
5865
5866         if (sopt->sopt_level != SOL_SOCKET) {
5867                 if (so->so_proto != NULL &&
5868                     so->so_proto->pr_ctloutput != NULL) {
5869                         error = (*so->so_proto->pr_ctloutput)(so, sopt);
5870                         goto out;
5871                 }
5872                 error = ENOPROTOOPT;
5873         } else {
5874                 /*
5875                  * Allow socket-level (SOL_SOCKET) options to be filtered by
5876                  * the protocol layer, if needed.  A zero value returned from
5877                  * the handler means use default socket-level processing as
5878                  * done by the rest of this routine.  Otherwise, any other
5879                  * return value indicates that the option is unsupported.
5880                  */
5881                 if (so->so_proto != NULL && (error = so->so_proto->pr_usrreqs->
5882                     pru_socheckopt(so, sopt)) != 0) {
5883                         goto out;
5884                 }
5885
5886                 error = 0;
5887                 switch (sopt->sopt_name) {
5888                 case SO_LINGER:
5889                 case SO_LINGER_SEC:
5890                         l.l_onoff = ((so->so_options & SO_LINGER) ? 1 : 0);
5891                         l.l_linger = (sopt->sopt_name == SO_LINGER) ?
5892                             so->so_linger : so->so_linger / hz;
5893                         error = sooptcopyout(sopt, &l, sizeof(l));
5894                         break;
5895
5896                 case SO_USELOOPBACK:
5897                 case SO_DONTROUTE:
5898                 case SO_DEBUG:
5899                 case SO_KEEPALIVE:
5900                 case SO_REUSEADDR:
5901                 case SO_REUSEPORT:
5902                 case SO_BROADCAST:
5903                 case SO_OOBINLINE:
5904                 case SO_TIMESTAMP:
5905                 case SO_TIMESTAMP_MONOTONIC:
5906                 case SO_TIMESTAMP_CONTINUOUS:
5907                 case SO_DONTTRUNC:
5908                 case SO_WANTMORE:
5909                 case SO_WANTOOBFLAG:
5910                 case SO_NOWAKEFROMSLEEP:
5911                 case SO_NOAPNFALLBK:
5912                         optval = so->so_options & sopt->sopt_name;
5913 integer:
5914                         error = sooptcopyout(sopt, &optval, sizeof(optval));
5915                         break;
5916
5917                 case SO_TYPE:
5918                         optval = so->so_type;
5919                         goto integer;
5920
5921                 case SO_NREAD:
5922                         if (so->so_proto->pr_flags & PR_ATOMIC) {
5923                                 int pkt_total;
5924                                 struct mbuf *m1;
5925
5926                                 pkt_total = 0;
5927                                 m1 = so->so_rcv.sb_mb;
5928                                 while (m1 != NULL) {
5929                                         if (m1->m_type == MT_DATA ||
5930                                             m1->m_type == MT_HEADER ||
5931                                             m1->m_type == MT_OOBDATA) {
5932                                                 pkt_total += m1->m_len;
5933                                         }
5934                                         m1 = m1->m_next;
5935                                 }
5936                                 optval = pkt_total;
5937                         } else {
5938                                 optval = so->so_rcv.sb_cc - so->so_rcv.sb_ctl;
5939                         }
5940                         goto integer;
5941
5942                 case SO_NUMRCVPKT:
5943                         if (so->so_proto->pr_flags & PR_ATOMIC) {
5944                                 int cnt = 0;
5945                                 struct mbuf *m1;
5946
5947                                 m1 = so->so_rcv.sb_mb;
5948                                 while (m1 != NULL) {
5949                                         cnt += 1;
5950                                         m1 = m1->m_nextpkt;
5951                                 }
5952                                 optval = cnt;
5953                                 goto integer;
5954                         } else {
5955                                 error = ENOPROTOOPT;
5956                                 break;
5957                         }
5958
5959                 case SO_NWRITE:
5960                         optval = so->so_snd.sb_cc;
5961                         goto integer;
5962
5963                 case SO_ERROR:
5964                         optval = so->so_error;
5965                         so->so_error = 0;
5966                         goto integer;
5967
5968                 case SO_SNDBUF: {
5969                         u_int32_t hiwat = so->so_snd.sb_hiwat;
5970
5971                         if (so->so_snd.sb_flags & SB_UNIX) {
5972                                 struct unpcb *unp =
5973                                     (struct unpcb *)(so->so_pcb);
5974                                 if (unp != NULL && unp->unp_conn != NULL) {
5975                                         hiwat += unp->unp_conn->unp_cc;
5976                                 }
5977                         }
5978
5979                         optval = hiwat;
5980                         goto integer;
5981                 }
5982                 case SO_RCVBUF:
5983                         optval = so->so_rcv.sb_hiwat;
5984                         goto integer;
5985
5986                 case SO_SNDLOWAT:
5987                         optval = so->so_snd.sb_lowat;
5988                         goto integer;
5989
5990                 case SO_RCVLOWAT:
5991                         optval = so->so_rcv.sb_lowat;
5992                         goto integer;
5993
5994                 case SO_SNDTIMEO:
5995                 case SO_RCVTIMEO:
5996                         tv = (sopt->sopt_name == SO_SNDTIMEO ?
5997                             so->so_snd.sb_timeo : so->so_rcv.sb_timeo);
5998
5999                         error = sooptcopyout_timeval(sopt, &tv);
6000                         break;
6001
6002                 case SO_NOSIGPIPE:
6003                         optval = (so->so_flags & SOF_NOSIGPIPE);
6004                         goto integer;
6005
6006                 case SO_NOADDRERR:
6007                         optval = (so->so_flags & SOF_NOADDRAVAIL);
6008                         goto integer;
6009
6010                 case SO_REUSESHAREUID:
6011                         optval = (so->so_flags & SOF_REUSESHAREUID);
6012                         goto integer;
6013
6014
6015                 case SO_NOTIFYCONFLICT:
6016                         optval = (so->so_flags & SOF_NOTIFYCONFLICT);
6017                         goto integer;
6018
6019                 case SO_RESTRICTIONS:
6020                         optval = so_get_restrictions(so);
6021                         goto integer;
6022
6023                 case SO_AWDL_UNRESTRICTED:
6024                         if (SOCK_DOM(so) == PF_INET ||
6025                             SOCK_DOM(so) == PF_INET6) {
6026                                 optval = inp_get_awdl_unrestricted(
6027                                         sotoinpcb(so));
6028                                 goto integer;
6029                         } else {
6030                                 error = EOPNOTSUPP;
6031                         }
6032                         break;
6033
6034                 case SO_INTCOPROC_ALLOW:
6035                         if (SOCK_DOM(so) == PF_INET6) {
6036                                 optval = inp_get_intcoproc_allowed(
6037                                         sotoinpcb(so));
6038                                 goto integer;
6039                         } else {
6040                                 error = EOPNOTSUPP;
6041                         }
6042                         break;
6043
6044                 case SO_LABEL:
6045 #if CONFIG_MACF_SOCKET
6046                         if ((error = sooptcopyin(sopt, &extmac, sizeof(extmac),
6047                             sizeof(extmac))) != 0 ||
6048                             (error = mac_socket_label_get(proc_ucred(
6049                                     sopt->sopt_p), so, &extmac)) != 0) {
6050                                 break;
6051                         }
6052
6053                         error = sooptcopyout(sopt, &extmac, sizeof(extmac));
6054 #else
6055                         error = EOPNOTSUPP;
6056 #endif /* MAC_SOCKET */
6057                         break;
6058
6059                 case SO_PEERLABEL:
6060 #if CONFIG_MACF_SOCKET
6061                         if ((error = sooptcopyin(sopt, &extmac, sizeof(extmac),
6062                             sizeof(extmac))) != 0 ||
6063                             (error = mac_socketpeer_label_get(proc_ucred(
6064                                     sopt->sopt_p), so, &extmac)) != 0) {
6065                                 break;
6066                         }
6067
6068                         error = sooptcopyout(sopt, &extmac, sizeof(extmac));
6069 #else
6070                         error = EOPNOTSUPP;
6071 #endif /* MAC_SOCKET */
6072                         break;
6073
6074 #ifdef __APPLE_API_PRIVATE
6075                 case SO_UPCALLCLOSEWAIT:
6076                         optval = (so->so_flags & SOF_UPCALLCLOSEWAIT);
6077                         goto integer;
6078 #endif
6079                 case SO_RANDOMPORT:
6080                         optval = (so->so_flags & SOF_BINDRANDOMPORT);
6081                         goto integer;
6082
6083                 case SO_NP_EXTENSIONS: {
6084                         struct so_np_extensions sonpx = {};
6085
6086                         sonpx.npx_flags = (so->so_flags & SOF_NPX_SETOPTSHUT) ?
6087                             SONPX_SETOPTSHUT : 0;
6088                         sonpx.npx_mask = SONPX_MASK_VALID;
6089
6090                         error = sooptcopyout(sopt, &sonpx,
6091                             sizeof(struct so_np_extensions));
6092                         break;
6093                 }
6094
6095                 case SO_TRAFFIC_CLASS:
6096                         optval = so->so_traffic_class;
6097                         goto integer;
6098
6099                 case SO_RECV_TRAFFIC_CLASS:
6100                         optval = (so->so_flags & SOF_RECV_TRAFFIC_CLASS);
6101                         goto integer;
6102
6103                 case SO_TRAFFIC_CLASS_STATS:
6104                         error = sooptcopyout(sopt, &so->so_tc_stats,
6105                             sizeof(so->so_tc_stats));
6106                         break;
6107
6108 #if (DEVELOPMENT || DEBUG)
6109                 case SO_TRAFFIC_CLASS_DBG:
6110                         error = sogetopt_tcdbg(so, sopt);
6111                         break;
6112 #endif /* (DEVELOPMENT || DEBUG) */
6113
6114                 case SO_PRIVILEGED_TRAFFIC_CLASS:
6115                         optval = (so->so_flags & SOF_PRIVILEGED_TRAFFIC_CLASS);
6116                         goto integer;
6117
6118                 case SO_DEFUNCTOK:
6119                         optval = !(so->so_flags & SOF_NODEFUNCT);
6120                         goto integer;
6121
6122                 case SO_ISDEFUNCT:
6123                         optval = (so->so_flags & SOF_DEFUNCT);
6124                         goto integer;
6125
6126                 case SO_OPPORTUNISTIC:
6127                         optval = so_get_opportunistic(so);
6128                         goto integer;
6129
6130                 case SO_FLUSH:
6131                         /* This option is not gettable */
6132                         error = EINVAL;
6133                         break;
6134
6135                 case SO_RECV_ANYIF:
6136                         optval = so_get_recv_anyif(so);
6137                         goto integer;
6138
6139                 case SO_TRAFFIC_MGT_BACKGROUND:
6140                         /* This option is handled by lower layer(s) */
6141                         if (so->so_proto != NULL &&
6142                             so->so_proto->pr_ctloutput != NULL) {
6143                                 (void) so->so_proto->pr_ctloutput(so, sopt);
6144                         }
6145                         break;
6146
6147 #if FLOW_DIVERT
6148                 case SO_FLOW_DIVERT_TOKEN:
6149                         error = flow_divert_token_get(so, sopt);
6150                         break;
6151 #endif  /* FLOW_DIVERT */
6152
6153 #if NECP
6154                 case SO_NECP_ATTRIBUTES:
6155                         error = necp_get_socket_attributes(so, sopt);
6156                         break;
6157
6158                 case SO_NECP_CLIENTUUID: {
6159                         uuid_t *ncu;
6160
6161                         if (SOCK_DOM(so) == PF_MULTIPATH) {
6162                                 ncu = &mpsotomppcb(so)->necp_client_uuid;
6163                         } else if (SOCK_DOM(so) == PF_INET || SOCK_DOM(so) == PF_INET6) {
6164                                 ncu = &sotoinpcb(so)->necp_client_uuid;
6165                         } else {
6166                                 error = EINVAL;
6167                                 goto out;
6168                         }
6169
6170                         error = sooptcopyout(sopt, ncu, sizeof(uuid_t));
6171                         break;
6172                 }
6173
6174                 case SO_NECP_LISTENUUID: {
6175                         uuid_t *nlu;
6176
6177                         if (SOCK_DOM(so) == PF_INET || SOCK_DOM(so) == PF_INET6) {
6178                                 if (sotoinpcb(so)->inp_flags2 & INP2_EXTERNAL_PORT) {
6179                                         nlu = &sotoinpcb(so)->necp_client_uuid;
6180                                 } else {
6181                                         error = ENOENT;
6182                                         goto out;
6183                                 }
6184                         } else {
6185                                 error = EINVAL;
6186                                 goto out;
6187                         }
6188
6189                         error = sooptcopyout(sopt, nlu, sizeof(uuid_t));
6190                         break;
6191                 }
6192 #endif /* NECP */
6193
6194 #if CONTENT_FILTER
6195                 case SO_CFIL_SOCK_ID: {
6196                         cfil_sock_id_t sock_id;
6197
6198                         sock_id = cfil_sock_id_from_socket(so);
6199
6200                         error = sooptcopyout(sopt, &sock_id,
6201                             sizeof(cfil_sock_id_t));
6202                         break;
6203                 }
6204 #endif  /* CONTENT_FILTER */
6205
6206                 case SO_EXTENDED_BK_IDLE:
6207                         optval = (so->so_flags1 & SOF1_EXTEND_BK_IDLE_WANTED);
6208                         goto integer;
6209                 case SO_MARK_CELLFALLBACK:
6210                         optval = ((so->so_flags1 & SOF1_CELLFALLBACK) > 0)
6211                             ? 1 : 0;
6212                         goto integer;
6213                 case SO_NET_SERVICE_TYPE: {
6214                         if ((so->so_flags1 & SOF1_TC_NET_SERV_TYPE)) {
6215                                 optval = so->so_netsvctype;
6216                         } else {
6217                                 optval = NET_SERVICE_TYPE_BE;
6218                         }
6219                         goto integer;
6220                 }
6221                 case SO_NETSVC_MARKING_LEVEL:
6222                         optval = so_get_netsvc_marking_level(so);
6223                         goto integer;
6224
6225                 case SO_MPKL_SEND_INFO: {
6226                         struct so_mpkl_send_info so_mpkl_send_info;
6227
6228                         uuid_copy(so_mpkl_send_info.mpkl_uuid, so->so_mpkl_send_uuid);
6229                         so_mpkl_send_info.mpkl_proto = so->so_mpkl_send_proto;
6230                         error = sooptcopyout(sopt, &so_mpkl_send_info,
6231                             sizeof(struct so_mpkl_send_info));
6232                         break;
6233                 }
6234                 default:
6235                         error = ENOPROTOOPT;
6236                         break;
6237                 }
6238         }
6239 out:
6240         if (dolock) {
6241                 socket_unlock(so, 1);
6242         }
6243         return error;
6244 }
6245
6246 /*
6247  * The size limits on our soopt_getm is different from that on FreeBSD.
6248  * We limit the size of options to MCLBYTES. This will have to change
6249  * if we need to define options that need more space than MCLBYTES.
6250  */
6251 int
6252 soopt_getm(struct sockopt *sopt, struct mbuf **mp)
6253 {
6254         struct mbuf *m, *m_prev;
6255         int sopt_size = sopt->sopt_valsize;
6256         int how;
6257
6258         if (sopt_size <= 0 || sopt_size > MCLBYTES) {
6259                 return EMSGSIZE;
6260         }
6261
6262         how = sopt->sopt_p != kernproc ? M_WAIT : M_DONTWAIT;
6263         MGET(m, how, MT_DATA);
6264         if (m == NULL) {
6265                 return ENOBUFS;
6266         }
6267         if (sopt_size > MLEN) {
6268                 MCLGET(m, how);
6269                 if ((m->m_flags & M_EXT) == 0) {
6270                         m_free(m);
6271                         return ENOBUFS;
6272                 }
6273                 m->m_len = min(MCLBYTES, sopt_size);
6274         } else {
6275                 m->m_len = min(MLEN, sopt_size);
6276         }
6277         sopt_size -= m->m_len;
6278         *mp = m;
6279         m_prev = m;
6280
6281         while (sopt_size > 0) {
6282                 MGET(m, how, MT_DATA);
6283                 if (m == NULL) {
6284                         m_freem(*mp);
6285                         return ENOBUFS;
6286                 }
6287                 if (sopt_size > MLEN) {
6288                         MCLGET(m, how);
6289                         if ((m->m_flags & M_EXT) == 0) {
6290                                 m_freem(*mp);
6291                                 m_freem(m);
6292                                 return ENOBUFS;
6293                         }
6294                         m->m_len = min(MCLBYTES, sopt_size);
6295                 } else {
6296                         m->m_len = min(MLEN, sopt_size);
6297                 }
6298                 sopt_size -= m->m_len;
6299                 m_prev->m_next = m;
6300                 m_prev = m;
6301         }
6302         return 0;
6303 }
6304
6305 /* copyin sopt data into mbuf chain */
6306 int
6307 soopt_mcopyin(struct sockopt *sopt, struct mbuf *m)
6308 {
6309         struct mbuf *m0 = m;
6310
6311         if (sopt->sopt_val == USER_ADDR_NULL) {
6312                 return 0;
6313         }
6314         while (m != NULL && sopt->sopt_valsize >= m->m_len) {
6315                 if (sopt->sopt_p != kernproc) {
6316                         int error;
6317
6318                         error = copyin(sopt->sopt_val, mtod(m, char *),
6319                             m->m_len);
6320                         if (error != 0) {
6321                                 m_freem(m0);
6322                                 return error;
6323                         }
6324                 } else {
6325                         bcopy(CAST_DOWN(caddr_t, sopt->sopt_val),
6326                             mtod(m, char *), m->m_len);
6327                 }
6328                 sopt->sopt_valsize -= m->m_len;
6329                 sopt->sopt_val += m->m_len;
6330                 m = m->m_next;
6331         }
6332         /* should be allocated enoughly at ip6_sooptmcopyin() */
6333         if (m != NULL) {
6334                 panic("soopt_mcopyin");
6335                 /* NOTREACHED */
6336         }
6337         return 0;
6338 }
6339
6340 /* copyout mbuf chain data into soopt */
6341 int
6342 soopt_mcopyout(struct sockopt *sopt, struct mbuf *m)
6343 {
6344         struct mbuf *m0 = m;
6345         size_t valsize = 0;
6346
6347         if (sopt->sopt_val == USER_ADDR_NULL) {
6348                 return 0;
6349         }
6350         while (m != NULL && sopt->sopt_valsize >= m->m_len) {
6351                 if (sopt->sopt_p != kernproc) {
6352                         int error;
6353
6354                         error = copyout(mtod(m, char *), sopt->sopt_val,
6355                             m->m_len);
6356                         if (error != 0) {
6357                                 m_freem(m0);
6358                                 return error;
6359                         }
6360                 } else {
6361                         bcopy(mtod(m, char *),
6362                             CAST_DOWN(caddr_t, sopt->sopt_val), m->m_len);
6363                 }
6364                 sopt->sopt_valsize -= m->m_len;
6365                 sopt->sopt_val += m->m_len;
6366                 valsize += m->m_len;
6367                 m = m->m_next;
6368         }
6369         if (m != NULL) {
6370                 /* enough soopt buffer should be given from user-land */
6371                 m_freem(m0);
6372                 return EINVAL;
6373         }
6374         sopt->sopt_valsize = valsize;
6375         return 0;
6376 }
6377
6378 void
6379 sohasoutofband(struct socket *so)
6380 {
6381         if (so->so_pgid < 0) {
6382                 gsignal(-so->so_pgid, SIGURG);
6383         } else if (so->so_pgid > 0) {
6384                 proc_signal(so->so_pgid, SIGURG);
6385         }
6386         selwakeup(&so->so_rcv.sb_sel);
6387         if (so->so_rcv.sb_flags & SB_KNOTE) {
6388                 KNOTE(&so->so_rcv.sb_sel.si_note,
6389                     (NOTE_OOB | SO_FILT_HINT_LOCKED));
6390         }
6391 }
6392
6393 int
6394 sopoll(struct socket *so, int events, kauth_cred_t cred, void * wql)
6395 {
6396 #pragma unused(cred)
6397         struct proc *p = current_proc();
6398         int revents = 0;
6399
6400         socket_lock(so, 1);
6401         so_update_last_owner_locked(so, PROC_NULL);
6402         so_update_policy(so);
6403
6404         if (events & (POLLIN | POLLRDNORM)) {
6405                 if (soreadable(so)) {
6406                         revents |= events & (POLLIN | POLLRDNORM);
6407                 }
6408         }
6409
6410         if (events & (POLLOUT | POLLWRNORM)) {
6411                 if (sowriteable(so)) {
6412                         revents |= events & (POLLOUT | POLLWRNORM);
6413                 }
6414         }
6415
6416         if (events & (POLLPRI | POLLRDBAND)) {
6417                 if (so->so_oobmark || (so->so_state & SS_RCVATMARK)) {
6418                         revents |= events & (POLLPRI | POLLRDBAND);
6419                 }
6420         }
6421
6422         if (revents == 0) {
6423                 if (events & (POLLIN | POLLPRI | POLLRDNORM | POLLRDBAND)) {
6424                         /*
6425                          * Darwin sets the flag first,
6426                          * BSD calls selrecord first
6427                          */
6428                         so->so_rcv.sb_flags |= SB_SEL;
6429                         selrecord(p, &so->so_rcv.sb_sel, wql);
6430                 }
6431
6432                 if (events & (POLLOUT | POLLWRNORM)) {
6433                         /*
6434                          * Darwin sets the flag first,
6435                          * BSD calls selrecord first
6436                          */
6437                         so->so_snd.sb_flags |= SB_SEL;
6438                         selrecord(p, &so->so_snd.sb_sel, wql);
6439                 }
6440         }
6441
6442         socket_unlock(so, 1);
6443         return revents;
6444 }
6445
6446 int
6447 soo_kqfilter(struct fileproc *fp, struct knote *kn, struct kevent_qos_s *kev)
6448 {
6449         struct socket *so = (struct socket *)fp->f_fglob->fg_data;
6450         int result;
6451
6452         socket_lock(so, 1);
6453         so_update_last_owner_locked(so, PROC_NULL);
6454         so_update_policy(so);
6455
6456 #if CONFIG_MACF_SOCKET
6457         proc_t p = knote_get_kq(kn)->kq_p;
6458         if (mac_socket_check_kqfilter(proc_ucred(p), kn, so) != 0) {
6459                 socket_unlock(so, 1);
6460                 knote_set_error(kn, EPERM);
6461                 return 0;
6462         }
6463 #endif /* MAC_SOCKET */
6464
6465         switch (kn->kn_filter) {
6466         case EVFILT_READ:
6467                 kn->kn_filtid = EVFILTID_SOREAD;
6468                 break;
6469         case EVFILT_WRITE:
6470                 kn->kn_filtid = EVFILTID_SOWRITE;
6471                 break;
6472         case EVFILT_SOCK:
6473                 kn->kn_filtid = EVFILTID_SCK;
6474                 break;
6475         case EVFILT_EXCEPT:
6476                 kn->kn_filtid = EVFILTID_SOEXCEPT;
6477                 break;
6478         default:
6479                 socket_unlock(so, 1);
6480                 knote_set_error(kn, EINVAL);
6481                 return 0;
6482         }
6483
6484         /*
6485          * call the appropriate sub-filter attach
6486          * with the socket still locked
6487          */
6488         result = knote_fops(kn)->f_attach(kn, kev);
6489
6490         socket_unlock(so, 1);
6491
6492         return result;
6493 }
6494
6495 static int
6496 filt_soread_common(struct knote *kn, struct kevent_qos_s *kev, struct socket *so)
6497 {
6498         int retval = 0;
6499         int64_t data = 0;
6500
6501         if (so->so_options & SO_ACCEPTCONN) {
6502                 /*
6503                  * Radar 6615193 handle the listen case dynamically
6504                  * for kqueue read filter. This allows to call listen()
6505                  * after registering the kqueue EVFILT_READ.
6506                  */
6507
6508                 retval = !TAILQ_EMPTY(&so->so_comp);
6509                 data = so->so_qlen;
6510                 goto out;
6511         }
6512
6513         /* socket isn't a listener */
6514         /*
6515          * NOTE_LOWAT specifies new low water mark in data, i.e.
6516          * the bytes of protocol data. We therefore exclude any
6517          * control bytes.
6518          */
6519         data = so->so_rcv.sb_cc - so->so_rcv.sb_ctl;
6520
6521         if (kn->kn_sfflags & NOTE_OOB) {
6522                 if (so->so_oobmark || (so->so_state & SS_RCVATMARK)) {
6523                         kn->kn_fflags |= NOTE_OOB;
6524                         data -= so->so_oobmark;
6525                         retval = 1;
6526                         goto out;
6527                 }
6528         }
6529
6530         if ((so->so_state & SS_CANTRCVMORE)
6531 #if CONTENT_FILTER
6532             && cfil_sock_data_pending(&so->so_rcv) == 0
6533 #endif /* CONTENT_FILTER */
6534             ) {
6535                 kn->kn_flags |= EV_EOF;
6536                 kn->kn_fflags = so->so_error;
6537                 retval = 1;
6538                 goto out;
6539         }
6540
6541         if (so->so_error) {     /* temporary udp error */
6542                 retval = 1;
6543                 goto out;
6544         }
6545
6546         int64_t lowwat = so->so_rcv.sb_lowat;
6547         /*
6548          * Ensure that when NOTE_LOWAT is used, the derived
6549          * low water mark is bounded by socket's rcv buf's
6550          * high and low water mark values.
6551          */
6552         if (kn->kn_sfflags & NOTE_LOWAT) {
6553                 if (kn->kn_sdata > so->so_rcv.sb_hiwat) {
6554                         lowwat = so->so_rcv.sb_hiwat;
6555                 } else if (kn->kn_sdata > lowwat) {
6556                         lowwat = kn->kn_sdata;
6557                 }
6558         }
6559
6560         /*
6561          * While the `data` field is the amount of data to read,
6562          * 0-sized packets need to wake up the kqueue, see 58140856,
6563          * so we need to take control bytes into account too.
6564          */
6565         retval = (so->so_rcv.sb_cc >= lowwat);
6566
6567 out:
6568         if (retval && kev) {
6569                 knote_fill_kevent(kn, kev, data);
6570         }
6571         return retval;
6572 }
6573
6574 static int
6575 filt_sorattach(struct knote *kn, __unused struct kevent_qos_s *kev)
6576 {
6577         struct socket *so = (struct socket *)kn->kn_fp->f_fglob->fg_data;
6578
6579         /* socket locked */
6580
6581         /*
6582          * If the caller explicitly asked for OOB results (e.g. poll())
6583          * from EVFILT_READ, then save that off in the hookid field
6584          * and reserve the kn_flags EV_OOBAND bit for output only.
6585          */
6586         if (kn->kn_filter == EVFILT_READ &&
6587             kn->kn_flags & EV_OOBAND) {
6588                 kn->kn_flags &= ~EV_OOBAND;
6589                 kn->kn_hook32 = EV_OOBAND;
6590         } else {
6591                 kn->kn_hook32 = 0;
6592         }
6593         if (KNOTE_ATTACH(&so->so_rcv.sb_sel.si_note, kn)) {
6594                 so->so_rcv.sb_flags |= SB_KNOTE;
6595         }
6596
6597         /* indicate if event is already fired */
6598         return filt_soread_common(kn, NULL, so);
6599 }
6600
6601 static void
6602 filt_sordetach(struct knote *kn)
6603 {
6604         struct socket *so = (struct socket *)kn->kn_fp->f_fglob->fg_data;
6605
6606         socket_lock(so, 1);
6607         if (so->so_rcv.sb_flags & SB_KNOTE) {
6608                 if (KNOTE_DETACH(&so->so_rcv.sb_sel.si_note, kn)) {
6609                         so->so_rcv.sb_flags &= ~SB_KNOTE;
6610                 }
6611         }
6612         socket_unlock(so, 1);
6613 }
6614
6615 /*ARGSUSED*/
6616 static int
6617 filt_soread(struct knote *kn, long hint)
6618 {
6619         struct socket *so = (struct socket *)kn->kn_fp->f_fglob->fg_data;
6620         int retval;
6621
6622         if ((hint & SO_FILT_HINT_LOCKED) == 0) {
6623                 socket_lock(so, 1);
6624         }
6625
6626         retval = filt_soread_common(kn, NULL, so);
6627
6628         if ((hint & SO_FILT_HINT_LOCKED) == 0) {
6629                 socket_unlock(so, 1);
6630         }
6631
6632         return retval;
6633 }
6634
6635 static int
6636 filt_sortouch(struct knote *kn, struct kevent_qos_s *kev)
6637 {
6638         struct socket *so = (struct socket *)kn->kn_fp->f_fglob->fg_data;
6639         int retval;
6640
6641         socket_lock(so, 1);
6642
6643         /* save off the new input fflags and data */
6644         kn->kn_sfflags = kev->fflags;
6645         kn->kn_sdata = kev->data;
6646
6647         /* determine if changes result in fired events */
6648         retval = filt_soread_common(kn, NULL, so);
6649
6650         socket_unlock(so, 1);
6651
6652         return retval;
6653 }
6654
6655 static int
6656 filt_sorprocess(struct knote *kn, struct kevent_qos_s *kev)
6657 {
6658         struct socket *so = (struct socket *)kn->kn_fp->f_fglob->fg_data;
6659         int retval;
6660
6661         socket_lock(so, 1);
6662         retval = filt_soread_common(kn, kev, so);
6663         socket_unlock(so, 1);
6664
6665         return retval;
6666 }
6667
6668 int
6669 so_wait_for_if_feedback(struct socket *so)
6670 {
6671         if ((SOCK_DOM(so) == PF_INET || SOCK_DOM(so) == PF_INET6) &&
6672             (so->so_state & SS_ISCONNECTED)) {
6673                 struct inpcb *inp = sotoinpcb(so);
6674                 if (INP_WAIT_FOR_IF_FEEDBACK(inp)) {
6675                         return 1;
6676                 }
6677         }
6678         return 0;
6679 }
6680
6681 static int
6682 filt_sowrite_common(struct knote *kn, struct kevent_qos_s *kev, struct socket *so)
6683 {
6684         int ret = 0;
6685         int64_t data = sbspace(&so->so_snd);
6686
6687         if (so->so_state & SS_CANTSENDMORE) {
6688                 kn->kn_flags |= EV_EOF;
6689                 kn->kn_fflags = so->so_error;
6690                 ret = 1;
6691                 goto out;
6692         }
6693
6694         if (so->so_error) {     /* temporary udp error */
6695                 ret = 1;
6696                 goto out;
6697         }
6698
6699         if (!socanwrite(so)) {
6700                 ret = 0;
6701                 goto out;
6702         }
6703
6704         if (so->so_flags1 & SOF1_PRECONNECT_DATA) {
6705                 ret = 1;
6706                 goto out;
6707         }
6708
6709         int64_t lowwat = so->so_snd.sb_lowat;
6710
6711         if (kn->kn_sfflags & NOTE_LOWAT) {
6712                 if (kn->kn_sdata > so->so_snd.sb_hiwat) {
6713                         lowwat = so->so_snd.sb_hiwat;
6714                 } else if (kn->kn_sdata > lowwat) {
6715                         lowwat = kn->kn_sdata;
6716                 }
6717         }
6718
6719         if (data >= lowwat) {
6720                 if ((so->so_flags & SOF_NOTSENT_LOWAT)
6721 #if (DEBUG || DEVELOPMENT)
6722                     && so_notsent_lowat_check == 1
6723 #endif /* DEBUG || DEVELOPMENT */
6724                     ) {
6725                         if ((SOCK_DOM(so) == PF_INET ||
6726                             SOCK_DOM(so) == PF_INET6) &&
6727                             so->so_type == SOCK_STREAM) {
6728                                 ret = tcp_notsent_lowat_check(so);
6729                         }
6730 #if MPTCP
6731                         else if ((SOCK_DOM(so) == PF_MULTIPATH) &&
6732                             (SOCK_PROTO(so) == IPPROTO_TCP)) {
6733                                 ret = mptcp_notsent_lowat_check(so);
6734                         }
6735 #endif
6736                         else {
6737                                 ret = 1;
6738                                 goto out;
6739                         }
6740                 } else {
6741                         ret = 1;
6742                 }
6743         }
6744         if (so_wait_for_if_feedback(so)) {
6745                 ret = 0;
6746         }
6747
6748 out:
6749         if (ret && kev) {
6750                 knote_fill_kevent(kn, kev, data);
6751         }
6752         return ret;
6753 }
6754
6755 static int
6756 filt_sowattach(struct knote *kn, __unused struct kevent_qos_s *kev)
6757 {
6758         struct socket *so = (struct socket *)kn->kn_fp->f_fglob->fg_data;
6759
6760         /* socket locked */
6761         if (KNOTE_ATTACH(&so->so_snd.sb_sel.si_note, kn)) {
6762                 so->so_snd.sb_flags |= SB_KNOTE;
6763         }
6764
6765         /* determine if its already fired */
6766         return filt_sowrite_common(kn, NULL, so);
6767 }
6768
6769 static void
6770 filt_sowdetach(struct knote *kn)
6771 {
6772         struct socket *so = (struct socket *)kn->kn_fp->f_fglob->fg_data;
6773         socket_lock(so, 1);
6774
6775         if (so->so_snd.sb_flags & SB_KNOTE) {
6776                 if (KNOTE_DETACH(&so->so_snd.sb_sel.si_note, kn)) {
6777                         so->so_snd.sb_flags &= ~SB_KNOTE;
6778                 }
6779         }
6780         socket_unlock(so, 1);
6781 }
6782
6783 /*ARGSUSED*/
6784 static int
6785 filt_sowrite(struct knote *kn, long hint)
6786 {
6787         struct socket *so = (struct socket *)kn->kn_fp->f_fglob->fg_data;
6788         int ret;
6789
6790         if ((hint & SO_FILT_HINT_LOCKED) == 0) {
6791                 socket_lock(so, 1);
6792         }
6793
6794         ret = filt_sowrite_common(kn, NULL, so);
6795
6796         if ((hint & SO_FILT_HINT_LOCKED) == 0) {
6797                 socket_unlock(so, 1);
6798         }
6799
6800         return ret;
6801 }
6802
6803 static int
6804 filt_sowtouch(struct knote *kn, struct kevent_qos_s *kev)
6805 {
6806         struct socket *so = (struct socket *)kn->kn_fp->f_fglob->fg_data;
6807         int ret;
6808
6809         socket_lock(so, 1);
6810
6811         /*save off the new input fflags and data */
6812         kn->kn_sfflags = kev->fflags;
6813         kn->kn_sdata = kev->data;
6814
6815         /* determine if these changes result in a triggered event */
6816         ret = filt_sowrite_common(kn, NULL, so);
6817
6818         socket_unlock(so, 1);
6819
6820         return ret;
6821 }
6822
6823 static int
6824 filt_sowprocess(struct knote *kn, struct kevent_qos_s *kev)
6825 {
6826         struct socket *so = (struct socket *)kn->kn_fp->f_fglob->fg_data;
6827         int ret;
6828
6829         socket_lock(so, 1);
6830         ret = filt_sowrite_common(kn, kev, so);
6831         socket_unlock(so, 1);
6832
6833         return ret;
6834 }
6835
6836 static int
6837 filt_sockev_common(struct knote *kn, struct kevent_qos_s *kev,
6838     struct socket *so, long ev_hint)
6839 {
6840         int ret = 0;
6841         int64_t data = 0;
6842         uint32_t level_trigger = 0;
6843
6844         if (ev_hint & SO_FILT_HINT_CONNRESET) {
6845                 kn->kn_fflags |= NOTE_CONNRESET;
6846         }
6847         if (ev_hint & SO_FILT_HINT_TIMEOUT) {
6848                 kn->kn_fflags |= NOTE_TIMEOUT;
6849         }
6850         if (ev_hint & SO_FILT_HINT_NOSRCADDR) {
6851                 kn->kn_fflags |= NOTE_NOSRCADDR;
6852         }
6853         if (ev_hint & SO_FILT_HINT_IFDENIED) {
6854                 kn->kn_fflags |= NOTE_IFDENIED;
6855         }
6856         if (ev_hint & SO_FILT_HINT_KEEPALIVE) {
6857                 kn->kn_fflags |= NOTE_KEEPALIVE;
6858         }
6859         if (ev_hint & SO_FILT_HINT_ADAPTIVE_WTIMO) {
6860                 kn->kn_fflags |= NOTE_ADAPTIVE_WTIMO;
6861         }
6862         if (ev_hint & SO_FILT_HINT_ADAPTIVE_RTIMO) {
6863                 kn->kn_fflags |= NOTE_ADAPTIVE_RTIMO;
6864         }
6865         if ((ev_hint & SO_FILT_HINT_CONNECTED) ||
6866             (so->so_state & SS_ISCONNECTED)) {
6867                 kn->kn_fflags |= NOTE_CONNECTED;
6868                 level_trigger |= NOTE_CONNECTED;
6869         }
6870         if ((ev_hint & SO_FILT_HINT_DISCONNECTED) ||
6871             (so->so_state & SS_ISDISCONNECTED)) {
6872                 kn->kn_fflags |= NOTE_DISCONNECTED;
6873                 level_trigger |= NOTE_DISCONNECTED;
6874         }
6875         if (ev_hint & SO_FILT_HINT_CONNINFO_UPDATED) {
6876                 if (so->so_proto != NULL &&
6877                     (so->so_proto->pr_flags & PR_EVCONNINFO)) {
6878                         kn->kn_fflags |= NOTE_CONNINFO_UPDATED;
6879                 }
6880         }
6881
6882         if ((ev_hint & SO_FILT_HINT_NOTIFY_ACK) ||
6883             tcp_notify_ack_active(so)) {
6884                 kn->kn_fflags |= NOTE_NOTIFY_ACK;
6885         }
6886
6887         if ((so->so_state & SS_CANTRCVMORE)
6888 #if CONTENT_FILTER
6889             && cfil_sock_data_pending(&so->so_rcv) == 0
6890 #endif /* CONTENT_FILTER */
6891             ) {
6892                 kn->kn_fflags |= NOTE_READCLOSED;
6893                 level_trigger |= NOTE_READCLOSED;
6894         }
6895
6896         if (so->so_state & SS_CANTSENDMORE) {
6897                 kn->kn_fflags |= NOTE_WRITECLOSED;
6898                 level_trigger |= NOTE_WRITECLOSED;
6899         }
6900
6901         if ((ev_hint & SO_FILT_HINT_SUSPEND) ||
6902             (so->so_flags & SOF_SUSPENDED)) {
6903                 kn->kn_fflags &= ~(NOTE_SUSPEND | NOTE_RESUME);
6904
6905                 /* If resume event was delivered before, reset it */
6906                 kn->kn_hook32 &= ~NOTE_RESUME;
6907
6908                 kn->kn_fflags |= NOTE_SUSPEND;
6909                 level_trigger |= NOTE_SUSPEND;
6910         }
6911
6912         if ((ev_hint & SO_FILT_HINT_RESUME) ||
6913             (so->so_flags & SOF_SUSPENDED) == 0) {
6914                 kn->kn_fflags &= ~(NOTE_SUSPEND | NOTE_RESUME);
6915
6916                 /* If suspend event was delivered before, reset it */
6917                 kn->kn_hook32 &= ~NOTE_SUSPEND;
6918
6919                 kn->kn_fflags |= NOTE_RESUME;
6920                 level_trigger |= NOTE_RESUME;
6921         }
6922
6923         if (so->so_error != 0) {
6924                 ret = 1;
6925                 data = so->so_error;
6926                 kn->kn_flags |= EV_EOF;
6927         } else {
6928                 u_int32_t data32;
6929                 get_sockev_state(so, &data32);
6930                 data = data32;
6931         }
6932
6933         /* Reset any events that are not requested on this knote */
6934         kn->kn_fflags &= (kn->kn_sfflags & EVFILT_SOCK_ALL_MASK);
6935         level_trigger &= (kn->kn_sfflags & EVFILT_SOCK_ALL_MASK);
6936
6937         /* Find the level triggerred events that are already delivered */
6938         level_trigger &= kn->kn_hook32;
6939         level_trigger &= EVFILT_SOCK_LEVEL_TRIGGER_MASK;
6940
6941         /* Do not deliver level triggerred events more than once */
6942         if ((kn->kn_fflags & ~level_trigger) != 0) {
6943                 ret = 1;
6944         }
6945
6946         if (ret && kev) {
6947                 /*
6948                  * Store the state of the events being delivered. This
6949                  * state can be used to deliver level triggered events
6950                  * ateast once and still avoid waking up the application
6951                  * multiple times as long as the event is active.
6952                  */
6953                 if (kn->kn_fflags != 0) {
6954                         kn->kn_hook32 |= (kn->kn_fflags &
6955                             EVFILT_SOCK_LEVEL_TRIGGER_MASK);
6956                 }
6957
6958                 /*
6959                  * NOTE_RESUME and NOTE_SUSPEND are an exception, deliver
6960                  * only one of them and remember the last one that was
6961                  * delivered last
6962                  */
6963                 if (kn->kn_fflags & NOTE_SUSPEND) {
6964                         kn->kn_hook32 &= ~NOTE_RESUME;
6965                 }
6966                 if (kn->kn_fflags & NOTE_RESUME) {
6967                         kn->kn_hook32 &= ~NOTE_SUSPEND;
6968                 }
6969
6970                 knote_fill_kevent(kn, kev, data);
6971         }
6972         return ret;
6973 }
6974
6975 static int
6976 filt_sockattach(struct knote *kn, __unused struct kevent_qos_s *kev)
6977 {
6978         struct socket *so = (struct socket *)kn->kn_fp->f_fglob->fg_data;
6979
6980         /* socket locked */
6981         kn->kn_hook32 = 0;
6982         if (KNOTE_ATTACH(&so->so_klist, kn)) {
6983                 so->so_flags |= SOF_KNOTE;
6984         }
6985
6986         /* determine if event already fired */
6987         return filt_sockev_common(kn, NULL, so, 0);
6988 }
6989
6990 static void
6991 filt_sockdetach(struct knote *kn)
6992 {
6993         struct socket *so = (struct socket *)kn->kn_fp->f_fglob->fg_data;
6994         socket_lock(so, 1);
6995
6996         if ((so->so_flags & SOF_KNOTE) != 0) {
6997                 if (KNOTE_DETACH(&so->so_klist, kn)) {
6998                         so->so_flags &= ~SOF_KNOTE;
6999                 }
7000         }
7001         socket_unlock(so, 1);
7002 }
7003
7004 static int
7005 filt_sockev(struct knote *kn, long hint)
7006 {
7007         int ret = 0, locked = 0;
7008         struct socket *so = (struct socket *)kn->kn_fp->f_fglob->fg_data;
7009         long ev_hint = (hint & SO_FILT_HINT_EV);
7010
7011         if ((hint & SO_FILT_HINT_LOCKED) == 0) {
7012                 socket_lock(so, 1);
7013                 locked = 1;
7014         }
7015
7016         ret = filt_sockev_common(kn, NULL, so, ev_hint);
7017
7018         if (locked) {
7019                 socket_unlock(so, 1);
7020         }
7021
7022         return ret;
7023 }
7024
7025
7026
7027 /*
7028  *      filt_socktouch - update event state
7029  */
7030 static int
7031 filt_socktouch(
7032         struct knote *kn,
7033         struct kevent_qos_s *kev)
7034 {
7035         struct socket *so = (struct socket *)kn->kn_fp->f_fglob->fg_data;
7036         uint32_t changed_flags;
7037         int ret;
7038
7039         socket_lock(so, 1);
7040
7041         /* save off the [result] data and fflags */
7042         changed_flags = (kn->kn_sfflags ^ kn->kn_hook32);
7043
7044         /* save off the new input fflags and data */
7045         kn->kn_sfflags = kev->fflags;
7046         kn->kn_sdata = kev->data;
7047
7048         /* restrict the current results to the (smaller?) set of new interest */
7049         /*
7050          * For compatibility with previous implementations, we leave kn_fflags
7051          * as they were before.
7052          */
7053         //kn->kn_fflags &= kev->fflags;
7054
7055         /*
7056          * Since we keep track of events that are already
7057          * delivered, if any of those events are not requested
7058          * anymore the state related to them can be reset
7059          */
7060         kn->kn_hook32 &= ~(changed_flags & EVFILT_SOCK_LEVEL_TRIGGER_MASK);
7061
7062         /* determine if we have events to deliver */
7063         ret = filt_sockev_common(kn, NULL, so, 0);
7064
7065         socket_unlock(so, 1);
7066
7067         return ret;
7068 }
7069
7070 /*
7071  *      filt_sockprocess - query event fired state and return data
7072  */
7073 static int
7074 filt_sockprocess(struct knote *kn, struct kevent_qos_s *kev)
7075 {
7076         struct socket *so = (struct socket *)kn->kn_fp->f_fglob->fg_data;
7077         int ret = 0;
7078
7079         socket_lock(so, 1);
7080
7081         ret = filt_sockev_common(kn, kev, so, 0);
7082
7083         socket_unlock(so, 1);
7084
7085         return ret;
7086 }
7087
7088 void
7089 get_sockev_state(struct socket *so, u_int32_t *statep)
7090 {
7091         u_int32_t state = *(statep);
7092
7093         /*
7094          * If the state variable is already used by a previous event,
7095          * reset it.
7096          */
7097         if (state != 0) {
7098                 return;
7099         }
7100
7101         if (so->so_state & SS_ISCONNECTED) {
7102                 state |= SOCKEV_CONNECTED;
7103         } else {
7104                 state &= ~(SOCKEV_CONNECTED);
7105         }
7106         state |= ((so->so_state & SS_ISDISCONNECTED) ? SOCKEV_DISCONNECTED : 0);
7107         *(statep) = state;
7108 }
7109
7110 #define SO_LOCK_HISTORY_STR_LEN \
7111         (2 * SO_LCKDBG_MAX * (2 + (2 * sizeof (void *)) + 1) + 1)
7112
7113 __private_extern__ const char *
7114 solockhistory_nr(struct socket *so)
7115 {
7116         size_t n = 0;
7117         int i;
7118         static char lock_history_str[SO_LOCK_HISTORY_STR_LEN];
7119
7120         bzero(lock_history_str, sizeof(lock_history_str));
7121         for (i = SO_LCKDBG_MAX - 1; i >= 0; i--) {
7122                 n += scnprintf(lock_history_str + n,
7123                     SO_LOCK_HISTORY_STR_LEN - n, "%p:%p ",
7124                     so->lock_lr[(so->next_lock_lr + i) % SO_LCKDBG_MAX],
7125                     so->unlock_lr[(so->next_unlock_lr + i) % SO_LCKDBG_MAX]);
7126         }
7127         return lock_history_str;
7128 }
7129
7130 lck_mtx_t *
7131 socket_getlock(struct socket *so, int flags)
7132 {
7133         if (so->so_proto->pr_getlock != NULL) {
7134                 return (*so->so_proto->pr_getlock)(so, flags);
7135         } else {
7136                 return so->so_proto->pr_domain->dom_mtx;
7137         }
7138 }
7139
7140 void
7141 socket_lock(struct socket *so, int refcount)
7142 {
7143         void *lr_saved;
7144
7145         lr_saved = __builtin_return_address(0);
7146
7147         if (so->so_proto->pr_lock) {
7148                 (*so->so_proto->pr_lock)(so, refcount, lr_saved);
7149         } else {
7150 #ifdef MORE_LOCKING_DEBUG
7151                 LCK_MTX_ASSERT(so->so_proto->pr_domain->dom_mtx,
7152                     LCK_MTX_ASSERT_NOTOWNED);
7153 #endif
7154                 lck_mtx_lock(so->so_proto->pr_domain->dom_mtx);
7155                 if (refcount) {
7156                         so->so_usecount++;
7157                 }
7158                 so->lock_lr[so->next_lock_lr] = lr_saved;
7159                 so->next_lock_lr = (so->next_lock_lr + 1) % SO_LCKDBG_MAX;
7160         }
7161 }
7162
7163 void
7164 socket_lock_assert_owned(struct socket *so)
7165 {
7166         lck_mtx_t *mutex_held;
7167
7168         if (so->so_proto->pr_getlock != NULL) {
7169                 mutex_held = (*so->so_proto->pr_getlock)(so, 0);
7170         } else {
7171                 mutex_held = so->so_proto->pr_domain->dom_mtx;
7172         }
7173
7174         LCK_MTX_ASSERT(mutex_held, LCK_MTX_ASSERT_OWNED);
7175 }
7176
7177 int
7178 socket_try_lock(struct socket *so)
7179 {
7180         lck_mtx_t *mtx;
7181
7182         if (so->so_proto->pr_getlock != NULL) {
7183                 mtx = (*so->so_proto->pr_getlock)(so, 0);
7184         } else {
7185                 mtx = so->so_proto->pr_domain->dom_mtx;
7186         }
7187
7188         return lck_mtx_try_lock(mtx);
7189 }
7190
7191 void
7192 socket_unlock(struct socket *so, int refcount)
7193 {
7194         void *lr_saved;
7195         lck_mtx_t *mutex_held;
7196
7197         lr_saved = __builtin_return_address(0);
7198
7199         if (so == NULL || so->so_proto == NULL) {
7200                 panic("%s: null so_proto so=%p\n", __func__, so);
7201                 /* NOTREACHED */
7202         }
7203
7204         if (so->so_proto->pr_unlock) {
7205                 (*so->so_proto->pr_unlock)(so, refcount, lr_saved);
7206         } else {
7207                 mutex_held = so->so_proto->pr_domain->dom_mtx;
7208 #ifdef MORE_LOCKING_DEBUG
7209                 LCK_MTX_ASSERT(mutex_held, LCK_MTX_ASSERT_OWNED);
7210 #endif
7211                 so->unlock_lr[so->next_unlock_lr] = lr_saved;
7212                 so->next_unlock_lr = (so->next_unlock_lr + 1) % SO_LCKDBG_MAX;
7213
7214                 if (refcount) {
7215                         if (so->so_usecount <= 0) {
7216                                 panic("%s: bad refcount=%d so=%p (%d, %d, %d) "
7217                                     "lrh=%s", __func__, so->so_usecount, so,
7218                                     SOCK_DOM(so), so->so_type,
7219                                     SOCK_PROTO(so), solockhistory_nr(so));
7220                                 /* NOTREACHED */
7221                         }
7222
7223                         so->so_usecount--;
7224                         if (so->so_usecount == 0) {
7225                                 sofreelastref(so, 1);
7226                         }
7227                 }
7228                 lck_mtx_unlock(mutex_held);
7229         }
7230 }
7231
7232 /* Called with socket locked, will unlock socket */
7233 void
7234 sofree(struct socket *so)
7235 {
7236         lck_mtx_t *mutex_held;
7237
7238         if (so->so_proto->pr_getlock != NULL) {
7239                 mutex_held = (*so->so_proto->pr_getlock)(so, 0);
7240         } else {
7241                 mutex_held = so->so_proto->pr_domain->dom_mtx;
7242         }
7243         LCK_MTX_ASSERT(mutex_held, LCK_MTX_ASSERT_OWNED);
7244
7245         sofreelastref(so, 0);
7246 }
7247
7248 void
7249 soreference(struct socket *so)
7250 {
7251         socket_lock(so, 1);     /* locks & take one reference on socket */
7252         socket_unlock(so, 0);   /* unlock only */
7253 }
7254
7255 void
7256 sodereference(struct socket *so)
7257 {
7258         socket_lock(so, 0);
7259         socket_unlock(so, 1);
7260 }
7261
7262 /*
7263  * Set or clear SOF_MULTIPAGES on the socket to enable or disable the
7264  * possibility of using jumbo clusters.  Caller must ensure to hold
7265  * the socket lock.
7266  */
7267 void
7268 somultipages(struct socket *so, boolean_t set)
7269 {
7270         if (set) {
7271                 so->so_flags |= SOF_MULTIPAGES;
7272         } else {
7273                 so->so_flags &= ~SOF_MULTIPAGES;
7274         }
7275 }
7276
7277 void
7278 soif2kcl(struct socket *so, boolean_t set)
7279 {
7280         if (set) {
7281                 so->so_flags1 |= SOF1_IF_2KCL;
7282         } else {
7283                 so->so_flags1 &= ~SOF1_IF_2KCL;
7284         }
7285 }
7286
7287 int
7288 so_isdstlocal(struct socket *so)
7289 {
7290         struct inpcb *inp = (struct inpcb *)so->so_pcb;
7291
7292         if (SOCK_DOM(so) == PF_INET) {
7293                 return inaddr_local(inp->inp_faddr);
7294         } else if (SOCK_DOM(so) == PF_INET6) {
7295                 return in6addr_local(&inp->in6p_faddr);
7296         }
7297
7298         return 0;
7299 }
7300
7301 int
7302 sosetdefunct(struct proc *p, struct socket *so, int level, boolean_t noforce)
7303 {
7304         struct sockbuf *rcv, *snd;
7305         int err = 0, defunct;
7306
7307         rcv = &so->so_rcv;
7308         snd = &so->so_snd;
7309
7310         defunct = (so->so_flags & SOF_DEFUNCT);
7311         if (defunct) {
7312                 if (!(snd->sb_flags & rcv->sb_flags & SB_DROP)) {
7313                         panic("%s: SB_DROP not set", __func__);
7314                         /* NOTREACHED */
7315                 }
7316                 goto done;
7317         }
7318
7319         if (so->so_flags & SOF_NODEFUNCT) {
7320                 if (noforce) {
7321                         err = EOPNOTSUPP;
7322                         if (p != PROC_NULL) {
7323                                 SODEFUNCTLOG("%s[%d, %s]: (target pid %d "
7324                                     "name %s level %d) so 0x%llx [%d,%d] "
7325                                     "is not eligible for defunct "
7326                                     "(%d)\n", __func__, proc_selfpid(),
7327                                     proc_best_name(current_proc()), proc_pid(p),
7328                                     proc_best_name(p), level,
7329                                     (uint64_t)DEBUG_KERNEL_ADDRPERM(so),
7330                                     SOCK_DOM(so), SOCK_TYPE(so), err);
7331                         }
7332                         return err;
7333                 }
7334                 so->so_flags &= ~SOF_NODEFUNCT;
7335                 if (p != PROC_NULL) {
7336                         SODEFUNCTLOG("%s[%d, %s]: (target pid %d "
7337                             "name %s level %d) so 0x%llx [%d,%d] "
7338                             "defunct by force "
7339                             "(%d)\n", __func__, proc_selfpid(),
7340                             proc_best_name(current_proc()), proc_pid(p),
7341                             proc_best_name(p), level,
7342                             (uint64_t)DEBUG_KERNEL_ADDRPERM(so),
7343                             SOCK_DOM(so), SOCK_TYPE(so), err);
7344                 }
7345         } else if (so->so_flags1 & SOF1_EXTEND_BK_IDLE_WANTED) {
7346                 struct inpcb *inp = (struct inpcb *)so->so_pcb;
7347                 struct ifnet *ifp = inp->inp_last_outifp;
7348
7349                 if (ifp && IFNET_IS_CELLULAR(ifp)) {
7350                         OSIncrementAtomic(&soextbkidlestat.so_xbkidle_nocell);
7351                 } else if (so->so_flags & SOF_DELEGATED) {
7352                         OSIncrementAtomic(&soextbkidlestat.so_xbkidle_nodlgtd);
7353                 } else if (soextbkidlestat.so_xbkidle_time == 0) {
7354                         OSIncrementAtomic(&soextbkidlestat.so_xbkidle_notime);
7355                 } else if (noforce && p != PROC_NULL) {
7356                         OSIncrementAtomic(&soextbkidlestat.so_xbkidle_active);
7357
7358                         so->so_flags1 |= SOF1_EXTEND_BK_IDLE_INPROG;
7359                         so->so_extended_bk_start = net_uptime();
7360                         OSBitOrAtomic(P_LXBKIDLEINPROG, &p->p_ladvflag);
7361
7362                         inpcb_timer_sched(inp->inp_pcbinfo, INPCB_TIMER_LAZY);
7363
7364                         err = EOPNOTSUPP;
7365                         SODEFUNCTLOG("%s[%d, %s]: (target pid %d "
7366                             "name %s level %d) so 0x%llx [%d,%d] "
7367                             "extend bk idle "
7368                             "(%d)\n", __func__, proc_selfpid(),
7369                             proc_best_name(current_proc()), proc_pid(p),
7370                             proc_best_name(p), level,
7371                             (uint64_t)DEBUG_KERNEL_ADDRPERM(so),
7372                             SOCK_DOM(so), SOCK_TYPE(so), err);
7373                         return err;
7374                 } else {
7375                         OSIncrementAtomic(&soextbkidlestat.so_xbkidle_forced);
7376                 }
7377         }
7378
7379         so->so_flags |= SOF_DEFUNCT;
7380
7381         /* Prevent further data from being appended to the socket buffers */
7382         snd->sb_flags |= SB_DROP;
7383         rcv->sb_flags |= SB_DROP;
7384
7385         /* Flush any existing data in the socket buffers */
7386         if (rcv->sb_cc != 0) {
7387                 rcv->sb_flags &= ~SB_SEL;
7388                 selthreadclear(&rcv->sb_sel);
7389                 sbrelease(rcv);
7390         }
7391         if (snd->sb_cc != 0) {
7392                 snd->sb_flags &= ~SB_SEL;
7393                 selthreadclear(&snd->sb_sel);
7394                 sbrelease(snd);
7395         }
7396
7397 done:
7398         if (p != PROC_NULL) {
7399                 SODEFUNCTLOG("%s[%d, %s]: (target pid %d name %s level %d) "
7400                     "so 0x%llx [%d,%d] %s defunct%s\n", __func__,
7401                     proc_selfpid(), proc_best_name(current_proc()),
7402                     proc_pid(p), proc_best_name(p), level,
7403                     (uint64_t)DEBUG_KERNEL_ADDRPERM(so), SOCK_DOM(so),
7404                     SOCK_TYPE(so), defunct ? "is already" : "marked as",
7405                     (so->so_flags1 & SOF1_EXTEND_BK_IDLE_WANTED) ?
7406                     " extbkidle" : "");
7407         }
7408         return err;
7409 }
7410
7411 int
7412 sodefunct(struct proc *p, struct socket *so, int level)
7413 {
7414         struct sockbuf *rcv, *snd;
7415
7416         if (!(so->so_flags & SOF_DEFUNCT)) {
7417                 panic("%s improperly called", __func__);
7418                 /* NOTREACHED */
7419         }
7420         if (so->so_state & SS_DEFUNCT) {
7421                 goto done;
7422         }
7423
7424         rcv = &so->so_rcv;
7425         snd = &so->so_snd;
7426
7427         if (SOCK_DOM(so) == PF_INET || SOCK_DOM(so) == PF_INET6) {
7428                 char s[MAX_IPv6_STR_LEN];
7429                 char d[MAX_IPv6_STR_LEN];
7430                 struct inpcb *inp = sotoinpcb(so);
7431
7432                 if (p != PROC_NULL) {
7433                         SODEFUNCTLOG(
7434                                 "%s[%d, %s]: (target pid %d name %s level %d) "
7435                                 "so 0x%llx [%s %s:%d -> %s:%d] is now defunct "
7436                                 "[rcv_si 0x%x, snd_si 0x%x, rcv_fl 0x%x, "
7437                                 " snd_fl 0x%x]\n", __func__,
7438                                 proc_selfpid(), proc_best_name(current_proc()),
7439                                 proc_pid(p), proc_best_name(p), level,
7440                                 (uint64_t)DEBUG_KERNEL_ADDRPERM(so),
7441                                 (SOCK_TYPE(so) == SOCK_STREAM) ? "TCP" : "UDP",
7442                                 inet_ntop(SOCK_DOM(so), ((SOCK_DOM(so) == PF_INET) ?
7443                                 (void *)&inp->inp_laddr.s_addr :
7444                                 (void *)&inp->in6p_laddr),
7445                                 s, sizeof(s)), ntohs(inp->in6p_lport),
7446                                 inet_ntop(SOCK_DOM(so), (SOCK_DOM(so) == PF_INET) ?
7447                                 (void *)&inp->inp_faddr.s_addr :
7448                                 (void *)&inp->in6p_faddr,
7449                                 d, sizeof(d)), ntohs(inp->in6p_fport),
7450                                 (uint32_t)rcv->sb_sel.si_flags,
7451                                 (uint32_t)snd->sb_sel.si_flags,
7452                                 rcv->sb_flags, snd->sb_flags);
7453                 }
7454         } else if (p != PROC_NULL) {
7455                 SODEFUNCTLOG("%s[%d, %s]: (target pid %d name %s level %d) "
7456                     "so 0x%llx [%d,%d] is now defunct [rcv_si 0x%x, "
7457                     "snd_si 0x%x, rcv_fl 0x%x, snd_fl 0x%x]\n", __func__,
7458                     proc_selfpid(), proc_best_name(current_proc()),
7459                     proc_pid(p), proc_best_name(p), level,
7460                     (uint64_t)DEBUG_KERNEL_ADDRPERM(so),
7461                     SOCK_DOM(so), SOCK_TYPE(so),
7462                     (uint32_t)rcv->sb_sel.si_flags,
7463                     (uint32_t)snd->sb_sel.si_flags, rcv->sb_flags,
7464                     snd->sb_flags);
7465         }
7466
7467         /*
7468          * Unwedge threads blocked on sbwait() and sb_lock().
7469          */
7470         sbwakeup(rcv);
7471         sbwakeup(snd);
7472
7473         so->so_flags1 |= SOF1_DEFUNCTINPROG;
7474         if (rcv->sb_flags & SB_LOCK) {
7475                 sbunlock(rcv, TRUE);    /* keep socket locked */
7476         }
7477         if (snd->sb_flags & SB_LOCK) {
7478                 sbunlock(snd, TRUE);    /* keep socket locked */
7479         }
7480         /*
7481          * Flush the buffers and disconnect.  We explicitly call shutdown
7482          * on both data directions to ensure that SS_CANT{RCV,SEND}MORE
7483          * states are set for the socket.  This would also flush out data
7484          * hanging off the receive list of this socket.
7485          */
7486         (void) soshutdownlock_final(so, SHUT_RD);
7487         (void) soshutdownlock_final(so, SHUT_WR);
7488         (void) sodisconnectlocked(so);
7489
7490         /*
7491          * Explicitly handle connectionless-protocol disconnection
7492          * and release any remaining data in the socket buffers.
7493          */
7494         if (!(so->so_state & SS_ISDISCONNECTED)) {
7495                 (void) soisdisconnected(so);
7496         }
7497
7498         if (so->so_error == 0) {
7499                 so->so_error = EBADF;
7500         }
7501
7502         if (rcv->sb_cc != 0) {
7503                 rcv->sb_flags &= ~SB_SEL;
7504                 selthreadclear(&rcv->sb_sel);
7505                 sbrelease(rcv);
7506         }
7507         if (snd->sb_cc != 0) {
7508                 snd->sb_flags &= ~SB_SEL;
7509                 selthreadclear(&snd->sb_sel);
7510                 sbrelease(snd);
7511         }
7512         so->so_state |= SS_DEFUNCT;
7513         OSIncrementAtomicLong((volatile long *)&sodefunct_calls);
7514
7515 done:
7516         return 0;
7517 }
7518
7519 int
7520 soresume(struct proc *p, struct socket *so, int locked)
7521 {
7522         if (locked == 0) {
7523                 socket_lock(so, 1);
7524         }
7525
7526         if (so->so_flags1 & SOF1_EXTEND_BK_IDLE_INPROG) {
7527                 SODEFUNCTLOG("%s[%d, %s]: (target pid %d name %s) so 0x%llx "
7528                     "[%d,%d] resumed from bk idle\n",
7529                     __func__, proc_selfpid(), proc_best_name(current_proc()),
7530                     proc_pid(p), proc_best_name(p),
7531                     (uint64_t)DEBUG_KERNEL_ADDRPERM(so),
7532                     SOCK_DOM(so), SOCK_TYPE(so));
7533
7534                 so->so_flags1 &= ~SOF1_EXTEND_BK_IDLE_INPROG;
7535                 so->so_extended_bk_start = 0;
7536                 OSBitAndAtomic(~P_LXBKIDLEINPROG, &p->p_ladvflag);
7537
7538                 OSIncrementAtomic(&soextbkidlestat.so_xbkidle_resumed);
7539                 OSDecrementAtomic(&soextbkidlestat.so_xbkidle_active);
7540                 VERIFY(soextbkidlestat.so_xbkidle_active >= 0);
7541         }
7542         if (locked == 0) {
7543                 socket_unlock(so, 1);
7544         }
7545
7546         return 0;
7547 }
7548
7549 /*
7550  * Does not attempt to account for sockets that are delegated from
7551  * the current process
7552  */
7553 int
7554 so_set_extended_bk_idle(struct socket *so, int optval)
7555 {
7556         int error = 0;
7557
7558         if ((SOCK_DOM(so) != PF_INET && SOCK_DOM(so) != PF_INET6) ||
7559             SOCK_PROTO(so) != IPPROTO_TCP) {
7560                 OSDecrementAtomic(&soextbkidlestat.so_xbkidle_notsupp);
7561                 error = EOPNOTSUPP;
7562         } else if (optval == 0) {
7563                 so->so_flags1 &= ~SOF1_EXTEND_BK_IDLE_WANTED;
7564
7565                 soresume(current_proc(), so, 1);
7566         } else {
7567                 struct proc *p = current_proc();
7568                 int i;
7569                 struct filedesc *fdp;
7570                 int count = 0;
7571
7572                 /*
7573                  * Unlock socket to avoid lock ordering issue with
7574                  * the proc fd table lock
7575                  */
7576                 socket_unlock(so, 0);
7577
7578                 proc_fdlock(p);
7579
7580                 fdp = p->p_fd;
7581                 for (i = 0; i < fdp->fd_nfiles; i++) {
7582                         struct fileproc *fp = fdp->fd_ofiles[i];
7583                         struct socket *so2;
7584
7585                         if (fp == NULL ||
7586                             (fdp->fd_ofileflags[i] & UF_RESERVED) != 0 ||
7587                             FILEGLOB_DTYPE(fp->f_fglob) != DTYPE_SOCKET) {
7588                                 continue;
7589                         }
7590
7591                         so2 = (struct socket *)fp->f_fglob->fg_data;
7592                         if (so != so2 &&
7593                             so2->so_flags1 & SOF1_EXTEND_BK_IDLE_WANTED) {
7594                                 count++;
7595                         }
7596                         if (count >= soextbkidlestat.so_xbkidle_maxperproc) {
7597                                 break;
7598                         }
7599                 }
7600                 proc_fdunlock(p);
7601
7602                 socket_lock(so, 0);
7603
7604                 if (count >= soextbkidlestat.so_xbkidle_maxperproc) {
7605                         OSIncrementAtomic(&soextbkidlestat.so_xbkidle_toomany);
7606                         error = EBUSY;
7607                 } else if (so->so_flags & SOF_DELEGATED) {
7608                         OSIncrementAtomic(&soextbkidlestat.so_xbkidle_nodlgtd);
7609                         error = EBUSY;
7610                 } else {
7611                         so->so_flags1 |= SOF1_EXTEND_BK_IDLE_WANTED;
7612                         OSIncrementAtomic(&soextbkidlestat.so_xbkidle_wantok);
7613                 }
7614                 SODEFUNCTLOG("%s[%d, %s]: so 0x%llx [%d,%d] "
7615                     "%s marked for extended bk idle\n",
7616                     __func__, proc_selfpid(), proc_best_name(current_proc()),
7617                     (uint64_t)DEBUG_KERNEL_ADDRPERM(so),
7618                     SOCK_DOM(so), SOCK_TYPE(so),
7619                     (so->so_flags1 & SOF1_EXTEND_BK_IDLE_WANTED) ?
7620                     "is" : "not");
7621         }
7622
7623         return error;
7624 }
7625
7626 static void
7627 so_stop_extended_bk_idle(struct socket *so)
7628 {
7629         so->so_flags1 &= ~SOF1_EXTEND_BK_IDLE_INPROG;
7630         so->so_extended_bk_start = 0;
7631
7632         OSDecrementAtomic(&soextbkidlestat.so_xbkidle_active);
7633         VERIFY(soextbkidlestat.so_xbkidle_active >= 0);
7634         /*
7635          * Force defunct
7636          */
7637         sosetdefunct(current_proc(), so,
7638             SHUTDOWN_SOCKET_LEVEL_DISCONNECT_INTERNAL, FALSE);
7639         if (so->so_flags & SOF_DEFUNCT) {
7640                 sodefunct(current_proc(), so,
7641                     SHUTDOWN_SOCKET_LEVEL_DISCONNECT_INTERNAL);
7642         }
7643 }
7644
7645 void
7646 so_drain_extended_bk_idle(struct socket *so)
7647 {
7648         if (so && (so->so_flags1 & SOF1_EXTEND_BK_IDLE_INPROG)) {
7649                 /*
7650                  * Only penalize sockets that have outstanding data
7651                  */
7652                 if (so->so_rcv.sb_cc || so->so_snd.sb_cc) {
7653                         so_stop_extended_bk_idle(so);
7654
7655                         OSIncrementAtomic(&soextbkidlestat.so_xbkidle_drained);
7656                 }
7657         }
7658 }
7659
7660 /*
7661  * Return values tells if socket is still in extended background idle
7662  */
7663 int
7664 so_check_extended_bk_idle_time(struct socket *so)
7665 {
7666         int ret = 1;
7667
7668         if ((so->so_flags1 & SOF1_EXTEND_BK_IDLE_INPROG)) {
7669                 SODEFUNCTLOG("%s[%d, %s]: so 0x%llx [%d,%d]\n",
7670                     __func__, proc_selfpid(), proc_best_name(current_proc()),
7671                     (uint64_t)DEBUG_KERNEL_ADDRPERM(so),
7672                     SOCK_DOM(so), SOCK_TYPE(so));
7673                 if (net_uptime() - so->so_extended_bk_start >
7674                     soextbkidlestat.so_xbkidle_time) {
7675                         so_stop_extended_bk_idle(so);
7676
7677                         OSIncrementAtomic(&soextbkidlestat.so_xbkidle_expired);
7678
7679                         ret = 0;
7680                 } else {
7681                         struct inpcb *inp = (struct inpcb *)so->so_pcb;
7682
7683                         inpcb_timer_sched(inp->inp_pcbinfo, INPCB_TIMER_LAZY);
7684                         OSIncrementAtomic(&soextbkidlestat.so_xbkidle_resched);
7685                 }
7686         }
7687
7688         return ret;
7689 }
7690
7691 void
7692 resume_proc_sockets(proc_t p)
7693 {
7694         if (p->p_ladvflag & P_LXBKIDLEINPROG) {
7695                 struct filedesc *fdp;
7696                 int i;
7697
7698                 proc_fdlock(p);
7699                 fdp = p->p_fd;
7700                 for (i = 0; i < fdp->fd_nfiles; i++) {
7701                         struct fileproc *fp;
7702                         struct socket *so;
7703
7704                         fp = fdp->fd_ofiles[i];
7705                         if (fp == NULL ||
7706                             (fdp->fd_ofileflags[i] & UF_RESERVED) != 0 ||
7707                             FILEGLOB_DTYPE(fp->f_fglob) != DTYPE_SOCKET) {
7708                                 continue;
7709                         }
7710
7711                         so = (struct socket *)fp->f_fglob->fg_data;
7712                         (void) soresume(p, so, 0);
7713                 }
7714                 proc_fdunlock(p);
7715
7716                 OSBitAndAtomic(~P_LXBKIDLEINPROG, &p->p_ladvflag);
7717         }
7718 }
7719
7720 __private_extern__ int
7721 so_set_recv_anyif(struct socket *so, int optval)
7722 {
7723         int ret = 0;
7724
7725 #if INET6
7726         if (SOCK_DOM(so) == PF_INET || SOCK_DOM(so) == PF_INET6) {
7727 #else
7728         if (SOCK_DOM(so) == PF_INET) {
7729 #endif /* !INET6 */
7730                 if (optval) {
7731                         sotoinpcb(so)->inp_flags |= INP_RECV_ANYIF;
7732                 } else {
7733                         sotoinpcb(so)->inp_flags &= ~INP_RECV_ANYIF;
7734                 }
7735         }
7736
7737
7738         return ret;
7739 }
7740
7741 __private_extern__ int
7742 so_get_recv_anyif(struct socket *so)
7743 {
7744         int ret = 0;
7745
7746 #if INET6
7747         if (SOCK_DOM(so) == PF_INET || SOCK_DOM(so) == PF_INET6) {
7748 #else
7749         if (SOCK_DOM(so) == PF_INET) {
7750 #endif /* !INET6 */
7751                 ret = (sotoinpcb(so)->inp_flags & INP_RECV_ANYIF) ? 1 : 0;
7752         }
7753
7754         return ret;
7755 }
7756
7757 int
7758 so_set_restrictions(struct socket *so, uint32_t vals)
7759 {
7760         int nocell_old, nocell_new;
7761         int noexpensive_old, noexpensive_new;
7762         int noconstrained_old, noconstrained_new;
7763
7764         /*
7765          * Deny-type restrictions are trapdoors; once set they cannot be
7766          * unset for the lifetime of the socket.  This allows them to be
7767          * issued by a framework on behalf of the application without
7768          * having to worry that they can be undone.
7769          *
7770          * Note here that socket-level restrictions overrides any protocol
7771          * level restrictions.  For instance, SO_RESTRICT_DENY_CELLULAR
7772          * socket restriction issued on the socket has a higher precendence
7773          * than INP_NO_IFT_CELLULAR.  The latter is affected by the UUID
7774          * policy PROC_UUID_NO_CELLULAR for unrestricted sockets only,
7775          * i.e. when SO_RESTRICT_DENY_CELLULAR has not been issued.
7776          */
7777         nocell_old = (so->so_restrictions & SO_RESTRICT_DENY_CELLULAR);
7778         noexpensive_old = (so->so_restrictions & SO_RESTRICT_DENY_EXPENSIVE);
7779         noconstrained_old = (so->so_restrictions & SO_RESTRICT_DENY_CONSTRAINED);
7780         so->so_restrictions |= (vals & (SO_RESTRICT_DENY_IN |
7781             SO_RESTRICT_DENY_OUT | SO_RESTRICT_DENY_CELLULAR |
7782             SO_RESTRICT_DENY_EXPENSIVE | SO_RESTRICT_DENY_CONSTRAINED));
7783         nocell_new = (so->so_restrictions & SO_RESTRICT_DENY_CELLULAR);
7784         noexpensive_new = (so->so_restrictions & SO_RESTRICT_DENY_EXPENSIVE);
7785         noconstrained_new = (so->so_restrictions & SO_RESTRICT_DENY_CONSTRAINED);
7786
7787         /* we can only set, not clear restrictions */
7788         if ((nocell_new - nocell_old) == 0 &&
7789             (noexpensive_new - noexpensive_old) == 0 &&
7790             (noconstrained_new - noconstrained_old) == 0) {
7791                 return 0;
7792         }
7793 #if INET6
7794         if (SOCK_DOM(so) == PF_INET || SOCK_DOM(so) == PF_INET6) {
7795 #else
7796         if (SOCK_DOM(so) == PF_INET) {
7797 #endif /* !INET6 */
7798                 if (nocell_new - nocell_old != 0) {
7799                         /*
7800                          * if deny cellular is now set, do what's needed
7801                          * for INPCB
7802                          */
7803                         inp_set_nocellular(sotoinpcb(so));
7804                 }
7805                 if (noexpensive_new - noexpensive_old != 0) {
7806                         inp_set_noexpensive(sotoinpcb(so));
7807                 }
7808                 if (noconstrained_new - noconstrained_old != 0) {
7809                         inp_set_noconstrained(sotoinpcb(so));
7810                 }
7811         }
7812
7813         if (SOCK_DOM(so) == PF_MULTIPATH) {
7814                 mptcp_set_restrictions(so);
7815         }
7816
7817         return 0;
7818 }
7819
7820 uint32_t
7821 so_get_restrictions(struct socket *so)
7822 {
7823         return so->so_restrictions & (SO_RESTRICT_DENY_IN |
7824                SO_RESTRICT_DENY_OUT |
7825                SO_RESTRICT_DENY_CELLULAR | SO_RESTRICT_DENY_EXPENSIVE);
7826 }
7827
7828 int
7829 so_set_effective_pid(struct socket *so, int epid, struct proc *p, boolean_t check_cred)
7830 {
7831         struct proc *ep = PROC_NULL;
7832         int error = 0;
7833
7834         /* pid 0 is reserved for kernel */
7835         if (epid == 0) {
7836                 error = EINVAL;
7837                 goto done;
7838         }
7839
7840         /*
7841          * If this is an in-kernel socket, prevent its delegate
7842          * association from changing unless the socket option is
7843          * coming from within the kernel itself.
7844          */
7845         if (so->last_pid == 0 && p != kernproc) {
7846                 error = EACCES;
7847                 goto done;
7848         }
7849
7850         /*
7851          * If this is issued by a process that's recorded as the
7852          * real owner of the socket, or if the pid is the same as
7853          * the process's own pid, then proceed.  Otherwise ensure
7854          * that the issuing process has the necessary privileges.
7855          */
7856         if (check_cred && (epid != so->last_pid || epid != proc_pid(p))) {
7857                 if ((error = priv_check_cred(kauth_cred_get(),
7858                     PRIV_NET_PRIVILEGED_SOCKET_DELEGATE, 0))) {
7859                         error = EACCES;
7860                         goto done;
7861                 }
7862         }
7863
7864         /* Find the process that corresponds to the effective pid */
7865         if ((ep = proc_find(epid)) == PROC_NULL) {
7866                 error = ESRCH;
7867                 goto done;
7868         }
7869
7870         /*
7871          * If a process tries to delegate the socket to itself, then
7872          * there's really nothing to do; treat it as a way for the
7873          * delegate association to be cleared.  Note that we check
7874          * the passed-in proc rather than calling proc_selfpid(),
7875          * as we need to check the process issuing the socket option
7876          * which could be kernproc.  Given that we don't allow 0 for
7877          * effective pid, it means that a delegated in-kernel socket
7878          * stays delegated during its lifetime (which is probably OK.)
7879          */
7880         if (epid == proc_pid(p)) {
7881                 so->so_flags &= ~SOF_DELEGATED;
7882                 so->e_upid = 0;
7883                 so->e_pid = 0;
7884                 uuid_clear(so->e_uuid);
7885         } else {
7886                 so->so_flags |= SOF_DELEGATED;
7887                 so->e_upid = proc_uniqueid(ep);
7888                 so->e_pid = proc_pid(ep);
7889                 proc_getexecutableuuid(ep, so->e_uuid, sizeof(so->e_uuid));
7890
7891 #if defined(XNU_TARGET_OS_OSX)
7892                 if (ep->p_responsible_pid != so->e_pid) {
7893                         proc_t rp = proc_find(ep->p_responsible_pid);
7894                         if (rp != PROC_NULL) {
7895                                 proc_getexecutableuuid(rp, so->so_ruuid, sizeof(so->so_ruuid));
7896                                 so->so_rpid = ep->p_responsible_pid;
7897                                 proc_rele(rp);
7898                         } else {
7899                                 uuid_clear(so->so_ruuid);
7900                                 so->so_rpid = -1;
7901                         }
7902                 }
7903 #endif
7904         }
7905         if (so->so_proto != NULL && so->so_proto->pr_update_last_owner != NULL) {
7906                 (*so->so_proto->pr_update_last_owner)(so, NULL, ep);
7907         }
7908 done:
7909         if (error == 0 && net_io_policy_log) {
7910                 uuid_string_t buf;
7911
7912                 uuid_unparse(so->e_uuid, buf);
7913                 log(LOG_DEBUG, "%s[%s,%d]: so 0x%llx [%d,%d] epid %d (%s) "
7914                     "euuid %s%s\n", __func__, proc_name_address(p),
7915                     proc_pid(p), (uint64_t)DEBUG_KERNEL_ADDRPERM(so),
7916                     SOCK_DOM(so), SOCK_TYPE(so),
7917                     so->e_pid, proc_name_address(ep), buf,
7918                     ((so->so_flags & SOF_DELEGATED) ? " [delegated]" : ""));
7919         } else if (error != 0 && net_io_policy_log) {
7920                 log(LOG_ERR, "%s[%s,%d]: so 0x%llx [%d,%d] epid %d (%s) "
7921                     "ERROR (%d)\n", __func__, proc_name_address(p),
7922                     proc_pid(p), (uint64_t)DEBUG_KERNEL_ADDRPERM(so),
7923                     SOCK_DOM(so), SOCK_TYPE(so),
7924                     epid, (ep == PROC_NULL) ? "PROC_NULL" :
7925                     proc_name_address(ep), error);
7926         }
7927
7928         /* Update this socket's policy upon success */
7929         if (error == 0) {
7930                 so->so_policy_gencnt *= -1;
7931                 so_update_policy(so);
7932 #if NECP
7933                 so_update_necp_policy(so, NULL, NULL);
7934 #endif /* NECP */
7935         }
7936
7937         if (ep != PROC_NULL) {
7938                 proc_rele(ep);
7939         }
7940
7941         return error;
7942 }
7943
7944 int
7945 so_set_effective_uuid(struct socket *so, uuid_t euuid, struct proc *p, boolean_t check_cred)
7946 {
7947         uuid_string_t buf;
7948         uuid_t uuid;
7949         int error = 0;
7950
7951         /* UUID must not be all-zeroes (reserved for kernel) */
7952         if (uuid_is_null(euuid)) {
7953                 error = EINVAL;
7954                 goto done;
7955         }
7956
7957         /*
7958          * If this is an in-kernel socket, prevent its delegate
7959          * association from changing unless the socket option is
7960          * coming from within the kernel itself.
7961          */
7962         if (so->last_pid == 0 && p != kernproc) {
7963                 error = EACCES;
7964                 goto done;
7965         }
7966
7967         /* Get the UUID of the issuing process */
7968         proc_getexecutableuuid(p, uuid, sizeof(uuid));
7969
7970         /*
7971          * If this is issued by a process that's recorded as the
7972          * real owner of the socket, or if the uuid is the same as
7973          * the process's own uuid, then proceed.  Otherwise ensure
7974          * that the issuing process has the necessary privileges.
7975          */
7976         if (check_cred &&
7977             (uuid_compare(euuid, so->last_uuid) != 0 ||
7978             uuid_compare(euuid, uuid) != 0)) {
7979                 if ((error = priv_check_cred(kauth_cred_get(),
7980                     PRIV_NET_PRIVILEGED_SOCKET_DELEGATE, 0))) {
7981                         error = EACCES;
7982                         goto done;
7983                 }
7984         }
7985
7986         /*
7987          * If a process tries to delegate the socket to itself, then
7988          * there's really nothing to do; treat it as a way for the
7989          * delegate association to be cleared.  Note that we check
7990          * the uuid of the passed-in proc rather than that of the
7991          * current process, as we need to check the process issuing
7992          * the socket option which could be kernproc itself.  Given
7993          * that we don't allow 0 for effective uuid, it means that
7994          * a delegated in-kernel socket stays delegated during its
7995          * lifetime (which is okay.)
7996          */
7997         if (uuid_compare(euuid, uuid) == 0) {
7998                 so->so_flags &= ~SOF_DELEGATED;
7999                 so->e_upid = 0;
8000                 so->e_pid = 0;
8001                 uuid_clear(so->e_uuid);
8002         } else {
8003                 so->so_flags |= SOF_DELEGATED;
8004                 /*
8005                  * Unlike so_set_effective_pid(), we only have the UUID
8006                  * here and the process ID is not known.  Inherit the
8007                  * real {pid,upid} of the socket.
8008                  */
8009                 so->e_upid = so->last_upid;
8010                 so->e_pid = so->last_pid;
8011                 uuid_copy(so->e_uuid, euuid);
8012         }
8013         /*
8014          * The following will clear the effective process name as it's the same
8015          * as the real process
8016          */
8017         if (so->so_proto != NULL && so->so_proto->pr_update_last_owner != NULL) {
8018                 (*so->so_proto->pr_update_last_owner)(so, NULL, NULL);
8019         }
8020 done:
8021         if (error == 0 && net_io_policy_log) {
8022                 uuid_unparse(so->e_uuid, buf);
8023                 log(LOG_DEBUG, "%s[%s,%d]: so 0x%llx [%d,%d] epid %d "
8024                     "euuid %s%s\n", __func__, proc_name_address(p), proc_pid(p),
8025                     (uint64_t)DEBUG_KERNEL_ADDRPERM(so), SOCK_DOM(so),
8026                     SOCK_TYPE(so), so->e_pid, buf,
8027                     ((so->so_flags & SOF_DELEGATED) ? " [delegated]" : ""));
8028         } else if (error != 0 && net_io_policy_log) {
8029                 uuid_unparse(euuid, buf);
8030                 log(LOG_DEBUG, "%s[%s,%d]: so 0x%llx [%d,%d] euuid %s "
8031                     "ERROR (%d)\n", __func__, proc_name_address(p), proc_pid(p),
8032                     (uint64_t)DEBUG_KERNEL_ADDRPERM(so), SOCK_DOM(so),
8033                     SOCK_TYPE(so), buf, error);
8034         }
8035
8036         /* Update this socket's policy upon success */
8037         if (error == 0) {
8038                 so->so_policy_gencnt *= -1;
8039                 so_update_policy(so);
8040 #if NECP
8041                 so_update_necp_policy(so, NULL, NULL);
8042 #endif /* NECP */
8043         }
8044
8045         return error;
8046 }
8047
8048 void
8049 netpolicy_post_msg(uint32_t ev_code, struct netpolicy_event_data *ev_data,
8050     uint32_t ev_datalen)
8051 {
8052         struct kev_msg ev_msg;
8053
8054         /*
8055          * A netpolicy event always starts with a netpolicy_event_data
8056          * structure, but the caller can provide for a longer event
8057          * structure to post, depending on the event code.
8058          */
8059         VERIFY(ev_data != NULL && ev_datalen >= sizeof(*ev_data));
8060
8061         bzero(&ev_msg, sizeof(ev_msg));
8062         ev_msg.vendor_code      = KEV_VENDOR_APPLE;
8063         ev_msg.kev_class        = KEV_NETWORK_CLASS;
8064         ev_msg.kev_subclass     = KEV_NETPOLICY_SUBCLASS;
8065         ev_msg.event_code       = ev_code;
8066
8067         ev_msg.dv[0].data_ptr   = ev_data;
8068         ev_msg.dv[0].data_length = ev_datalen;
8069
8070         kev_post_msg(&ev_msg);
8071 }
8072
8073 void
8074 socket_post_kev_msg(uint32_t ev_code,
8075     struct kev_socket_event_data *ev_data,
8076     uint32_t ev_datalen)
8077 {
8078         struct kev_msg ev_msg;
8079
8080         bzero(&ev_msg, sizeof(ev_msg));
8081         ev_msg.vendor_code = KEV_VENDOR_APPLE;
8082         ev_msg.kev_class = KEV_NETWORK_CLASS;
8083         ev_msg.kev_subclass = KEV_SOCKET_SUBCLASS;
8084         ev_msg.event_code = ev_code;
8085
8086         ev_msg.dv[0].data_ptr = ev_data;
8087         ev_msg.dv[0].data_length = ev_datalen;
8088
8089         kev_post_msg(&ev_msg);
8090 }
8091
8092 void
8093 socket_post_kev_msg_closed(struct socket *so)
8094 {
8095         struct kev_socket_closed ev;
8096         struct sockaddr *socksa = NULL, *peersa = NULL;
8097         int err;
8098         bzero(&ev, sizeof(ev));
8099         err = (*so->so_proto->pr_usrreqs->pru_sockaddr)(so, &socksa);
8100         if (err == 0) {
8101                 err = (*so->so_proto->pr_usrreqs->pru_peeraddr)(so,
8102                     &peersa);
8103                 if (err == 0) {
8104                         memcpy(&ev.ev_data.kev_sockname, socksa,
8105                             min(socksa->sa_len,
8106                             sizeof(ev.ev_data.kev_sockname)));
8107                         memcpy(&ev.ev_data.kev_peername, peersa,
8108                             min(peersa->sa_len,
8109                             sizeof(ev.ev_data.kev_peername)));
8110                         socket_post_kev_msg(KEV_SOCKET_CLOSED,
8111                             &ev.ev_data, sizeof(ev));
8112                 }
8113         }
8114         if (socksa != NULL) {
8115                 FREE(socksa, M_SONAME);
8116         }
8117         if (peersa != NULL) {
8118                 FREE(peersa, M_SONAME);
8119         }
8120 }