bsd/kern/uipc_socket.c

   1 /*
   2  * Copyright (c) 1998-2019 Apple Inc. All rights reserved.
   3  *
   4  * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
   5  *
   6  * This file contains Original Code and/or Modifications of Original Code
   7  * as defined in and that are subject to the Apple Public Source License
   8  * Version 2.0 (the 'License'). You may not use this file except in
   9  * compliance with the License. The rights granted to you under the License
  10  * may not be used to create, or enable the creation or redistribution of,
  11  * unlawful or unlicensed copies of an Apple operating system, or to
  12  * circumvent, violate, or enable the circumvention or violation of, any
  13  * terms of an Apple operating system software license agreement.
  14  *
  15  * Please obtain a copy of the License at
  16  * http://www.opensource.apple.com/apsl/ and read it before using this file.
  17  *
  18  * The Original Code and all software distributed under the License are
  19  * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
  20  * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
  21  * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
  22  * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
  23  * Please see the License for the specific language governing rights and
  24  * limitations under the License.
  25  *
  26  * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
  27  */
  28 /* Copyright (c) 1995 NeXT Computer, Inc. All Rights Reserved */
  29 /*
  30  * Copyright (c) 1982, 1986, 1988, 1990, 1993
  31  *      The Regents of the University of California.  All rights reserved.
  32  *
  33  * Redistribution and use in source and binary forms, with or without
  34  * modification, are permitted provided that the following conditions
  35  * are met:
  36  * 1. Redistributions of source code must retain the above copyright
  37  *    notice, this list of conditions and the following disclaimer.
  38  * 2. Redistributions in binary form must reproduce the above copyright
  39  *    notice, this list of conditions and the following disclaimer in the
  40  *    documentation and/or other materials provided with the distribution.
  41  * 3. All advertising materials mentioning features or use of this software
  42  *    must display the following acknowledgement:
  43  *      This product includes software developed by the University of
  44  *      California, Berkeley and its contributors.
  45  * 4. Neither the name of the University nor the names of its contributors
  46  *    may be used to endorse or promote products derived from this software
  47  *    without specific prior written permission.
  48  *
  49  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
  50  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  51  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  52  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  53  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  54  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  55  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  56  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  57  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  58  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  59  * SUCH DAMAGE.
  60  *
  61  *      @(#)uipc_socket.c       8.3 (Berkeley) 4/15/94
  62  */
  63 /*
  64  * NOTICE: This file was modified by SPARTA, Inc. in 2005 to introduce
  65  * support for mandatory and extensible security protections.  This notice
  66  * is included in support of clause 2.2 (b) of the Apple Public License,
  67  * Version 2.0.
  68  */
  69
  70 #include <sys/param.h>
  71 #include <sys/systm.h>
  72 #include <sys/filedesc.h>
  73 #include <sys/proc.h>
  74 #include <sys/proc_internal.h>
  75 #include <sys/kauth.h>
  76 #include <sys/file_internal.h>
  77 #include <sys/fcntl.h>
  78 #include <sys/malloc.h>
  79 #include <sys/mbuf.h>
  80 #include <sys/domain.h>
  81 #include <sys/kernel.h>
  82 #include <sys/event.h>
  83 #include <sys/poll.h>
  84 #include <sys/protosw.h>
  85 #include <sys/socket.h>
  86 #include <sys/socketvar.h>
  87 #include <sys/resourcevar.h>
  88 #include <sys/signalvar.h>
  89 #include <sys/sysctl.h>
  90 #include <sys/syslog.h>
  91 #include <sys/uio.h>
  92 #include <sys/uio_internal.h>
  93 #include <sys/ev.h>
  94 #include <sys/kdebug.h>
  95 #include <sys/un.h>
  96 #include <sys/user.h>
  97 #include <sys/priv.h>
  98 #include <sys/kern_event.h>
  99 #include <net/route.h>
 100 #include <net/init.h>
 101 #include <net/net_api_stats.h>
 102 #include <net/ntstat.h>
 103 #include <net/content_filter.h>
 104 #include <netinet/in.h>
 105 #include <netinet/in_pcb.h>
 106 #include <netinet/in_tclass.h>
 107 #include <netinet/in_var.h>
 108 #include <netinet/tcp_var.h>
 109 #include <netinet/ip6.h>
 110 #include <netinet6/ip6_var.h>
 111 #include <netinet/flow_divert.h>
 112 #include <kern/zalloc.h>
 113 #include <kern/locks.h>
 114 #include <machine/limits.h>
 115 #include <libkern/OSAtomic.h>
 116 #include <pexpert/pexpert.h>
 117 #include <kern/assert.h>
 118 #include <kern/task.h>
 119 #include <kern/policy_internal.h>
 120
 121 #include <sys/kpi_mbuf.h>
 122 #include <sys/mcache.h>
 123 #include <sys/unpcb.h>
 124 #include <libkern/section_keywords.h>
 125
 126 #if CONFIG_MACF
 127 #include <security/mac_framework.h>
 128 #endif /* MAC */
 129
 130 #if MULTIPATH
 131 #include <netinet/mp_pcb.h>
 132 #include <netinet/mptcp_var.h>
 133 #endif /* MULTIPATH */
 134
 135 #define ROUNDUP(a, b) (((a) + ((b) - 1)) & (~((b) - 1)))
 136
 137 #if DEBUG || DEVELOPMENT
 138 #define DEBUG_KERNEL_ADDRPERM(_v) (_v)
 139 #else
 140 #define DEBUG_KERNEL_ADDRPERM(_v) VM_KERNEL_ADDRPERM(_v)
 141 #endif
 142
 143 /* TODO: this should be in a header file somewhere */
 144 extern char *proc_name_address(void *p);
 145
 146 static u_int32_t        so_cache_hw;    /* High water mark for socache */
 147 static u_int32_t        so_cache_timeouts;      /* number of timeouts */
 148 static u_int32_t        so_cache_max_freed;     /* max freed per timeout */
 149 static u_int32_t        cached_sock_count = 0;
 150 STAILQ_HEAD(, socket)   so_cache_head;
 151 int     max_cached_sock_count = MAX_CACHED_SOCKETS;
 152 static u_int32_t        so_cache_time;
 153 static int              socketinit_done;
 154 static struct zone      *so_cache_zone;
 155
 156 static lck_grp_t        *so_cache_mtx_grp;
 157 static lck_attr_t       *so_cache_mtx_attr;
 158 static lck_grp_attr_t   *so_cache_mtx_grp_attr;
 159 static lck_mtx_t        *so_cache_mtx;
 160
 161 #include <machine/limits.h>
 162
 163 static int      filt_sorattach(struct knote *kn, struct kevent_qos_s *kev);
 164 static void     filt_sordetach(struct knote *kn);
 165 static int      filt_soread(struct knote *kn, long hint);
 166 static int      filt_sortouch(struct knote *kn, struct kevent_qos_s *kev);
 167 static int      filt_sorprocess(struct knote *kn, struct kevent_qos_s *kev);
 168
 169 static int      filt_sowattach(struct knote *kn, struct kevent_qos_s *kev);
 170 static void     filt_sowdetach(struct knote *kn);
 171 static int      filt_sowrite(struct knote *kn, long hint);
 172 static int      filt_sowtouch(struct knote *kn, struct kevent_qos_s *kev);
 173 static int      filt_sowprocess(struct knote *kn, struct kevent_qos_s *kev);
 174
 175 static int      filt_sockattach(struct knote *kn, struct kevent_qos_s *kev);
 176 static void     filt_sockdetach(struct knote *kn);
 177 static int      filt_sockev(struct knote *kn, long hint);
 178 static int      filt_socktouch(struct knote *kn, struct kevent_qos_s *kev);
 179 static int      filt_sockprocess(struct knote *kn, struct kevent_qos_s *kev);
 180
 181 static int sooptcopyin_timeval(struct sockopt *, struct timeval *);
 182 static int sooptcopyout_timeval(struct sockopt *, const struct timeval *);
 183
 184 SECURITY_READ_ONLY_EARLY(struct filterops) soread_filtops = {
 185         .f_isfd = 1,
 186         .f_attach = filt_sorattach,
 187         .f_detach = filt_sordetach,
 188         .f_event = filt_soread,
 189         .f_touch = filt_sortouch,
 190         .f_process = filt_sorprocess,
 191 };
 192
 193 SECURITY_READ_ONLY_EARLY(struct filterops) sowrite_filtops = {
 194         .f_isfd = 1,
 195         .f_attach = filt_sowattach,
 196         .f_detach = filt_sowdetach,
 197         .f_event = filt_sowrite,
 198         .f_touch = filt_sowtouch,
 199         .f_process = filt_sowprocess,
 200 };
 201
 202 SECURITY_READ_ONLY_EARLY(struct filterops) sock_filtops = {
 203         .f_isfd = 1,
 204         .f_attach = filt_sockattach,
 205         .f_detach = filt_sockdetach,
 206         .f_event = filt_sockev,
 207         .f_touch = filt_socktouch,
 208         .f_process = filt_sockprocess,
 209 };
 210
 211 SECURITY_READ_ONLY_EARLY(struct filterops) soexcept_filtops = {
 212         .f_isfd = 1,
 213         .f_attach = filt_sorattach,
 214         .f_detach = filt_sordetach,
 215         .f_event = filt_soread,
 216         .f_touch = filt_sortouch,
 217         .f_process = filt_sorprocess,
 218 };
 219
 220 SYSCTL_DECL(_kern_ipc);
 221
 222 #define EVEN_MORE_LOCKING_DEBUG 0
 223
 224 int socket_debug = 0;
 225 SYSCTL_INT(_kern_ipc, OID_AUTO, socket_debug,
 226     CTLFLAG_RW | CTLFLAG_LOCKED, &socket_debug, 0, "");
 227
 228 static unsigned long sodefunct_calls = 0;
 229 SYSCTL_LONG(_kern_ipc, OID_AUTO, sodefunct_calls, CTLFLAG_LOCKED,
 230     &sodefunct_calls, "");
 231
 232 static int socket_zone = M_SOCKET;
 233 so_gen_t        so_gencnt;      /* generation count for sockets */
 234
 235 MALLOC_DEFINE(M_SONAME, "soname", "socket name");
 236 MALLOC_DEFINE(M_PCB, "pcb", "protocol control block");
 237
 238 #define DBG_LAYER_IN_BEG        NETDBG_CODE(DBG_NETSOCK, 0)
 239 #define DBG_LAYER_IN_END        NETDBG_CODE(DBG_NETSOCK, 2)
 240 #define DBG_LAYER_OUT_BEG       NETDBG_CODE(DBG_NETSOCK, 1)
 241 #define DBG_LAYER_OUT_END       NETDBG_CODE(DBG_NETSOCK, 3)
 242 #define DBG_FNC_SOSEND          NETDBG_CODE(DBG_NETSOCK, (4 << 8) | 1)
 243 #define DBG_FNC_SOSEND_LIST     NETDBG_CODE(DBG_NETSOCK, (4 << 8) | 3)
 244 #define DBG_FNC_SORECEIVE       NETDBG_CODE(DBG_NETSOCK, (8 << 8))
 245 #define DBG_FNC_SORECEIVE_LIST  NETDBG_CODE(DBG_NETSOCK, (8 << 8) | 3)
 246 #define DBG_FNC_SOSHUTDOWN      NETDBG_CODE(DBG_NETSOCK, (9 << 8))
 247
 248 #define MAX_SOOPTGETM_SIZE      (128 * MCLBYTES)
 249
 250 int somaxconn = SOMAXCONN;
 251 SYSCTL_INT(_kern_ipc, KIPC_SOMAXCONN, somaxconn,
 252     CTLFLAG_RW | CTLFLAG_LOCKED, &somaxconn, 0, "");
 253
 254 /* Should we get a maximum also ??? */
 255 static int sosendmaxchain = 65536;
 256 static int sosendminchain = 16384;
 257 static int sorecvmincopy  = 16384;
 258 SYSCTL_INT(_kern_ipc, OID_AUTO, sosendminchain,
 259     CTLFLAG_RW | CTLFLAG_LOCKED, &sosendminchain, 0, "");
 260 SYSCTL_INT(_kern_ipc, OID_AUTO, sorecvmincopy,
 261     CTLFLAG_RW | CTLFLAG_LOCKED, &sorecvmincopy, 0, "");
 262
 263 /*
 264  * Set to enable jumbo clusters (if available) for large writes when
 265  * the socket is marked with SOF_MULTIPAGES; see below.
 266  */
 267 int sosendjcl = 1;
 268 SYSCTL_INT(_kern_ipc, OID_AUTO, sosendjcl,
 269     CTLFLAG_RW | CTLFLAG_LOCKED, &sosendjcl, 0, "");
 270
 271 /*
 272  * Set this to ignore SOF_MULTIPAGES and use jumbo clusters for large
 273  * writes on the socket for all protocols on any network interfaces,
 274  * depending upon sosendjcl above.  Be extra careful when setting this
 275  * to 1, because sending down packets that cross physical pages down to
 276  * broken drivers (those that falsely assume that the physical pages
 277  * are contiguous) might lead to system panics or silent data corruption.
 278  * When set to 0, the system will respect SOF_MULTIPAGES, which is set
 279  * only for TCP sockets whose outgoing interface is IFNET_MULTIPAGES
 280  * capable.  Set this to 1 only for testing/debugging purposes.
 281  */
 282 int sosendjcl_ignore_capab = 0;
 283 SYSCTL_INT(_kern_ipc, OID_AUTO, sosendjcl_ignore_capab,
 284     CTLFLAG_RW | CTLFLAG_LOCKED, &sosendjcl_ignore_capab, 0, "");
 285
 286 /*
 287  * Set this to ignore SOF1_IF_2KCL and use big clusters for large
 288  * writes on the socket for all protocols on any network interfaces.
 289  * Be extra careful when setting this to 1, because sending down packets with
 290  * clusters larger that 2 KB might lead to system panics or data corruption.
 291  * When set to 0, the system will respect SOF1_IF_2KCL, which is set
 292  * on the outgoing interface
 293  * Set this to 1  for testing/debugging purposes only.
 294  */
 295 int sosendbigcl_ignore_capab = 0;
 296 SYSCTL_INT(_kern_ipc, OID_AUTO, sosendbigcl_ignore_capab,
 297     CTLFLAG_RW | CTLFLAG_LOCKED, &sosendbigcl_ignore_capab, 0, "");
 298
 299 int sodefunctlog = 0;
 300 SYSCTL_INT(_kern_ipc, OID_AUTO, sodefunctlog, CTLFLAG_RW | CTLFLAG_LOCKED,
 301     &sodefunctlog, 0, "");
 302
 303 int sothrottlelog = 0;
 304 SYSCTL_INT(_kern_ipc, OID_AUTO, sothrottlelog, CTLFLAG_RW | CTLFLAG_LOCKED,
 305     &sothrottlelog, 0, "");
 306
 307 int sorestrictrecv = 1;
 308 SYSCTL_INT(_kern_ipc, OID_AUTO, sorestrictrecv, CTLFLAG_RW | CTLFLAG_LOCKED,
 309     &sorestrictrecv, 0, "Enable inbound interface restrictions");
 310
 311 int sorestrictsend = 1;
 312 SYSCTL_INT(_kern_ipc, OID_AUTO, sorestrictsend, CTLFLAG_RW | CTLFLAG_LOCKED,
 313     &sorestrictsend, 0, "Enable outbound interface restrictions");
 314
 315 int soreserveheadroom = 1;
 316 SYSCTL_INT(_kern_ipc, OID_AUTO, soreserveheadroom, CTLFLAG_RW | CTLFLAG_LOCKED,
 317     &soreserveheadroom, 0, "To allocate contiguous datagram buffers");
 318
 319 #if (DEBUG || DEVELOPMENT)
 320 int so_notsent_lowat_check = 1;
 321 SYSCTL_INT(_kern_ipc, OID_AUTO, notsent_lowat, CTLFLAG_RW | CTLFLAG_LOCKED,
 322     &so_notsent_lowat_check, 0, "enable/disable notsnet lowat check");
 323 #endif /* DEBUG || DEVELOPMENT */
 324
 325 int so_accept_list_waits = 0;
 326 #if (DEBUG || DEVELOPMENT)
 327 SYSCTL_INT(_kern_ipc, OID_AUTO, accept_list_waits, CTLFLAG_RW | CTLFLAG_LOCKED,
 328     &so_accept_list_waits, 0, "number of waits for listener incomp list");
 329 #endif /* DEBUG || DEVELOPMENT */
 330
 331 extern struct inpcbinfo tcbinfo;
 332
 333 /* TODO: these should be in header file */
 334 extern int get_inpcb_str_size(void);
 335 extern int get_tcp_str_size(void);
 336
 337 vm_size_t       so_cache_zone_element_size;
 338
 339 static int sodelayed_copy(struct socket *, struct uio *, struct mbuf **,
 340     user_ssize_t *);
 341 static void cached_sock_alloc(struct socket **, int);
 342 static void cached_sock_free(struct socket *);
 343
 344 /*
 345  * Maximum of extended background idle sockets per process
 346  * Set to zero to disable further setting of the option
 347  */
 348
 349 #define SO_IDLE_BK_IDLE_MAX_PER_PROC    1
 350 #define SO_IDLE_BK_IDLE_TIME            600
 351 #define SO_IDLE_BK_IDLE_RCV_HIWAT       131072
 352
 353 struct soextbkidlestat soextbkidlestat;
 354
 355 SYSCTL_UINT(_kern_ipc, OID_AUTO, maxextbkidleperproc,
 356     CTLFLAG_RW | CTLFLAG_LOCKED, &soextbkidlestat.so_xbkidle_maxperproc, 0,
 357     "Maximum of extended background idle sockets per process");
 358
 359 SYSCTL_UINT(_kern_ipc, OID_AUTO, extbkidletime, CTLFLAG_RW | CTLFLAG_LOCKED,
 360     &soextbkidlestat.so_xbkidle_time, 0,
 361     "Time in seconds to keep extended background idle sockets");
 362
 363 SYSCTL_UINT(_kern_ipc, OID_AUTO, extbkidlercvhiwat, CTLFLAG_RW | CTLFLAG_LOCKED,
 364     &soextbkidlestat.so_xbkidle_rcvhiwat, 0,
 365     "High water mark for extended background idle sockets");
 366
 367 SYSCTL_STRUCT(_kern_ipc, OID_AUTO, extbkidlestat, CTLFLAG_RD | CTLFLAG_LOCKED,
 368     &soextbkidlestat, soextbkidlestat, "");
 369
 370 int so_set_extended_bk_idle(struct socket *, int);
 371
 372
 373 /*
 374  * SOTCDB_NO_DSCP is set by default, to prevent the networking stack from
 375  * setting the DSCP code on the packet based on the service class; see
 376  * <rdar://problem/11277343> for details.
 377  */
 378 __private_extern__ u_int32_t sotcdb = 0;
 379 SYSCTL_INT(_kern_ipc, OID_AUTO, sotcdb, CTLFLAG_RW | CTLFLAG_LOCKED,
 380     &sotcdb, 0, "");
 381
 382 void
 383 socketinit(void)
 384 {
 385         _CASSERT(sizeof(so_gencnt) == sizeof(uint64_t));
 386         VERIFY(IS_P2ALIGNED(&so_gencnt, sizeof(uint32_t)));
 387
 388 #ifdef __LP64__
 389         _CASSERT(sizeof(struct sa_endpoints) == sizeof(struct user64_sa_endpoints));
 390         _CASSERT(offsetof(struct sa_endpoints, sae_srcif) == offsetof(struct user64_sa_endpoints, sae_srcif));
 391         _CASSERT(offsetof(struct sa_endpoints, sae_srcaddr) == offsetof(struct user64_sa_endpoints, sae_srcaddr));
 392         _CASSERT(offsetof(struct sa_endpoints, sae_srcaddrlen) == offsetof(struct user64_sa_endpoints, sae_srcaddrlen));
 393         _CASSERT(offsetof(struct sa_endpoints, sae_dstaddr) == offsetof(struct user64_sa_endpoints, sae_dstaddr));
 394         _CASSERT(offsetof(struct sa_endpoints, sae_dstaddrlen) == offsetof(struct user64_sa_endpoints, sae_dstaddrlen));
 395 #else
 396         _CASSERT(sizeof(struct sa_endpoints) == sizeof(struct user32_sa_endpoints));
 397         _CASSERT(offsetof(struct sa_endpoints, sae_srcif) == offsetof(struct user32_sa_endpoints, sae_srcif));
 398         _CASSERT(offsetof(struct sa_endpoints, sae_srcaddr) == offsetof(struct user32_sa_endpoints, sae_srcaddr));
 399         _CASSERT(offsetof(struct sa_endpoints, sae_srcaddrlen) == offsetof(struct user32_sa_endpoints, sae_srcaddrlen));
 400         _CASSERT(offsetof(struct sa_endpoints, sae_dstaddr) == offsetof(struct user32_sa_endpoints, sae_dstaddr));
 401         _CASSERT(offsetof(struct sa_endpoints, sae_dstaddrlen) == offsetof(struct user32_sa_endpoints, sae_dstaddrlen));
 402 #endif
 403
 404         if (socketinit_done) {
 405                 printf("socketinit: already called...\n");
 406                 return;
 407         }
 408         socketinit_done = 1;
 409
 410         PE_parse_boot_argn("socket_debug", &socket_debug,
 411             sizeof(socket_debug));
 412
 413         /*
 414          * allocate lock group attribute and group for socket cache mutex
 415          */
 416         so_cache_mtx_grp_attr = lck_grp_attr_alloc_init();
 417         so_cache_mtx_grp = lck_grp_alloc_init("so_cache",
 418             so_cache_mtx_grp_attr);
 419
 420         /*
 421          * allocate the lock attribute for socket cache mutex
 422          */
 423         so_cache_mtx_attr = lck_attr_alloc_init();
 424
 425         /* cached sockets mutex */
 426         so_cache_mtx = lck_mtx_alloc_init(so_cache_mtx_grp, so_cache_mtx_attr);
 427         if (so_cache_mtx == NULL) {
 428                 panic("%s: unable to allocate so_cache_mtx\n", __func__);
 429                 /* NOTREACHED */
 430         }
 431         STAILQ_INIT(&so_cache_head);
 432
 433         so_cache_zone_element_size = (vm_size_t)(sizeof(struct socket) + 4
 434             + get_inpcb_str_size() + 4 + get_tcp_str_size());
 435
 436         so_cache_zone = zinit(so_cache_zone_element_size,
 437             (120000 * so_cache_zone_element_size), 8192, "socache zone");
 438         zone_change(so_cache_zone, Z_CALLERACCT, FALSE);
 439         zone_change(so_cache_zone, Z_NOENCRYPT, TRUE);
 440
 441         bzero(&soextbkidlestat, sizeof(struct soextbkidlestat));
 442         soextbkidlestat.so_xbkidle_maxperproc = SO_IDLE_BK_IDLE_MAX_PER_PROC;
 443         soextbkidlestat.so_xbkidle_time = SO_IDLE_BK_IDLE_TIME;
 444         soextbkidlestat.so_xbkidle_rcvhiwat = SO_IDLE_BK_IDLE_RCV_HIWAT;
 445
 446         in_pcbinit();
 447         sflt_init();
 448         socket_tclass_init();
 449 #if MULTIPATH
 450         mp_pcbinit();
 451 #endif /* MULTIPATH */
 452 }
 453
 454 static void
 455 cached_sock_alloc(struct socket **so, int waitok)
 456 {
 457         caddr_t temp;
 458         uintptr_t offset;
 459
 460         lck_mtx_lock(so_cache_mtx);
 461
 462         if (!STAILQ_EMPTY(&so_cache_head)) {
 463                 VERIFY(cached_sock_count > 0);
 464
 465                 *so = STAILQ_FIRST(&so_cache_head);
 466                 STAILQ_REMOVE_HEAD(&so_cache_head, so_cache_ent);
 467                 STAILQ_NEXT((*so), so_cache_ent) = NULL;
 468
 469                 cached_sock_count--;
 470                 lck_mtx_unlock(so_cache_mtx);
 471
 472                 temp = (*so)->so_saved_pcb;
 473                 bzero((caddr_t)*so, sizeof(struct socket));
 474
 475                 (*so)->so_saved_pcb = temp;
 476         } else {
 477                 lck_mtx_unlock(so_cache_mtx);
 478
 479                 if (waitok) {
 480                         *so = (struct socket *)zalloc(so_cache_zone);
 481                 } else {
 482                         *so = (struct socket *)zalloc_noblock(so_cache_zone);
 483                 }
 484
 485                 if (*so == NULL) {
 486                         return;
 487                 }
 488
 489                 bzero((caddr_t)*so, sizeof(struct socket));
 490
 491                 /*
 492                  * Define offsets for extra structures into our
 493                  * single block of memory. Align extra structures
 494                  * on longword boundaries.
 495                  */
 496
 497                 offset = (uintptr_t)*so;
 498                 offset += sizeof(struct socket);
 499
 500                 offset = ALIGN(offset);
 501
 502                 (*so)->so_saved_pcb = (caddr_t)offset;
 503                 offset += get_inpcb_str_size();
 504
 505                 offset = ALIGN(offset);
 506
 507                 ((struct inpcb *)(void *)(*so)->so_saved_pcb)->inp_saved_ppcb =
 508                     (caddr_t)offset;
 509         }
 510
 511         OSBitOrAtomic(SOF1_CACHED_IN_SOCK_LAYER, &(*so)->so_flags1);
 512 }
 513
 514 static void
 515 cached_sock_free(struct socket *so)
 516 {
 517         lck_mtx_lock(so_cache_mtx);
 518
 519         so_cache_time = net_uptime();
 520         if (++cached_sock_count > max_cached_sock_count) {
 521                 --cached_sock_count;
 522                 lck_mtx_unlock(so_cache_mtx);
 523                 zfree(so_cache_zone, so);
 524         } else {
 525                 if (so_cache_hw < cached_sock_count) {
 526                         so_cache_hw = cached_sock_count;
 527                 }
 528
 529                 STAILQ_INSERT_TAIL(&so_cache_head, so, so_cache_ent);
 530
 531                 so->cache_timestamp = so_cache_time;
 532                 lck_mtx_unlock(so_cache_mtx);
 533         }
 534 }
 535
 536 void
 537 so_update_last_owner_locked(struct socket *so, proc_t self)
 538 {
 539         if (so->last_pid != 0) {
 540                 /*
 541                  * last_pid and last_upid should remain zero for sockets
 542                  * created using sock_socket. The check above achieves that
 543                  */
 544                 if (self == PROC_NULL) {
 545                         self = current_proc();
 546                 }
 547
 548                 if (so->last_upid != proc_uniqueid(self) ||
 549                     so->last_pid != proc_pid(self)) {
 550                         so->last_upid = proc_uniqueid(self);
 551                         so->last_pid = proc_pid(self);
 552                         proc_getexecutableuuid(self, so->last_uuid,
 553                             sizeof(so->last_uuid));
 554                         if (so->so_proto != NULL && so->so_proto->pr_update_last_owner != NULL) {
 555                                 (*so->so_proto->pr_update_last_owner)(so, self, NULL);
 556                         }
 557                 }
 558                 proc_pidoriginatoruuid(so->so_vuuid, sizeof(so->so_vuuid));
 559         }
 560 }
 561
 562 void
 563 so_update_policy(struct socket *so)
 564 {
 565         if (SOCK_DOM(so) == PF_INET || SOCK_DOM(so) == PF_INET6) {
 566                 (void) inp_update_policy(sotoinpcb(so));
 567         }
 568 }
 569
 570 #if NECP
 571 static void
 572 so_update_necp_policy(struct socket *so, struct sockaddr *override_local_addr,
 573     struct sockaddr *override_remote_addr)
 574 {
 575         if (SOCK_DOM(so) == PF_INET || SOCK_DOM(so) == PF_INET6) {
 576                 inp_update_necp_policy(sotoinpcb(so), override_local_addr,
 577                     override_remote_addr, 0);
 578         }
 579 }
 580 #endif /* NECP */
 581
 582 boolean_t
 583 so_cache_timer(void)
 584 {
 585         struct socket   *p;
 586         int             n_freed = 0;
 587         boolean_t rc = FALSE;
 588
 589         lck_mtx_lock(so_cache_mtx);
 590         so_cache_timeouts++;
 591         so_cache_time = net_uptime();
 592
 593         while (!STAILQ_EMPTY(&so_cache_head)) {
 594                 VERIFY(cached_sock_count > 0);
 595                 p = STAILQ_FIRST(&so_cache_head);
 596                 if ((so_cache_time - p->cache_timestamp) <
 597                     SO_CACHE_TIME_LIMIT) {
 598                         break;
 599                 }
 600
 601                 STAILQ_REMOVE_HEAD(&so_cache_head, so_cache_ent);
 602                 --cached_sock_count;
 603
 604                 zfree(so_cache_zone, p);
 605
 606                 if (++n_freed >= SO_CACHE_MAX_FREE_BATCH) {
 607                         so_cache_max_freed++;
 608                         break;
 609                 }
 610         }
 611
 612         /* Schedule again if there is more to cleanup */
 613         if (!STAILQ_EMPTY(&so_cache_head)) {
 614                 rc = TRUE;
 615         }
 616
 617         lck_mtx_unlock(so_cache_mtx);
 618         return rc;
 619 }
 620
 621 /*
 622  * Get a socket structure from our zone, and initialize it.
 623  * We don't implement `waitok' yet (see comments in uipc_domain.c).
 624  * Note that it would probably be better to allocate socket
 625  * and PCB at the same time, but I'm not convinced that all
 626  * the protocols can be easily modified to do this.
 627  */
 628 struct socket *
 629 soalloc(int waitok, int dom, int type)
 630 {
 631         struct socket *so;
 632
 633         if ((dom == PF_INET) && (type == SOCK_STREAM)) {
 634                 cached_sock_alloc(&so, waitok);
 635         } else {
 636                 MALLOC_ZONE(so, struct socket *, sizeof(*so), socket_zone,
 637                     M_WAITOK);
 638                 if (so != NULL) {
 639                         bzero(so, sizeof(*so));
 640                 }
 641         }
 642         if (so != NULL) {
 643                 so->so_gencnt = OSIncrementAtomic64((SInt64 *)&so_gencnt);
 644                 so->so_zone = socket_zone;
 645
 646                 /*
 647                  * Increment the socket allocation statistics
 648                  */
 649                 INC_ATOMIC_INT64_LIM(net_api_stats.nas_socket_alloc_total);
 650
 651 #if CONFIG_MACF_SOCKET
 652                 /* Convert waitok to  M_WAITOK/M_NOWAIT for MAC Framework. */
 653                 if (mac_socket_label_init(so, !waitok) != 0) {
 654                         sodealloc(so);
 655                         return NULL;
 656                 }
 657 #endif /* MAC_SOCKET */
 658         }
 659
 660         return so;
 661 }
 662
 663 int
 664 socreate_internal(int dom, struct socket **aso, int type, int proto,
 665     struct proc *p, uint32_t flags, struct proc *ep)
 666 {
 667         struct protosw *prp;
 668         struct socket *so;
 669         int error = 0;
 670
 671 #if TCPDEBUG
 672         extern int tcpconsdebug;
 673 #endif
 674
 675         VERIFY(aso != NULL);
 676         *aso = NULL;
 677
 678         if (proto != 0) {
 679                 prp = pffindproto(dom, proto, type);
 680         } else {
 681                 prp = pffindtype(dom, type);
 682         }
 683
 684         if (prp == NULL || prp->pr_usrreqs->pru_attach == NULL) {
 685                 if (pffinddomain(dom) == NULL) {
 686                         return EAFNOSUPPORT;
 687                 }
 688                 if (proto != 0) {
 689                         if (pffindprotonotype(dom, proto) != NULL) {
 690                                 return EPROTOTYPE;
 691                         }
 692                 }
 693                 return EPROTONOSUPPORT;
 694         }
 695         if (prp->pr_type != type) {
 696                 return EPROTOTYPE;
 697         }
 698         so = soalloc(1, dom, type);
 699         if (so == NULL) {
 700                 return ENOBUFS;
 701         }
 702
 703         switch (dom) {
 704         case PF_LOCAL:
 705                 INC_ATOMIC_INT64_LIM(net_api_stats.nas_socket_domain_local_total);
 706                 break;
 707         case PF_INET:
 708                 INC_ATOMIC_INT64_LIM(net_api_stats.nas_socket_domain_inet_total);
 709                 if (type == SOCK_STREAM) {
 710                         INC_ATOMIC_INT64_LIM(net_api_stats.nas_socket_inet_stream_total);
 711                 } else {
 712                         INC_ATOMIC_INT64_LIM(net_api_stats.nas_socket_inet_dgram_total);
 713                 }
 714                 break;
 715         case PF_ROUTE:
 716                 INC_ATOMIC_INT64_LIM(net_api_stats.nas_socket_domain_route_total);
 717                 break;
 718         case PF_NDRV:
 719                 INC_ATOMIC_INT64_LIM(net_api_stats.nas_socket_domain_ndrv_total);
 720                 break;
 721         case PF_KEY:
 722                 INC_ATOMIC_INT64_LIM(net_api_stats.nas_socket_domain_key_total);
 723                 break;
 724         case PF_INET6:
 725                 INC_ATOMIC_INT64_LIM(net_api_stats.nas_socket_domain_inet6_total);
 726                 if (type == SOCK_STREAM) {
 727                         INC_ATOMIC_INT64_LIM(net_api_stats.nas_socket_inet6_stream_total);
 728                 } else {
 729                         INC_ATOMIC_INT64_LIM(net_api_stats.nas_socket_inet6_dgram_total);
 730                 }
 731                 break;
 732         case PF_SYSTEM:
 733                 INC_ATOMIC_INT64_LIM(net_api_stats.nas_socket_domain_system_total);
 734                 break;
 735         case PF_MULTIPATH:
 736                 INC_ATOMIC_INT64_LIM(net_api_stats.nas_socket_domain_multipath_total);
 737                 break;
 738         default:
 739                 INC_ATOMIC_INT64_LIM(net_api_stats.nas_socket_domain_other_total);
 740                 break;
 741         }
 742
 743         if (flags & SOCF_MPTCP) {
 744                 so->so_state |= SS_NBIO;
 745         }
 746
 747         TAILQ_INIT(&so->so_incomp);
 748         TAILQ_INIT(&so->so_comp);
 749         so->so_type = type;
 750         so->last_upid = proc_uniqueid(p);
 751         so->last_pid = proc_pid(p);
 752         proc_getexecutableuuid(p, so->last_uuid, sizeof(so->last_uuid));
 753         proc_pidoriginatoruuid(so->so_vuuid, sizeof(so->so_vuuid));
 754
 755         if (ep != PROC_NULL && ep != p) {
 756                 so->e_upid = proc_uniqueid(ep);
 757                 so->e_pid = proc_pid(ep);
 758                 proc_getexecutableuuid(ep, so->e_uuid, sizeof(so->e_uuid));
 759                 so->so_flags |= SOF_DELEGATED;
 760         }
 761
 762         so->so_cred = kauth_cred_proc_ref(p);
 763         if (!suser(kauth_cred_get(), NULL)) {
 764                 so->so_state |= SS_PRIV;
 765         }
 766
 767         so->so_proto = prp;
 768         so->so_rcv.sb_flags |= SB_RECV;
 769         so->so_rcv.sb_so = so->so_snd.sb_so = so;
 770         so->next_lock_lr = 0;
 771         so->next_unlock_lr = 0;
 772
 773 #if CONFIG_MACF_SOCKET
 774         mac_socket_label_associate(kauth_cred_get(), so);
 775 #endif /* MAC_SOCKET */
 776
 777         /*
 778          * Attachment will create the per pcb lock if necessary and
 779          * increase refcount for creation, make sure it's done before
 780          * socket is inserted in lists.
 781          */
 782         so->so_usecount++;
 783
 784         error = (*prp->pr_usrreqs->pru_attach)(so, proto, p);
 785         if (error != 0) {
 786                 /*
 787                  * Warning:
 788                  * If so_pcb is not zero, the socket will be leaked,
 789                  * so protocol attachment handler must be coded carefuly
 790                  */
 791                 so->so_state |= SS_NOFDREF;
 792                 VERIFY(so->so_usecount > 0);
 793                 so->so_usecount--;
 794                 sofreelastref(so, 1);   /* will deallocate the socket */
 795                 return error;
 796         }
 797
 798         /*
 799          * Note: needs so_pcb to be set after pru_attach
 800          */
 801         if (prp->pr_update_last_owner != NULL) {
 802                 (*prp->pr_update_last_owner)(so, p, ep);
 803         }
 804
 805         atomic_add_32(&prp->pr_domain->dom_refs, 1);
 806         TAILQ_INIT(&so->so_evlist);
 807
 808         /* Attach socket filters for this protocol */
 809         sflt_initsock(so);
 810 #if TCPDEBUG
 811         if (tcpconsdebug == 2) {
 812                 so->so_options |= SO_DEBUG;
 813         }
 814 #endif
 815         so_set_default_traffic_class(so);
 816
 817         /*
 818          * If this thread or task is marked to create backgrounded sockets,
 819          * mark the socket as background.
 820          */
 821         if (!(flags & SOCF_MPTCP) &&
 822             proc_get_effective_thread_policy(current_thread(), TASK_POLICY_NEW_SOCKETS_BG)) {
 823                 socket_set_traffic_mgt_flags(so, TRAFFIC_MGT_SO_BACKGROUND);
 824                 so->so_background_thread = current_thread();
 825         }
 826
 827         switch (dom) {
 828         /*
 829          * Don't mark Unix domain or system
 830          * eligible for defunct by default.
 831          */
 832         case PF_LOCAL:
 833         case PF_SYSTEM:
 834                 so->so_flags |= SOF_NODEFUNCT;
 835                 break;
 836         default:
 837                 break;
 838         }
 839
 840         /*
 841          * Entitlements can't be checked at socket creation time except if the
 842          * application requested a feature guarded by a privilege (c.f., socket
 843          * delegation).
 844          * The priv(9) and the Sandboxing APIs are designed with the idea that
 845          * a privilege check should only be triggered by a userland request.
 846          * A privilege check at socket creation time is time consuming and
 847          * could trigger many authorisation error messages from the security
 848          * APIs.
 849          */
 850
 851         *aso = so;
 852
 853         return 0;
 854 }
 855
 856 /*
 857  * Returns:     0                       Success
 858  *              EAFNOSUPPORT
 859  *              EPROTOTYPE
 860  *              EPROTONOSUPPORT
 861  *              ENOBUFS
 862  *      <pru_attach>:ENOBUFS[AF_UNIX]
 863  *      <pru_attach>:ENOBUFS[TCP]
 864  *      <pru_attach>:ENOMEM[TCP]
 865  *      <pru_attach>:???                [other protocol families, IPSEC]
 866  */
 867 int
 868 socreate(int dom, struct socket **aso, int type, int proto)
 869 {
 870         return socreate_internal(dom, aso, type, proto, current_proc(), 0,
 871                    PROC_NULL);
 872 }
 873
 874 int
 875 socreate_delegate(int dom, struct socket **aso, int type, int proto, pid_t epid)
 876 {
 877         int error = 0;
 878         struct proc *ep = PROC_NULL;
 879
 880         if ((proc_selfpid() != epid) && ((ep = proc_find(epid)) == PROC_NULL)) {
 881                 error = ESRCH;
 882                 goto done;
 883         }
 884
 885         error = socreate_internal(dom, aso, type, proto, current_proc(), 0, ep);
 886
 887         /*
 888          * It might not be wise to hold the proc reference when calling
 889          * socreate_internal since it calls soalloc with M_WAITOK
 890          */
 891 done:
 892         if (ep != PROC_NULL) {
 893                 proc_rele(ep);
 894         }
 895
 896         return error;
 897 }
 898
 899 /*
 900  * Returns:     0                       Success
 901  *      <pru_bind>:EINVAL               Invalid argument [COMMON_START]
 902  *      <pru_bind>:EAFNOSUPPORT         Address family not supported
 903  *      <pru_bind>:EADDRNOTAVAIL        Address not available.
 904  *      <pru_bind>:EINVAL               Invalid argument
 905  *      <pru_bind>:EAFNOSUPPORT         Address family not supported [notdef]
 906  *      <pru_bind>:EACCES               Permission denied
 907  *      <pru_bind>:EADDRINUSE           Address in use
 908  *      <pru_bind>:EAGAIN               Resource unavailable, try again
 909  *      <pru_bind>:EPERM                Operation not permitted
 910  *      <pru_bind>:???
 911  *      <sf_bind>:???
 912  *
 913  * Notes:       It's not possible to fully enumerate the return codes above,
 914  *              since socket filter authors and protocol family authors may
 915  *              not choose to limit their error returns to those listed, even
 916  *              though this may result in some software operating incorrectly.
 917  *
 918  *              The error codes which are enumerated above are those known to
 919  *              be returned by the tcp_usr_bind function supplied.
 920  */
 921 int
 922 sobindlock(struct socket *so, struct sockaddr *nam, int dolock)
 923 {
 924         struct proc *p = current_proc();
 925         int error = 0;
 926
 927         if (dolock) {
 928                 socket_lock(so, 1);
 929         }
 930
 931         so_update_last_owner_locked(so, p);
 932         so_update_policy(so);
 933
 934 #if NECP
 935         so_update_necp_policy(so, nam, NULL);
 936 #endif /* NECP */
 937
 938         /*
 939          * If this is a bind request on a socket that has been marked
 940          * as inactive, reject it now before we go any further.
 941          */
 942         if (so->so_flags & SOF_DEFUNCT) {
 943                 error = EINVAL;
 944                 SODEFUNCTLOG("%s[%d, %s]: defunct so 0x%llx [%d,%d] (%d)\n",
 945                     __func__, proc_pid(p), proc_best_name(p),
 946                     (uint64_t)DEBUG_KERNEL_ADDRPERM(so),
 947                     SOCK_DOM(so), SOCK_TYPE(so), error);
 948                 goto out;
 949         }
 950
 951         /* Socket filter */
 952         error = sflt_bind(so, nam);
 953
 954         if (error == 0) {
 955                 error = (*so->so_proto->pr_usrreqs->pru_bind)(so, nam, p);
 956         }
 957 out:
 958         if (dolock) {
 959                 socket_unlock(so, 1);
 960         }
 961
 962         if (error == EJUSTRETURN) {
 963                 error = 0;
 964         }
 965
 966         return error;
 967 }
 968
 969 void
 970 sodealloc(struct socket *so)
 971 {
 972         kauth_cred_unref(&so->so_cred);
 973
 974         /* Remove any filters */
 975         sflt_termsock(so);
 976
 977 #if CONTENT_FILTER
 978         cfil_sock_detach(so);
 979 #endif /* CONTENT_FILTER */
 980
 981         /* Delete the state allocated for msg queues on a socket */
 982         if (so->so_flags & SOF_ENABLE_MSGS) {
 983                 FREE(so->so_msg_state, M_TEMP);
 984                 so->so_msg_state = NULL;
 985         }
 986         VERIFY(so->so_msg_state == NULL);
 987
 988         so->so_gencnt = OSIncrementAtomic64((SInt64 *)&so_gencnt);
 989
 990 #if CONFIG_MACF_SOCKET
 991         mac_socket_label_destroy(so);
 992 #endif /* MAC_SOCKET */
 993
 994         if (so->so_flags1 & SOF1_CACHED_IN_SOCK_LAYER) {
 995                 cached_sock_free(so);
 996         } else {
 997                 FREE_ZONE(so, sizeof(*so), so->so_zone);
 998         }
 999 }
1000
1001 /*
1002  * Returns:     0                       Success
1003  *              EINVAL
1004  *              EOPNOTSUPP
1005  *      <pru_listen>:EINVAL[AF_UNIX]
1006  *      <pru_listen>:EINVAL[TCP]
1007  *      <pru_listen>:EADDRNOTAVAIL[TCP] Address not available.
1008  *      <pru_listen>:EINVAL[TCP]        Invalid argument
1009  *      <pru_listen>:EAFNOSUPPORT[TCP]  Address family not supported [notdef]
1010  *      <pru_listen>:EACCES[TCP]        Permission denied
1011  *      <pru_listen>:EADDRINUSE[TCP]    Address in use
1012  *      <pru_listen>:EAGAIN[TCP]        Resource unavailable, try again
1013  *      <pru_listen>:EPERM[TCP]         Operation not permitted
1014  *      <sf_listen>:???
1015  *
1016  * Notes:       Other <pru_listen> returns depend on the protocol family; all
1017  *              <sf_listen> returns depend on what the filter author causes
1018  *              their filter to return.
1019  */
1020 int
1021 solisten(struct socket *so, int backlog)
1022 {
1023         struct proc *p = current_proc();
1024         int error = 0;
1025
1026         socket_lock(so, 1);
1027
1028         so_update_last_owner_locked(so, p);
1029         so_update_policy(so);
1030
1031 #if NECP
1032         so_update_necp_policy(so, NULL, NULL);
1033 #endif /* NECP */
1034
1035         if (so->so_proto == NULL) {
1036                 error = EINVAL;
1037                 goto out;
1038         }
1039         if ((so->so_proto->pr_flags & PR_CONNREQUIRED) == 0) {
1040                 error = EOPNOTSUPP;
1041                 goto out;
1042         }
1043
1044         /*
1045          * If the listen request is made on a socket that is not fully
1046          * disconnected, or on a socket that has been marked as inactive,
1047          * reject the request now.
1048          */
1049         if ((so->so_state &
1050             (SS_ISCONNECTED | SS_ISCONNECTING | SS_ISDISCONNECTING)) ||
1051             (so->so_flags & SOF_DEFUNCT)) {
1052                 error = EINVAL;
1053                 if (so->so_flags & SOF_DEFUNCT) {
1054                         SODEFUNCTLOG("%s[%d, %s]: defunct so 0x%llx [%d,%d] "
1055                             "(%d)\n", __func__, proc_pid(p),
1056                             proc_best_name(p),
1057                             (uint64_t)DEBUG_KERNEL_ADDRPERM(so),
1058                             SOCK_DOM(so), SOCK_TYPE(so), error);
1059                 }
1060                 goto out;
1061         }
1062
1063         if ((so->so_restrictions & SO_RESTRICT_DENY_IN) != 0) {
1064                 error = EPERM;
1065                 goto out;
1066         }
1067
1068         error = sflt_listen(so);
1069         if (error == 0) {
1070                 error = (*so->so_proto->pr_usrreqs->pru_listen)(so, p);
1071         }
1072
1073         if (error) {
1074                 if (error == EJUSTRETURN) {
1075                         error = 0;
1076                 }
1077                 goto out;
1078         }
1079
1080         if (TAILQ_EMPTY(&so->so_comp)) {
1081                 so->so_options |= SO_ACCEPTCONN;
1082         }
1083         /*
1084          * POSIX: The implementation may have an upper limit on the length of
1085          * the listen queue-either global or per accepting socket. If backlog
1086          * exceeds this limit, the length of the listen queue is set to the
1087          * limit.
1088          *
1089          * If listen() is called with a backlog argument value that is less
1090          * than 0, the function behaves as if it had been called with a backlog
1091          * argument value of 0.
1092          *
1093          * A backlog argument of 0 may allow the socket to accept connections,
1094          * in which case the length of the listen queue may be set to an
1095          * implementation-defined minimum value.
1096          */
1097         if (backlog <= 0 || backlog > somaxconn) {
1098                 backlog = somaxconn;
1099         }
1100
1101         so->so_qlimit = backlog;
1102 out:
1103         socket_unlock(so, 1);
1104         return error;
1105 }
1106
1107 /*
1108  * The "accept list lock" protects the fields related to the listener queues
1109  * because we can unlock a socket to respect the lock ordering between
1110  * the listener socket and its clients sockets. The lock ordering is first to
1111  * acquire the client socket before the listener socket.
1112  *
1113  * The accept list lock serializes access to the following fields:
1114  * - of the listener socket:
1115  *   - so_comp
1116  *   - so_incomp
1117  *   - so_qlen
1118  *   - so_inqlen
1119  * - of client sockets that are in so_comp or so_incomp:
1120  *   - so_head
1121  *   - so_list
1122  *
1123  * As one can see the accept list lock protects the consistent of the
1124  * linkage of the client sockets.
1125  *
1126  * Note that those fields may be read without holding the accept list lock
1127  * for a preflight provided the accept list lock is taken when committing
1128  * to take an action based on the result of the preflight. The preflight
1129  * saves the cost of doing the unlock/lock dance.
1130  */
1131 void
1132 so_acquire_accept_list(struct socket *head, struct socket *so)
1133 {
1134         lck_mtx_t *mutex_held;
1135
1136         if (head->so_proto->pr_getlock == NULL) {
1137                 return;
1138         }
1139         mutex_held = (*head->so_proto->pr_getlock)(head, PR_F_WILLUNLOCK);
1140         LCK_MTX_ASSERT(mutex_held, LCK_MTX_ASSERT_OWNED);
1141
1142         if (!(head->so_flags1 & SOF1_ACCEPT_LIST_HELD)) {
1143                 head->so_flags1 |= SOF1_ACCEPT_LIST_HELD;
1144                 return;
1145         }
1146         if (so != NULL) {
1147                 socket_unlock(so, 0);
1148         }
1149         while (head->so_flags1 & SOF1_ACCEPT_LIST_HELD) {
1150                 so_accept_list_waits += 1;
1151                 msleep((caddr_t)&head->so_incomp, mutex_held,
1152                     PSOCK | PCATCH, __func__, NULL);
1153         }
1154         head->so_flags1 |= SOF1_ACCEPT_LIST_HELD;
1155         if (so != NULL) {
1156                 socket_unlock(head, 0);
1157                 socket_lock(so, 0);
1158                 socket_lock(head, 0);
1159         }
1160 }
1161
1162 void
1163 so_release_accept_list(struct socket *head)
1164 {
1165         if (head->so_proto->pr_getlock != NULL) {
1166                 lck_mtx_t *mutex_held;
1167
1168                 mutex_held = (*head->so_proto->pr_getlock)(head, 0);
1169                 LCK_MTX_ASSERT(mutex_held, LCK_MTX_ASSERT_OWNED);
1170
1171                 head->so_flags1 &= ~SOF1_ACCEPT_LIST_HELD;
1172                 wakeup((caddr_t)&head->so_incomp);
1173         }
1174 }
1175
1176 void
1177 sofreelastref(struct socket *so, int dealloc)
1178 {
1179         struct socket *head = so->so_head;
1180
1181         /* Assume socket is locked */
1182
1183         if (!(so->so_flags & SOF_PCBCLEARING) || !(so->so_state & SS_NOFDREF)) {
1184                 selthreadclear(&so->so_snd.sb_sel);
1185                 selthreadclear(&so->so_rcv.sb_sel);
1186                 so->so_rcv.sb_flags &= ~(SB_SEL | SB_UPCALL);
1187                 so->so_snd.sb_flags &= ~(SB_SEL | SB_UPCALL);
1188                 so->so_event = sonullevent;
1189                 return;
1190         }
1191         if (head != NULL) {
1192                 /*
1193                  * Need to lock the listener when the protocol has
1194                  * per socket locks
1195                  */
1196                 if (head->so_proto->pr_getlock != NULL) {
1197                         socket_lock(head, 1);
1198                         so_acquire_accept_list(head, so);
1199                 }
1200                 if (so->so_state & SS_INCOMP) {
1201                         so->so_state &= ~SS_INCOMP;
1202                         TAILQ_REMOVE(&head->so_incomp, so, so_list);
1203                         head->so_incqlen--;
1204                         head->so_qlen--;
1205                         so->so_head = NULL;
1206
1207                         if (head->so_proto->pr_getlock != NULL) {
1208                                 so_release_accept_list(head);
1209                                 socket_unlock(head, 1);
1210                         }
1211                 } else if (so->so_state & SS_COMP) {
1212                         if (head->so_proto->pr_getlock != NULL) {
1213                                 so_release_accept_list(head);
1214                                 socket_unlock(head, 1);
1215                         }
1216                         /*
1217                          * We must not decommission a socket that's
1218                          * on the accept(2) queue.  If we do, then
1219                          * accept(2) may hang after select(2) indicated
1220                          * that the listening socket was ready.
1221                          */
1222                         selthreadclear(&so->so_snd.sb_sel);
1223                         selthreadclear(&so->so_rcv.sb_sel);
1224                         so->so_rcv.sb_flags &= ~(SB_SEL | SB_UPCALL);
1225                         so->so_snd.sb_flags &= ~(SB_SEL | SB_UPCALL);
1226                         so->so_event = sonullevent;
1227                         return;
1228                 } else {
1229                         if (head->so_proto->pr_getlock != NULL) {
1230                                 so_release_accept_list(head);
1231                                 socket_unlock(head, 1);
1232                         }
1233                         printf("sofree: not queued\n");
1234                 }
1235         }
1236         sowflush(so);
1237         sorflush(so);
1238
1239 #if FLOW_DIVERT
1240         if (so->so_flags & SOF_FLOW_DIVERT) {
1241                 flow_divert_detach(so);
1242         }
1243 #endif  /* FLOW_DIVERT */
1244
1245         /* 3932268: disable upcall */
1246         so->so_rcv.sb_flags &= ~SB_UPCALL;
1247         so->so_snd.sb_flags &= ~(SB_UPCALL | SB_SNDBYTE_CNT);
1248         so->so_event = sonullevent;
1249
1250         if (dealloc) {
1251                 sodealloc(so);
1252         }
1253 }
1254
1255 void
1256 soclose_wait_locked(struct socket *so)
1257 {
1258         lck_mtx_t *mutex_held;
1259
1260         if (so->so_proto->pr_getlock != NULL) {
1261                 mutex_held = (*so->so_proto->pr_getlock)(so, PR_F_WILLUNLOCK);
1262         } else {
1263                 mutex_held = so->so_proto->pr_domain->dom_mtx;
1264         }
1265         LCK_MTX_ASSERT(mutex_held, LCK_MTX_ASSERT_OWNED);
1266
1267         /*
1268          * Double check here and return if there's no outstanding upcall;
1269          * otherwise proceed further only if SOF_UPCALLCLOSEWAIT is set.
1270          */
1271         if (!so->so_upcallusecount || !(so->so_flags & SOF_UPCALLCLOSEWAIT)) {
1272                 return;
1273         }
1274         so->so_rcv.sb_flags &= ~SB_UPCALL;
1275         so->so_snd.sb_flags &= ~SB_UPCALL;
1276         so->so_flags |= SOF_CLOSEWAIT;
1277
1278         (void) msleep((caddr_t)&so->so_upcallusecount, mutex_held, (PZERO - 1),
1279             "soclose_wait_locked", NULL);
1280         LCK_MTX_ASSERT(mutex_held, LCK_MTX_ASSERT_OWNED);
1281         so->so_flags &= ~SOF_CLOSEWAIT;
1282 }
1283
1284 /*
1285  * Close a socket on last file table reference removal.
1286  * Initiate disconnect if connected.
1287  * Free socket when disconnect complete.
1288  */
1289 int
1290 soclose_locked(struct socket *so)
1291 {
1292         int error = 0;
1293         struct timespec ts;
1294
1295         if (so->so_usecount == 0) {
1296                 panic("soclose: so=%p refcount=0\n", so);
1297                 /* NOTREACHED */
1298         }
1299
1300         sflt_notify(so, sock_evt_closing, NULL);
1301
1302         if (so->so_upcallusecount) {
1303                 soclose_wait_locked(so);
1304         }
1305
1306 #if CONTENT_FILTER
1307         /*
1308          * We have to wait until the content filters are done
1309          */
1310         if ((so->so_flags & SOF_CONTENT_FILTER) != 0) {
1311                 cfil_sock_close_wait(so);
1312                 cfil_sock_is_closed(so);
1313                 cfil_sock_detach(so);
1314         }
1315 #endif /* CONTENT_FILTER */
1316
1317         if (so->so_flags1 & SOF1_EXTEND_BK_IDLE_INPROG) {
1318                 soresume(current_proc(), so, 1);
1319                 so->so_flags1 &= ~SOF1_EXTEND_BK_IDLE_WANTED;
1320         }
1321
1322         if ((so->so_options & SO_ACCEPTCONN)) {
1323                 struct socket *sp, *sonext;
1324                 int persocklock = 0;
1325                 int incomp_overflow_only;
1326
1327                 /*
1328                  * We do not want new connection to be added
1329                  * to the connection queues
1330                  */
1331                 so->so_options &= ~SO_ACCEPTCONN;
1332
1333                 /*
1334                  * We can drop the lock on the listener once
1335                  * we've acquired the incoming list
1336                  */
1337                 if (so->so_proto->pr_getlock != NULL) {
1338                         persocklock = 1;
1339                         so_acquire_accept_list(so, NULL);
1340                         socket_unlock(so, 0);
1341                 }
1342 again:
1343                 incomp_overflow_only = 1;
1344
1345                 TAILQ_FOREACH_SAFE(sp, &so->so_incomp, so_list, sonext) {
1346                         /*
1347                          * Radar 5350314
1348                          * skip sockets thrown away by tcpdropdropblreq
1349                          * they will get cleanup by the garbage collection.
1350                          * otherwise, remove the incomp socket from the queue
1351                          * and let soabort trigger the appropriate cleanup.
1352                          */
1353                         if (sp->so_flags & SOF_OVERFLOW) {
1354                                 continue;
1355                         }
1356
1357                         if (persocklock != 0) {
1358                                 socket_lock(sp, 1);
1359                         }
1360
1361                         /*
1362                          * Radar 27945981
1363                          * The extra reference for the list insure the
1364                          * validity of the socket pointer when we perform the
1365                          * unlock of the head above
1366                          */
1367                         if (sp->so_state & SS_INCOMP) {
1368                                 sp->so_state &= ~SS_INCOMP;
1369                                 sp->so_head = NULL;
1370                                 TAILQ_REMOVE(&so->so_incomp, sp, so_list);
1371                                 so->so_incqlen--;
1372                                 so->so_qlen--;
1373
1374                                 (void) soabort(sp);
1375                         } else {
1376                                 panic("%s sp %p in so_incomp but !SS_INCOMP",
1377                                     __func__, sp);
1378                         }
1379
1380                         if (persocklock != 0) {
1381                                 socket_unlock(sp, 1);
1382                         }
1383                 }
1384
1385                 TAILQ_FOREACH_SAFE(sp, &so->so_comp, so_list, sonext) {
1386                         /* Dequeue from so_comp since sofree() won't do it */
1387                         if (persocklock != 0) {
1388                                 socket_lock(sp, 1);
1389                         }
1390
1391                         if (sp->so_state & SS_COMP) {
1392                                 sp->so_state &= ~SS_COMP;
1393                                 sp->so_head = NULL;
1394                                 TAILQ_REMOVE(&so->so_comp, sp, so_list);
1395                                 so->so_qlen--;
1396
1397                                 (void) soabort(sp);
1398                         } else {
1399                                 panic("%s sp %p in so_comp but !SS_COMP",
1400                                     __func__, sp);
1401                         }
1402
1403                         if (persocklock) {
1404                                 socket_unlock(sp, 1);
1405                         }
1406                 }
1407
1408                 if (incomp_overflow_only == 0 && !TAILQ_EMPTY(&so->so_incomp)) {
1409 #if (DEBUG | DEVELOPMENT)
1410                         panic("%s head %p so_comp not empty\n", __func__, so);
1411 #endif /* (DEVELOPMENT || DEBUG) */
1412
1413                         goto again;
1414                 }
1415
1416                 if (!TAILQ_EMPTY(&so->so_comp)) {
1417 #if (DEBUG | DEVELOPMENT)
1418                         panic("%s head %p so_comp not empty\n", __func__, so);
1419 #endif /* (DEVELOPMENT || DEBUG) */
1420
1421                         goto again;
1422                 }
1423
1424                 if (persocklock) {
1425                         socket_lock(so, 0);
1426                         so_release_accept_list(so);
1427                 }
1428         }
1429         if (so->so_pcb == NULL) {
1430                 /* 3915887: mark the socket as ready for dealloc */
1431                 so->so_flags |= SOF_PCBCLEARING;
1432                 goto discard;
1433         }
1434         if (so->so_state & SS_ISCONNECTED) {
1435                 if ((so->so_state & SS_ISDISCONNECTING) == 0) {
1436                         error = sodisconnectlocked(so);
1437                         if (error) {
1438                                 goto drop;
1439                         }
1440                 }
1441                 if (so->so_options & SO_LINGER) {
1442                         lck_mtx_t *mutex_held;
1443
1444                         if ((so->so_state & SS_ISDISCONNECTING) &&
1445                             (so->so_state & SS_NBIO)) {
1446                                 goto drop;
1447                         }
1448                         if (so->so_proto->pr_getlock != NULL) {
1449                                 mutex_held = (*so->so_proto->pr_getlock)(so, PR_F_WILLUNLOCK);
1450                         } else {
1451                                 mutex_held = so->so_proto->pr_domain->dom_mtx;
1452                         }
1453                         while (so->so_state & SS_ISCONNECTED) {
1454                                 ts.tv_sec = (so->so_linger / 100);
1455                                 ts.tv_nsec = (so->so_linger % 100) *
1456                                     NSEC_PER_USEC * 1000 * 10;
1457                                 error = msleep((caddr_t)&so->so_timeo,
1458                                     mutex_held, PSOCK | PCATCH, "soclose", &ts);
1459                                 if (error) {
1460                                         /*
1461                                          * It's OK when the time fires,
1462                                          * don't report an error
1463                                          */
1464                                         if (error == EWOULDBLOCK) {
1465                                                 error = 0;
1466                                         }
1467                                         break;
1468                                 }
1469                         }
1470                 }
1471         }
1472 drop:
1473         if (so->so_usecount == 0) {
1474                 panic("soclose: usecount is zero so=%p\n", so);
1475                 /* NOTREACHED */
1476         }
1477         if (so->so_pcb != NULL && !(so->so_flags & SOF_PCBCLEARING)) {
1478                 int error2 = (*so->so_proto->pr_usrreqs->pru_detach)(so);
1479                 if (error == 0) {
1480                         error = error2;
1481                 }
1482         }
1483         if (so->so_usecount <= 0) {
1484                 panic("soclose: usecount is zero so=%p\n", so);
1485                 /* NOTREACHED */
1486         }
1487 discard:
1488         if (so->so_pcb != NULL && !(so->so_flags & SOF_MP_SUBFLOW) &&
1489             (so->so_state & SS_NOFDREF)) {
1490                 panic("soclose: NOFDREF");
1491                 /* NOTREACHED */
1492         }
1493         so->so_state |= SS_NOFDREF;
1494
1495         if ((so->so_flags & SOF_KNOTE) != 0) {
1496                 KNOTE(&so->so_klist, SO_FILT_HINT_LOCKED);
1497         }
1498
1499         atomic_add_32(&so->so_proto->pr_domain->dom_refs, -1);
1500         evsofree(so);
1501
1502         VERIFY(so->so_usecount > 0);
1503         so->so_usecount--;
1504         sofree(so);
1505         return error;
1506 }
1507
1508 int
1509 soclose(struct socket *so)
1510 {
1511         int error = 0;
1512         socket_lock(so, 1);
1513
1514         if (so->so_retaincnt == 0) {
1515                 error = soclose_locked(so);
1516         } else {
1517                 /*
1518                  * if the FD is going away, but socket is
1519                  * retained in kernel remove its reference
1520                  */
1521                 so->so_usecount--;
1522                 if (so->so_usecount < 2) {
1523                         panic("soclose: retaincnt non null and so=%p "
1524                             "usecount=%d\n", so, so->so_usecount);
1525                 }
1526         }
1527         socket_unlock(so, 1);
1528         return error;
1529 }
1530
1531 /*
1532  * Must be called at splnet...
1533  */
1534 /* Should already be locked */
1535 int
1536 soabort(struct socket *so)
1537 {
1538         int error;
1539
1540 #ifdef MORE_LOCKING_DEBUG
1541         lck_mtx_t *mutex_held;
1542
1543         if (so->so_proto->pr_getlock != NULL) {
1544                 mutex_held = (*so->so_proto->pr_getlock)(so, 0);
1545         } else {
1546                 mutex_held = so->so_proto->pr_domain->dom_mtx;
1547         }
1548         LCK_MTX_ASSERT(mutex_held, LCK_MTX_ASSERT_OWNED);
1549 #endif
1550
1551         if ((so->so_flags & SOF_ABORTED) == 0) {
1552                 so->so_flags |= SOF_ABORTED;
1553                 error = (*so->so_proto->pr_usrreqs->pru_abort)(so);
1554                 if (error) {
1555                         sofree(so);
1556                         return error;
1557                 }
1558         }
1559         return 0;
1560 }
1561
1562 int
1563 soacceptlock(struct socket *so, struct sockaddr **nam, int dolock)
1564 {
1565         int error;
1566
1567         if (dolock) {
1568                 socket_lock(so, 1);
1569         }
1570
1571         so_update_last_owner_locked(so, PROC_NULL);
1572         so_update_policy(so);
1573 #if NECP
1574         so_update_necp_policy(so, NULL, NULL);
1575 #endif /* NECP */
1576
1577         if ((so->so_state & SS_NOFDREF) == 0) {
1578                 panic("soaccept: !NOFDREF");
1579         }
1580         so->so_state &= ~SS_NOFDREF;
1581         error = (*so->so_proto->pr_usrreqs->pru_accept)(so, nam);
1582
1583         if (dolock) {
1584                 socket_unlock(so, 1);
1585         }
1586         return error;
1587 }
1588
1589 int
1590 soaccept(struct socket *so, struct sockaddr **nam)
1591 {
1592         return soacceptlock(so, nam, 1);
1593 }
1594
1595 int
1596 soacceptfilter(struct socket *so, struct socket *head)
1597 {
1598         struct sockaddr *local = NULL, *remote = NULL;
1599         int error = 0;
1600
1601         /*
1602          * Hold the lock even if this socket has not been made visible
1603          * to the filter(s).  For sockets with global locks, this protects
1604          * against the head or peer going away
1605          */
1606         socket_lock(so, 1);
1607         if (sogetaddr_locked(so, &remote, 1) != 0 ||
1608             sogetaddr_locked(so, &local, 0) != 0) {
1609                 so->so_state &= ~SS_NOFDREF;
1610                 socket_unlock(so, 1);
1611                 soclose(so);
1612                 /* Out of resources; try it again next time */
1613                 error = ECONNABORTED;
1614                 goto done;
1615         }
1616
1617         error = sflt_accept(head, so, local, remote);
1618
1619         /*
1620          * If we get EJUSTRETURN from one of the filters, mark this socket
1621          * as inactive and return it anyway.  This newly accepted socket
1622          * will be disconnected later before we hand it off to the caller.
1623          */
1624         if (error == EJUSTRETURN) {
1625                 error = 0;
1626                 (void) sosetdefunct(current_proc(), so,
1627                     SHUTDOWN_SOCKET_LEVEL_DISCONNECT_INTERNAL, FALSE);
1628         }
1629
1630         if (error != 0) {
1631                 /*
1632                  * This may seem like a duplication to the above error
1633                  * handling part when we return ECONNABORTED, except
1634                  * the following is done while holding the lock since
1635                  * the socket has been exposed to the filter(s) earlier.
1636                  */
1637                 so->so_state &= ~SS_NOFDREF;
1638                 socket_unlock(so, 1);
1639                 soclose(so);
1640                 /* Propagate socket filter's error code to the caller */
1641         } else {
1642                 socket_unlock(so, 1);
1643         }
1644 done:
1645         /* Callee checks for NULL pointer */
1646         sock_freeaddr(remote);
1647         sock_freeaddr(local);
1648         return error;
1649 }
1650
1651 /*
1652  * Returns:     0                       Success
1653  *              EOPNOTSUPP              Operation not supported on socket
1654  *              EISCONN                 Socket is connected
1655  *      <pru_connect>:EADDRNOTAVAIL     Address not available.
1656  *      <pru_connect>:EINVAL            Invalid argument
1657  *      <pru_connect>:EAFNOSUPPORT      Address family not supported [notdef]
1658  *      <pru_connect>:EACCES            Permission denied
1659  *      <pru_connect>:EADDRINUSE        Address in use
1660  *      <pru_connect>:EAGAIN            Resource unavailable, try again
1661  *      <pru_connect>:EPERM             Operation not permitted
1662  *      <sf_connect_out>:???            [anything a filter writer might set]
1663  */
1664 int
1665 soconnectlock(struct socket *so, struct sockaddr *nam, int dolock)
1666 {
1667         int error;
1668         struct proc *p = current_proc();
1669
1670         if (dolock) {
1671                 socket_lock(so, 1);
1672         }
1673
1674         so_update_last_owner_locked(so, p);
1675         so_update_policy(so);
1676
1677 #if NECP
1678         so_update_necp_policy(so, NULL, nam);
1679 #endif /* NECP */
1680
1681         /*
1682          * If this is a listening socket or if this is a previously-accepted
1683          * socket that has been marked as inactive, reject the connect request.
1684          */
1685         if ((so->so_options & SO_ACCEPTCONN) || (so->so_flags & SOF_DEFUNCT)) {
1686                 error = EOPNOTSUPP;
1687                 if (so->so_flags & SOF_DEFUNCT) {
1688                         SODEFUNCTLOG("%s[%d, %s]: defunct so 0x%llx [%d,%d] "
1689                             "(%d)\n", __func__, proc_pid(p),
1690                             proc_best_name(p),
1691                             (uint64_t)DEBUG_KERNEL_ADDRPERM(so),
1692                             SOCK_DOM(so), SOCK_TYPE(so), error);
1693                 }
1694                 if (dolock) {
1695                         socket_unlock(so, 1);
1696                 }
1697                 return error;
1698         }
1699
1700         if ((so->so_restrictions & SO_RESTRICT_DENY_OUT) != 0) {
1701                 if (dolock) {
1702                         socket_unlock(so, 1);
1703                 }
1704                 return EPERM;
1705         }
1706
1707         /*
1708          * If protocol is connection-based, can only connect once.
1709          * Otherwise, if connected, try to disconnect first.
1710          * This allows user to disconnect by connecting to, e.g.,
1711          * a null address.
1712          */
1713         if (so->so_state & (SS_ISCONNECTED | SS_ISCONNECTING) &&
1714             ((so->so_proto->pr_flags & PR_CONNREQUIRED) ||
1715             (error = sodisconnectlocked(so)))) {
1716                 error = EISCONN;
1717         } else {
1718                 /*
1719                  * Run connect filter before calling protocol:
1720                  *  - non-blocking connect returns before completion;
1721                  */
1722                 error = sflt_connectout(so, nam);
1723                 if (error != 0) {
1724                         if (error == EJUSTRETURN) {
1725                                 error = 0;
1726                         }
1727                 } else {
1728                         error = (*so->so_proto->pr_usrreqs->pru_connect)
1729                             (so, nam, p);
1730                         if (error != 0) {
1731                                 so->so_state &= ~SS_ISCONNECTING;
1732                         }
1733                 }
1734         }
1735         if (dolock) {
1736                 socket_unlock(so, 1);
1737         }
1738         return error;
1739 }
1740
1741 int
1742 soconnect(struct socket *so, struct sockaddr *nam)
1743 {
1744         return soconnectlock(so, nam, 1);
1745 }
1746
1747 /*
1748  * Returns:     0                       Success
1749  *      <pru_connect2>:EINVAL[AF_UNIX]
1750  *      <pru_connect2>:EPROTOTYPE[AF_UNIX]
1751  *      <pru_connect2>:???              [other protocol families]
1752  *
1753  * Notes:       <pru_connect2> is not supported by [TCP].
1754  */
1755 int
1756 soconnect2(struct socket *so1, struct socket *so2)
1757 {
1758         int error;
1759
1760         socket_lock(so1, 1);
1761         if (so2->so_proto->pr_lock) {
1762                 socket_lock(so2, 1);
1763         }
1764
1765         error = (*so1->so_proto->pr_usrreqs->pru_connect2)(so1, so2);
1766
1767         socket_unlock(so1, 1);
1768         if (so2->so_proto->pr_lock) {
1769                 socket_unlock(so2, 1);
1770         }
1771         return error;
1772 }
1773
1774 int
1775 soconnectxlocked(struct socket *so, struct sockaddr *src,
1776     struct sockaddr *dst, struct proc *p, uint32_t ifscope,
1777     sae_associd_t aid, sae_connid_t *pcid, uint32_t flags, void *arg,
1778     uint32_t arglen, uio_t auio, user_ssize_t *bytes_written)
1779 {
1780         int error;
1781
1782         so_update_last_owner_locked(so, p);
1783         so_update_policy(so);
1784
1785         /*
1786          * If this is a listening socket or if this is a previously-accepted
1787          * socket that has been marked as inactive, reject the connect request.
1788          */
1789         if ((so->so_options & SO_ACCEPTCONN) || (so->so_flags & SOF_DEFUNCT)) {
1790                 error = EOPNOTSUPP;
1791                 if (so->so_flags & SOF_DEFUNCT) {
1792                         SODEFUNCTLOG("%s[%d, %s]: defunct so 0x%llx [%d,%d] "
1793                             "(%d)\n", __func__, proc_pid(p),
1794                             proc_best_name(p),
1795                             (uint64_t)DEBUG_KERNEL_ADDRPERM(so),
1796                             SOCK_DOM(so), SOCK_TYPE(so), error);
1797                 }
1798                 return error;
1799         }
1800
1801         if ((so->so_restrictions & SO_RESTRICT_DENY_OUT) != 0) {
1802                 return EPERM;
1803         }
1804
1805         /*
1806          * If protocol is connection-based, can only connect once
1807          * unless PR_MULTICONN is set.  Otherwise, if connected,
1808          * try to disconnect first.  This allows user to disconnect
1809          * by connecting to, e.g., a null address.
1810          */
1811         if ((so->so_state & (SS_ISCONNECTED | SS_ISCONNECTING)) &&
1812             !(so->so_proto->pr_flags & PR_MULTICONN) &&
1813             ((so->so_proto->pr_flags & PR_CONNREQUIRED) ||
1814             (error = sodisconnectlocked(so)) != 0)) {
1815                 error = EISCONN;
1816         } else {
1817                 /*
1818                  * Run connect filter before calling protocol:
1819                  *  - non-blocking connect returns before completion;
1820                  */
1821                 error = sflt_connectout(so, dst);
1822                 if (error != 0) {
1823                         /* Disable PRECONNECT_DATA, as we don't need to send a SYN anymore. */
1824                         so->so_flags1 &= ~SOF1_PRECONNECT_DATA;
1825                         if (error == EJUSTRETURN) {
1826                                 error = 0;
1827                         }
1828                 } else {
1829                         error = (*so->so_proto->pr_usrreqs->pru_connectx)
1830                             (so, src, dst, p, ifscope, aid, pcid,
1831                             flags, arg, arglen, auio, bytes_written);
1832                         if (error != 0) {
1833                                 so->so_state &= ~SS_ISCONNECTING;
1834                         }
1835                 }
1836         }
1837
1838         return error;
1839 }
1840
1841 int
1842 sodisconnectlocked(struct socket *so)
1843 {
1844         int error;
1845
1846         if ((so->so_state & SS_ISCONNECTED) == 0) {
1847                 error = ENOTCONN;
1848                 goto bad;
1849         }
1850         if (so->so_state & SS_ISDISCONNECTING) {
1851                 error = EALREADY;
1852                 goto bad;
1853         }
1854
1855         error = (*so->so_proto->pr_usrreqs->pru_disconnect)(so);
1856         if (error == 0) {
1857                 sflt_notify(so, sock_evt_disconnected, NULL);
1858         }
1859
1860 bad:
1861         return error;
1862 }
1863
1864 /* Locking version */
1865 int
1866 sodisconnect(struct socket *so)
1867 {
1868         int error;
1869
1870         socket_lock(so, 1);
1871         error = sodisconnectlocked(so);
1872         socket_unlock(so, 1);
1873         return error;
1874 }
1875
1876 int
1877 sodisconnectxlocked(struct socket *so, sae_associd_t aid, sae_connid_t cid)
1878 {
1879         int error;
1880
1881         /*
1882          * Call the protocol disconnectx handler; let it handle all
1883          * matters related to the connection state of this session.
1884          */
1885         error = (*so->so_proto->pr_usrreqs->pru_disconnectx)(so, aid, cid);
1886         if (error == 0) {
1887                 /*
1888                  * The event applies only for the session, not for
1889                  * the disconnection of individual subflows.
1890                  */
1891                 if (so->so_state & (SS_ISDISCONNECTING | SS_ISDISCONNECTED)) {
1892                         sflt_notify(so, sock_evt_disconnected, NULL);
1893                 }
1894         }
1895         return error;
1896 }
1897
1898 int
1899 sodisconnectx(struct socket *so, sae_associd_t aid, sae_connid_t cid)
1900 {
1901         int error;
1902
1903         socket_lock(so, 1);
1904         error = sodisconnectxlocked(so, aid, cid);
1905         socket_unlock(so, 1);
1906         return error;
1907 }
1908
1909 #define SBLOCKWAIT(f)   (((f) & MSG_DONTWAIT) ? 0 : SBL_WAIT)
1910
1911 /*
1912  * sosendcheck will lock the socket buffer if it isn't locked and
1913  * verify that there is space for the data being inserted.
1914  *
1915  * Returns:     0                       Success
1916  *              EPIPE
1917  *      sblock:EWOULDBLOCK
1918  *      sblock:EINTR
1919  *      sbwait:EBADF
1920  *      sbwait:EINTR
1921  *      [so_error]:???
1922  */
1923 int
1924 sosendcheck(struct socket *so, struct sockaddr *addr, user_ssize_t resid,
1925     int32_t clen, int32_t atomic, int flags, int *sblocked,
1926     struct mbuf *control)
1927 {
1928         int     error = 0;
1929         int32_t space;
1930         int     assumelock = 0;
1931
1932 restart:
1933         if (*sblocked == 0) {
1934                 if ((so->so_snd.sb_flags & SB_LOCK) != 0 &&
1935                     so->so_send_filt_thread != 0 &&
1936                     so->so_send_filt_thread == current_thread()) {
1937                         /*
1938                          * We're being called recursively from a filter,
1939                          * allow this to continue. Radar 4150520.
1940                          * Don't set sblocked because we don't want
1941                          * to perform an unlock later.
1942                          */
1943                         assumelock = 1;
1944                 } else {
1945                         error = sblock(&so->so_snd, SBLOCKWAIT(flags));
1946                         if (error) {
1947                                 if (so->so_flags & SOF_DEFUNCT) {
1948                                         goto defunct;
1949                                 }
1950                                 return error;
1951                         }
1952                         *sblocked = 1;
1953                 }
1954         }
1955
1956         /*
1957          * If a send attempt is made on a socket that has been marked
1958          * as inactive (disconnected), reject the request.
1959          */
1960         if (so->so_flags & SOF_DEFUNCT) {
1961 defunct:
1962                 error = EPIPE;
1963                 SODEFUNCTLOG("%s[%d, %s]: defunct so 0x%llx [%d,%d] (%d)\n",
1964                     __func__, proc_selfpid(), proc_best_name(current_proc()),
1965                     (uint64_t)DEBUG_KERNEL_ADDRPERM(so),
1966                     SOCK_DOM(so), SOCK_TYPE(so), error);
1967                 return error;
1968         }
1969
1970         if (so->so_state & SS_CANTSENDMORE) {
1971 #if CONTENT_FILTER
1972                 /*
1973                  * Can re-inject data of half closed connections
1974                  */
1975                 if ((so->so_state & SS_ISDISCONNECTED) == 0 &&
1976                     so->so_snd.sb_cfil_thread == current_thread() &&
1977                     cfil_sock_data_pending(&so->so_snd) != 0) {
1978                         CFIL_LOG(LOG_INFO,
1979                             "so %llx ignore SS_CANTSENDMORE",
1980                             (uint64_t)DEBUG_KERNEL_ADDRPERM(so));
1981                 } else
1982 #endif /* CONTENT_FILTER */
1983                 return EPIPE;
1984         }
1985         if (so->so_error) {
1986                 error = so->so_error;
1987                 so->so_error = 0;
1988                 return error;
1989         }
1990
1991         if ((so->so_state & SS_ISCONNECTED) == 0) {
1992                 if ((so->so_proto->pr_flags & PR_CONNREQUIRED) != 0) {
1993                         if (((so->so_state & SS_ISCONFIRMING) == 0) &&
1994                             (resid != 0 || clen == 0) &&
1995                             !(so->so_flags1 & SOF1_PRECONNECT_DATA)) {
1996                                 return ENOTCONN;
1997                         }
1998                 } else if (addr == 0) {
1999                         return (so->so_proto->pr_flags & PR_CONNREQUIRED) ?
2000                                ENOTCONN : EDESTADDRREQ;
2001                 }
2002         }
2003
2004         if (so->so_flags & SOF_ENABLE_MSGS) {
2005                 space = msgq_sbspace(so, control);
2006         } else {
2007                 space = sbspace(&so->so_snd);
2008         }
2009
2010         if (flags & MSG_OOB) {
2011                 space += 1024;
2012         }
2013         if ((atomic && resid > so->so_snd.sb_hiwat) ||
2014             clen > so->so_snd.sb_hiwat) {
2015                 return EMSGSIZE;
2016         }
2017
2018         if ((space < resid + clen &&
2019             (atomic || (space < (int32_t)so->so_snd.sb_lowat) ||
2020             space < clen)) ||
2021             (so->so_type == SOCK_STREAM && so_wait_for_if_feedback(so))) {
2022                 /*
2023                  * don't block the connectx call when there's more data
2024                  * than can be copied.
2025                  */
2026                 if (so->so_flags1 & SOF1_PRECONNECT_DATA) {
2027                         if (space == 0) {
2028                                 return EWOULDBLOCK;
2029                         }
2030                         if (space < (int32_t)so->so_snd.sb_lowat) {
2031                                 return 0;
2032                         }
2033                 }
2034                 if ((so->so_state & SS_NBIO) || (flags & MSG_NBIO) ||
2035                     assumelock) {
2036                         return EWOULDBLOCK;
2037                 }
2038                 sbunlock(&so->so_snd, TRUE);    /* keep socket locked */
2039                 *sblocked = 0;
2040                 error = sbwait(&so->so_snd);
2041                 if (error) {
2042                         if (so->so_flags & SOF_DEFUNCT) {
2043                                 goto defunct;
2044                         }
2045                         return error;
2046                 }
2047                 goto restart;
2048         }
2049         return 0;
2050 }
2051
2052 /*
2053  * Send on a socket.
2054  * If send must go all at once and message is larger than
2055  * send buffering, then hard error.
2056  * Lock against other senders.
2057  * If must go all at once and not enough room now, then
2058  * inform user that this would block and do nothing.
2059  * Otherwise, if nonblocking, send as much as possible.
2060  * The data to be sent is described by "uio" if nonzero,
2061  * otherwise by the mbuf chain "top" (which must be null
2062  * if uio is not).  Data provided in mbuf chain must be small
2063  * enough to send all at once.
2064  *
2065  * Returns nonzero on error, timeout or signal; callers
2066  * must check for short counts if EINTR/ERESTART are returned.
2067  * Data and control buffers are freed on return.
2068  *
2069  * Returns:     0                       Success
2070  *              EOPNOTSUPP
2071  *              EINVAL
2072  *              ENOBUFS
2073  *      uiomove:EFAULT
2074  *      sosendcheck:EPIPE
2075  *      sosendcheck:EWOULDBLOCK
2076  *      sosendcheck:EINTR
2077  *      sosendcheck:EBADF
2078  *      sosendcheck:EINTR
2079  *      sosendcheck:???                 [value from so_error]
2080  *      <pru_send>:ECONNRESET[TCP]
2081  *      <pru_send>:EINVAL[TCP]
2082  *      <pru_send>:ENOBUFS[TCP]
2083  *      <pru_send>:EADDRINUSE[TCP]
2084  *      <pru_send>:EADDRNOTAVAIL[TCP]
2085  *      <pru_send>:EAFNOSUPPORT[TCP]
2086  *      <pru_send>:EACCES[TCP]
2087  *      <pru_send>:EAGAIN[TCP]
2088  *      <pru_send>:EPERM[TCP]
2089  *      <pru_send>:EMSGSIZE[TCP]
2090  *      <pru_send>:EHOSTUNREACH[TCP]
2091  *      <pru_send>:ENETUNREACH[TCP]
2092  *      <pru_send>:ENETDOWN[TCP]
2093  *      <pru_send>:ENOMEM[TCP]
2094  *      <pru_send>:ENOBUFS[TCP]
2095  *      <pru_send>:???[TCP]             [ignorable: mostly IPSEC/firewall/DLIL]
2096  *      <pru_send>:EINVAL[AF_UNIX]
2097  *      <pru_send>:EOPNOTSUPP[AF_UNIX]
2098  *      <pru_send>:EPIPE[AF_UNIX]
2099  *      <pru_send>:ENOTCONN[AF_UNIX]
2100  *      <pru_send>:EISCONN[AF_UNIX]
2101  *      <pru_send>:???[AF_UNIX]         [whatever a filter author chooses]
2102  *      <sf_data_out>:???               [whatever a filter author chooses]
2103  *
2104  * Notes:       Other <pru_send> returns depend on the protocol family; all
2105  *              <sf_data_out> returns depend on what the filter author causes
2106  *              their filter to return.
2107  */
2108 int
2109 sosend(struct socket *so, struct sockaddr *addr, struct uio *uio,
2110     struct mbuf *top, struct mbuf *control, int flags)
2111 {
2112         struct mbuf **mp;
2113         struct mbuf *m, *freelist = NULL;
2114         user_ssize_t space, len, resid, orig_resid;
2115         int clen = 0, error, dontroute, mlen, sendflags;
2116         int atomic = sosendallatonce(so) || top;
2117         int sblocked = 0;
2118         struct proc *p = current_proc();
2119         struct mbuf *control_copy = NULL;
2120         uint16_t headroom = 0;
2121         boolean_t en_tracing = FALSE;
2122
2123         if (uio != NULL) {
2124                 resid = uio_resid(uio);
2125         } else {
2126                 resid = top->m_pkthdr.len;
2127         }
2128
2129         KERNEL_DEBUG((DBG_FNC_SOSEND | DBG_FUNC_START), so, resid,
2130             so->so_snd.sb_cc, so->so_snd.sb_lowat, so->so_snd.sb_hiwat);
2131
2132         socket_lock(so, 1);
2133
2134         /*
2135          * trace if tracing & network (vs. unix) sockets & and
2136          * non-loopback
2137          */
2138         if (ENTR_SHOULDTRACE &&
2139             (SOCK_CHECK_DOM(so, AF_INET) || SOCK_CHECK_DOM(so, AF_INET6))) {
2140                 struct inpcb *inp = sotoinpcb(so);
2141                 if (inp->inp_last_outifp != NULL &&
2142                     !(inp->inp_last_outifp->if_flags & IFF_LOOPBACK)) {
2143                         en_tracing = TRUE;
2144                         KERNEL_ENERGYTRACE(kEnTrActKernSockWrite, DBG_FUNC_START,
2145                             VM_KERNEL_ADDRPERM(so),
2146                             ((so->so_state & SS_NBIO) ? kEnTrFlagNonBlocking : 0),
2147                             (int64_t)resid);
2148                         orig_resid = resid;
2149                 }
2150         }
2151
2152         /*
2153          * Re-injection should not affect process accounting
2154          */
2155         if ((flags & MSG_SKIPCFIL) == 0) {
2156                 so_update_last_owner_locked(so, p);
2157                 so_update_policy(so);
2158
2159 #if NECP
2160                 so_update_necp_policy(so, NULL, addr);
2161 #endif /* NECP */
2162         }
2163
2164         if (so->so_type != SOCK_STREAM && (flags & MSG_OOB) != 0) {
2165                 error = EOPNOTSUPP;
2166                 goto out_locked;
2167         }
2168
2169         /*
2170          * In theory resid should be unsigned.
2171          * However, space must be signed, as it might be less than 0
2172          * if we over-committed, and we must use a signed comparison
2173          * of space and resid.  On the other hand, a negative resid
2174          * causes us to loop sending 0-length segments to the protocol.
2175          *
2176          * Usually, MSG_EOR isn't used on SOCK_STREAM type sockets.
2177          * But it will be used by sockets doing message delivery.
2178          *
2179          * Note: We limit resid to be a positive int value as we use
2180          * imin() to set bytes_to_copy -- radr://14558484
2181          */
2182         if (resid < 0 || resid > INT_MAX || (so->so_type == SOCK_STREAM &&
2183             !(so->so_flags & SOF_ENABLE_MSGS) && (flags & MSG_EOR))) {
2184                 error = EINVAL;
2185                 goto out_locked;
2186         }
2187
2188         dontroute = (flags & MSG_DONTROUTE) &&
2189             (so->so_options & SO_DONTROUTE) == 0 &&
2190             (so->so_proto->pr_flags & PR_ATOMIC);
2191         OSIncrementAtomicLong(&p->p_stats->p_ru.ru_msgsnd);
2192
2193         if (control != NULL) {
2194                 clen = control->m_len;
2195         }
2196
2197         if (soreserveheadroom != 0) {
2198                 headroom = so->so_pktheadroom;
2199         }
2200
2201         do {
2202                 error = sosendcheck(so, addr, resid, clen, atomic, flags,
2203                     &sblocked, control);
2204                 if (error) {
2205                         goto out_locked;
2206                 }
2207
2208                 mp = &top;
2209                 if (so->so_flags & SOF_ENABLE_MSGS) {
2210                         space = msgq_sbspace(so, control);
2211                 } else {
2212                         space = sbspace(&so->so_snd) - clen;
2213                 }
2214                 space += ((flags & MSG_OOB) ? 1024 : 0);
2215
2216                 do {
2217                         if (uio == NULL) {
2218                                 /*
2219                                  * Data is prepackaged in "top".
2220                                  */
2221                                 resid = 0;
2222                                 if (flags & MSG_EOR) {
2223                                         top->m_flags |= M_EOR;
2224                                 }
2225                         } else {
2226                                 int chainlength;
2227                                 int bytes_to_copy;
2228                                 boolean_t jumbocl;
2229                                 boolean_t bigcl;
2230                                 int bytes_to_alloc;
2231
2232                                 bytes_to_copy = imin(resid, space);
2233
2234                                 bytes_to_alloc = bytes_to_copy;
2235                                 if (top == NULL) {
2236                                         bytes_to_alloc += headroom;
2237                                 }
2238
2239                                 if (sosendminchain > 0) {
2240                                         chainlength = 0;
2241                                 } else {
2242                                         chainlength = sosendmaxchain;
2243                                 }
2244
2245                                 /*
2246                                  * Use big 4 KB cluster when the outgoing interface
2247                                  * does not prefer 2 KB clusters
2248                                  */
2249                                 bigcl = !(so->so_flags1 & SOF1_IF_2KCL) ||
2250                                     sosendbigcl_ignore_capab;
2251
2252                                 /*
2253                                  * Attempt to use larger than system page-size
2254                                  * clusters for large writes only if there is
2255                                  * a jumbo cluster pool and if the socket is
2256                                  * marked accordingly.
2257                                  */
2258                                 jumbocl = sosendjcl && njcl > 0 &&
2259                                     ((so->so_flags & SOF_MULTIPAGES) ||
2260                                     sosendjcl_ignore_capab) &&
2261                                     bigcl;
2262
2263                                 socket_unlock(so, 0);
2264
2265                                 do {
2266                                         int num_needed;
2267                                         int hdrs_needed = (top == NULL) ? 1 : 0;
2268
2269                                         /*
2270                                          * try to maintain a local cache of mbuf
2271                                          * clusters needed to complete this
2272                                          * write the list is further limited to
2273                                          * the number that are currently needed
2274                                          * to fill the socket this mechanism
2275                                          * allows a large number of mbufs/
2276                                          * clusters to be grabbed under a single
2277                                          * mbuf lock... if we can't get any
2278                                          * clusters, than fall back to trying
2279                                          * for mbufs if we fail early (or
2280                                          * miscalcluate the number needed) make
2281                                          * sure to release any clusters we
2282                                          * haven't yet consumed.
2283                                          */
2284                                         if (freelist == NULL &&
2285                                             bytes_to_alloc > MBIGCLBYTES &&
2286                                             jumbocl) {
2287                                                 num_needed =
2288                                                     bytes_to_alloc / M16KCLBYTES;
2289
2290                                                 if ((bytes_to_alloc -
2291                                                     (num_needed * M16KCLBYTES))
2292                                                     >= MINCLSIZE) {
2293                                                         num_needed++;
2294                                                 }
2295
2296                                                 freelist =
2297                                                     m_getpackets_internal(
2298                                                         (unsigned int *)&num_needed,
2299                                                         hdrs_needed, M_WAIT, 0,
2300                                                         M16KCLBYTES);
2301                                                 /*
2302                                                  * Fall back to 4K cluster size
2303                                                  * if allocation failed
2304                                                  */
2305                                         }
2306
2307                                         if (freelist == NULL &&
2308                                             bytes_to_alloc > MCLBYTES &&
2309                                             bigcl) {
2310                                                 num_needed =
2311                                                     bytes_to_alloc / MBIGCLBYTES;
2312
2313                                                 if ((bytes_to_alloc -
2314                                                     (num_needed * MBIGCLBYTES)) >=
2315                                                     MINCLSIZE) {
2316                                                         num_needed++;
2317                                                 }
2318
2319                                                 freelist =
2320                                                     m_getpackets_internal(
2321                                                         (unsigned int *)&num_needed,
2322                                                         hdrs_needed, M_WAIT, 0,
2323                                                         MBIGCLBYTES);
2324                                                 /*
2325                                                  * Fall back to cluster size
2326                                                  * if allocation failed
2327                                                  */
2328                                         }
2329
2330                                         /*
2331                                          * Allocate a cluster as we want to
2332                                          * avoid to split the data in more
2333                                          * that one segment and using MINCLSIZE
2334                                          * would lead us to allocate two mbufs
2335                                          */
2336                                         if (soreserveheadroom != 0 &&
2337                                             freelist == NULL &&
2338                                             ((top == NULL &&
2339                                             bytes_to_alloc > _MHLEN) ||
2340                                             bytes_to_alloc > _MLEN)) {
2341                                                 num_needed = ROUNDUP(bytes_to_alloc, MCLBYTES) /
2342                                                     MCLBYTES;
2343                                                 freelist =
2344                                                     m_getpackets_internal(
2345                                                         (unsigned int *)&num_needed,
2346                                                         hdrs_needed, M_WAIT, 0,
2347                                                         MCLBYTES);
2348                                                 /*
2349                                                  * Fall back to a single mbuf
2350                                                  * if allocation failed
2351                                                  */
2352                                         } else if (freelist == NULL &&
2353                                             bytes_to_alloc > MINCLSIZE) {
2354                                                 num_needed =
2355                                                     bytes_to_alloc / MCLBYTES;
2356
2357                                                 if ((bytes_to_alloc -
2358                                                     (num_needed * MCLBYTES)) >=
2359                                                     MINCLSIZE) {
2360                                                         num_needed++;
2361                                                 }
2362
2363                                                 freelist =
2364                                                     m_getpackets_internal(
2365                                                         (unsigned int *)&num_needed,
2366                                                         hdrs_needed, M_WAIT, 0,
2367                                                         MCLBYTES);
2368                                                 /*
2369                                                  * Fall back to a single mbuf
2370                                                  * if allocation failed
2371                                                  */
2372                                         }
2373                                         /*
2374                                          * For datagram protocols, leave
2375                                          * headroom for protocol headers
2376                                          * in the first cluster of the chain
2377                                          */
2378                                         if (freelist != NULL && atomic &&
2379                                             top == NULL && headroom > 0) {
2380                                                 freelist->m_data += headroom;
2381                                         }
2382
2383                                         /*
2384                                          * Fall back to regular mbufs without
2385                                          * reserving the socket headroom
2386                                          */
2387                                         if (freelist == NULL) {
2388                                                 if (top == NULL) {
2389                                                         MGETHDR(freelist,
2390                                                             M_WAIT, MT_DATA);
2391                                                 } else {
2392                                                         MGET(freelist,
2393                                                             M_WAIT, MT_DATA);
2394                                                 }
2395
2396                                                 if (freelist == NULL) {
2397                                                         error = ENOBUFS;
2398                                                         socket_lock(so, 0);
2399                                                         goto out_locked;
2400                                                 }
2401                                                 /*
2402                                                  * For datagram protocols,
2403                                                  * leave room for protocol
2404                                                  * headers in first mbuf.
2405                                                  */
2406                                                 if (atomic && top == NULL &&
2407                                                     bytes_to_copy < MHLEN) {
2408                                                         MH_ALIGN(freelist,
2409                                                             bytes_to_copy);
2410                                                 }
2411                                         }
2412                                         m = freelist;
2413                                         freelist = m->m_next;
2414                                         m->m_next = NULL;
2415
2416                                         if ((m->m_flags & M_EXT)) {
2417                                                 mlen = m->m_ext.ext_size -
2418                                                     M_LEADINGSPACE(m);
2419                                         } else if ((m->m_flags & M_PKTHDR)) {
2420                                                 mlen =
2421                                                     MHLEN - M_LEADINGSPACE(m);
2422                                         } else {
2423                                                 mlen = MLEN - M_LEADINGSPACE(m);
2424                                         }
2425                                         len = imin(mlen, bytes_to_copy);
2426
2427                                         chainlength += len;
2428
2429                                         space -= len;
2430
2431                                         error = uiomove(mtod(m, caddr_t),
2432                                             len, uio);
2433
2434                                         resid = uio_resid(uio);
2435
2436                                         m->m_len = len;
2437                                         *mp = m;
2438                                         top->m_pkthdr.len += len;
2439                                         if (error) {
2440                                                 break;
2441                                         }
2442                                         mp = &m->m_next;
2443                                         if (resid <= 0) {
2444                                                 if (flags & MSG_EOR) {
2445                                                         top->m_flags |= M_EOR;
2446                                                 }
2447                                                 break;
2448                                         }
2449                                         bytes_to_copy = min(resid, space);
2450                                 } while (space > 0 &&
2451                                     (chainlength < sosendmaxchain || atomic ||
2452                                     resid < MINCLSIZE));
2453
2454                                 socket_lock(so, 0);
2455
2456                                 if (error) {
2457                                         goto out_locked;
2458                                 }
2459                         }
2460
2461                         if (dontroute) {
2462                                 so->so_options |= SO_DONTROUTE;
2463                         }
2464
2465                         /*
2466                          * Compute flags here, for pru_send and NKEs
2467                          *
2468                          * If the user set MSG_EOF, the protocol
2469                          * understands this flag and nothing left to
2470                          * send then use PRU_SEND_EOF instead of PRU_SEND.
2471                          */
2472                         sendflags = (flags & MSG_OOB) ? PRUS_OOB :
2473                             ((flags & MSG_EOF) &&
2474                             (so->so_proto->pr_flags & PR_IMPLOPCL) &&
2475                             (resid <= 0)) ? PRUS_EOF :
2476                             /* If there is more to send set PRUS_MORETOCOME */
2477                             (resid > 0 && space > 0) ? PRUS_MORETOCOME : 0;
2478
2479                         if ((flags & MSG_SKIPCFIL) == 0) {
2480                                 /*
2481                                  * Socket filter processing
2482                                  */
2483                                 error = sflt_data_out(so, addr, &top,
2484                                     &control, (sendflags & MSG_OOB) ?
2485                                     sock_data_filt_flag_oob : 0);
2486                                 if (error) {
2487                                         if (error == EJUSTRETURN) {
2488                                                 error = 0;
2489                                                 clen = 0;
2490                                                 control = NULL;
2491                                                 top = NULL;
2492                                         }
2493                                         goto out_locked;
2494                                 }
2495 #if CONTENT_FILTER
2496                                 /*
2497                                  * Content filter processing
2498                                  */
2499                                 error = cfil_sock_data_out(so, addr, top,
2500                                     control, sendflags);
2501                                 if (error) {
2502                                         if (error == EJUSTRETURN) {
2503                                                 error = 0;
2504                                                 clen = 0;
2505                                                 control = NULL;
2506                                                 top = NULL;
2507                                         }
2508                                         goto out_locked;
2509                                 }
2510 #endif /* CONTENT_FILTER */
2511                         }
2512                         if (so->so_flags & SOF_ENABLE_MSGS) {
2513                                 /*
2514                                  * Make a copy of control mbuf,
2515                                  * so that msg priority can be
2516                                  * passed to subsequent mbufs.
2517                                  */
2518                                 control_copy = m_dup(control, M_NOWAIT);
2519                         }
2520                         error = (*so->so_proto->pr_usrreqs->pru_send)
2521                             (so, sendflags, top, addr, control, p);
2522
2523                         if (dontroute) {
2524                                 so->so_options &= ~SO_DONTROUTE;
2525                         }
2526
2527                         clen = 0;
2528                         control = control_copy;
2529                         control_copy = NULL;
2530                         top = NULL;
2531                         mp = &top;
2532                         if (error) {
2533                                 goto out_locked;
2534                         }
2535                 } while (resid && space > 0);
2536         } while (resid);
2537
2538 out_locked:
2539         if (sblocked) {
2540                 sbunlock(&so->so_snd, FALSE);   /* will unlock socket */
2541         } else {
2542                 socket_unlock(so, 1);
2543         }
2544         if (top != NULL) {
2545                 m_freem(top);
2546         }
2547         if (control != NULL) {
2548                 m_freem(control);
2549         }
2550         if (freelist != NULL) {
2551                 m_freem_list(freelist);
2552         }
2553         if (control_copy != NULL) {
2554                 m_freem(control_copy);
2555         }
2556
2557         soclearfastopen(so);
2558
2559         if (en_tracing) {
2560                 /* resid passed here is the bytes left in uio */
2561                 KERNEL_ENERGYTRACE(kEnTrActKernSockWrite, DBG_FUNC_END,
2562                     VM_KERNEL_ADDRPERM(so),
2563                     ((error == EWOULDBLOCK) ? kEnTrFlagNoWork : 0),
2564                     (int64_t)(orig_resid - resid));
2565         }
2566         KERNEL_DEBUG(DBG_FNC_SOSEND | DBG_FUNC_END, so, resid,
2567             so->so_snd.sb_cc, space, error);
2568
2569         return error;
2570 }
2571
2572 int
2573 sosend_reinject(struct socket *so, struct sockaddr *addr, struct mbuf *top, struct mbuf *control, uint32_t sendflags)
2574 {
2575         struct mbuf *m0 = NULL, *control_end = NULL;
2576
2577         socket_lock_assert_owned(so);
2578
2579         /*
2580          * top must points to mbuf chain to be sent.
2581          * If control is not NULL, top must be packet header
2582          */
2583         VERIFY(top != NULL &&
2584             (control == NULL || top->m_flags & M_PKTHDR));
2585
2586         /*
2587          * If control is not passed in, see if we can get it
2588          * from top.
2589          */
2590         if (control == NULL && (top->m_flags & M_PKTHDR) == 0) {
2591                 // Locate start of control if present and start of data
2592                 for (m0 = top; m0 != NULL; m0 = m0->m_next) {
2593                         if (m0->m_flags & M_PKTHDR) {
2594                                 top = m0;
2595                                 break;
2596                         } else if (m0->m_type == MT_CONTROL) {
2597                                 if (control == NULL) {
2598                                         // Found start of control
2599                                         control = m0;
2600                                 }
2601                                 if (control != NULL && m0->m_next != NULL && m0->m_next->m_type != MT_CONTROL) {
2602                                         // Found end of control
2603                                         control_end = m0;
2604                                 }
2605                         }
2606                 }
2607                 if (control_end != NULL) {
2608                         control_end->m_next = NULL;
2609                 }
2610         }
2611
2612         int error = (*so->so_proto->pr_usrreqs->pru_send)
2613             (so, sendflags, top, addr, control, current_proc());
2614
2615         return error;
2616 }
2617
2618 /*
2619  * Supported only connected sockets (no address) without ancillary data
2620  * (control mbuf) for atomic protocols
2621  */
2622 int
2623 sosend_list(struct socket *so, struct uio **uioarray, u_int uiocnt, int flags)
2624 {
2625         struct mbuf *m, *freelist = NULL;
2626         user_ssize_t len, resid;
2627         int error, dontroute, mlen;
2628         int atomic = sosendallatonce(so);
2629         int sblocked = 0;
2630         struct proc *p = current_proc();
2631         u_int uiofirst = 0;
2632         u_int uiolast = 0;
2633         struct mbuf *top = NULL;
2634         uint16_t headroom = 0;
2635         boolean_t bigcl;
2636
2637         KERNEL_DEBUG((DBG_FNC_SOSEND_LIST | DBG_FUNC_START), so, uiocnt,
2638             so->so_snd.sb_cc, so->so_snd.sb_lowat, so->so_snd.sb_hiwat);
2639
2640         if (so->so_type != SOCK_DGRAM) {
2641                 error = EINVAL;
2642                 goto out;
2643         }
2644         if (atomic == 0) {
2645                 error = EINVAL;
2646                 goto out;
2647         }
2648         if (so->so_proto->pr_usrreqs->pru_send_list == NULL) {
2649                 error = EPROTONOSUPPORT;
2650                 goto out;
2651         }
2652         if (flags & ~(MSG_DONTWAIT | MSG_NBIO)) {
2653                 error = EINVAL;
2654                 goto out;
2655         }
2656         resid = uio_array_resid(uioarray, uiocnt);
2657
2658         /*
2659          * In theory resid should be unsigned.
2660          * However, space must be signed, as it might be less than 0
2661          * if we over-committed, and we must use a signed comparison
2662          * of space and resid.  On the other hand, a negative resid
2663          * causes us to loop sending 0-length segments to the protocol.
2664          *
2665          * Note: We limit resid to be a positive int value as we use
2666          * imin() to set bytes_to_copy -- radr://14558484
2667          */
2668         if (resid < 0 || resid > INT_MAX) {
2669                 error = EINVAL;
2670                 goto out;
2671         }
2672
2673         socket_lock(so, 1);
2674         so_update_last_owner_locked(so, p);
2675         so_update_policy(so);
2676
2677 #if NECP
2678         so_update_necp_policy(so, NULL, NULL);
2679 #endif /* NECP */
2680
2681         dontroute = (flags & MSG_DONTROUTE) &&
2682             (so->so_options & SO_DONTROUTE) == 0 &&
2683             (so->so_proto->pr_flags & PR_ATOMIC);
2684         OSIncrementAtomicLong(&p->p_stats->p_ru.ru_msgsnd);
2685
2686         error = sosendcheck(so, NULL, resid, 0, atomic, flags,
2687             &sblocked, NULL);
2688         if (error) {
2689                 goto release;
2690         }
2691
2692         /*
2693          * Use big 4 KB clusters when the outgoing interface does not prefer
2694          * 2 KB clusters
2695          */
2696         bigcl = !(so->so_flags1 & SOF1_IF_2KCL) || sosendbigcl_ignore_capab;
2697
2698         if (soreserveheadroom != 0) {
2699                 headroom = so->so_pktheadroom;
2700         }
2701
2702         do {
2703                 int i;
2704                 int num_needed = 0;
2705                 int chainlength;
2706                 size_t maxpktlen = 0;
2707                 int bytes_to_alloc;
2708
2709                 if (sosendminchain > 0) {
2710                         chainlength = 0;
2711                 } else {
2712                         chainlength = sosendmaxchain;
2713                 }
2714
2715                 socket_unlock(so, 0);
2716
2717                 /*
2718                  * Find a set of uio that fit in a reasonable number
2719                  * of mbuf packets
2720                  */
2721                 for (i = uiofirst; i < uiocnt; i++) {
2722                         struct uio *auio = uioarray[i];
2723
2724                         len = uio_resid(auio);
2725
2726                         /* Do nothing for empty messages */
2727                         if (len == 0) {
2728                                 continue;
2729                         }
2730
2731                         num_needed += 1;
2732                         uiolast += 1;
2733
2734                         if (len > maxpktlen) {
2735                                 maxpktlen = len;
2736                         }
2737
2738                         chainlength += len;
2739                         if (chainlength > sosendmaxchain) {
2740                                 break;
2741                         }
2742                 }
2743                 /*
2744                  * Nothing left to send
2745                  */
2746                 if (num_needed == 0) {
2747                         socket_lock(so, 0);
2748                         break;
2749                 }
2750                 /*
2751                  * Allocate buffer large enough to include headroom space for
2752                  * network and link header
2753                  *
2754                  */
2755                 bytes_to_alloc = maxpktlen + headroom;
2756
2757                 /*
2758                  * Allocate a single contiguous buffer of the smallest available
2759                  * size when possible
2760                  */
2761                 if (bytes_to_alloc > MCLBYTES &&
2762                     bytes_to_alloc <= MBIGCLBYTES && bigcl) {
2763                         freelist = m_getpackets_internal(
2764                                 (unsigned int *)&num_needed,
2765                                 num_needed, M_WAIT, 1,
2766                                 MBIGCLBYTES);
2767                 } else if (bytes_to_alloc > _MHLEN &&
2768                     bytes_to_alloc <= MCLBYTES) {
2769                         freelist = m_getpackets_internal(
2770                                 (unsigned int *)&num_needed,
2771                                 num_needed, M_WAIT, 1,
2772                                 MCLBYTES);
2773                 } else {
2774                         freelist = m_allocpacket_internal(
2775                                 (unsigned int *)&num_needed,
2776                                 bytes_to_alloc, NULL, M_WAIT, 1, 0);
2777                 }
2778
2779                 if (freelist == NULL) {
2780                         socket_lock(so, 0);
2781                         error = ENOMEM;
2782                         goto release;
2783                 }
2784                 /*
2785                  * Copy each uio of the set into its own mbuf packet
2786                  */
2787                 for (i = uiofirst, m = freelist;
2788                     i < uiolast && m != NULL;
2789                     i++) {
2790                         int bytes_to_copy;
2791                         struct mbuf *n;
2792                         struct uio *auio = uioarray[i];
2793
2794                         bytes_to_copy = uio_resid(auio);
2795
2796                         /* Do nothing for empty messages */
2797                         if (bytes_to_copy == 0) {
2798                                 continue;
2799                         }
2800                         /*
2801                          * Leave headroom for protocol headers
2802                          * in the first mbuf of the chain
2803                          */
2804                         m->m_data += headroom;
2805
2806                         for (n = m; n != NULL; n = n->m_next) {
2807                                 if ((m->m_flags & M_EXT)) {
2808                                         mlen = m->m_ext.ext_size -
2809                                             M_LEADINGSPACE(m);
2810                                 } else if ((m->m_flags & M_PKTHDR)) {
2811                                         mlen =
2812                                             MHLEN - M_LEADINGSPACE(m);
2813                                 } else {
2814                                         mlen = MLEN - M_LEADINGSPACE(m);
2815                                 }
2816                                 len = imin(mlen, bytes_to_copy);
2817
2818                                 /*
2819                                  * Note: uiomove() decrements the iovec
2820                                  * length
2821                                  */
2822                                 error = uiomove(mtod(n, caddr_t),
2823                                     len, auio);
2824                                 if (error != 0) {
2825                                         break;
2826                                 }
2827                                 n->m_len = len;
2828                                 m->m_pkthdr.len += len;
2829
2830                                 VERIFY(m->m_pkthdr.len <= maxpktlen);
2831
2832                                 bytes_to_copy -= len;
2833                                 resid -= len;
2834                         }
2835                         if (m->m_pkthdr.len == 0) {
2836                                 printf(
2837                                         "%s:%d so %llx pkt %llx type %u len null\n",
2838                                         __func__, __LINE__,
2839                                         (uint64_t)DEBUG_KERNEL_ADDRPERM(so),
2840                                         (uint64_t)DEBUG_KERNEL_ADDRPERM(m),
2841                                         m->m_type);
2842                         }
2843                         if (error != 0) {
2844                                 break;
2845                         }
2846                         m = m->m_nextpkt;
2847                 }
2848
2849                 socket_lock(so, 0);
2850
2851                 if (error) {
2852                         goto release;
2853                 }
2854                 top = freelist;
2855                 freelist = NULL;
2856
2857                 if (dontroute) {
2858                         so->so_options |= SO_DONTROUTE;
2859                 }
2860
2861                 if ((flags & MSG_SKIPCFIL) == 0) {
2862                         struct mbuf **prevnextp = NULL;
2863
2864                         for (i = uiofirst, m = top;
2865                             i < uiolast && m != NULL;
2866                             i++) {
2867                                 struct mbuf *nextpkt = m->m_nextpkt;
2868
2869                                 /*
2870                                  * Socket filter processing
2871                                  */
2872                                 error = sflt_data_out(so, NULL, &m,
2873                                     NULL, 0);
2874                                 if (error != 0 && error != EJUSTRETURN) {
2875                                         goto release;
2876                                 }
2877
2878 #if CONTENT_FILTER
2879                                 if (error == 0) {
2880                                         /*
2881                                          * Content filter processing
2882                                          */
2883                                         error = cfil_sock_data_out(so, NULL, m,
2884                                             NULL, 0);
2885                                         if (error != 0 && error != EJUSTRETURN) {
2886                                                 goto release;
2887                                         }
2888                                 }
2889 #endif /* CONTENT_FILTER */
2890                                 /*
2891                                  * Remove packet from the list when
2892                                  * swallowed by a filter
2893                                  */
2894                                 if (error == EJUSTRETURN) {
2895                                         error = 0;
2896                                         if (prevnextp != NULL) {
2897                                                 *prevnextp = nextpkt;
2898                                         } else {
2899                                                 top = nextpkt;
2900                                         }
2901                                 }
2902
2903                                 m = nextpkt;
2904                                 if (m != NULL) {
2905                                         prevnextp = &m->m_nextpkt;
2906                                 }
2907                         }
2908                 }
2909                 if (top != NULL) {
2910                         error = (*so->so_proto->pr_usrreqs->pru_send_list)
2911                             (so, 0, top, NULL, NULL, p);
2912                 }
2913
2914                 if (dontroute) {
2915                         so->so_options &= ~SO_DONTROUTE;
2916                 }
2917
2918                 top = NULL;
2919                 uiofirst = uiolast;
2920         } while (resid > 0 && error == 0);
2921 release:
2922         if (sblocked) {
2923                 sbunlock(&so->so_snd, FALSE);   /* will unlock socket */
2924         } else {
2925                 socket_unlock(so, 1);
2926         }
2927 out:
2928         if (top != NULL) {
2929                 m_freem(top);
2930         }
2931         if (freelist != NULL) {
2932                 m_freem_list(freelist);
2933         }
2934
2935         KERNEL_DEBUG(DBG_FNC_SOSEND_LIST | DBG_FUNC_END, so, resid,
2936             so->so_snd.sb_cc, 0, error);
2937
2938         return error;
2939 }
2940
2941 /*
2942  * May return ERESTART when packet is dropped by MAC policy check
2943  */
2944 static int
2945 soreceive_addr(struct proc *p, struct socket *so, struct sockaddr **psa,
2946     int flags, struct mbuf **mp, struct mbuf **nextrecordp, int canwait)
2947 {
2948         int error = 0;
2949         struct mbuf *m = *mp;
2950         struct mbuf *nextrecord = *nextrecordp;
2951
2952         KASSERT(m->m_type == MT_SONAME, ("receive 1a"));
2953 #if CONFIG_MACF_SOCKET_SUBSET
2954         /*
2955          * Call the MAC framework for policy checking if we're in
2956          * the user process context and the socket isn't connected.
2957          */
2958         if (p != kernproc && !(so->so_state & SS_ISCONNECTED)) {
2959                 struct mbuf *m0 = m;
2960                 /*
2961                  * Dequeue this record (temporarily) from the receive
2962                  * list since we're about to drop the socket's lock
2963                  * where a new record may arrive and be appended to
2964                  * the list.  Upon MAC policy failure, the record
2965                  * will be freed.  Otherwise, we'll add it back to
2966                  * the head of the list.  We cannot rely on SB_LOCK
2967                  * because append operation uses the socket's lock.
2968                  */
2969                 do {
2970                         m->m_nextpkt = NULL;
2971                         sbfree(&so->so_rcv, m);
2972                         m = m->m_next;
2973                 } while (m != NULL);
2974                 m = m0;
2975                 so->so_rcv.sb_mb = nextrecord;
2976                 SB_EMPTY_FIXUP(&so->so_rcv);
2977                 SBLASTRECORDCHK(&so->so_rcv, "soreceive 1a");
2978                 SBLASTMBUFCHK(&so->so_rcv, "soreceive 1a");
2979                 socket_unlock(so, 0);
2980
2981                 if (mac_socket_check_received(proc_ucred(p), so,
2982                     mtod(m, struct sockaddr *)) != 0) {
2983                         /*
2984                          * MAC policy failure; free this record and
2985                          * process the next record (or block until
2986                          * one is available).  We have adjusted sb_cc
2987                          * and sb_mbcnt above so there is no need to
2988                          * call sbfree() again.
2989                          */
2990                         m_freem(m);
2991                         /*
2992                          * Clear SB_LOCK but don't unlock the socket.
2993                          * Process the next record or wait for one.
2994                          */
2995                         socket_lock(so, 0);
2996                         sbunlock(&so->so_rcv, TRUE); /* stay locked */
2997                         error = ERESTART;
2998                         goto done;
2999                 }
3000                 socket_lock(so, 0);
3001                 /*
3002                  * If the socket has been defunct'd, drop it.
3003                  */
3004                 if (so->so_flags & SOF_DEFUNCT) {
3005                         m_freem(m);
3006                         error = ENOTCONN;
3007                         goto done;
3008                 }
3009                 /*
3010                  * Re-adjust the socket receive list and re-enqueue
3011                  * the record in front of any packets which may have
3012                  * been appended while we dropped the lock.
3013                  */
3014                 for (m = m0; m->m_next != NULL; m = m->m_next) {
3015                         sballoc(&so->so_rcv, m);
3016                 }
3017                 sballoc(&so->so_rcv, m);
3018                 if (so->so_rcv.sb_mb == NULL) {
3019                         so->so_rcv.sb_lastrecord = m0;
3020                         so->so_rcv.sb_mbtail = m;
3021                 }
3022                 m = m0;
3023                 nextrecord = m->m_nextpkt = so->so_rcv.sb_mb;
3024                 so->so_rcv.sb_mb = m;
3025                 SBLASTRECORDCHK(&so->so_rcv, "soreceive 1b");
3026                 SBLASTMBUFCHK(&so->so_rcv, "soreceive 1b");
3027         }
3028 #endif /* CONFIG_MACF_SOCKET_SUBSET */
3029         if (psa != NULL) {
3030                 *psa = dup_sockaddr(mtod(m, struct sockaddr *), canwait);
3031                 if ((*psa == NULL) && (flags & MSG_NEEDSA)) {
3032                         error = EWOULDBLOCK;
3033                         goto done;
3034                 }
3035         }
3036         if (flags & MSG_PEEK) {
3037                 m = m->m_next;
3038         } else {
3039                 sbfree(&so->so_rcv, m);
3040                 if (m->m_next == NULL && so->so_rcv.sb_cc != 0) {
3041                         panic("%s: about to create invalid socketbuf",
3042                             __func__);
3043                         /* NOTREACHED */
3044                 }
3045                 MFREE(m, so->so_rcv.sb_mb);
3046                 m = so->so_rcv.sb_mb;
3047                 if (m != NULL) {
3048                         m->m_nextpkt = nextrecord;
3049                 } else {
3050                         so->so_rcv.sb_mb = nextrecord;
3051                         SB_EMPTY_FIXUP(&so->so_rcv);
3052                 }
3053         }
3054 done:
3055         *mp = m;
3056         *nextrecordp = nextrecord;
3057
3058         return error;
3059 }
3060
3061 /*
3062  * Process one or more MT_CONTROL mbufs present before any data mbufs
3063  * in the first mbuf chain on the socket buffer.  If MSG_PEEK, we
3064  * just copy the data; if !MSG_PEEK, we call into the protocol to
3065  * perform externalization.
3066  */
3067 static int
3068 soreceive_ctl(struct socket *so, struct mbuf **controlp, int flags,
3069     struct mbuf **mp, struct mbuf **nextrecordp)
3070 {
3071         int error = 0;
3072         struct mbuf *cm = NULL, *cmn;
3073         struct mbuf **cme = &cm;
3074         struct sockbuf *sb_rcv = &so->so_rcv;
3075         struct mbuf **msgpcm = NULL;
3076         struct mbuf *m = *mp;
3077         struct mbuf *nextrecord = *nextrecordp;
3078         struct protosw *pr = so->so_proto;
3079
3080         /*
3081          * Externalizing the control messages would require us to
3082          * drop the socket's lock below.  Once we re-acquire the
3083          * lock, the mbuf chain might change.  In order to preserve
3084          * consistency, we unlink all control messages from the
3085          * first mbuf chain in one shot and link them separately
3086          * onto a different chain.
3087          */
3088         do {
3089                 if (flags & MSG_PEEK) {
3090                         if (controlp != NULL) {
3091                                 if (*controlp == NULL) {
3092                                         msgpcm = controlp;
3093                                 }
3094                                 *controlp = m_copy(m, 0, m->m_len);
3095
3096                                 /*
3097                                  * If we failed to allocate an mbuf,
3098                                  * release any previously allocated
3099                                  * mbufs for control data. Return
3100                                  * an error. Keep the mbufs in the
3101                                  * socket as this is using
3102                                  * MSG_PEEK flag.
3103                                  */
3104                                 if (*controlp == NULL) {
3105                                         m_freem(*msgpcm);
3106                                         error = ENOBUFS;
3107                                         goto done;
3108                                 }
3109                                 controlp = &(*controlp)->m_next;
3110                         }
3111                         m = m->m_next;
3112                 } else {
3113                         m->m_nextpkt = NULL;
3114                         sbfree(sb_rcv, m);
3115                         sb_rcv->sb_mb = m->m_next;
3116                         m->m_next = NULL;
3117                         *cme = m;
3118                         cme = &(*cme)->m_next;
3119                         m = sb_rcv->sb_mb;
3120                 }
3121         } while (m != NULL && m->m_type == MT_CONTROL);
3122
3123         if (!(flags & MSG_PEEK)) {
3124                 if (sb_rcv->sb_mb != NULL) {
3125                         sb_rcv->sb_mb->m_nextpkt = nextrecord;
3126                 } else {
3127                         sb_rcv->sb_mb = nextrecord;
3128                         SB_EMPTY_FIXUP(sb_rcv);
3129                 }
3130                 if (nextrecord == NULL) {
3131                         sb_rcv->sb_lastrecord = m;
3132                 }
3133         }
3134
3135         SBLASTRECORDCHK(&so->so_rcv, "soreceive ctl");
3136         SBLASTMBUFCHK(&so->so_rcv, "soreceive ctl");
3137
3138         while (cm != NULL) {
3139                 int cmsg_type;
3140
3141                 cmn = cm->m_next;
3142                 cm->m_next = NULL;
3143                 cmsg_type = mtod(cm, struct cmsghdr *)->cmsg_type;
3144
3145                 /*
3146                  * Call the protocol to externalize SCM_RIGHTS message
3147                  * and return the modified message to the caller upon
3148                  * success.  Otherwise, all other control messages are
3149                  * returned unmodified to the caller.  Note that we
3150                  * only get into this loop if MSG_PEEK is not set.
3151                  */
3152                 if (pr->pr_domain->dom_externalize != NULL &&
3153                     cmsg_type == SCM_RIGHTS) {
3154                         /*
3155                          * Release socket lock: see 3903171.  This
3156                          * would also allow more records to be appended
3157                          * to the socket buffer.  We still have SB_LOCK
3158                          * set on it, so we can be sure that the head
3159                          * of the mbuf chain won't change.
3160                          */
3161                         socket_unlock(so, 0);
3162                         error = (*pr->pr_domain->dom_externalize)(cm);
3163                         socket_lock(so, 0);
3164                 } else {
3165                         error = 0;
3166                 }
3167
3168                 if (controlp != NULL && error == 0) {
3169                         *controlp = cm;
3170                         controlp = &(*controlp)->m_next;
3171                 } else {
3172                         (void) m_free(cm);
3173                 }
3174                 cm = cmn;
3175         }
3176         /*
3177          * Update the value of nextrecord in case we received new
3178          * records when the socket was unlocked above for
3179          * externalizing SCM_RIGHTS.
3180          */
3181         if (m != NULL) {
3182                 nextrecord = sb_rcv->sb_mb->m_nextpkt;
3183         } else {
3184                 nextrecord = sb_rcv->sb_mb;
3185         }
3186
3187 done:
3188         *mp = m;
3189         *nextrecordp = nextrecord;
3190
3191         return error;
3192 }
3193
3194 /*
3195  * Implement receive operations on a socket.
3196  * We depend on the way that records are added to the sockbuf
3197  * by sbappend*.  In particular, each record (mbufs linked through m_next)
3198  * must begin with an address if the protocol so specifies,
3199  * followed by an optional mbuf or mbufs containing ancillary data,
3200  * and then zero or more mbufs of data.
3201  * In order to avoid blocking network interrupts for the entire time here,
3202  * we splx() while doing the actual copy to user space.
3203  * Although the sockbuf is locked, new data may still be appended,
3204  * and thus we must maintain consistency of the sockbuf during that time.
3205  *
3206  * The caller may receive the data as a single mbuf chain by supplying
3207  * an mbuf **mp0 for use in returning the chain.  The uio is then used
3208  * only for the count in uio_resid.
3209  *
3210  * Returns:     0                       Success
3211  *              ENOBUFS
3212  *              ENOTCONN
3213  *              EWOULDBLOCK
3214  *      uiomove:EFAULT
3215  *      sblock:EWOULDBLOCK
3216  *      sblock:EINTR
3217  *      sbwait:EBADF
3218  *      sbwait:EINTR
3219  *      sodelayed_copy:EFAULT
3220  *      <pru_rcvoob>:EINVAL[TCP]
3221  *      <pru_rcvoob>:EWOULDBLOCK[TCP]
3222  *      <pru_rcvoob>:???
3223  *      <pr_domain->dom_externalize>:EMSGSIZE[AF_UNIX]
3224  *      <pr_domain->dom_externalize>:ENOBUFS[AF_UNIX]
3225  *      <pr_domain->dom_externalize>:???
3226  *
3227  * Notes:       Additional return values from calls through <pru_rcvoob> and
3228  *              <pr_domain->dom_externalize> depend on protocols other than
3229  *              TCP or AF_UNIX, which are documented above.
3230  */
3231 int
3232 soreceive(struct socket *so, struct sockaddr **psa, struct uio *uio,
3233     struct mbuf **mp0, struct mbuf **controlp, int *flagsp)
3234 {
3235         struct mbuf *m, **mp, *ml = NULL;
3236         struct mbuf *nextrecord, *free_list;
3237         int flags, error, offset;
3238         user_ssize_t len;
3239         struct protosw *pr = so->so_proto;
3240         int moff, type = 0;
3241         user_ssize_t orig_resid = uio_resid(uio);
3242         user_ssize_t delayed_copy_len;
3243         int can_delay;
3244         int need_event;
3245         struct proc *p = current_proc();
3246         boolean_t en_tracing = FALSE;
3247
3248         /*
3249          * Sanity check on the length passed by caller as we are making 'int'
3250          * comparisons
3251          */
3252         if (orig_resid < 0 || orig_resid > INT_MAX) {
3253                 return EINVAL;
3254         }
3255
3256         KERNEL_DEBUG(DBG_FNC_SORECEIVE | DBG_FUNC_START, so,
3257             uio_resid(uio), so->so_rcv.sb_cc, so->so_rcv.sb_lowat,
3258             so->so_rcv.sb_hiwat);
3259
3260         socket_lock(so, 1);
3261         so_update_last_owner_locked(so, p);
3262         so_update_policy(so);
3263
3264 #ifdef MORE_LOCKING_DEBUG
3265         if (so->so_usecount == 1) {
3266                 panic("%s: so=%x no other reference on socket\n", __func__, so);
3267                 /* NOTREACHED */
3268         }
3269 #endif
3270         mp = mp0;
3271         if (psa != NULL) {
3272                 *psa = NULL;
3273         }
3274         if (controlp != NULL) {
3275                 *controlp = NULL;
3276         }
3277         if (flagsp != NULL) {
3278                 flags = *flagsp & ~MSG_EOR;
3279         } else {
3280                 flags = 0;
3281         }
3282
3283         /*
3284          * If a recv attempt is made on a previously-accepted socket
3285          * that has been marked as inactive (disconnected), reject
3286          * the request.
3287          */
3288         if (so->so_flags & SOF_DEFUNCT) {
3289                 struct sockbuf *sb = &so->so_rcv;
3290
3291                 error = ENOTCONN;
3292                 SODEFUNCTLOG("%s[%d, %s]: defunct so 0x%llx [%d,%d] (%d)\n",
3293                     __func__, proc_pid(p), proc_best_name(p),
3294                     (uint64_t)DEBUG_KERNEL_ADDRPERM(so),
3295                     SOCK_DOM(so), SOCK_TYPE(so), error);
3296                 /*
3297                  * This socket should have been disconnected and flushed
3298                  * prior to being returned from sodefunct(); there should
3299                  * be no data on its receive list, so panic otherwise.
3300                  */
3301                 if (so->so_state & SS_DEFUNCT) {
3302                         sb_empty_assert(sb, __func__);
3303                 }
3304                 socket_unlock(so, 1);
3305                 return error;
3306         }
3307
3308         if ((so->so_flags1 & SOF1_PRECONNECT_DATA) &&
3309             pr->pr_usrreqs->pru_preconnect) {
3310                 /*
3311                  * A user may set the CONNECT_RESUME_ON_READ_WRITE-flag but not
3312                  * calling write() right after this. *If* the app calls a read
3313                  * we do not want to block this read indefinetely. Thus,
3314                  * we trigger a connect so that the session gets initiated.
3315                  */
3316                 error = (*pr->pr_usrreqs->pru_preconnect)(so);
3317
3318                 if (error) {
3319                         socket_unlock(so, 1);
3320                         return error;
3321                 }
3322         }
3323
3324         if (ENTR_SHOULDTRACE &&
3325             (SOCK_CHECK_DOM(so, AF_INET) || SOCK_CHECK_DOM(so, AF_INET6))) {
3326                 /*
3327                  * enable energy tracing for inet sockets that go over
3328                  * non-loopback interfaces only.
3329                  */
3330                 struct inpcb *inp = sotoinpcb(so);
3331                 if (inp->inp_last_outifp != NULL &&
3332                     !(inp->inp_last_outifp->if_flags & IFF_LOOPBACK)) {
3333                         en_tracing = TRUE;
3334                         KERNEL_ENERGYTRACE(kEnTrActKernSockRead, DBG_FUNC_START,
3335                             VM_KERNEL_ADDRPERM(so),
3336                             ((so->so_state & SS_NBIO) ?
3337                             kEnTrFlagNonBlocking : 0),
3338                             (int64_t)orig_resid);
3339                 }
3340         }
3341
3342         /*
3343          * When SO_WANTOOBFLAG is set we try to get out-of-band data
3344          * regardless of the flags argument. Here is the case were
3345          * out-of-band data is not inline.
3346          */
3347         if ((flags & MSG_OOB) ||
3348             ((so->so_options & SO_WANTOOBFLAG) != 0 &&
3349             (so->so_options & SO_OOBINLINE) == 0 &&
3350             (so->so_oobmark || (so->so_state & SS_RCVATMARK)))) {
3351                 m = m_get(M_WAIT, MT_DATA);
3352                 if (m == NULL) {
3353                         socket_unlock(so, 1);
3354                         KERNEL_DEBUG(DBG_FNC_SORECEIVE | DBG_FUNC_END,
3355                             ENOBUFS, 0, 0, 0, 0);
3356                         return ENOBUFS;
3357                 }
3358                 error = (*pr->pr_usrreqs->pru_rcvoob)(so, m, flags & MSG_PEEK);
3359                 if (error) {
3360                         goto bad;
3361                 }
3362                 socket_unlock(so, 0);
3363                 do {
3364                         error = uiomove(mtod(m, caddr_t),
3365                             imin(uio_resid(uio), m->m_len), uio);
3366                         m = m_free(m);
3367                 } while (uio_resid(uio) && error == 0 && m != NULL);
3368                 socket_lock(so, 0);
3369 bad:
3370                 if (m != NULL) {
3371                         m_freem(m);
3372                 }
3373
3374                 if ((so->so_options & SO_WANTOOBFLAG) != 0) {
3375                         if (error == EWOULDBLOCK || error == EINVAL) {
3376                                 /*
3377                                  * Let's try to get normal data:
3378                                  * EWOULDBLOCK: out-of-band data not
3379                                  * receive yet. EINVAL: out-of-band data
3380                                  * already read.
3381                                  */
3382                                 error = 0;
3383                                 goto nooob;
3384                         } else if (error == 0 && flagsp != NULL) {
3385                                 *flagsp |= MSG_OOB;
3386                         }
3387                 }
3388                 socket_unlock(so, 1);
3389                 if (en_tracing) {
3390                         KERNEL_ENERGYTRACE(kEnTrActKernSockRead, DBG_FUNC_END,
3391                             VM_KERNEL_ADDRPERM(so), 0,
3392                             (int64_t)(orig_resid - uio_resid(uio)));
3393                 }
3394                 KERNEL_DEBUG(DBG_FNC_SORECEIVE | DBG_FUNC_END, error,
3395                     0, 0, 0, 0);
3396
3397                 return error;
3398         }
3399 nooob:
3400         if (mp != NULL) {
3401                 *mp = NULL;
3402         }
3403
3404         if (so->so_state & SS_ISCONFIRMING && uio_resid(uio)) {
3405                 (*pr->pr_usrreqs->pru_rcvd)(so, 0);
3406         }
3407
3408         free_list = NULL;
3409         delayed_copy_len = 0;
3410 restart:
3411 #ifdef MORE_LOCKING_DEBUG
3412         if (so->so_usecount <= 1) {
3413                 printf("soreceive: sblock so=0x%llx ref=%d on socket\n",
3414                     (uint64_t)DEBUG_KERNEL_ADDRPERM(so), so->so_usecount);
3415         }
3416 #endif
3417         /*
3418          * See if the socket has been closed (SS_NOFDREF|SS_CANTRCVMORE)
3419          * and if so just return to the caller.  This could happen when
3420          * soreceive() is called by a socket upcall function during the
3421          * time the socket is freed.  The socket buffer would have been
3422          * locked across the upcall, therefore we cannot put this thread
3423          * to sleep (else we will deadlock) or return EWOULDBLOCK (else
3424          * we may livelock), because the lock on the socket buffer will
3425          * only be released when the upcall routine returns to its caller.
3426          * Because the socket has been officially closed, there can be
3427          * no further read on it.
3428          *
3429          * A multipath subflow socket would have its SS_NOFDREF set by
3430          * default, so check for SOF_MP_SUBFLOW socket flag; when the
3431          * socket is closed for real, SOF_MP_SUBFLOW would be cleared.
3432          */
3433         if ((so->so_state & (SS_NOFDREF | SS_CANTRCVMORE)) ==
3434             (SS_NOFDREF | SS_CANTRCVMORE) && !(so->so_flags & SOF_MP_SUBFLOW)) {
3435                 socket_unlock(so, 1);
3436                 return 0;
3437         }
3438
3439         error = sblock(&so->so_rcv, SBLOCKWAIT(flags));
3440         if (error) {
3441                 socket_unlock(so, 1);
3442                 KERNEL_DEBUG(DBG_FNC_SORECEIVE | DBG_FUNC_END, error,
3443                     0, 0, 0, 0);
3444                 if (en_tracing) {
3445                         KERNEL_ENERGYTRACE(kEnTrActKernSockRead, DBG_FUNC_END,
3446                             VM_KERNEL_ADDRPERM(so), 0,
3447                             (int64_t)(orig_resid - uio_resid(uio)));
3448                 }
3449                 return error;
3450         }
3451
3452         m = so->so_rcv.sb_mb;
3453         /*
3454          * If we have less data than requested, block awaiting more
3455          * (subject to any timeout) if:
3456          *   1. the current count is less than the low water mark, or
3457          *   2. MSG_WAITALL is set, and it is possible to do the entire
3458          *      receive operation at once if we block (resid <= hiwat).
3459          *   3. MSG_DONTWAIT is not set
3460          * If MSG_WAITALL is set but resid is larger than the receive buffer,
3461          * we have to do the receive in sections, and thus risk returning
3462          * a short count if a timeout or signal occurs after we start.
3463          */
3464         if (m == NULL || (((flags & MSG_DONTWAIT) == 0 &&
3465             so->so_rcv.sb_cc < uio_resid(uio)) &&
3466             (so->so_rcv.sb_cc < so->so_rcv.sb_lowat ||
3467             ((flags & MSG_WAITALL) && uio_resid(uio) <= so->so_rcv.sb_hiwat)) &&
3468             m->m_nextpkt == NULL && (pr->pr_flags & PR_ATOMIC) == 0)) {
3469                 /*
3470                  * Panic if we notice inconsistencies in the socket's
3471                  * receive list; both sb_mb and sb_cc should correctly
3472                  * reflect the contents of the list, otherwise we may
3473                  * end up with false positives during select() or poll()
3474                  * which could put the application in a bad state.
3475                  */
3476                 SB_MB_CHECK(&so->so_rcv);
3477
3478                 if (so->so_error) {
3479                         if (m != NULL) {
3480                                 goto dontblock;
3481                         }
3482                         error = so->so_error;
3483                         if ((flags & MSG_PEEK) == 0) {
3484                                 so->so_error = 0;
3485                         }
3486                         goto release;
3487                 }
3488                 if (so->so_state & SS_CANTRCVMORE) {
3489 #if CONTENT_FILTER
3490                         /*
3491                          * Deal with half closed connections
3492                          */
3493                         if ((so->so_state & SS_ISDISCONNECTED) == 0 &&
3494                             cfil_sock_data_pending(&so->so_rcv) != 0) {
3495                                 CFIL_LOG(LOG_INFO,
3496                                     "so %llx ignore SS_CANTRCVMORE",
3497                                     (uint64_t)DEBUG_KERNEL_ADDRPERM(so));
3498                         } else
3499 #endif /* CONTENT_FILTER */
3500                         if (m != NULL) {
3501                                 goto dontblock;
3502                         } else {
3503                                 goto release;
3504                         }
3505                 }
3506                 for (; m != NULL; m = m->m_next) {
3507                         if (m->m_type == MT_OOBDATA || (m->m_flags & M_EOR)) {
3508                                 m = so->so_rcv.sb_mb;
3509                                 goto dontblock;
3510                         }
3511                 }
3512                 if ((so->so_state & (SS_ISCONNECTED | SS_ISCONNECTING)) == 0 &&
3513                     (so->so_proto->pr_flags & PR_CONNREQUIRED)) {
3514                         error = ENOTCONN;
3515                         goto release;
3516                 }
3517                 if (uio_resid(uio) == 0) {
3518                         goto release;
3519                 }
3520
3521                 if ((so->so_state & SS_NBIO) ||
3522                     (flags & (MSG_DONTWAIT | MSG_NBIO))) {
3523                         error = EWOULDBLOCK;
3524                         goto release;
3525                 }
3526                 SBLASTRECORDCHK(&so->so_rcv, "soreceive sbwait 1");
3527                 SBLASTMBUFCHK(&so->so_rcv, "soreceive sbwait 1");
3528                 sbunlock(&so->so_rcv, TRUE);    /* keep socket locked */
3529 #if EVEN_MORE_LOCKING_DEBUG
3530                 if (socket_debug) {
3531                         printf("Waiting for socket data\n");
3532                 }
3533 #endif
3534
3535                 error = sbwait(&so->so_rcv);
3536 #if EVEN_MORE_LOCKING_DEBUG
3537                 if (socket_debug) {
3538                         printf("SORECEIVE - sbwait returned %d\n", error);
3539                 }
3540 #endif
3541                 if (so->so_usecount < 1) {
3542                         panic("%s: after 2nd sblock so=%p ref=%d on socket\n",
3543                             __func__, so, so->so_usecount);
3544                         /* NOTREACHED */
3545                 }
3546                 if (error) {
3547                         socket_unlock(so, 1);
3548                         KERNEL_DEBUG(DBG_FNC_SORECEIVE | DBG_FUNC_END, error,
3549                             0, 0, 0, 0);
3550                         if (en_tracing) {
3551                                 KERNEL_ENERGYTRACE(kEnTrActKernSockRead, DBG_FUNC_END,
3552                                     VM_KERNEL_ADDRPERM(so), 0,
3553                                     (int64_t)(orig_resid - uio_resid(uio)));
3554                         }
3555                         return error;
3556                 }
3557                 goto restart;
3558         }
3559 dontblock:
3560         OSIncrementAtomicLong(&p->p_stats->p_ru.ru_msgrcv);
3561         SBLASTRECORDCHK(&so->so_rcv, "soreceive 1");
3562         SBLASTMBUFCHK(&so->so_rcv, "soreceive 1");
3563         nextrecord = m->m_nextpkt;
3564
3565         if ((pr->pr_flags & PR_ADDR) && m->m_type == MT_SONAME) {
3566                 error = soreceive_addr(p, so, psa, flags, &m, &nextrecord,
3567                     mp0 == NULL);
3568                 if (error == ERESTART) {
3569                         goto restart;
3570                 } else if (error != 0) {
3571                         goto release;
3572                 }
3573                 orig_resid = 0;
3574         }
3575
3576         /*
3577          * Process one or more MT_CONTROL mbufs present before any data mbufs
3578          * in the first mbuf chain on the socket buffer.  If MSG_PEEK, we
3579          * just copy the data; if !MSG_PEEK, we call into the protocol to
3580          * perform externalization.
3581          */
3582         if (m != NULL && m->m_type == MT_CONTROL) {
3583                 error = soreceive_ctl(so, controlp, flags, &m, &nextrecord);
3584                 if (error != 0) {
3585                         goto release;
3586                 }
3587                 orig_resid = 0;
3588         }
3589
3590         /*
3591          * If the socket is a TCP socket with message delivery
3592          * enabled, then create a control msg to deliver the
3593          * relative TCP sequence number for this data. Waiting
3594          * until this point will protect against failures to
3595          * allocate an mbuf for control msgs.
3596          */
3597         if (so->so_type == SOCK_STREAM && SOCK_PROTO(so) == IPPROTO_TCP &&
3598             (so->so_flags & SOF_ENABLE_MSGS) && controlp != NULL) {
3599                 struct mbuf *seq_cm;
3600
3601                 seq_cm = sbcreatecontrol((caddr_t)&m->m_pkthdr.msg_seq,
3602                     sizeof(uint32_t), SCM_SEQNUM, SOL_SOCKET);
3603                 if (seq_cm == NULL) {
3604                         /* unable to allocate a control mbuf */
3605                         error = ENOBUFS;
3606                         goto release;
3607                 }
3608                 *controlp = seq_cm;
3609                 controlp = &seq_cm->m_next;
3610         }
3611
3612         if (m != NULL) {
3613                 if (!(flags & MSG_PEEK)) {
3614                         /*
3615                          * We get here because m points to an mbuf following
3616                          * any MT_SONAME or MT_CONTROL mbufs which have been
3617                          * processed above.  In any case, m should be pointing
3618                          * to the head of the mbuf chain, and the nextrecord
3619                          * should be either NULL or equal to m->m_nextpkt.
3620                          * See comments above about SB_LOCK.
3621                          */
3622                         if (m != so->so_rcv.sb_mb ||
3623                             m->m_nextpkt != nextrecord) {
3624                                 panic("%s: post-control !sync so=%p m=%p "
3625                                     "nextrecord=%p\n", __func__, so, m,
3626                                     nextrecord);
3627                                 /* NOTREACHED */
3628                         }
3629                         if (nextrecord == NULL) {
3630                                 so->so_rcv.sb_lastrecord = m;
3631                         }
3632                 }
3633                 type = m->m_type;
3634                 if (type == MT_OOBDATA) {
3635                         flags |= MSG_OOB;
3636                 }
3637         } else {
3638                 if (!(flags & MSG_PEEK)) {
3639                         SB_EMPTY_FIXUP(&so->so_rcv);
3640                 }
3641         }
3642         SBLASTRECORDCHK(&so->so_rcv, "soreceive 2");
3643         SBLASTMBUFCHK(&so->so_rcv, "soreceive 2");
3644
3645         moff = 0;
3646         offset = 0;
3647
3648         if (!(flags & MSG_PEEK) && uio_resid(uio) > sorecvmincopy) {
3649                 can_delay = 1;
3650         } else {
3651                 can_delay = 0;
3652         }
3653
3654         need_event = 0;
3655
3656         while (m != NULL &&
3657             (uio_resid(uio) - delayed_copy_len) > 0 && error == 0) {
3658                 if (m->m_type == MT_OOBDATA) {
3659                         if (type != MT_OOBDATA) {
3660                                 break;
3661                         }
3662                 } else if (type == MT_OOBDATA) {
3663                         break;
3664                 }
3665                 /*
3666                  * Make sure to allways set MSG_OOB event when getting
3667                  * out of band data inline.
3668                  */
3669                 if ((so->so_options & SO_WANTOOBFLAG) != 0 &&
3670                     (so->so_options & SO_OOBINLINE) != 0 &&
3671                     (so->so_state & SS_RCVATMARK) != 0) {
3672                         flags |= MSG_OOB;
3673                 }
3674                 so->so_state &= ~SS_RCVATMARK;
3675                 len = uio_resid(uio) - delayed_copy_len;
3676                 if (so->so_oobmark && len > so->so_oobmark - offset) {
3677                         len = so->so_oobmark - offset;
3678                 }
3679                 if (len > m->m_len - moff) {
3680                         len = m->m_len - moff;
3681                 }
3682                 /*
3683                  * If mp is set, just pass back the mbufs.
3684                  * Otherwise copy them out via the uio, then free.
3685                  * Sockbuf must be consistent here (points to current mbuf,
3686                  * it points to next record) when we drop priority;
3687                  * we must note any additions to the sockbuf when we
3688                  * block interrupts again.
3689                  */
3690                 if (mp == NULL) {
3691                         SBLASTRECORDCHK(&so->so_rcv, "soreceive uiomove");
3692                         SBLASTMBUFCHK(&so->so_rcv, "soreceive uiomove");
3693                         if (can_delay && len == m->m_len) {
3694                                 /*
3695                                  * only delay the copy if we're consuming the
3696                                  * mbuf and we're NOT in MSG_PEEK mode
3697                                  * and we have enough data to make it worthwile
3698                                  * to drop and retake the lock... can_delay
3699                                  * reflects the state of the 2 latter
3700                                  * constraints moff should always be zero
3701                                  * in these cases
3702                                  */
3703                                 delayed_copy_len += len;
3704                         } else {
3705                                 if (delayed_copy_len) {
3706                                         error = sodelayed_copy(so, uio,
3707                                             &free_list, &delayed_copy_len);
3708
3709                                         if (error) {
3710                                                 goto release;
3711                                         }
3712                                         /*
3713                                          * can only get here if MSG_PEEK is not
3714                                          * set therefore, m should point at the
3715                                          * head of the rcv queue; if it doesn't,
3716                                          * it means something drastically
3717                                          * changed while we were out from behind
3718                                          * the lock in sodelayed_copy. perhaps
3719                                          * a RST on the stream. in any event,
3720                                          * the stream has been interrupted. it's
3721                                          * probably best just to return whatever
3722                                          * data we've moved and let the caller
3723                                          * sort it out...
3724                                          */
3725                                         if (m != so->so_rcv.sb_mb) {
3726                                                 break;
3727                                         }
3728                                 }
3729                                 socket_unlock(so, 0);
3730                                 error = uiomove(mtod(m, caddr_t) + moff,
3731                                     (int)len, uio);
3732                                 socket_lock(so, 0);
3733
3734                                 if (error) {
3735                                         goto release;
3736                                 }
3737                         }
3738                 } else {
3739                         uio_setresid(uio, (uio_resid(uio) - len));
3740                 }
3741                 if (len == m->m_len - moff) {
3742                         if (m->m_flags & M_EOR) {
3743                                 flags |= MSG_EOR;
3744                         }
3745                         if (flags & MSG_PEEK) {
3746                                 m = m->m_next;
3747                                 moff = 0;
3748                         } else {
3749                                 nextrecord = m->m_nextpkt;
3750                                 sbfree(&so->so_rcv, m);
3751                                 m->m_nextpkt = NULL;
3752
3753                                 /*
3754                                  * If this packet is an unordered packet
3755                                  * (indicated by M_UNORDERED_DATA flag), remove
3756                                  * the additional bytes added to the
3757                                  * receive socket buffer size.
3758                                  */
3759                                 if ((so->so_flags & SOF_ENABLE_MSGS) &&
3760                                     m->m_len &&
3761                                     (m->m_flags & M_UNORDERED_DATA) &&
3762                                     sbreserve(&so->so_rcv,
3763                                     so->so_rcv.sb_hiwat - m->m_len)) {
3764                                         if (so->so_msg_state->msg_uno_bytes >
3765                                             m->m_len) {
3766                                                 so->so_msg_state->
3767                                                 msg_uno_bytes -= m->m_len;
3768                                         } else {
3769                                                 so->so_msg_state->
3770                                                 msg_uno_bytes = 0;
3771                                         }
3772                                         m->m_flags &= ~M_UNORDERED_DATA;
3773                                 }
3774
3775                                 if (mp != NULL) {
3776                                         *mp = m;
3777                                         mp = &m->m_next;
3778                                         so->so_rcv.sb_mb = m = m->m_next;
3779                                         *mp = NULL;
3780                                 } else {
3781                                         if (free_list == NULL) {
3782                                                 free_list = m;
3783                                         } else {
3784                                                 ml->m_next = m;
3785                                         }
3786                                         ml = m;
3787                                         so->so_rcv.sb_mb = m = m->m_next;
3788                                         ml->m_next = NULL;
3789                                 }
3790                                 if (m != NULL) {
3791                                         m->m_nextpkt = nextrecord;
3792                                         if (nextrecord == NULL) {
3793                                                 so->so_rcv.sb_lastrecord = m;
3794                                         }
3795                                 } else {
3796                                         so->so_rcv.sb_mb = nextrecord;
3797                                         SB_EMPTY_FIXUP(&so->so_rcv);
3798                                 }
3799                                 SBLASTRECORDCHK(&so->so_rcv, "soreceive 3");
3800                                 SBLASTMBUFCHK(&so->so_rcv, "soreceive 3");
3801                         }
3802                 } else {
3803                         if (flags & MSG_PEEK) {
3804                                 moff += len;
3805                         } else {
3806                                 if (mp != NULL) {
3807                                         int copy_flag;
3808
3809                                         if (flags & MSG_DONTWAIT) {
3810                                                 copy_flag = M_DONTWAIT;
3811                                         } else {
3812                                                 copy_flag = M_WAIT;
3813                                         }
3814                                         *mp = m_copym(m, 0, len, copy_flag);
3815                                         /*
3816                                          * Failed to allocate an mbuf?
3817                                          * Adjust uio_resid back, it was
3818                                          * adjusted down by len bytes which
3819                                          * we didn't copy over.
3820                                          */
3821                                         if (*mp == NULL) {
3822                                                 uio_setresid(uio,
3823                                                     (uio_resid(uio) + len));
3824                                                 break;
3825                                         }
3826                                 }
3827                                 m->m_data += len;
3828                                 m->m_len -= len;
3829                                 so->so_rcv.sb_cc -= len;
3830                         }
3831                 }
3832                 if (so->so_oobmark) {
3833                         if ((flags & MSG_PEEK) == 0) {
3834                                 so->so_oobmark -= len;
3835                                 if (so->so_oobmark == 0) {
3836                                         so->so_state |= SS_RCVATMARK;
3837                                         /*
3838                                          * delay posting the actual event until
3839                                          * after any delayed copy processing
3840                                          * has finished
3841                                          */
3842                                         need_event = 1;
3843                                         break;
3844                                 }
3845                         } else {
3846                                 offset += len;
3847                                 if (offset == so->so_oobmark) {
3848                                         break;
3849                                 }
3850                         }
3851                 }
3852                 if (flags & MSG_EOR) {
3853                         break;
3854                 }
3855                 /*
3856                  * If the MSG_WAITALL or MSG_WAITSTREAM flag is set
3857                  * (for non-atomic socket), we must not quit until
3858                  * "uio->uio_resid == 0" or an error termination.
3859                  * If a signal/timeout occurs, return with a short
3860                  * count but without error.  Keep sockbuf locked
3861                  * against other readers.
3862                  */
3863                 while (flags & (MSG_WAITALL | MSG_WAITSTREAM) && m == NULL &&
3864                     (uio_resid(uio) - delayed_copy_len) > 0 &&
3865                     !sosendallatonce(so) && !nextrecord) {
3866                         if (so->so_error || ((so->so_state & SS_CANTRCVMORE)
3867 #if CONTENT_FILTER
3868                             && cfil_sock_data_pending(&so->so_rcv) == 0
3869 #endif /* CONTENT_FILTER */
3870                             )) {
3871                                 goto release;
3872                         }
3873
3874                         /*
3875                          * Depending on the protocol (e.g. TCP), the following
3876                          * might cause the socket lock to be dropped and later
3877                          * be reacquired, and more data could have arrived and
3878                          * have been appended to the receive socket buffer by
3879                          * the time it returns.  Therefore, we only sleep in
3880                          * sbwait() below if and only if the socket buffer is
3881                          * empty, in order to avoid a false sleep.
3882                          */
3883                         if (pr->pr_flags & PR_WANTRCVD && so->so_pcb &&
3884                             (((struct inpcb *)so->so_pcb)->inp_state !=
3885                             INPCB_STATE_DEAD)) {
3886                                 (*pr->pr_usrreqs->pru_rcvd)(so, flags);
3887                         }
3888
3889                         SBLASTRECORDCHK(&so->so_rcv, "soreceive sbwait 2");
3890                         SBLASTMBUFCHK(&so->so_rcv, "soreceive sbwait 2");
3891
3892                         if (so->so_rcv.sb_mb == NULL && sbwait(&so->so_rcv)) {
3893                                 error = 0;
3894                                 goto release;
3895                         }
3896                         /*
3897                          * have to wait until after we get back from the sbwait
3898                          * to do the copy because we will drop the lock if we
3899                          * have enough data that has been delayed... by dropping
3900                          * the lock we open up a window allowing the netisr
3901                          * thread to process the incoming packets and to change
3902                          * the state of this socket... we're issuing the sbwait
3903                          * because the socket is empty and we're expecting the
3904                          * netisr thread to wake us up when more packets arrive;
3905                          * if we allow that processing to happen and then sbwait
3906                          * we could stall forever with packets sitting in the
3907                          * socket if no further packets arrive from the remote
3908                          * side.
3909                          *
3910                          * we want to copy before we've collected all the data
3911                          * to satisfy this request to allow the copy to overlap
3912                          * the incoming packet processing on an MP system
3913                          */
3914                         if (delayed_copy_len > sorecvmincopy &&
3915                             (delayed_copy_len > (so->so_rcv.sb_hiwat / 2))) {
3916                                 error = sodelayed_copy(so, uio,
3917                                     &free_list, &delayed_copy_len);
3918
3919                                 if (error) {
3920                                         goto release;
3921                                 }
3922                         }
3923                         m = so->so_rcv.sb_mb;
3924                         if (m != NULL) {
3925                                 nextrecord = m->m_nextpkt;
3926                         }
3927                         SB_MB_CHECK(&so->so_rcv);
3928                 }
3929         }
3930 #ifdef MORE_LOCKING_DEBUG
3931         if (so->so_usecount <= 1) {
3932                 panic("%s: after big while so=%p ref=%d on socket\n",
3933                     __func__, so, so->so_usecount);
3934                 /* NOTREACHED */
3935         }
3936 #endif
3937
3938         if (m != NULL && pr->pr_flags & PR_ATOMIC) {
3939                 if (so->so_options & SO_DONTTRUNC) {
3940                         flags |= MSG_RCVMORE;
3941                 } else {
3942                         flags |= MSG_TRUNC;
3943                         if ((flags & MSG_PEEK) == 0) {
3944                                 (void) sbdroprecord(&so->so_rcv);
3945                         }
3946                 }
3947         }
3948
3949         /*
3950          * pru_rcvd below (for TCP) may cause more data to be received
3951          * if the socket lock is dropped prior to sending the ACK; some
3952          * legacy OpenTransport applications don't handle this well
3953          * (if it receives less data than requested while MSG_HAVEMORE
3954          * is set), and so we set the flag now based on what we know
3955          * prior to calling pru_rcvd.
3956          */
3957         if ((so->so_options & SO_WANTMORE) && so->so_rcv.sb_cc > 0) {
3958                 flags |= MSG_HAVEMORE;
3959         }
3960
3961         if ((flags & MSG_PEEK) == 0) {
3962                 if (m == NULL) {
3963                         so->so_rcv.sb_mb = nextrecord;
3964                         /*
3965                          * First part is an inline SB_EMPTY_FIXUP().  Second
3966                          * part makes sure sb_lastrecord is up-to-date if
3967                          * there is still data in the socket buffer.
3968                          */
3969                         if (so->so_rcv.sb_mb == NULL) {
3970                                 so->so_rcv.sb_mbtail = NULL;
3971                                 so->so_rcv.sb_lastrecord = NULL;
3972                         } else if (nextrecord->m_nextpkt == NULL) {
3973                                 so->so_rcv.sb_lastrecord = nextrecord;
3974                         }
3975                         SB_MB_CHECK(&so->so_rcv);
3976                 }
3977                 SBLASTRECORDCHK(&so->so_rcv, "soreceive 4");
3978                 SBLASTMBUFCHK(&so->so_rcv, "soreceive 4");
3979                 if (pr->pr_flags & PR_WANTRCVD && so->so_pcb) {
3980                         (*pr->pr_usrreqs->pru_rcvd)(so, flags);
3981                 }
3982         }
3983
3984         if (delayed_copy_len) {
3985                 error = sodelayed_copy(so, uio, &free_list, &delayed_copy_len);
3986                 if (error) {
3987                         goto release;
3988                 }
3989         }
3990         if (free_list != NULL) {
3991                 m_freem_list(free_list);
3992                 free_list = NULL;
3993         }
3994         if (need_event) {
3995                 postevent(so, 0, EV_OOB);
3996         }
3997
3998         if (orig_resid == uio_resid(uio) && orig_resid &&
3999             (flags & MSG_EOR) == 0 && (so->so_state & SS_CANTRCVMORE) == 0) {
4000                 sbunlock(&so->so_rcv, TRUE);    /* keep socket locked */
4001                 goto restart;
4002         }
4003
4004         if (flagsp != NULL) {
4005                 *flagsp |= flags;
4006         }
4007 release:
4008 #ifdef MORE_LOCKING_DEBUG
4009         if (so->so_usecount <= 1) {
4010                 panic("%s: release so=%p ref=%d on socket\n", __func__,
4011                     so, so->so_usecount);
4012                 /* NOTREACHED */
4013         }
4014 #endif
4015         if (delayed_copy_len) {
4016                 error = sodelayed_copy(so, uio, &free_list, &delayed_copy_len);
4017         }
4018
4019         if (free_list != NULL) {
4020                 m_freem_list(free_list);
4021         }
4022
4023         sbunlock(&so->so_rcv, FALSE);   /* will unlock socket */
4024
4025         if (en_tracing) {
4026                 KERNEL_ENERGYTRACE(kEnTrActKernSockRead, DBG_FUNC_END,
4027                     VM_KERNEL_ADDRPERM(so),
4028                     ((error == EWOULDBLOCK) ? kEnTrFlagNoWork : 0),
4029                     (int64_t)(orig_resid - uio_resid(uio)));
4030         }
4031         KERNEL_DEBUG(DBG_FNC_SORECEIVE | DBG_FUNC_END, so, uio_resid(uio),
4032             so->so_rcv.sb_cc, 0, error);
4033
4034         return error;
4035 }
4036
4037 /*
4038  * Returns:     0                       Success
4039  *      uiomove:EFAULT
4040  */
4041 static int
4042 sodelayed_copy(struct socket *so, struct uio *uio, struct mbuf **free_list,
4043     user_ssize_t *resid)
4044 {
4045         int error = 0;
4046         struct mbuf *m;
4047
4048         m = *free_list;
4049
4050         socket_unlock(so, 0);
4051
4052         while (m != NULL && error == 0) {
4053                 error = uiomove(mtod(m, caddr_t), (int)m->m_len, uio);
4054                 m = m->m_next;
4055         }
4056         m_freem_list(*free_list);
4057
4058         *free_list = NULL;
4059         *resid = 0;
4060
4061         socket_lock(so, 0);
4062
4063         return error;
4064 }
4065
4066 static int
4067 sodelayed_copy_list(struct socket *so, struct recv_msg_elem *msgarray,
4068     u_int uiocnt, struct mbuf **free_list, user_ssize_t *resid)
4069 {
4070 #pragma unused(so)
4071         int error = 0;
4072         struct mbuf *ml, *m;
4073         int i = 0;
4074         struct uio *auio;
4075
4076         for (ml = *free_list, i = 0; ml != NULL && i < uiocnt;
4077             ml = ml->m_nextpkt, i++) {
4078                 auio = msgarray[i].uio;
4079                 for (m = ml; m != NULL; m = m->m_next) {
4080                         error = uiomove(mtod(m, caddr_t), m->m_len, auio);
4081                         if (error != 0) {
4082                                 goto out;
4083                         }
4084                 }
4085         }
4086 out:
4087         m_freem_list(*free_list);
4088
4089         *free_list = NULL;
4090         *resid = 0;
4091
4092         return error;
4093 }
4094
4095 int
4096 soreceive_list(struct socket *so, struct recv_msg_elem *msgarray, u_int uiocnt,
4097     int *flagsp)
4098 {
4099         struct mbuf *m;
4100         struct mbuf *nextrecord;
4101         struct mbuf *ml = NULL, *free_list = NULL, *free_tail = NULL;
4102         int error;
4103         user_ssize_t len, pktlen, delayed_copy_len = 0;
4104         struct protosw *pr = so->so_proto;
4105         user_ssize_t resid;
4106         struct proc *p = current_proc();
4107         struct uio *auio = NULL;
4108         int npkts = 0;
4109         int sblocked = 0;
4110         struct sockaddr **psa = NULL;
4111         struct mbuf **controlp = NULL;
4112         int can_delay;
4113         int flags;
4114         struct mbuf *free_others = NULL;
4115
4116         KERNEL_DEBUG(DBG_FNC_SORECEIVE_LIST | DBG_FUNC_START,
4117             so, uiocnt,
4118             so->so_rcv.sb_cc, so->so_rcv.sb_lowat, so->so_rcv.sb_hiwat);
4119
4120         /*
4121          * Sanity checks:
4122          * - Only supports don't wait flags
4123          * - Only support datagram sockets (could be extended to raw)
4124          * - Must be atomic
4125          * - Protocol must support packet chains
4126          * - The uio array is NULL (should we panic?)
4127          */
4128         if (flagsp != NULL) {
4129                 flags = *flagsp;
4130         } else {
4131                 flags = 0;
4132         }
4133         if (flags & ~(MSG_PEEK | MSG_WAITALL | MSG_DONTWAIT | MSG_NEEDSA |
4134             MSG_NBIO)) {
4135                 printf("%s invalid flags 0x%x\n", __func__, flags);
4136                 error = EINVAL;
4137                 goto out;
4138         }
4139         if (so->so_type != SOCK_DGRAM) {
4140                 error = EINVAL;
4141                 goto out;
4142         }
4143         if (sosendallatonce(so) == 0) {
4144                 error = EINVAL;
4145                 goto out;
4146         }
4147         if (so->so_proto->pr_usrreqs->pru_send_list == NULL) {
4148                 error = EPROTONOSUPPORT;
4149                 goto out;
4150         }
4151         if (msgarray == NULL) {
4152                 printf("%s uioarray is NULL\n", __func__);
4153                 error = EINVAL;
4154                 goto out;
4155         }
4156         if (uiocnt == 0) {
4157                 printf("%s uiocnt is 0\n", __func__);
4158                 error = EINVAL;
4159                 goto out;
4160         }
4161         /*
4162          * Sanity check on the length passed by caller as we are making 'int'
4163          * comparisons
4164          */
4165         resid = recv_msg_array_resid(msgarray, uiocnt);
4166         if (resid < 0 || resid > INT_MAX) {
4167                 error = EINVAL;
4168                 goto out;
4169         }
4170
4171         if (!(flags & MSG_PEEK) && sorecvmincopy > 0) {
4172                 can_delay = 1;
4173         } else {
4174                 can_delay = 0;
4175         }
4176
4177         socket_lock(so, 1);
4178         so_update_last_owner_locked(so, p);
4179         so_update_policy(so);
4180
4181 #if NECP
4182         so_update_necp_policy(so, NULL, NULL);
4183 #endif /* NECP */
4184
4185         /*
4186          * If a recv attempt is made on a previously-accepted socket
4187          * that has been marked as inactive (disconnected), reject
4188          * the request.
4189          */
4190         if (so->so_flags & SOF_DEFUNCT) {
4191                 struct sockbuf *sb = &so->so_rcv;
4192
4193                 error = ENOTCONN;
4194                 SODEFUNCTLOG("%s[%d, %s]: defunct so 0x%llx [%d,%d] (%d)\n",
4195                     __func__, proc_pid(p), proc_best_name(p),
4196                     (uint64_t)DEBUG_KERNEL_ADDRPERM(so),
4197                     SOCK_DOM(so), SOCK_TYPE(so), error);
4198                 /*
4199                  * This socket should have been disconnected and flushed
4200                  * prior to being returned from sodefunct(); there should
4201                  * be no data on its receive list, so panic otherwise.
4202                  */
4203                 if (so->so_state & SS_DEFUNCT) {
4204                         sb_empty_assert(sb, __func__);
4205                 }
4206                 goto release;
4207         }
4208
4209 next:
4210         /*
4211          * The uio may be empty
4212          */
4213         if (npkts >= uiocnt) {
4214                 error = 0;
4215                 goto release;
4216         }
4217 restart:
4218         /*
4219          * See if the socket has been closed (SS_NOFDREF|SS_CANTRCVMORE)
4220          * and if so just return to the caller.  This could happen when
4221          * soreceive() is called by a socket upcall function during the
4222          * time the socket is freed.  The socket buffer would have been
4223          * locked across the upcall, therefore we cannot put this thread
4224          * to sleep (else we will deadlock) or return EWOULDBLOCK (else
4225          * we may livelock), because the lock on the socket buffer will
4226          * only be released when the upcall routine returns to its caller.
4227          * Because the socket has been officially closed, there can be
4228          * no further read on it.
4229          */
4230         if ((so->so_state & (SS_NOFDREF | SS_CANTRCVMORE)) ==
4231             (SS_NOFDREF | SS_CANTRCVMORE)) {
4232                 error = 0;
4233                 goto release;
4234         }
4235
4236         error = sblock(&so->so_rcv, SBLOCKWAIT(flags));
4237         if (error) {
4238                 goto release;
4239         }
4240         sblocked = 1;
4241
4242         m = so->so_rcv.sb_mb;
4243         /*
4244          * Block awaiting more datagram if needed
4245          */
4246         if (m == NULL || (((flags & MSG_DONTWAIT) == 0 &&
4247             (so->so_rcv.sb_cc < so->so_rcv.sb_lowat ||
4248             ((flags & MSG_WAITALL) && npkts < uiocnt))))) {
4249                 /*
4250                  * Panic if we notice inconsistencies in the socket's
4251                  * receive list; both sb_mb and sb_cc should correctly
4252                  * reflect the contents of the list, otherwise we may
4253                  * end up with false positives during select() or poll()
4254                  * which could put the application in a bad state.
4255                  */
4256                 SB_MB_CHECK(&so->so_rcv);
4257
4258                 if (so->so_error) {
4259                         error = so->so_error;
4260                         if ((flags & MSG_PEEK) == 0) {
4261                                 so->so_error = 0;
4262                         }
4263                         goto release;
4264                 }
4265                 if (so->so_state & SS_CANTRCVMORE) {
4266                         goto release;
4267                 }
4268                 if ((so->so_state & (SS_ISCONNECTED | SS_ISCONNECTING)) == 0 &&
4269                     (so->so_proto->pr_flags & PR_CONNREQUIRED)) {
4270                         error = ENOTCONN;
4271                         goto release;
4272                 }
4273                 if ((so->so_state & SS_NBIO) ||
4274                     (flags & (MSG_DONTWAIT | MSG_NBIO))) {
4275                         error = EWOULDBLOCK;
4276                         goto release;
4277                 }
4278                 /*
4279                  * Do not block if we got some data
4280                  */
4281                 if (free_list != NULL) {
4282                         error = 0;
4283                         goto release;
4284                 }
4285
4286                 SBLASTRECORDCHK(&so->so_rcv, "soreceive sbwait 1");
4287                 SBLASTMBUFCHK(&so->so_rcv, "soreceive sbwait 1");
4288
4289                 sbunlock(&so->so_rcv, TRUE);    /* keep socket locked */
4290                 sblocked = 0;
4291
4292                 error = sbwait(&so->so_rcv);
4293                 if (error) {
4294                         goto release;
4295                 }
4296                 goto restart;
4297         }
4298
4299         OSIncrementAtomicLong(&p->p_stats->p_ru.ru_msgrcv);
4300         SBLASTRECORDCHK(&so->so_rcv, "soreceive 1");
4301         SBLASTMBUFCHK(&so->so_rcv, "soreceive 1");
4302
4303         /*
4304          * Consume the current uio index as we have a datagram
4305          */
4306         auio = msgarray[npkts].uio;
4307         resid = uio_resid(auio);
4308         msgarray[npkts].which |= SOCK_MSG_DATA;
4309         psa = (msgarray[npkts].which & SOCK_MSG_SA) ?
4310             &msgarray[npkts].psa : NULL;
4311         controlp = (msgarray[npkts].which & SOCK_MSG_CONTROL) ?
4312             &msgarray[npkts].controlp : NULL;
4313         npkts += 1;
4314         nextrecord = m->m_nextpkt;
4315
4316         if ((pr->pr_flags & PR_ADDR) && m->m_type == MT_SONAME) {
4317                 error = soreceive_addr(p, so, psa, flags, &m, &nextrecord, 1);
4318                 if (error == ERESTART) {
4319                         goto restart;
4320                 } else if (error != 0) {
4321                         goto release;
4322                 }
4323         }
4324
4325         if (m != NULL && m->m_type == MT_CONTROL) {
4326                 error = soreceive_ctl(so, controlp, flags, &m, &nextrecord);
4327                 if (error != 0) {
4328                         goto release;
4329                 }
4330         }
4331
4332         if (m->m_pkthdr.len == 0) {
4333                 printf("%s:%d so %llx pkt %llx type %u pktlen null\n",
4334                     __func__, __LINE__,
4335                     (uint64_t)DEBUG_KERNEL_ADDRPERM(so),
4336                     (uint64_t)DEBUG_KERNEL_ADDRPERM(m),
4337                     m->m_type);
4338         }
4339
4340         /*
4341          * Loop to copy the mbufs of the current record
4342          * Support zero length packets
4343          */
4344         ml = NULL;
4345         pktlen = 0;
4346         while (m != NULL && (len = resid - pktlen) >= 0 && error == 0) {
4347                 if (m->m_len == 0) {
4348                         panic("%p m_len zero", m);
4349                 }
4350                 if (m->m_type == 0) {
4351                         panic("%p m_type zero", m);
4352                 }
4353                 /*
4354                  * Clip to the residual length
4355                  */
4356                 if (len > m->m_len) {
4357                         len = m->m_len;
4358                 }
4359                 pktlen += len;
4360                 /*
4361                  * Copy the mbufs via the uio or delay the copy
4362                  * Sockbuf must be consistent here (points to current mbuf,
4363                  * it points to next record) when we drop priority;
4364                  * we must note any additions to the sockbuf when we
4365                  * block interrupts again.
4366                  */
4367                 if (len > 0 && can_delay == 0) {
4368                         socket_unlock(so, 0);
4369                         error = uiomove(mtod(m, caddr_t), (int)len, auio);
4370                         socket_lock(so, 0);
4371                         if (error) {
4372                                 goto release;
4373                         }
4374                 } else {
4375                         delayed_copy_len += len;
4376                 }
4377
4378                 if (len == m->m_len) {
4379                         /*
4380                          * m was entirely copied
4381                          */
4382                         sbfree(&so->so_rcv, m);
4383                         nextrecord = m->m_nextpkt;
4384                         m->m_nextpkt = NULL;
4385
4386                         /*
4387                          * Set the first packet to the head of the free list
4388                          */
4389                         if (free_list == NULL) {
4390                                 free_list = m;
4391                         }
4392                         /*
4393                          * Link current packet to tail of free list
4394                          */
4395                         if (ml == NULL) {
4396                                 if (free_tail != NULL) {
4397                                         free_tail->m_nextpkt = m;
4398                                 }
4399                                 free_tail = m;
4400                         }
4401                         /*
4402                          * Link current mbuf to last mbuf of current packet
4403                          */
4404                         if (ml != NULL) {
4405                                 ml->m_next = m;
4406                         }
4407                         ml = m;
4408
4409                         /*
4410                          * Move next buf to head of socket buffer
4411                          */
4412                         so->so_rcv.sb_mb = m = ml->m_next;
4413                         ml->m_next = NULL;
4414
4415                         if (m != NULL) {
4416                                 m->m_nextpkt = nextrecord;
4417                                 if (nextrecord == NULL) {
4418                                         so->so_rcv.sb_lastrecord = m;
4419                                 }
4420                         } else {
4421                                 so->so_rcv.sb_mb = nextrecord;
4422                                 SB_EMPTY_FIXUP(&so->so_rcv);
4423                         }
4424                         SBLASTRECORDCHK(&so->so_rcv, "soreceive 3");
4425                         SBLASTMBUFCHK(&so->so_rcv, "soreceive 3");
4426                 } else {
4427                         /*
4428                          * Stop the loop on partial copy
4429                          */
4430                         break;
4431                 }
4432         }
4433 #ifdef MORE_LOCKING_DEBUG
4434         if (so->so_usecount <= 1) {
4435                 panic("%s: after big while so=%llx ref=%d on socket\n",
4436                     __func__,
4437                     (uint64_t)DEBUG_KERNEL_ADDRPERM(so), so->so_usecount);
4438                 /* NOTREACHED */
4439         }
4440 #endif
4441         /*
4442          * Tell the caller we made a partial copy
4443          */
4444         if (m != NULL) {
4445                 if (so->so_options & SO_DONTTRUNC) {
4446                         /*
4447                          * Copyout first the freelist then the partial mbuf
4448                          */
4449                         socket_unlock(so, 0);
4450                         if (delayed_copy_len) {
4451                                 error = sodelayed_copy_list(so, msgarray,
4452                                     uiocnt, &free_list, &delayed_copy_len);
4453                         }
4454
4455                         if (error == 0) {
4456                                 error = uiomove(mtod(m, caddr_t), (int)len,
4457                                     auio);
4458                         }
4459                         socket_lock(so, 0);
4460                         if (error) {
4461                                 goto release;
4462                         }
4463
4464                         m->m_data += len;
4465                         m->m_len -= len;
4466                         so->so_rcv.sb_cc -= len;
4467                         flags |= MSG_RCVMORE;
4468                 } else {
4469                         (void) sbdroprecord(&so->so_rcv);
4470                         nextrecord = so->so_rcv.sb_mb;
4471                         m = NULL;
4472                         flags |= MSG_TRUNC;
4473                 }
4474         }
4475
4476         if (m == NULL) {
4477                 so->so_rcv.sb_mb = nextrecord;
4478                 /*
4479                  * First part is an inline SB_EMPTY_FIXUP().  Second
4480                  * part makes sure sb_lastrecord is up-to-date if
4481                  * there is still data in the socket buffer.
4482                  */
4483                 if (so->so_rcv.sb_mb == NULL) {
4484                         so->so_rcv.sb_mbtail = NULL;
4485                         so->so_rcv.sb_lastrecord = NULL;
4486                 } else if (nextrecord->m_nextpkt == NULL) {
4487                         so->so_rcv.sb_lastrecord = nextrecord;
4488                 }
4489                 SB_MB_CHECK(&so->so_rcv);
4490         }
4491         SBLASTRECORDCHK(&so->so_rcv, "soreceive 4");
4492         SBLASTMBUFCHK(&so->so_rcv, "soreceive 4");
4493
4494         /*
4495          * We can continue to the next packet as long as:
4496          * - We haven't exhausted the uio array
4497          * - There was no error
4498          * - A packet was not truncated
4499          * - We can still receive more data
4500          */
4501         if (npkts < uiocnt && error == 0 &&
4502             (flags & (MSG_RCVMORE | MSG_TRUNC)) == 0 &&
4503             (so->so_state & SS_CANTRCVMORE) == 0) {
4504                 sbunlock(&so->so_rcv, TRUE);    /* keep socket locked */
4505                 sblocked = 0;
4506
4507                 goto next;
4508         }
4509         if (flagsp != NULL) {
4510                 *flagsp |= flags;
4511         }
4512
4513 release:
4514         /*
4515          * pru_rcvd may cause more data to be received if the socket lock
4516          * is dropped so we set MSG_HAVEMORE now based on what we know.
4517          * That way the caller won't be surprised if it receives less data
4518          * than requested.
4519          */
4520         if ((so->so_options & SO_WANTMORE) && so->so_rcv.sb_cc > 0) {
4521                 flags |= MSG_HAVEMORE;
4522         }
4523
4524         if (pr->pr_flags & PR_WANTRCVD && so->so_pcb) {
4525                 (*pr->pr_usrreqs->pru_rcvd)(so, flags);
4526         }
4527
4528         if (sblocked) {
4529                 sbunlock(&so->so_rcv, FALSE);   /* will unlock socket */
4530         } else {
4531                 socket_unlock(so, 1);
4532         }
4533
4534         if (delayed_copy_len) {
4535                 error = sodelayed_copy_list(so, msgarray, uiocnt,
4536                     &free_list, &delayed_copy_len);
4537         }
4538 out:
4539         /*
4540          * Amortize the cost of freeing the mbufs
4541          */
4542         if (free_list != NULL) {
4543                 m_freem_list(free_list);
4544         }
4545         if (free_others != NULL) {
4546                 m_freem_list(free_others);
4547         }
4548
4549         KERNEL_DEBUG(DBG_FNC_SORECEIVE_LIST | DBG_FUNC_END, error,
4550             0, 0, 0, 0);
4551         return error;
4552 }
4553
4554 static int
4555 so_statistics_event_to_nstat_event(int64_t *input_options,
4556     uint64_t *nstat_event)
4557 {
4558         int error = 0;
4559         switch (*input_options) {
4560         case SO_STATISTICS_EVENT_ENTER_CELLFALLBACK:
4561                 *nstat_event = NSTAT_EVENT_SRC_ENTER_CELLFALLBACK;
4562                 break;
4563         case SO_STATISTICS_EVENT_EXIT_CELLFALLBACK:
4564                 *nstat_event = NSTAT_EVENT_SRC_EXIT_CELLFALLBACK;
4565                 break;
4566 #if (DEBUG || DEVELOPMENT)
4567         case SO_STATISTICS_EVENT_RESERVED_1:
4568                 *nstat_event = NSTAT_EVENT_SRC_RESERVED_1;
4569                 break;
4570         case SO_STATISTICS_EVENT_RESERVED_2:
4571                 *nstat_event = NSTAT_EVENT_SRC_RESERVED_2;
4572                 break;
4573 #endif /* (DEBUG || DEVELOPMENT) */
4574         default:
4575                 error = EINVAL;
4576                 break;
4577         }
4578         return error;
4579 }
4580
4581 /*
4582  * Returns:     0                       Success
4583  *              EINVAL
4584  *              ENOTCONN
4585  *      <pru_shutdown>:EINVAL
4586  *      <pru_shutdown>:EADDRNOTAVAIL[TCP]
4587  *      <pru_shutdown>:ENOBUFS[TCP]
4588  *      <pru_shutdown>:EMSGSIZE[TCP]
4589  *      <pru_shutdown>:EHOSTUNREACH[TCP]
4590  *      <pru_shutdown>:ENETUNREACH[TCP]
4591  *      <pru_shutdown>:ENETDOWN[TCP]
4592  *      <pru_shutdown>:ENOMEM[TCP]
4593  *      <pru_shutdown>:EACCES[TCP]
4594  *      <pru_shutdown>:EMSGSIZE[TCP]
4595  *      <pru_shutdown>:ENOBUFS[TCP]
4596  *      <pru_shutdown>:???[TCP]         [ignorable: mostly IPSEC/firewall/DLIL]
4597  *      <pru_shutdown>:???              [other protocol families]
4598  */
4599 int
4600 soshutdown(struct socket *so, int how)
4601 {
4602         int error;
4603
4604         KERNEL_DEBUG(DBG_FNC_SOSHUTDOWN | DBG_FUNC_START, how, 0, 0, 0, 0);
4605
4606         switch (how) {
4607         case SHUT_RD:
4608         case SHUT_WR:
4609         case SHUT_RDWR:
4610                 socket_lock(so, 1);
4611                 if ((so->so_state &
4612                     (SS_ISCONNECTED | SS_ISCONNECTING | SS_ISDISCONNECTING)) == 0) {
4613                         error = ENOTCONN;
4614                 } else {
4615                         error = soshutdownlock(so, how);
4616                 }
4617                 socket_unlock(so, 1);
4618                 break;
4619         default:
4620                 error = EINVAL;
4621                 break;
4622         }
4623
4624         KERNEL_DEBUG(DBG_FNC_SOSHUTDOWN | DBG_FUNC_END, how, error, 0, 0, 0);
4625
4626         return error;
4627 }
4628
4629 int
4630 soshutdownlock_final(struct socket *so, int how)
4631 {
4632         struct protosw *pr = so->so_proto;
4633         int error = 0;
4634
4635         sflt_notify(so, sock_evt_shutdown, &how);
4636
4637         if (how != SHUT_WR) {
4638                 if ((so->so_state & SS_CANTRCVMORE) != 0) {
4639                         /* read already shut down */
4640                         error = ENOTCONN;
4641                         goto done;
4642                 }
4643                 sorflush(so);
4644                 postevent(so, 0, EV_RCLOSED);
4645         }
4646         if (how != SHUT_RD) {
4647                 if ((so->so_state & SS_CANTSENDMORE) != 0) {
4648                         /* write already shut down */
4649                         error = ENOTCONN;
4650                         goto done;
4651                 }
4652                 error = (*pr->pr_usrreqs->pru_shutdown)(so);
4653                 postevent(so, 0, EV_WCLOSED);
4654         }
4655 done:
4656         KERNEL_DEBUG(DBG_FNC_SOSHUTDOWN, how, 1, 0, 0, 0);
4657         return error;
4658 }
4659
4660 int
4661 soshutdownlock(struct socket *so, int how)
4662 {
4663         int error = 0;
4664
4665 #if CONTENT_FILTER
4666         /*
4667          * A content filter may delay the actual shutdown until it
4668          * has processed the pending data
4669          */
4670         if (so->so_flags & SOF_CONTENT_FILTER) {
4671                 error = cfil_sock_shutdown(so, &how);
4672                 if (error == EJUSTRETURN) {
4673                         error = 0;
4674                         goto done;
4675                 } else if (error != 0) {
4676                         goto done;
4677                 }
4678         }
4679 #endif /* CONTENT_FILTER */
4680
4681         error = soshutdownlock_final(so, how);
4682
4683 done:
4684         return error;
4685 }
4686
4687 void
4688 sowflush(struct socket *so)
4689 {
4690         struct sockbuf *sb = &so->so_snd;
4691
4692         /*
4693          * Obtain lock on the socket buffer (SB_LOCK).  This is required
4694          * to prevent the socket buffer from being unexpectedly altered
4695          * while it is used by another thread in socket send/receive.
4696          *
4697          * sblock() must not fail here, hence the assertion.
4698          */
4699         (void) sblock(sb, SBL_WAIT | SBL_NOINTR | SBL_IGNDEFUNCT);
4700         VERIFY(sb->sb_flags & SB_LOCK);
4701
4702         sb->sb_flags            &= ~(SB_SEL | SB_UPCALL);
4703         sb->sb_flags            |= SB_DROP;
4704         sb->sb_upcall           = NULL;
4705         sb->sb_upcallarg        = NULL;
4706
4707         sbunlock(sb, TRUE);     /* keep socket locked */
4708
4709         selthreadclear(&sb->sb_sel);
4710         sbrelease(sb);
4711 }
4712
4713 void
4714 sorflush(struct socket *so)
4715 {
4716         struct sockbuf *sb = &so->so_rcv;
4717         struct protosw *pr = so->so_proto;
4718         struct sockbuf asb;
4719 #ifdef notyet
4720         lck_mtx_t *mutex_held;
4721         /*
4722          * XXX: This code is currently commented out, because we may get here
4723          * as part of sofreelastref(), and at that time, pr_getlock() may no
4724          * longer be able to return us the lock; this will be fixed in future.
4725          */
4726         if (so->so_proto->pr_getlock != NULL) {
4727                 mutex_held = (*so->so_proto->pr_getlock)(so, 0);
4728         } else {
4729                 mutex_held = so->so_proto->pr_domain->dom_mtx;
4730         }
4731
4732         LCK_MTX_ASSERT(mutex_held, LCK_MTX_ASSERT_OWNED);
4733 #endif /* notyet */
4734
4735         sflt_notify(so, sock_evt_flush_read, NULL);
4736
4737         socantrcvmore(so);
4738
4739         /*
4740          * Obtain lock on the socket buffer (SB_LOCK).  This is required
4741          * to prevent the socket buffer from being unexpectedly altered
4742          * while it is used by another thread in socket send/receive.
4743          *
4744          * sblock() must not fail here, hence the assertion.
4745          */
4746         (void) sblock(sb, SBL_WAIT | SBL_NOINTR | SBL_IGNDEFUNCT);
4747         VERIFY(sb->sb_flags & SB_LOCK);
4748
4749         /*
4750          * Copy only the relevant fields from "sb" to "asb" which we
4751          * need for sbrelease() to function.  In particular, skip
4752          * sb_sel as it contains the wait queue linkage, which would
4753          * wreak havoc if we were to issue selthreadclear() on "asb".
4754          * Make sure to not carry over SB_LOCK in "asb", as we need
4755          * to acquire it later as part of sbrelease().
4756          */
4757         bzero(&asb, sizeof(asb));
4758         asb.sb_cc               = sb->sb_cc;
4759         asb.sb_hiwat            = sb->sb_hiwat;
4760         asb.sb_mbcnt            = sb->sb_mbcnt;
4761         asb.sb_mbmax            = sb->sb_mbmax;
4762         asb.sb_ctl              = sb->sb_ctl;
4763         asb.sb_lowat            = sb->sb_lowat;
4764         asb.sb_mb               = sb->sb_mb;
4765         asb.sb_mbtail           = sb->sb_mbtail;
4766         asb.sb_lastrecord       = sb->sb_lastrecord;
4767         asb.sb_so               = sb->sb_so;
4768         asb.sb_flags            = sb->sb_flags;
4769         asb.sb_flags            &= ~(SB_LOCK | SB_SEL | SB_KNOTE | SB_UPCALL);
4770         asb.sb_flags            |= SB_DROP;
4771
4772         /*
4773          * Ideally we'd bzero() these and preserve the ones we need;
4774          * but to do that we'd need to shuffle things around in the
4775          * sockbuf, and we can't do it now because there are KEXTS
4776          * that are directly referring to the socket structure.
4777          *
4778          * Setting SB_DROP acts as a barrier to prevent further appends.
4779          * Clearing SB_SEL is done for selthreadclear() below.
4780          */
4781         sb->sb_cc               = 0;
4782         sb->sb_hiwat            = 0;
4783         sb->sb_mbcnt            = 0;
4784         sb->sb_mbmax            = 0;
4785         sb->sb_ctl              = 0;
4786         sb->sb_lowat            = 0;
4787         sb->sb_mb               = NULL;
4788         sb->sb_mbtail           = NULL;
4789         sb->sb_lastrecord       = NULL;
4790         sb->sb_timeo.tv_sec     = 0;
4791         sb->sb_timeo.tv_usec    = 0;
4792         sb->sb_upcall           = NULL;
4793         sb->sb_upcallarg        = NULL;
4794         sb->sb_flags            &= ~(SB_SEL | SB_UPCALL);
4795         sb->sb_flags            |= SB_DROP;
4796
4797         sbunlock(sb, TRUE);     /* keep socket locked */
4798
4799         /*
4800          * Note that selthreadclear() is called on the original "sb" and
4801          * not the local "asb" because of the way wait queue linkage is
4802          * implemented.  Given that selwakeup() may be triggered, SB_SEL
4803          * should no longer be set (cleared above.)
4804          */
4805         selthreadclear(&sb->sb_sel);
4806
4807         if ((pr->pr_flags & PR_RIGHTS) && pr->pr_domain->dom_dispose) {
4808                 (*pr->pr_domain->dom_dispose)(asb.sb_mb);
4809         }
4810
4811         sbrelease(&asb);
4812 }
4813
4814 /*
4815  * Perhaps this routine, and sooptcopyout(), below, ought to come in
4816  * an additional variant to handle the case where the option value needs
4817  * to be some kind of integer, but not a specific size.
4818  * In addition to their use here, these functions are also called by the
4819  * protocol-level pr_ctloutput() routines.
4820  *
4821  * Returns:     0                       Success
4822  *              EINVAL
4823  *      copyin:EFAULT
4824  */
4825 int
4826 sooptcopyin(struct sockopt *sopt, void *buf, size_t len, size_t minlen)
4827 {
4828         size_t  valsize;
4829
4830         /*
4831          * If the user gives us more than we wanted, we ignore it,
4832          * but if we don't get the minimum length the caller
4833          * wants, we return EINVAL.  On success, sopt->sopt_valsize
4834          * is set to however much we actually retrieved.
4835          */
4836         if ((valsize = sopt->sopt_valsize) < minlen) {
4837                 return EINVAL;
4838         }
4839         if (valsize > len) {
4840                 sopt->sopt_valsize = valsize = len;
4841         }
4842
4843         if (sopt->sopt_p != kernproc) {
4844                 return copyin(sopt->sopt_val, buf, valsize);
4845         }
4846
4847         bcopy(CAST_DOWN(caddr_t, sopt->sopt_val), buf, valsize);
4848         return 0;
4849 }
4850
4851 /*
4852  * sooptcopyin_timeval
4853  *   Copy in a timeval value into tv_p, and take into account whether the
4854  *   the calling process is 64-bit or 32-bit.  Moved the sanity checking
4855  *   code here so that we can verify the 64-bit tv_sec value before we lose
4856  *   the top 32-bits assigning tv64.tv_sec to tv_p->tv_sec.
4857  */
4858 static int
4859 sooptcopyin_timeval(struct sockopt *sopt, struct timeval *tv_p)
4860 {
4861         int                     error;
4862
4863         if (proc_is64bit(sopt->sopt_p)) {
4864                 struct user64_timeval   tv64;
4865
4866                 if (sopt->sopt_valsize < sizeof(tv64)) {
4867                         return EINVAL;
4868                 }
4869
4870                 sopt->sopt_valsize = sizeof(tv64);
4871                 if (sopt->sopt_p != kernproc) {
4872                         error = copyin(sopt->sopt_val, &tv64, sizeof(tv64));
4873                         if (error != 0) {
4874                                 return error;
4875                         }
4876                 } else {
4877                         bcopy(CAST_DOWN(caddr_t, sopt->sopt_val), &tv64,
4878                             sizeof(tv64));
4879                 }
4880                 if (tv64.tv_sec < 0 || tv64.tv_sec > LONG_MAX ||
4881                     tv64.tv_usec < 0 || tv64.tv_usec >= 1000000) {
4882                         return EDOM;
4883                 }
4884
4885                 tv_p->tv_sec = tv64.tv_sec;
4886                 tv_p->tv_usec = tv64.tv_usec;
4887         } else {
4888                 struct user32_timeval   tv32;
4889
4890                 if (sopt->sopt_valsize < sizeof(tv32)) {
4891                         return EINVAL;
4892                 }
4893
4894                 sopt->sopt_valsize = sizeof(tv32);
4895                 if (sopt->sopt_p != kernproc) {
4896                         error = copyin(sopt->sopt_val, &tv32, sizeof(tv32));
4897                         if (error != 0) {
4898                                 return error;
4899                         }
4900                 } else {
4901                         bcopy(CAST_DOWN(caddr_t, sopt->sopt_val), &tv32,
4902                             sizeof(tv32));
4903                 }
4904 #ifndef __LP64__
4905                 /*
4906                  * K64todo "comparison is always false due to
4907                  * limited range of data type"
4908                  */
4909                 if (tv32.tv_sec < 0 || tv32.tv_sec > LONG_MAX ||
4910                     tv32.tv_usec < 0 || tv32.tv_usec >= 1000000) {
4911                         return EDOM;
4912                 }
4913 #endif
4914                 tv_p->tv_sec = tv32.tv_sec;
4915                 tv_p->tv_usec = tv32.tv_usec;
4916         }
4917         return 0;
4918 }
4919
4920 int
4921 soopt_cred_check(struct socket *so, int priv, boolean_t allow_root,
4922     boolean_t ignore_delegate)
4923 {
4924         kauth_cred_t cred =  NULL;
4925         proc_t ep = PROC_NULL;
4926         uid_t uid;
4927         int error = 0;
4928
4929         if (ignore_delegate == false && so->so_flags & SOF_DELEGATED) {
4930                 ep = proc_find(so->e_pid);
4931                 if (ep) {
4932                         cred = kauth_cred_proc_ref(ep);
4933                 }
4934         }
4935
4936         uid = kauth_cred_getuid(cred ? cred : so->so_cred);
4937
4938         /* uid is 0 for root */
4939         if (uid != 0 || !allow_root) {
4940                 error = priv_check_cred(cred ? cred : so->so_cred, priv, 0);
4941         }
4942         if (cred) {
4943                 kauth_cred_unref(&cred);
4944         }
4945         if (ep != PROC_NULL) {
4946                 proc_rele(ep);
4947         }
4948
4949         return error;
4950 }
4951
4952 /*
4953  * Returns:     0                       Success
4954  *              EINVAL
4955  *              ENOPROTOOPT
4956  *              ENOBUFS
4957  *              EDOM
4958  *      sooptcopyin:EINVAL
4959  *      sooptcopyin:EFAULT
4960  *      sooptcopyin_timeval:EINVAL
4961  *      sooptcopyin_timeval:EFAULT
4962  *      sooptcopyin_timeval:EDOM
4963  *      <pr_ctloutput>:EOPNOTSUPP[AF_UNIX]
4964  *      <pr_ctloutput>:???w
4965  *      sflt_attach_private:???         [whatever a filter author chooses]
4966  *      <sf_setoption>:???              [whatever a filter author chooses]
4967  *
4968  * Notes:       Other <pru_listen> returns depend on the protocol family; all
4969  *              <sf_listen> returns depend on what the filter author causes
4970  *              their filter to return.
4971  */
4972 int
4973 sosetoptlock(struct socket *so, struct sockopt *sopt, int dolock)
4974 {
4975         int     error, optval;
4976         int64_t long_optval;
4977         struct  linger l;
4978         struct  timeval tv;
4979 #if CONFIG_MACF_SOCKET
4980         struct mac extmac;
4981 #endif /* MAC_SOCKET */
4982
4983         if (sopt->sopt_dir != SOPT_SET) {
4984                 sopt->sopt_dir = SOPT_SET;
4985         }
4986
4987         if (dolock) {
4988                 socket_lock(so, 1);
4989         }
4990
4991         if ((so->so_state & (SS_CANTRCVMORE | SS_CANTSENDMORE)) ==
4992             (SS_CANTRCVMORE | SS_CANTSENDMORE) &&
4993             (so->so_flags & SOF_NPX_SETOPTSHUT) == 0) {
4994                 /* the socket has been shutdown, no more sockopt's */
4995                 error = EINVAL;
4996                 goto out;
4997         }
4998
4999         error = sflt_setsockopt(so, sopt);
5000         if (error != 0) {
5001                 if (error == EJUSTRETURN) {
5002                         error = 0;
5003                 }
5004                 goto out;
5005         }
5006
5007         if (sopt->sopt_level != SOL_SOCKET) {
5008                 if (so->so_proto != NULL &&
5009                     so->so_proto->pr_ctloutput != NULL) {
5010                         error = (*so->so_proto->pr_ctloutput)(so, sopt);
5011                         goto out;
5012                 }
5013                 error = ENOPROTOOPT;
5014         } else {
5015                 /*
5016                  * Allow socket-level (SOL_SOCKET) options to be filtered by
5017                  * the protocol layer, if needed.  A zero value returned from
5018                  * the handler means use default socket-level processing as
5019                  * done by the rest of this routine.  Otherwise, any other
5020                  * return value indicates that the option is unsupported.
5021                  */
5022                 if (so->so_proto != NULL && (error = so->so_proto->pr_usrreqs->
5023                     pru_socheckopt(so, sopt)) != 0) {
5024                         goto out;
5025                 }
5026
5027                 error = 0;
5028                 switch (sopt->sopt_name) {
5029                 case SO_LINGER:
5030                 case SO_LINGER_SEC:
5031                         error = sooptcopyin(sopt, &l, sizeof(l), sizeof(l));
5032                         if (error != 0) {
5033                                 goto out;
5034                         }
5035
5036                         so->so_linger = (sopt->sopt_name == SO_LINGER) ?
5037                             l.l_linger : l.l_linger * hz;
5038                         if (l.l_onoff != 0) {
5039                                 so->so_options |= SO_LINGER;
5040                         } else {
5041                                 so->so_options &= ~SO_LINGER;
5042                         }
5043                         break;
5044
5045                 case SO_DEBUG:
5046                 case SO_KEEPALIVE:
5047                 case SO_DONTROUTE:
5048                 case SO_USELOOPBACK:
5049                 case SO_BROADCAST:
5050                 case SO_REUSEADDR:
5051                 case SO_REUSEPORT:
5052                 case SO_OOBINLINE:
5053                 case SO_TIMESTAMP:
5054                 case SO_TIMESTAMP_MONOTONIC:
5055                 case SO_TIMESTAMP_CONTINUOUS:
5056                 case SO_DONTTRUNC:
5057                 case SO_WANTMORE:
5058                 case SO_WANTOOBFLAG:
5059                 case SO_NOWAKEFROMSLEEP:
5060                 case SO_NOAPNFALLBK:
5061                         error = sooptcopyin(sopt, &optval, sizeof(optval),
5062                             sizeof(optval));
5063                         if (error != 0) {
5064                                 goto out;
5065                         }
5066                         if (optval) {
5067                                 so->so_options |= sopt->sopt_name;
5068                         } else {
5069                                 so->so_options &= ~sopt->sopt_name;
5070                         }
5071                         break;
5072
5073                 case SO_SNDBUF:
5074                 case SO_RCVBUF:
5075                 case SO_SNDLOWAT:
5076                 case SO_RCVLOWAT:
5077                         error = sooptcopyin(sopt, &optval, sizeof(optval),
5078                             sizeof(optval));
5079                         if (error != 0) {
5080                                 goto out;
5081                         }
5082
5083                         /*
5084                          * Values < 1 make no sense for any of these
5085                          * options, so disallow them.
5086                          */
5087                         if (optval < 1) {
5088                                 error = EINVAL;
5089                                 goto out;
5090                         }
5091
5092                         switch (sopt->sopt_name) {
5093                         case SO_SNDBUF:
5094                         case SO_RCVBUF: {
5095                                 struct sockbuf *sb =
5096                                     (sopt->sopt_name == SO_SNDBUF) ?
5097                                     &so->so_snd : &so->so_rcv;
5098                                 if (sbreserve(sb, (u_int32_t)optval) == 0) {
5099                                         error = ENOBUFS;
5100                                         goto out;
5101                                 }
5102                                 sb->sb_flags |= SB_USRSIZE;
5103                                 sb->sb_flags &= ~SB_AUTOSIZE;
5104                                 sb->sb_idealsize = (u_int32_t)optval;
5105                                 break;
5106                         }
5107                         /*
5108                          * Make sure the low-water is never greater than
5109                          * the high-water.
5110                          */
5111                         case SO_SNDLOWAT: {
5112                                 int space = sbspace(&so->so_snd);
5113                                 u_int32_t hiwat = so->so_snd.sb_hiwat;
5114
5115                                 if (so->so_snd.sb_flags & SB_UNIX) {
5116                                         struct unpcb *unp =
5117                                             (struct unpcb *)(so->so_pcb);
5118                                         if (unp != NULL &&
5119                                             unp->unp_conn != NULL) {
5120                                                 hiwat += unp->unp_conn->unp_cc;
5121                                         }
5122                                 }
5123
5124                                 so->so_snd.sb_lowat =
5125                                     (optval > hiwat) ?
5126                                     hiwat : optval;
5127
5128                                 if (space >= so->so_snd.sb_lowat) {
5129                                         sowwakeup(so);
5130                                 }
5131                                 break;
5132                         }
5133                         case SO_RCVLOWAT: {
5134                                 int64_t data_len;
5135                                 so->so_rcv.sb_lowat =
5136                                     (optval > so->so_rcv.sb_hiwat) ?
5137                                     so->so_rcv.sb_hiwat : optval;
5138                                 data_len = so->so_rcv.sb_cc
5139                                     - so->so_rcv.sb_ctl;
5140                                 if (data_len >= so->so_rcv.sb_lowat) {
5141                                         sorwakeup(so);
5142                                 }
5143                                 break;
5144                         }
5145                         }
5146                         break;
5147
5148                 case SO_SNDTIMEO:
5149                 case SO_RCVTIMEO:
5150                         error = sooptcopyin_timeval(sopt, &tv);
5151                         if (error != 0) {
5152                                 goto out;
5153                         }
5154
5155                         switch (sopt->sopt_name) {
5156                         case SO_SNDTIMEO:
5157                                 so->so_snd.sb_timeo = tv;
5158                                 break;
5159                         case SO_RCVTIMEO:
5160                                 so->so_rcv.sb_timeo = tv;
5161                                 break;
5162                         }
5163                         break;
5164
5165                 case SO_NKE: {
5166                         struct so_nke nke;
5167
5168                         error = sooptcopyin(sopt, &nke, sizeof(nke),
5169                             sizeof(nke));
5170                         if (error != 0) {
5171                                 goto out;
5172                         }
5173
5174                         error = sflt_attach_internal(so, nke.nke_handle);
5175                         break;
5176                 }
5177
5178                 case SO_NOSIGPIPE:
5179                         error = sooptcopyin(sopt, &optval, sizeof(optval),
5180                             sizeof(optval));
5181                         if (error != 0) {
5182                                 goto out;
5183                         }
5184                         if (optval != 0) {
5185                                 so->so_flags |= SOF_NOSIGPIPE;
5186                         } else {
5187                                 so->so_flags &= ~SOF_NOSIGPIPE;
5188                         }
5189                         break;
5190
5191                 case SO_NOADDRERR:
5192                         error = sooptcopyin(sopt, &optval, sizeof(optval),
5193                             sizeof(optval));
5194                         if (error != 0) {
5195                                 goto out;
5196                         }
5197                         if (optval != 0) {
5198                                 so->so_flags |= SOF_NOADDRAVAIL;
5199                         } else {
5200                                 so->so_flags &= ~SOF_NOADDRAVAIL;
5201                         }
5202                         break;
5203
5204                 case SO_REUSESHAREUID:
5205                         error = sooptcopyin(sopt, &optval, sizeof(optval),
5206                             sizeof(optval));
5207                         if (error != 0) {
5208                                 goto out;
5209                         }
5210                         if (optval != 0) {
5211                                 so->so_flags |= SOF_REUSESHAREUID;
5212                         } else {
5213                                 so->so_flags &= ~SOF_REUSESHAREUID;
5214                         }
5215                         break;
5216
5217                 case SO_NOTIFYCONFLICT:
5218                         if (kauth_cred_issuser(kauth_cred_get()) == 0) {
5219                                 error = EPERM;
5220                                 goto out;
5221                         }
5222                         error = sooptcopyin(sopt, &optval, sizeof(optval),
5223                             sizeof(optval));
5224                         if (error != 0) {
5225                                 goto out;
5226                         }
5227                         if (optval != 0) {
5228                                 so->so_flags |= SOF_NOTIFYCONFLICT;
5229                         } else {
5230                                 so->so_flags &= ~SOF_NOTIFYCONFLICT;
5231                         }
5232                         break;
5233
5234                 case SO_RESTRICTIONS:
5235                         error = sooptcopyin(sopt, &optval, sizeof(optval),
5236                             sizeof(optval));
5237                         if (error != 0) {
5238                                 goto out;
5239                         }
5240
5241                         error = so_set_restrictions(so, optval);
5242                         break;
5243
5244                 case SO_AWDL_UNRESTRICTED:
5245                         if (SOCK_DOM(so) != PF_INET &&
5246                             SOCK_DOM(so) != PF_INET6) {
5247                                 error = EOPNOTSUPP;
5248                                 goto out;
5249                         }
5250                         error = sooptcopyin(sopt, &optval, sizeof(optval),
5251                             sizeof(optval));
5252                         if (error != 0) {
5253                                 goto out;
5254                         }
5255                         if (optval != 0) {
5256                                 error = soopt_cred_check(so,
5257                                     PRIV_NET_RESTRICTED_AWDL, false, false);
5258                                 if (error == 0) {
5259                                         inp_set_awdl_unrestricted(
5260                                                 sotoinpcb(so));
5261                                 }
5262                         } else {
5263                                 inp_clear_awdl_unrestricted(sotoinpcb(so));
5264                         }
5265                         break;
5266                 case SO_INTCOPROC_ALLOW:
5267                         if (SOCK_DOM(so) != PF_INET6) {
5268                                 error = EOPNOTSUPP;
5269                                 goto out;
5270                         }
5271                         error = sooptcopyin(sopt, &optval, sizeof(optval),
5272                             sizeof(optval));
5273                         if (error != 0) {
5274                                 goto out;
5275                         }
5276                         if (optval != 0 &&
5277                             inp_get_intcoproc_allowed(sotoinpcb(so)) == FALSE) {
5278                                 error = soopt_cred_check(so,
5279                                     PRIV_NET_RESTRICTED_INTCOPROC, false, false);
5280                                 if (error == 0) {
5281                                         inp_set_intcoproc_allowed(
5282                                                 sotoinpcb(so));
5283                                 }
5284                         } else if (optval == 0) {
5285                                 inp_clear_intcoproc_allowed(sotoinpcb(so));
5286                         }
5287                         break;
5288
5289                 case SO_LABEL:
5290 #if CONFIG_MACF_SOCKET
5291                         if ((error = sooptcopyin(sopt, &extmac, sizeof(extmac),
5292                             sizeof(extmac))) != 0) {
5293                                 goto out;
5294                         }
5295
5296                         error = mac_setsockopt_label(proc_ucred(sopt->sopt_p),
5297                             so, &extmac);
5298 #else
5299                         error = EOPNOTSUPP;
5300 #endif /* MAC_SOCKET */
5301                         break;
5302
5303                 case SO_UPCALLCLOSEWAIT:
5304                         error = sooptcopyin(sopt, &optval, sizeof(optval),
5305                             sizeof(optval));
5306                         if (error != 0) {
5307                                 goto out;
5308                         }
5309                         if (optval != 0) {
5310                                 so->so_flags |= SOF_UPCALLCLOSEWAIT;
5311                         } else {
5312                                 so->so_flags &= ~SOF_UPCALLCLOSEWAIT;
5313                         }
5314                         break;
5315
5316                 case SO_RANDOMPORT:
5317                         error = sooptcopyin(sopt, &optval, sizeof(optval),
5318                             sizeof(optval));
5319                         if (error != 0) {
5320                                 goto out;
5321                         }
5322                         if (optval != 0) {
5323                                 so->so_flags |= SOF_BINDRANDOMPORT;
5324                         } else {
5325                                 so->so_flags &= ~SOF_BINDRANDOMPORT;
5326                         }
5327                         break;
5328
5329                 case SO_NP_EXTENSIONS: {
5330                         struct so_np_extensions sonpx;
5331
5332                         error = sooptcopyin(sopt, &sonpx, sizeof(sonpx),
5333                             sizeof(sonpx));
5334                         if (error != 0) {
5335                                 goto out;
5336                         }
5337                         if (sonpx.npx_mask & ~SONPX_MASK_VALID) {
5338                                 error = EINVAL;
5339                                 goto out;
5340                         }
5341                         /*
5342                          * Only one bit defined for now
5343                          */
5344                         if ((sonpx.npx_mask & SONPX_SETOPTSHUT)) {
5345                                 if ((sonpx.npx_flags & SONPX_SETOPTSHUT)) {
5346                                         so->so_flags |= SOF_NPX_SETOPTSHUT;
5347                                 } else {
5348                                         so->so_flags &= ~SOF_NPX_SETOPTSHUT;
5349                                 }
5350                         }
5351                         break;
5352                 }
5353
5354                 case SO_TRAFFIC_CLASS: {
5355                         error = sooptcopyin(sopt, &optval, sizeof(optval),
5356                             sizeof(optval));
5357                         if (error != 0) {
5358                                 goto out;
5359                         }
5360                         if (optval >= SO_TC_NET_SERVICE_OFFSET) {
5361                                 int netsvc = optval - SO_TC_NET_SERVICE_OFFSET;
5362                                 error = so_set_net_service_type(so, netsvc);
5363                                 goto out;
5364                         }
5365                         error = so_set_traffic_class(so, optval);
5366                         if (error != 0) {
5367                                 goto out;
5368                         }
5369                         so->so_flags1 &= ~SOF1_TC_NET_SERV_TYPE;
5370                         so->so_netsvctype = _NET_SERVICE_TYPE_UNSPEC;
5371                         break;
5372                 }
5373
5374                 case SO_RECV_TRAFFIC_CLASS: {
5375                         error = sooptcopyin(sopt, &optval, sizeof(optval),
5376                             sizeof(optval));
5377                         if (error != 0) {
5378                                 goto out;
5379                         }
5380                         if (optval == 0) {
5381                                 so->so_flags &= ~SOF_RECV_TRAFFIC_CLASS;
5382                         } else {
5383                                 so->so_flags |= SOF_RECV_TRAFFIC_CLASS;
5384                         }
5385                         break;
5386                 }
5387
5388 #if (DEVELOPMENT || DEBUG)
5389                 case SO_TRAFFIC_CLASS_DBG: {
5390                         struct so_tcdbg so_tcdbg;
5391
5392                         error = sooptcopyin(sopt, &so_tcdbg,
5393                             sizeof(struct so_tcdbg), sizeof(struct so_tcdbg));
5394                         if (error != 0) {
5395                                 goto out;
5396                         }
5397                         error = so_set_tcdbg(so, &so_tcdbg);
5398                         if (error != 0) {
5399                                 goto out;
5400                         }
5401                         break;
5402                 }
5403 #endif /* (DEVELOPMENT || DEBUG) */
5404
5405                 case SO_PRIVILEGED_TRAFFIC_CLASS:
5406                         error = priv_check_cred(kauth_cred_get(),
5407                             PRIV_NET_PRIVILEGED_TRAFFIC_CLASS, 0);
5408                         if (error != 0) {
5409                                 goto out;
5410                         }
5411                         error = sooptcopyin(sopt, &optval, sizeof(optval),
5412                             sizeof(optval));
5413                         if (error != 0) {
5414                                 goto out;
5415                         }
5416                         if (optval == 0) {
5417                                 so->so_flags &= ~SOF_PRIVILEGED_TRAFFIC_CLASS;
5418                         } else {
5419                                 so->so_flags |= SOF_PRIVILEGED_TRAFFIC_CLASS;
5420                         }
5421                         break;
5422
5423 #if (DEVELOPMENT || DEBUG)
5424                 case SO_DEFUNCTIT:
5425                         error = sosetdefunct(current_proc(), so, 0, FALSE);
5426                         if (error == 0) {
5427                                 error = sodefunct(current_proc(), so, 0);
5428                         }
5429
5430                         break;
5431 #endif /* (DEVELOPMENT || DEBUG) */
5432
5433                 case SO_DEFUNCTOK:
5434                         error = sooptcopyin(sopt, &optval, sizeof(optval),
5435                             sizeof(optval));
5436                         if (error != 0 || (so->so_flags & SOF_DEFUNCT)) {
5437                                 if (error == 0) {
5438                                         error = EBADF;
5439                                 }
5440                                 goto out;
5441                         }
5442                         /*
5443                          * Any process can set SO_DEFUNCTOK (clear
5444                          * SOF_NODEFUNCT), but only root can clear
5445                          * SO_DEFUNCTOK (set SOF_NODEFUNCT).
5446                          */
5447                         if (optval == 0 &&
5448                             kauth_cred_issuser(kauth_cred_get()) == 0) {
5449                                 error = EPERM;
5450                                 goto out;
5451                         }
5452                         if (optval) {
5453                                 so->so_flags &= ~SOF_NODEFUNCT;
5454                         } else {
5455                                 so->so_flags |= SOF_NODEFUNCT;
5456                         }
5457
5458                         if (SOCK_DOM(so) == PF_INET ||
5459                             SOCK_DOM(so) == PF_INET6) {
5460                                 char s[MAX_IPv6_STR_LEN];
5461                                 char d[MAX_IPv6_STR_LEN];
5462                                 struct inpcb *inp = sotoinpcb(so);
5463
5464                                 SODEFUNCTLOG("%s[%d, %s]: so 0x%llx "
5465                                     "[%s %s:%d -> %s:%d] is now marked "
5466                                     "as %seligible for "
5467                                     "defunct\n", __func__, proc_selfpid(),
5468                                     proc_best_name(current_proc()),
5469                                     (uint64_t)DEBUG_KERNEL_ADDRPERM(so),
5470                                     (SOCK_TYPE(so) == SOCK_STREAM) ?
5471                                     "TCP" : "UDP", inet_ntop(SOCK_DOM(so),
5472                                     ((SOCK_DOM(so) == PF_INET) ?
5473                                     (void *)&inp->inp_laddr.s_addr :
5474                                     (void *)&inp->in6p_laddr), s, sizeof(s)),
5475                                     ntohs(inp->in6p_lport),
5476                                     inet_ntop(SOCK_DOM(so),
5477                                     (SOCK_DOM(so) == PF_INET) ?
5478                                     (void *)&inp->inp_faddr.s_addr :
5479                                     (void *)&inp->in6p_faddr, d, sizeof(d)),
5480                                     ntohs(inp->in6p_fport),
5481                                     (so->so_flags & SOF_NODEFUNCT) ?
5482                                     "not " : "");
5483                         } else {
5484                                 SODEFUNCTLOG("%s[%d, %s]: so 0x%llx [%d,%d] "
5485                                     "is now marked as %seligible for "
5486                                     "defunct\n",
5487                                     __func__, proc_selfpid(),
5488                                     proc_best_name(current_proc()),
5489                                     (uint64_t)DEBUG_KERNEL_ADDRPERM(so),
5490                                     SOCK_DOM(so), SOCK_TYPE(so),
5491                                     (so->so_flags & SOF_NODEFUNCT) ?
5492                                     "not " : "");
5493                         }
5494                         break;
5495
5496                 case SO_ISDEFUNCT:
5497                         /* This option is not settable */
5498                         error = EINVAL;
5499                         break;
5500
5501                 case SO_OPPORTUNISTIC:
5502                         error = sooptcopyin(sopt, &optval, sizeof(optval),
5503                             sizeof(optval));
5504                         if (error == 0) {
5505                                 error = so_set_opportunistic(so, optval);
5506                         }
5507                         break;
5508
5509                 case SO_FLUSH:
5510                         /* This option is handled by lower layer(s) */
5511                         error = 0;
5512                         break;
5513
5514                 case SO_RECV_ANYIF:
5515                         error = sooptcopyin(sopt, &optval, sizeof(optval),
5516                             sizeof(optval));
5517                         if (error == 0) {
5518                                 error = so_set_recv_anyif(so, optval);
5519                         }
5520                         break;
5521
5522                 case SO_TRAFFIC_MGT_BACKGROUND: {
5523                         /* This option is handled by lower layer(s) */
5524                         error = 0;
5525                         break;
5526                 }
5527
5528 #if FLOW_DIVERT
5529                 case SO_FLOW_DIVERT_TOKEN:
5530                         error = flow_divert_token_set(so, sopt);
5531                         break;
5532 #endif  /* FLOW_DIVERT */
5533
5534
5535                 case SO_DELEGATED:
5536                         if ((error = sooptcopyin(sopt, &optval, sizeof(optval),
5537                             sizeof(optval))) != 0) {
5538                                 break;
5539                         }
5540
5541                         error = so_set_effective_pid(so, optval, sopt->sopt_p, true);
5542                         break;
5543
5544                 case SO_DELEGATED_UUID: {
5545                         uuid_t euuid;
5546
5547                         if ((error = sooptcopyin(sopt, &euuid, sizeof(euuid),
5548                             sizeof(euuid))) != 0) {
5549                                 break;
5550                         }
5551
5552                         error = so_set_effective_uuid(so, euuid, sopt->sopt_p, true);
5553                         break;
5554                 }
5555
5556 #if NECP
5557                 case SO_NECP_ATTRIBUTES:
5558                         error = necp_set_socket_attributes(so, sopt);
5559                         break;
5560
5561                 case SO_NECP_CLIENTUUID: {
5562                         if (SOCK_DOM(so) == PF_MULTIPATH) {
5563                                 /* Handled by MPTCP itself */
5564                                 break;
5565                         }
5566
5567                         if (SOCK_DOM(so) != PF_INET && SOCK_DOM(so) != PF_INET6) {
5568                                 error = EINVAL;
5569                                 goto out;
5570                         }
5571
5572                         struct inpcb *inp = sotoinpcb(so);
5573                         if (!uuid_is_null(inp->necp_client_uuid)) {
5574                                 // Clear out the old client UUID if present
5575                                 necp_inpcb_remove_cb(inp);
5576                         }
5577
5578                         error = sooptcopyin(sopt, &inp->necp_client_uuid,
5579                             sizeof(uuid_t), sizeof(uuid_t));
5580                         if (error != 0) {
5581                                 goto out;
5582                         }
5583
5584                         if (uuid_is_null(inp->necp_client_uuid)) {
5585                                 error = EINVAL;
5586                                 goto out;
5587                         }
5588
5589                         pid_t current_pid = proc_pid(current_proc());
5590                         error = necp_client_register_socket_flow(current_pid,
5591                             inp->necp_client_uuid, inp);
5592                         if (error != 0) {
5593                                 uuid_clear(inp->necp_client_uuid);
5594                                 goto out;
5595                         }
5596
5597                         if (inp->inp_lport != 0) {
5598                                 // There is a bound local port, so this is not
5599                                 // a fresh socket. Assign to the client.
5600                                 necp_client_assign_from_socket(current_pid, inp->necp_client_uuid, inp);
5601                         }
5602
5603                         break;
5604                 }
5605                 case SO_NECP_LISTENUUID: {
5606                         if (SOCK_DOM(so) != PF_INET && SOCK_DOM(so) != PF_INET6) {
5607                                 error = EINVAL;
5608                                 goto out;
5609                         }
5610
5611                         struct inpcb *inp = sotoinpcb(so);
5612                         if (!uuid_is_null(inp->necp_client_uuid)) {
5613                                 error = EINVAL;
5614                                 goto out;
5615                         }
5616
5617                         error = sooptcopyin(sopt, &inp->necp_client_uuid,
5618                             sizeof(uuid_t), sizeof(uuid_t));
5619                         if (error != 0) {
5620                                 goto out;
5621                         }
5622
5623                         if (uuid_is_null(inp->necp_client_uuid)) {
5624                                 error = EINVAL;
5625                                 goto out;
5626                         }
5627
5628                         error = necp_client_register_socket_listener(proc_pid(current_proc()),
5629                             inp->necp_client_uuid, inp);
5630                         if (error != 0) {
5631                                 uuid_clear(inp->necp_client_uuid);
5632                                 goto out;
5633                         }
5634
5635                         // Mark that the port registration is held by NECP
5636                         inp->inp_flags2 |= INP2_EXTERNAL_PORT;
5637
5638                         break;
5639                 }
5640 #endif /* NECP */
5641
5642                 case SO_EXTENDED_BK_IDLE:
5643                         error = sooptcopyin(sopt, &optval, sizeof(optval),
5644                             sizeof(optval));
5645                         if (error == 0) {
5646                                 error = so_set_extended_bk_idle(so, optval);
5647                         }
5648                         break;
5649
5650                 case SO_MARK_CELLFALLBACK:
5651                         error = sooptcopyin(sopt, &optval, sizeof(optval),
5652                             sizeof(optval));
5653                         if (error != 0) {
5654                                 goto out;
5655                         }
5656                         if (optval < 0) {
5657                                 error = EINVAL;
5658                                 goto out;
5659                         }
5660                         if (optval == 0) {
5661                                 so->so_flags1 &= ~SOF1_CELLFALLBACK;
5662                         } else {
5663                                 so->so_flags1 |= SOF1_CELLFALLBACK;
5664                         }
5665                         break;
5666
5667                 case SO_STATISTICS_EVENT:
5668                         error = sooptcopyin(sopt, &long_optval,
5669                             sizeof(long_optval), sizeof(long_optval));
5670                         if (error != 0) {
5671                                 goto out;
5672                         }
5673                         u_int64_t nstat_event = 0;
5674                         error = so_statistics_event_to_nstat_event(
5675                                 &long_optval, &nstat_event);
5676                         if (error != 0) {
5677                                 goto out;
5678                         }
5679                         nstat_pcb_event(sotoinpcb(so), nstat_event);
5680                         break;
5681
5682                 case SO_NET_SERVICE_TYPE: {
5683                         error = sooptcopyin(sopt, &optval, sizeof(optval),
5684                             sizeof(optval));
5685                         if (error != 0) {
5686                                 goto out;
5687                         }
5688                         error = so_set_net_service_type(so, optval);
5689                         break;
5690                 }
5691
5692                 case SO_QOSMARKING_POLICY_OVERRIDE:
5693                         error = priv_check_cred(kauth_cred_get(),
5694                             PRIV_NET_QOSMARKING_POLICY_OVERRIDE, 0);
5695                         if (error != 0) {
5696                                 goto out;
5697                         }
5698                         error = sooptcopyin(sopt, &optval, sizeof(optval),
5699                             sizeof(optval));
5700                         if (error != 0) {
5701                                 goto out;
5702                         }
5703                         if (optval == 0) {
5704                                 so->so_flags1 &= ~SOF1_QOSMARKING_POLICY_OVERRIDE;
5705                         } else {
5706                                 so->so_flags1 |= SOF1_QOSMARKING_POLICY_OVERRIDE;
5707                         }
5708                         break;
5709
5710                 case SO_MPKL_SEND_INFO: {
5711                         struct so_mpkl_send_info so_mpkl_send_info;
5712
5713                         error = sooptcopyin(sopt, &so_mpkl_send_info,
5714                             sizeof(struct so_mpkl_send_info), sizeof(struct so_mpkl_send_info));
5715                         if (error != 0) {
5716                                 goto out;
5717                         }
5718                         uuid_copy(so->so_mpkl_send_uuid, so_mpkl_send_info.mpkl_uuid);
5719                         so->so_mpkl_send_proto = so_mpkl_send_info.mpkl_proto;
5720
5721                         if (uuid_is_null(so->so_mpkl_send_uuid) && so->so_mpkl_send_proto == 0) {
5722                                 so->so_flags1 &= ~SOF1_MPKL_SEND_INFO;
5723                         } else {
5724                                 so->so_flags1 |= SOF1_MPKL_SEND_INFO;
5725                         }
5726                         break;
5727                 }
5728                 default:
5729                         error = ENOPROTOOPT;
5730                         break;
5731                 }
5732                 if (error == 0 && so->so_proto != NULL &&
5733                     so->so_proto->pr_ctloutput != NULL) {
5734                         (void) so->so_proto->pr_ctloutput(so, sopt);
5735                 }
5736         }
5737 out:
5738         if (dolock) {
5739                 socket_unlock(so, 1);
5740         }
5741         return error;
5742 }
5743
5744 /* Helper routines for getsockopt */
5745 int
5746 sooptcopyout(struct sockopt *sopt, void *buf, size_t len)
5747 {
5748         int     error;
5749         size_t  valsize;
5750
5751         error = 0;
5752
5753         /*
5754          * Documented get behavior is that we always return a value,
5755          * possibly truncated to fit in the user's buffer.
5756          * Traditional behavior is that we always tell the user
5757          * precisely how much we copied, rather than something useful
5758          * like the total amount we had available for her.
5759          * Note that this interface is not idempotent; the entire answer must
5760          * generated ahead of time.
5761          */
5762         valsize = min(len, sopt->sopt_valsize);
5763         sopt->sopt_valsize = valsize;
5764         if (sopt->sopt_val != USER_ADDR_NULL) {
5765                 if (sopt->sopt_p != kernproc) {
5766                         error = copyout(buf, sopt->sopt_val, valsize);
5767                 } else {
5768                         bcopy(buf, CAST_DOWN(caddr_t, sopt->sopt_val), valsize);
5769                 }
5770         }
5771         return error;
5772 }
5773
5774 static int
5775 sooptcopyout_timeval(struct sockopt *sopt, const struct timeval *tv_p)
5776 {
5777         int                     error;
5778         size_t                  len;
5779         struct user64_timeval   tv64 = {};
5780         struct user32_timeval   tv32 = {};
5781         const void *            val;
5782         size_t                  valsize;
5783
5784         error = 0;
5785         if (proc_is64bit(sopt->sopt_p)) {
5786                 len = sizeof(tv64);
5787                 tv64.tv_sec = tv_p->tv_sec;
5788                 tv64.tv_usec = tv_p->tv_usec;
5789                 val = &tv64;
5790         } else {
5791                 len = sizeof(tv32);
5792                 tv32.tv_sec = tv_p->tv_sec;
5793                 tv32.tv_usec = tv_p->tv_usec;
5794                 val = &tv32;
5795         }
5796         valsize = min(len, sopt->sopt_valsize);
5797         sopt->sopt_valsize = valsize;
5798         if (sopt->sopt_val != USER_ADDR_NULL) {
5799                 if (sopt->sopt_p != kernproc) {
5800                         error = copyout(val, sopt->sopt_val, valsize);
5801                 } else {
5802                         bcopy(val, CAST_DOWN(caddr_t, sopt->sopt_val), valsize);
5803                 }
5804         }
5805         return error;
5806 }
5807
5808 /*
5809  * Return:      0                       Success
5810  *              ENOPROTOOPT
5811  *      <pr_ctloutput>:EOPNOTSUPP[AF_UNIX]
5812  *      <pr_ctloutput>:???
5813  *      <sf_getoption>:???
5814  */
5815 int
5816 sogetoptlock(struct socket *so, struct sockopt *sopt, int dolock)
5817 {
5818         int     error, optval;
5819         struct  linger l;
5820         struct  timeval tv;
5821 #if CONFIG_MACF_SOCKET
5822         struct mac extmac;
5823 #endif /* MAC_SOCKET */
5824
5825         if (sopt->sopt_dir != SOPT_GET) {
5826                 sopt->sopt_dir = SOPT_GET;
5827         }
5828
5829         if (dolock) {
5830                 socket_lock(so, 1);
5831         }
5832
5833         error = sflt_getsockopt(so, sopt);
5834         if (error != 0) {
5835                 if (error == EJUSTRETURN) {
5836                         error = 0;
5837                 }
5838                 goto out;
5839         }
5840
5841         if (sopt->sopt_level != SOL_SOCKET) {
5842                 if (so->so_proto != NULL &&
5843                     so->so_proto->pr_ctloutput != NULL) {
5844                         error = (*so->so_proto->pr_ctloutput)(so, sopt);
5845                         goto out;
5846                 }
5847                 error = ENOPROTOOPT;
5848         } else {
5849                 /*
5850                  * Allow socket-level (SOL_SOCKET) options to be filtered by
5851                  * the protocol layer, if needed.  A zero value returned from
5852                  * the handler means use default socket-level processing as
5853                  * done by the rest of this routine.  Otherwise, any other
5854                  * return value indicates that the option is unsupported.
5855                  */
5856                 if (so->so_proto != NULL && (error = so->so_proto->pr_usrreqs->
5857                     pru_socheckopt(so, sopt)) != 0) {
5858                         goto out;
5859                 }
5860
5861                 error = 0;
5862                 switch (sopt->sopt_name) {
5863                 case SO_LINGER:
5864                 case SO_LINGER_SEC:
5865                         l.l_onoff = ((so->so_options & SO_LINGER) ? 1 : 0);
5866                         l.l_linger = (sopt->sopt_name == SO_LINGER) ?
5867                             so->so_linger : so->so_linger / hz;
5868                         error = sooptcopyout(sopt, &l, sizeof(l));
5869                         break;
5870
5871                 case SO_USELOOPBACK:
5872                 case SO_DONTROUTE:
5873                 case SO_DEBUG:
5874                 case SO_KEEPALIVE:
5875                 case SO_REUSEADDR:
5876                 case SO_REUSEPORT:
5877                 case SO_BROADCAST:
5878                 case SO_OOBINLINE:
5879                 case SO_TIMESTAMP:
5880                 case SO_TIMESTAMP_MONOTONIC:
5881                 case SO_TIMESTAMP_CONTINUOUS:
5882                 case SO_DONTTRUNC:
5883                 case SO_WANTMORE:
5884                 case SO_WANTOOBFLAG:
5885                 case SO_NOWAKEFROMSLEEP:
5886                 case SO_NOAPNFALLBK:
5887                         optval = so->so_options & sopt->sopt_name;
5888 integer:
5889                         error = sooptcopyout(sopt, &optval, sizeof(optval));
5890                         break;
5891
5892                 case SO_TYPE:
5893                         optval = so->so_type;
5894                         goto integer;
5895
5896                 case SO_NREAD:
5897                         if (so->so_proto->pr_flags & PR_ATOMIC) {
5898                                 int pkt_total;
5899                                 struct mbuf *m1;
5900
5901                                 pkt_total = 0;
5902                                 m1 = so->so_rcv.sb_mb;
5903                                 while (m1 != NULL) {
5904                                         if (m1->m_type == MT_DATA ||
5905                                             m1->m_type == MT_HEADER ||
5906                                             m1->m_type == MT_OOBDATA) {
5907                                                 pkt_total += m1->m_len;
5908                                         }
5909                                         m1 = m1->m_next;
5910                                 }
5911                                 optval = pkt_total;
5912                         } else {
5913                                 optval = so->so_rcv.sb_cc - so->so_rcv.sb_ctl;
5914                         }
5915                         goto integer;
5916
5917                 case SO_NUMRCVPKT:
5918                         if (so->so_proto->pr_flags & PR_ATOMIC) {
5919                                 int cnt = 0;
5920                                 struct mbuf *m1;
5921
5922                                 m1 = so->so_rcv.sb_mb;
5923                                 while (m1 != NULL) {
5924                                         cnt += 1;
5925                                         m1 = m1->m_nextpkt;
5926                                 }
5927                                 optval = cnt;
5928                                 goto integer;
5929                         } else {
5930                                 error = ENOPROTOOPT;
5931                                 break;
5932                         }
5933
5934                 case SO_NWRITE:
5935                         optval = so->so_snd.sb_cc;
5936                         goto integer;
5937
5938                 case SO_ERROR:
5939                         optval = so->so_error;
5940                         so->so_error = 0;
5941                         goto integer;
5942
5943                 case SO_SNDBUF: {
5944                         u_int32_t hiwat = so->so_snd.sb_hiwat;
5945
5946                         if (so->so_snd.sb_flags & SB_UNIX) {
5947                                 struct unpcb *unp =
5948                                     (struct unpcb *)(so->so_pcb);
5949                                 if (unp != NULL && unp->unp_conn != NULL) {
5950                                         hiwat += unp->unp_conn->unp_cc;
5951                                 }
5952                         }
5953
5954                         optval = hiwat;
5955                         goto integer;
5956                 }
5957                 case SO_RCVBUF:
5958                         optval = so->so_rcv.sb_hiwat;
5959                         goto integer;
5960
5961                 case SO_SNDLOWAT:
5962                         optval = so->so_snd.sb_lowat;
5963                         goto integer;
5964
5965                 case SO_RCVLOWAT:
5966                         optval = so->so_rcv.sb_lowat;
5967                         goto integer;
5968
5969                 case SO_SNDTIMEO:
5970                 case SO_RCVTIMEO:
5971                         tv = (sopt->sopt_name == SO_SNDTIMEO ?
5972                             so->so_snd.sb_timeo : so->so_rcv.sb_timeo);
5973
5974                         error = sooptcopyout_timeval(sopt, &tv);
5975                         break;
5976
5977                 case SO_NOSIGPIPE:
5978                         optval = (so->so_flags & SOF_NOSIGPIPE);
5979                         goto integer;
5980
5981                 case SO_NOADDRERR:
5982                         optval = (so->so_flags & SOF_NOADDRAVAIL);
5983                         goto integer;
5984
5985                 case SO_REUSESHAREUID:
5986                         optval = (so->so_flags & SOF_REUSESHAREUID);
5987                         goto integer;
5988
5989
5990                 case SO_NOTIFYCONFLICT:
5991                         optval = (so->so_flags & SOF_NOTIFYCONFLICT);
5992                         goto integer;
5993
5994                 case SO_RESTRICTIONS:
5995                         optval = so_get_restrictions(so);
5996                         goto integer;
5997
5998                 case SO_AWDL_UNRESTRICTED:
5999                         if (SOCK_DOM(so) == PF_INET ||
6000                             SOCK_DOM(so) == PF_INET6) {
6001                                 optval = inp_get_awdl_unrestricted(
6002                                         sotoinpcb(so));
6003                                 goto integer;
6004                         } else {
6005                                 error = EOPNOTSUPP;
6006                         }
6007                         break;
6008
6009                 case SO_INTCOPROC_ALLOW:
6010                         if (SOCK_DOM(so) == PF_INET6) {
6011                                 optval = inp_get_intcoproc_allowed(
6012                                         sotoinpcb(so));
6013                                 goto integer;
6014                         } else {
6015                                 error = EOPNOTSUPP;
6016                         }
6017                         break;
6018
6019                 case SO_LABEL:
6020 #if CONFIG_MACF_SOCKET
6021                         if ((error = sooptcopyin(sopt, &extmac, sizeof(extmac),
6022                             sizeof(extmac))) != 0 ||
6023                             (error = mac_socket_label_get(proc_ucred(
6024                                     sopt->sopt_p), so, &extmac)) != 0) {
6025                                 break;
6026                         }
6027
6028                         error = sooptcopyout(sopt, &extmac, sizeof(extmac));
6029 #else
6030                         error = EOPNOTSUPP;
6031 #endif /* MAC_SOCKET */
6032                         break;
6033
6034                 case SO_PEERLABEL:
6035 #if CONFIG_MACF_SOCKET
6036                         if ((error = sooptcopyin(sopt, &extmac, sizeof(extmac),
6037                             sizeof(extmac))) != 0 ||
6038                             (error = mac_socketpeer_label_get(proc_ucred(
6039                                     sopt->sopt_p), so, &extmac)) != 0) {
6040                                 break;
6041                         }
6042
6043                         error = sooptcopyout(sopt, &extmac, sizeof(extmac));
6044 #else
6045                         error = EOPNOTSUPP;
6046 #endif /* MAC_SOCKET */
6047                         break;
6048
6049 #ifdef __APPLE_API_PRIVATE
6050                 case SO_UPCALLCLOSEWAIT:
6051                         optval = (so->so_flags & SOF_UPCALLCLOSEWAIT);
6052                         goto integer;
6053 #endif
6054                 case SO_RANDOMPORT:
6055                         optval = (so->so_flags & SOF_BINDRANDOMPORT);
6056                         goto integer;
6057
6058                 case SO_NP_EXTENSIONS: {
6059                         struct so_np_extensions sonpx = {};
6060
6061                         sonpx.npx_flags = (so->so_flags & SOF_NPX_SETOPTSHUT) ?
6062                             SONPX_SETOPTSHUT : 0;
6063                         sonpx.npx_mask = SONPX_MASK_VALID;
6064
6065                         error = sooptcopyout(sopt, &sonpx,
6066                             sizeof(struct so_np_extensions));
6067                         break;
6068                 }
6069
6070                 case SO_TRAFFIC_CLASS:
6071                         optval = so->so_traffic_class;
6072                         goto integer;
6073
6074                 case SO_RECV_TRAFFIC_CLASS:
6075                         optval = (so->so_flags & SOF_RECV_TRAFFIC_CLASS);
6076                         goto integer;
6077
6078                 case SO_TRAFFIC_CLASS_STATS:
6079                         error = sooptcopyout(sopt, &so->so_tc_stats,
6080                             sizeof(so->so_tc_stats));
6081                         break;
6082
6083 #if (DEVELOPMENT || DEBUG)
6084                 case SO_TRAFFIC_CLASS_DBG:
6085                         error = sogetopt_tcdbg(so, sopt);
6086                         break;
6087 #endif /* (DEVELOPMENT || DEBUG) */
6088
6089                 case SO_PRIVILEGED_TRAFFIC_CLASS:
6090                         optval = (so->so_flags & SOF_PRIVILEGED_TRAFFIC_CLASS);
6091                         goto integer;
6092
6093                 case SO_DEFUNCTOK:
6094                         optval = !(so->so_flags & SOF_NODEFUNCT);
6095                         goto integer;
6096
6097                 case SO_ISDEFUNCT:
6098                         optval = (so->so_flags & SOF_DEFUNCT);
6099                         goto integer;
6100
6101                 case SO_OPPORTUNISTIC:
6102                         optval = so_get_opportunistic(so);
6103                         goto integer;
6104
6105                 case SO_FLUSH:
6106                         /* This option is not gettable */
6107                         error = EINVAL;
6108                         break;
6109
6110                 case SO_RECV_ANYIF:
6111                         optval = so_get_recv_anyif(so);
6112                         goto integer;
6113
6114                 case SO_TRAFFIC_MGT_BACKGROUND:
6115                         /* This option is handled by lower layer(s) */
6116                         if (so->so_proto != NULL &&
6117                             so->so_proto->pr_ctloutput != NULL) {
6118                                 (void) so->so_proto->pr_ctloutput(so, sopt);
6119                         }
6120                         break;
6121
6122 #if FLOW_DIVERT
6123                 case SO_FLOW_DIVERT_TOKEN:
6124                         error = flow_divert_token_get(so, sopt);
6125                         break;
6126 #endif  /* FLOW_DIVERT */
6127
6128 #if NECP
6129                 case SO_NECP_ATTRIBUTES:
6130                         error = necp_get_socket_attributes(so, sopt);
6131                         break;
6132
6133                 case SO_NECP_CLIENTUUID: {
6134                         uuid_t *ncu;
6135
6136                         if (SOCK_DOM(so) == PF_MULTIPATH) {
6137                                 ncu = &mpsotomppcb(so)->necp_client_uuid;
6138                         } else if (SOCK_DOM(so) == PF_INET || SOCK_DOM(so) == PF_INET6) {
6139                                 ncu = &sotoinpcb(so)->necp_client_uuid;
6140                         } else {
6141                                 error = EINVAL;
6142                                 goto out;
6143                         }
6144
6145                         error = sooptcopyout(sopt, ncu, sizeof(uuid_t));
6146                         break;
6147                 }
6148
6149                 case SO_NECP_LISTENUUID: {
6150                         uuid_t *nlu;
6151
6152                         if (SOCK_DOM(so) == PF_INET || SOCK_DOM(so) == PF_INET6) {
6153                                 if (sotoinpcb(so)->inp_flags2 & INP2_EXTERNAL_PORT) {
6154                                         nlu = &sotoinpcb(so)->necp_client_uuid;
6155                                 } else {
6156                                         error = ENOENT;
6157                                         goto out;
6158                                 }
6159                         } else {
6160                                 error = EINVAL;
6161                                 goto out;
6162                         }
6163
6164                         error = sooptcopyout(sopt, nlu, sizeof(uuid_t));
6165                         break;
6166                 }
6167 #endif /* NECP */
6168
6169 #if CONTENT_FILTER
6170                 case SO_CFIL_SOCK_ID: {
6171                         cfil_sock_id_t sock_id;
6172
6173                         sock_id = cfil_sock_id_from_socket(so);
6174
6175                         error = sooptcopyout(sopt, &sock_id,
6176                             sizeof(cfil_sock_id_t));
6177                         break;
6178                 }
6179 #endif  /* CONTENT_FILTER */
6180
6181                 case SO_EXTENDED_BK_IDLE:
6182                         optval = (so->so_flags1 & SOF1_EXTEND_BK_IDLE_WANTED);
6183                         goto integer;
6184                 case SO_MARK_CELLFALLBACK:
6185                         optval = ((so->so_flags1 & SOF1_CELLFALLBACK) > 0)
6186                             ? 1 : 0;
6187                         goto integer;
6188                 case SO_NET_SERVICE_TYPE: {
6189                         if ((so->so_flags1 & SOF1_TC_NET_SERV_TYPE)) {
6190                                 optval = so->so_netsvctype;
6191                         } else {
6192                                 optval = NET_SERVICE_TYPE_BE;
6193                         }
6194                         goto integer;
6195                 }
6196                 case SO_NETSVC_MARKING_LEVEL:
6197                         optval = so_get_netsvc_marking_level(so);
6198                         goto integer;
6199
6200                 case SO_MPKL_SEND_INFO: {
6201                         struct so_mpkl_send_info so_mpkl_send_info;
6202
6203                         uuid_copy(so_mpkl_send_info.mpkl_uuid, so->so_mpkl_send_uuid);
6204                         so_mpkl_send_info.mpkl_proto = so->so_mpkl_send_proto;
6205                         error = sooptcopyout(sopt, &so_mpkl_send_info,
6206                             sizeof(struct so_mpkl_send_info));
6207                         break;
6208                 }
6209                 default:
6210                         error = ENOPROTOOPT;
6211                         break;
6212                 }
6213         }
6214 out:
6215         if (dolock) {
6216                 socket_unlock(so, 1);
6217         }
6218         return error;
6219 }
6220
6221 /*
6222  * The size limits on our soopt_getm is different from that on FreeBSD.
6223  * We limit the size of options to MCLBYTES. This will have to change
6224  * if we need to define options that need more space than MCLBYTES.
6225  */
6226 int
6227 soopt_getm(struct sockopt *sopt, struct mbuf **mp)
6228 {
6229         struct mbuf *m, *m_prev;
6230         int sopt_size = sopt->sopt_valsize;
6231         int how;
6232
6233         if (sopt_size <= 0 || sopt_size > MCLBYTES) {
6234                 return EMSGSIZE;
6235         }
6236
6237         how = sopt->sopt_p != kernproc ? M_WAIT : M_DONTWAIT;
6238         MGET(m, how, MT_DATA);
6239         if (m == NULL) {
6240                 return ENOBUFS;
6241         }
6242         if (sopt_size > MLEN) {
6243                 MCLGET(m, how);
6244                 if ((m->m_flags & M_EXT) == 0) {
6245                         m_free(m);
6246                         return ENOBUFS;
6247                 }
6248                 m->m_len = min(MCLBYTES, sopt_size);
6249         } else {
6250                 m->m_len = min(MLEN, sopt_size);
6251         }
6252         sopt_size -= m->m_len;
6253         *mp = m;
6254         m_prev = m;
6255
6256         while (sopt_size > 0) {
6257                 MGET(m, how, MT_DATA);
6258                 if (m == NULL) {
6259                         m_freem(*mp);
6260                         return ENOBUFS;
6261                 }
6262                 if (sopt_size > MLEN) {
6263                         MCLGET(m, how);
6264                         if ((m->m_flags & M_EXT) == 0) {
6265                                 m_freem(*mp);
6266                                 m_freem(m);
6267                                 return ENOBUFS;
6268                         }
6269                         m->m_len = min(MCLBYTES, sopt_size);
6270                 } else {
6271                         m->m_len = min(MLEN, sopt_size);
6272                 }
6273                 sopt_size -= m->m_len;
6274                 m_prev->m_next = m;
6275                 m_prev = m;
6276         }
6277         return 0;
6278 }
6279
6280 /* copyin sopt data into mbuf chain */
6281 int
6282 soopt_mcopyin(struct sockopt *sopt, struct mbuf *m)
6283 {
6284         struct mbuf *m0 = m;
6285
6286         if (sopt->sopt_val == USER_ADDR_NULL) {
6287                 return 0;
6288         }
6289         while (m != NULL && sopt->sopt_valsize >= m->m_len) {
6290                 if (sopt->sopt_p != kernproc) {
6291                         int error;
6292
6293                         error = copyin(sopt->sopt_val, mtod(m, char *),
6294                             m->m_len);
6295                         if (error != 0) {
6296                                 m_freem(m0);
6297                                 return error;
6298                         }
6299                 } else {
6300                         bcopy(CAST_DOWN(caddr_t, sopt->sopt_val),
6301                             mtod(m, char *), m->m_len);
6302                 }
6303                 sopt->sopt_valsize -= m->m_len;
6304                 sopt->sopt_val += m->m_len;
6305                 m = m->m_next;
6306         }
6307         /* should be allocated enoughly at ip6_sooptmcopyin() */
6308         if (m != NULL) {
6309                 panic("soopt_mcopyin");
6310                 /* NOTREACHED */
6311         }
6312         return 0;
6313 }
6314
6315 /* copyout mbuf chain data into soopt */
6316 int
6317 soopt_mcopyout(struct sockopt *sopt, struct mbuf *m)
6318 {
6319         struct mbuf *m0 = m;
6320         size_t valsize = 0;
6321
6322         if (sopt->sopt_val == USER_ADDR_NULL) {
6323                 return 0;
6324         }
6325         while (m != NULL && sopt->sopt_valsize >= m->m_len) {
6326                 if (sopt->sopt_p != kernproc) {
6327                         int error;
6328
6329                         error = copyout(mtod(m, char *), sopt->sopt_val,
6330                             m->m_len);
6331                         if (error != 0) {
6332                                 m_freem(m0);
6333                                 return error;
6334                         }
6335                 } else {
6336                         bcopy(mtod(m, char *),
6337                             CAST_DOWN(caddr_t, sopt->sopt_val), m->m_len);
6338                 }
6339                 sopt->sopt_valsize -= m->m_len;
6340                 sopt->sopt_val += m->m_len;
6341                 valsize += m->m_len;
6342                 m = m->m_next;
6343         }
6344         if (m != NULL) {
6345                 /* enough soopt buffer should be given from user-land */
6346                 m_freem(m0);
6347                 return EINVAL;
6348         }
6349         sopt->sopt_valsize = valsize;
6350         return 0;
6351 }
6352
6353 void
6354 sohasoutofband(struct socket *so)
6355 {
6356         if (so->so_pgid < 0) {
6357                 gsignal(-so->so_pgid, SIGURG);
6358         } else if (so->so_pgid > 0) {
6359                 proc_signal(so->so_pgid, SIGURG);
6360         }
6361         selwakeup(&so->so_rcv.sb_sel);
6362         if (so->so_rcv.sb_flags & SB_KNOTE) {
6363                 KNOTE(&so->so_rcv.sb_sel.si_note,
6364                     (NOTE_OOB | SO_FILT_HINT_LOCKED));
6365         }
6366 }
6367
6368 int
6369 sopoll(struct socket *so, int events, kauth_cred_t cred, void * wql)
6370 {
6371 #pragma unused(cred)
6372         struct proc *p = current_proc();
6373         int revents = 0;
6374
6375         socket_lock(so, 1);
6376         so_update_last_owner_locked(so, PROC_NULL);
6377         so_update_policy(so);
6378
6379         if (events & (POLLIN | POLLRDNORM)) {
6380                 if (soreadable(so)) {
6381                         revents |= events & (POLLIN | POLLRDNORM);
6382                 }
6383         }
6384
6385         if (events & (POLLOUT | POLLWRNORM)) {
6386                 if (sowriteable(so)) {
6387                         revents |= events & (POLLOUT | POLLWRNORM);
6388                 }
6389         }
6390
6391         if (events & (POLLPRI | POLLRDBAND)) {
6392                 if (so->so_oobmark || (so->so_state & SS_RCVATMARK)) {
6393                         revents |= events & (POLLPRI | POLLRDBAND);
6394                 }
6395         }
6396
6397         if (revents == 0) {
6398                 if (events & (POLLIN | POLLPRI | POLLRDNORM | POLLRDBAND)) {
6399                         /*
6400                          * Darwin sets the flag first,
6401                          * BSD calls selrecord first
6402                          */
6403                         so->so_rcv.sb_flags |= SB_SEL;
6404                         selrecord(p, &so->so_rcv.sb_sel, wql);
6405                 }
6406
6407                 if (events & (POLLOUT | POLLWRNORM)) {
6408                         /*
6409                          * Darwin sets the flag first,
6410                          * BSD calls selrecord first
6411                          */
6412                         so->so_snd.sb_flags |= SB_SEL;
6413                         selrecord(p, &so->so_snd.sb_sel, wql);
6414                 }
6415         }
6416
6417         socket_unlock(so, 1);
6418         return revents;
6419 }
6420
6421 int
6422 soo_kqfilter(struct fileproc *fp, struct knote *kn, struct kevent_qos_s *kev)
6423 {
6424         struct socket *so = (struct socket *)fp->f_fglob->fg_data;
6425         int result;
6426
6427         socket_lock(so, 1);
6428         so_update_last_owner_locked(so, PROC_NULL);
6429         so_update_policy(so);
6430
6431 #if CONFIG_MACF_SOCKET
6432         proc_t p = knote_get_kq(kn)->kq_p;
6433         if (mac_socket_check_kqfilter(proc_ucred(p), kn, so) != 0) {
6434                 socket_unlock(so, 1);
6435                 knote_set_error(kn, EPERM);
6436                 return 0;
6437         }
6438 #endif /* MAC_SOCKET */
6439
6440         switch (kn->kn_filter) {
6441         case EVFILT_READ:
6442                 kn->kn_filtid = EVFILTID_SOREAD;
6443                 break;
6444         case EVFILT_WRITE:
6445                 kn->kn_filtid = EVFILTID_SOWRITE;
6446                 break;
6447         case EVFILT_SOCK:
6448                 kn->kn_filtid = EVFILTID_SCK;
6449                 break;
6450         case EVFILT_EXCEPT:
6451                 kn->kn_filtid = EVFILTID_SOEXCEPT;
6452                 break;
6453         default:
6454                 socket_unlock(so, 1);
6455                 knote_set_error(kn, EINVAL);
6456                 return 0;
6457         }
6458
6459         /*
6460          * call the appropriate sub-filter attach
6461          * with the socket still locked
6462          */
6463         result = knote_fops(kn)->f_attach(kn, kev);
6464
6465         socket_unlock(so, 1);
6466
6467         return result;
6468 }
6469
6470 static int
6471 filt_soread_common(struct knote *kn, struct kevent_qos_s *kev, struct socket *so)
6472 {
6473         int retval = 0;
6474         int64_t data = 0;
6475
6476         if (so->so_options & SO_ACCEPTCONN) {
6477                 /*
6478                  * Radar 6615193 handle the listen case dynamically
6479                  * for kqueue read filter. This allows to call listen()
6480                  * after registering the kqueue EVFILT_READ.
6481                  */
6482
6483                 retval = !TAILQ_EMPTY(&so->so_comp);
6484                 data = so->so_qlen;
6485                 goto out;
6486         }
6487
6488         /* socket isn't a listener */
6489         /*
6490          * NOTE_LOWAT specifies new low water mark in data, i.e.
6491          * the bytes of protocol data. We therefore exclude any
6492          * control bytes.
6493          */
6494         data = so->so_rcv.sb_cc - so->so_rcv.sb_ctl;
6495
6496         if (kn->kn_sfflags & NOTE_OOB) {
6497                 if (so->so_oobmark || (so->so_state & SS_RCVATMARK)) {
6498                         kn->kn_fflags |= NOTE_OOB;
6499                         data -= so->so_oobmark;
6500                         retval = 1;
6501                         goto out;
6502                 }
6503         }
6504
6505         if ((so->so_state & SS_CANTRCVMORE)
6506 #if CONTENT_FILTER
6507             && cfil_sock_data_pending(&so->so_rcv) == 0
6508 #endif /* CONTENT_FILTER */
6509             ) {
6510                 kn->kn_flags |= EV_EOF;
6511                 kn->kn_fflags = so->so_error;
6512                 retval = 1;
6513                 goto out;
6514         }
6515
6516         if (so->so_error) {     /* temporary udp error */
6517                 retval = 1;
6518                 goto out;
6519         }
6520
6521         int64_t lowwat = so->so_rcv.sb_lowat;
6522         /*
6523          * Ensure that when NOTE_LOWAT is used, the derived
6524          * low water mark is bounded by socket's rcv buf's
6525          * high and low water mark values.
6526          */
6527         if (kn->kn_sfflags & NOTE_LOWAT) {
6528                 if (kn->kn_sdata > so->so_rcv.sb_hiwat) {
6529                         lowwat = so->so_rcv.sb_hiwat;
6530                 } else if (kn->kn_sdata > lowwat) {
6531                         lowwat = kn->kn_sdata;
6532                 }
6533         }
6534
6535         retval = (data >= lowwat);
6536
6537 out:
6538         if (retval && kev) {
6539                 knote_fill_kevent(kn, kev, data);
6540         }
6541         return retval;
6542 }
6543
6544 static int
6545 filt_sorattach(struct knote *kn, __unused struct kevent_qos_s *kev)
6546 {
6547         struct socket *so = (struct socket *)kn->kn_fp->f_fglob->fg_data;
6548
6549         /* socket locked */
6550
6551         /*
6552          * If the caller explicitly asked for OOB results (e.g. poll())
6553          * from EVFILT_READ, then save that off in the hookid field
6554          * and reserve the kn_flags EV_OOBAND bit for output only.
6555          */
6556         if (kn->kn_filter == EVFILT_READ &&
6557             kn->kn_flags & EV_OOBAND) {
6558                 kn->kn_flags &= ~EV_OOBAND;
6559                 kn->kn_hook32 = EV_OOBAND;
6560         } else {
6561                 kn->kn_hook32 = 0;
6562         }
6563         if (KNOTE_ATTACH(&so->so_rcv.sb_sel.si_note, kn)) {
6564                 so->so_rcv.sb_flags |= SB_KNOTE;
6565         }
6566
6567         /* indicate if event is already fired */
6568         return filt_soread_common(kn, NULL, so);
6569 }
6570
6571 static void
6572 filt_sordetach(struct knote *kn)
6573 {
6574         struct socket *so = (struct socket *)kn->kn_fp->f_fglob->fg_data;
6575
6576         socket_lock(so, 1);
6577         if (so->so_rcv.sb_flags & SB_KNOTE) {
6578                 if (KNOTE_DETACH(&so->so_rcv.sb_sel.si_note, kn)) {
6579                         so->so_rcv.sb_flags &= ~SB_KNOTE;
6580                 }
6581         }
6582         socket_unlock(so, 1);
6583 }
6584
6585 /*ARGSUSED*/
6586 static int
6587 filt_soread(struct knote *kn, long hint)
6588 {
6589         struct socket *so = (struct socket *)kn->kn_fp->f_fglob->fg_data;
6590         int retval;
6591
6592         if ((hint & SO_FILT_HINT_LOCKED) == 0) {
6593                 socket_lock(so, 1);
6594         }
6595
6596         retval = filt_soread_common(kn, NULL, so);
6597
6598         if ((hint & SO_FILT_HINT_LOCKED) == 0) {
6599                 socket_unlock(so, 1);
6600         }
6601
6602         return retval;
6603 }
6604
6605 static int
6606 filt_sortouch(struct knote *kn, struct kevent_qos_s *kev)
6607 {
6608         struct socket *so = (struct socket *)kn->kn_fp->f_fglob->fg_data;
6609         int retval;
6610
6611         socket_lock(so, 1);
6612
6613         /* save off the new input fflags and data */
6614         kn->kn_sfflags = kev->fflags;
6615         kn->kn_sdata = kev->data;
6616
6617         /* determine if changes result in fired events */
6618         retval = filt_soread_common(kn, NULL, so);
6619
6620         socket_unlock(so, 1);
6621
6622         return retval;
6623 }
6624
6625 static int
6626 filt_sorprocess(struct knote *kn, struct kevent_qos_s *kev)
6627 {
6628         struct socket *so = (struct socket *)kn->kn_fp->f_fglob->fg_data;
6629         int retval;
6630
6631         socket_lock(so, 1);
6632         retval = filt_soread_common(kn, kev, so);
6633         socket_unlock(so, 1);
6634
6635         return retval;
6636 }
6637
6638 int
6639 so_wait_for_if_feedback(struct socket *so)
6640 {
6641         if ((SOCK_DOM(so) == PF_INET || SOCK_DOM(so) == PF_INET6) &&
6642             (so->so_state & SS_ISCONNECTED)) {
6643                 struct inpcb *inp = sotoinpcb(so);
6644                 if (INP_WAIT_FOR_IF_FEEDBACK(inp)) {
6645                         return 1;
6646                 }
6647         }
6648         return 0;
6649 }
6650
6651 static int
6652 filt_sowrite_common(struct knote *kn, struct kevent_qos_s *kev, struct socket *so)
6653 {
6654         int ret = 0;
6655         int64_t data = sbspace(&so->so_snd);
6656
6657         if (so->so_state & SS_CANTSENDMORE) {
6658                 kn->kn_flags |= EV_EOF;
6659                 kn->kn_fflags = so->so_error;
6660                 ret = 1;
6661                 goto out;
6662         }
6663
6664         if (so->so_error) {     /* temporary udp error */
6665                 ret = 1;
6666                 goto out;
6667         }
6668
6669         if (!socanwrite(so)) {
6670                 ret = 0;
6671                 goto out;
6672         }
6673
6674         if (so->so_flags1 & SOF1_PRECONNECT_DATA) {
6675                 ret = 1;
6676                 goto out;
6677         }
6678
6679         int64_t lowwat = so->so_snd.sb_lowat;
6680
6681         if (kn->kn_sfflags & NOTE_LOWAT) {
6682                 if (kn->kn_sdata > so->so_snd.sb_hiwat) {
6683                         lowwat = so->so_snd.sb_hiwat;
6684                 } else if (kn->kn_sdata > lowwat) {
6685                         lowwat = kn->kn_sdata;
6686                 }
6687         }
6688
6689         if (data >= lowwat) {
6690                 if ((so->so_flags & SOF_NOTSENT_LOWAT)
6691 #if (DEBUG || DEVELOPMENT)
6692                     && so_notsent_lowat_check == 1
6693 #endif /* DEBUG || DEVELOPMENT */
6694                     ) {
6695                         if ((SOCK_DOM(so) == PF_INET ||
6696                             SOCK_DOM(so) == PF_INET6) &&
6697                             so->so_type == SOCK_STREAM) {
6698                                 ret = tcp_notsent_lowat_check(so);
6699                         }
6700 #if MPTCP
6701                         else if ((SOCK_DOM(so) == PF_MULTIPATH) &&
6702                             (SOCK_PROTO(so) == IPPROTO_TCP)) {
6703                                 ret = mptcp_notsent_lowat_check(so);
6704                         }
6705 #endif
6706                         else {
6707                                 ret = 1;
6708                                 goto out;
6709                         }
6710                 } else {
6711                         ret = 1;
6712                 }
6713         }
6714         if (so_wait_for_if_feedback(so)) {
6715                 ret = 0;
6716         }
6717
6718 out:
6719         if (ret && kev) {
6720                 knote_fill_kevent(kn, kev, data);
6721         }
6722         return ret;
6723 }
6724
6725 static int
6726 filt_sowattach(struct knote *kn, __unused struct kevent_qos_s *kev)
6727 {
6728         struct socket *so = (struct socket *)kn->kn_fp->f_fglob->fg_data;
6729
6730         /* socket locked */
6731         if (KNOTE_ATTACH(&so->so_snd.sb_sel.si_note, kn)) {
6732                 so->so_snd.sb_flags |= SB_KNOTE;
6733         }
6734
6735         /* determine if its already fired */
6736         return filt_sowrite_common(kn, NULL, so);
6737 }
6738
6739 static void
6740 filt_sowdetach(struct knote *kn)
6741 {
6742         struct socket *so = (struct socket *)kn->kn_fp->f_fglob->fg_data;
6743         socket_lock(so, 1);
6744
6745         if (so->so_snd.sb_flags & SB_KNOTE) {
6746                 if (KNOTE_DETACH(&so->so_snd.sb_sel.si_note, kn)) {
6747                         so->so_snd.sb_flags &= ~SB_KNOTE;
6748                 }
6749         }
6750         socket_unlock(so, 1);
6751 }
6752
6753 /*ARGSUSED*/
6754 static int
6755 filt_sowrite(struct knote *kn, long hint)
6756 {
6757         struct socket *so = (struct socket *)kn->kn_fp->f_fglob->fg_data;
6758         int ret;
6759
6760         if ((hint & SO_FILT_HINT_LOCKED) == 0) {
6761                 socket_lock(so, 1);
6762         }
6763
6764         ret = filt_sowrite_common(kn, NULL, so);
6765
6766         if ((hint & SO_FILT_HINT_LOCKED) == 0) {
6767                 socket_unlock(so, 1);
6768         }
6769
6770         return ret;
6771 }
6772
6773 static int
6774 filt_sowtouch(struct knote *kn, struct kevent_qos_s *kev)
6775 {
6776         struct socket *so = (struct socket *)kn->kn_fp->f_fglob->fg_data;
6777         int ret;
6778
6779         socket_lock(so, 1);
6780
6781         /*save off the new input fflags and data */
6782         kn->kn_sfflags = kev->fflags;
6783         kn->kn_sdata = kev->data;
6784
6785         /* determine if these changes result in a triggered event */
6786         ret = filt_sowrite_common(kn, NULL, so);
6787
6788         socket_unlock(so, 1);
6789
6790         return ret;
6791 }
6792
6793 static int
6794 filt_sowprocess(struct knote *kn, struct kevent_qos_s *kev)
6795 {
6796         struct socket *so = (struct socket *)kn->kn_fp->f_fglob->fg_data;
6797         int ret;
6798
6799         socket_lock(so, 1);
6800         ret = filt_sowrite_common(kn, kev, so);
6801         socket_unlock(so, 1);
6802
6803         return ret;
6804 }
6805
6806 static int
6807 filt_sockev_common(struct knote *kn, struct kevent_qos_s *kev,
6808     struct socket *so, long ev_hint)
6809 {
6810         int ret = 0;
6811         int64_t data = 0;
6812         uint32_t level_trigger = 0;
6813
6814         if (ev_hint & SO_FILT_HINT_CONNRESET) {
6815                 kn->kn_fflags |= NOTE_CONNRESET;
6816         }
6817         if (ev_hint & SO_FILT_HINT_TIMEOUT) {
6818                 kn->kn_fflags |= NOTE_TIMEOUT;
6819         }
6820         if (ev_hint & SO_FILT_HINT_NOSRCADDR) {
6821                 kn->kn_fflags |= NOTE_NOSRCADDR;
6822         }
6823         if (ev_hint & SO_FILT_HINT_IFDENIED) {
6824                 kn->kn_fflags |= NOTE_IFDENIED;
6825         }
6826         if (ev_hint & SO_FILT_HINT_KEEPALIVE) {
6827                 kn->kn_fflags |= NOTE_KEEPALIVE;
6828         }
6829         if (ev_hint & SO_FILT_HINT_ADAPTIVE_WTIMO) {
6830                 kn->kn_fflags |= NOTE_ADAPTIVE_WTIMO;
6831         }
6832         if (ev_hint & SO_FILT_HINT_ADAPTIVE_RTIMO) {
6833                 kn->kn_fflags |= NOTE_ADAPTIVE_RTIMO;
6834         }
6835         if ((ev_hint & SO_FILT_HINT_CONNECTED) ||
6836             (so->so_state & SS_ISCONNECTED)) {
6837                 kn->kn_fflags |= NOTE_CONNECTED;
6838                 level_trigger |= NOTE_CONNECTED;
6839         }
6840         if ((ev_hint & SO_FILT_HINT_DISCONNECTED) ||
6841             (so->so_state & SS_ISDISCONNECTED)) {
6842                 kn->kn_fflags |= NOTE_DISCONNECTED;
6843                 level_trigger |= NOTE_DISCONNECTED;
6844         }
6845         if (ev_hint & SO_FILT_HINT_CONNINFO_UPDATED) {
6846                 if (so->so_proto != NULL &&
6847                     (so->so_proto->pr_flags & PR_EVCONNINFO)) {
6848                         kn->kn_fflags |= NOTE_CONNINFO_UPDATED;
6849                 }
6850         }
6851
6852         if ((ev_hint & SO_FILT_HINT_NOTIFY_ACK) ||
6853             tcp_notify_ack_active(so)) {
6854                 kn->kn_fflags |= NOTE_NOTIFY_ACK;
6855         }
6856
6857         if ((so->so_state & SS_CANTRCVMORE)
6858 #if CONTENT_FILTER
6859             && cfil_sock_data_pending(&so->so_rcv) == 0
6860 #endif /* CONTENT_FILTER */
6861             ) {
6862                 kn->kn_fflags |= NOTE_READCLOSED;
6863                 level_trigger |= NOTE_READCLOSED;
6864         }
6865
6866         if (so->so_state & SS_CANTSENDMORE) {
6867                 kn->kn_fflags |= NOTE_WRITECLOSED;
6868                 level_trigger |= NOTE_WRITECLOSED;
6869         }
6870
6871         if ((ev_hint & SO_FILT_HINT_SUSPEND) ||
6872             (so->so_flags & SOF_SUSPENDED)) {
6873                 kn->kn_fflags &= ~(NOTE_SUSPEND | NOTE_RESUME);
6874
6875                 /* If resume event was delivered before, reset it */
6876                 kn->kn_hook32 &= ~NOTE_RESUME;
6877
6878                 kn->kn_fflags |= NOTE_SUSPEND;
6879                 level_trigger |= NOTE_SUSPEND;
6880         }
6881
6882         if ((ev_hint & SO_FILT_HINT_RESUME) ||
6883             (so->so_flags & SOF_SUSPENDED) == 0) {
6884                 kn->kn_fflags &= ~(NOTE_SUSPEND | NOTE_RESUME);
6885
6886                 /* If suspend event was delivered before, reset it */
6887                 kn->kn_hook32 &= ~NOTE_SUSPEND;
6888
6889                 kn->kn_fflags |= NOTE_RESUME;
6890                 level_trigger |= NOTE_RESUME;
6891         }
6892
6893         if (so->so_error != 0) {
6894                 ret = 1;
6895                 data = so->so_error;
6896                 kn->kn_flags |= EV_EOF;
6897         } else {
6898                 u_int32_t data32;
6899                 get_sockev_state(so, &data32);
6900                 data = data32;
6901         }
6902
6903         /* Reset any events that are not requested on this knote */
6904         kn->kn_fflags &= (kn->kn_sfflags & EVFILT_SOCK_ALL_MASK);
6905         level_trigger &= (kn->kn_sfflags & EVFILT_SOCK_ALL_MASK);
6906
6907         /* Find the level triggerred events that are already delivered */
6908         level_trigger &= kn->kn_hook32;
6909         level_trigger &= EVFILT_SOCK_LEVEL_TRIGGER_MASK;
6910
6911         /* Do not deliver level triggerred events more than once */
6912         if ((kn->kn_fflags & ~level_trigger) != 0) {
6913                 ret = 1;
6914         }
6915
6916         if (ret && kev) {
6917                 /*
6918                  * Store the state of the events being delivered. This
6919                  * state can be used to deliver level triggered events
6920                  * ateast once and still avoid waking up the application
6921                  * multiple times as long as the event is active.
6922                  */
6923                 if (kn->kn_fflags != 0) {
6924                         kn->kn_hook32 |= (kn->kn_fflags &
6925                             EVFILT_SOCK_LEVEL_TRIGGER_MASK);
6926                 }
6927
6928                 /*
6929                  * NOTE_RESUME and NOTE_SUSPEND are an exception, deliver
6930                  * only one of them and remember the last one that was
6931                  * delivered last
6932                  */
6933                 if (kn->kn_fflags & NOTE_SUSPEND) {
6934                         kn->kn_hook32 &= ~NOTE_RESUME;
6935                 }
6936                 if (kn->kn_fflags & NOTE_RESUME) {
6937                         kn->kn_hook32 &= ~NOTE_SUSPEND;
6938                 }
6939
6940                 knote_fill_kevent(kn, kev, data);
6941         }
6942         return ret;
6943 }
6944
6945 static int
6946 filt_sockattach(struct knote *kn, __unused struct kevent_qos_s *kev)
6947 {
6948         struct socket *so = (struct socket *)kn->kn_fp->f_fglob->fg_data;
6949
6950         /* socket locked */
6951         kn->kn_hook32 = 0;
6952         if (KNOTE_ATTACH(&so->so_klist, kn)) {
6953                 so->so_flags |= SOF_KNOTE;
6954         }
6955
6956         /* determine if event already fired */
6957         return filt_sockev_common(kn, NULL, so, 0);
6958 }
6959
6960 static void
6961 filt_sockdetach(struct knote *kn)
6962 {
6963         struct socket *so = (struct socket *)kn->kn_fp->f_fglob->fg_data;
6964         socket_lock(so, 1);
6965
6966         if ((so->so_flags & SOF_KNOTE) != 0) {
6967                 if (KNOTE_DETACH(&so->so_klist, kn)) {
6968                         so->so_flags &= ~SOF_KNOTE;
6969                 }
6970         }
6971         socket_unlock(so, 1);
6972 }
6973
6974 static int
6975 filt_sockev(struct knote *kn, long hint)
6976 {
6977         int ret = 0, locked = 0;
6978         struct socket *so = (struct socket *)kn->kn_fp->f_fglob->fg_data;
6979         long ev_hint = (hint & SO_FILT_HINT_EV);
6980
6981         if ((hint & SO_FILT_HINT_LOCKED) == 0) {
6982                 socket_lock(so, 1);
6983                 locked = 1;
6984         }
6985
6986         ret = filt_sockev_common(kn, NULL, so, ev_hint);
6987
6988         if (locked) {
6989                 socket_unlock(so, 1);
6990         }
6991
6992         return ret;
6993 }
6994
6995
6996
6997 /*
6998  *      filt_socktouch - update event state
6999  */
7000 static int
7001 filt_socktouch(
7002         struct knote *kn,
7003         struct kevent_qos_s *kev)
7004 {
7005         struct socket *so = (struct socket *)kn->kn_fp->f_fglob->fg_data;
7006         uint32_t changed_flags;
7007         int ret;
7008
7009         socket_lock(so, 1);
7010
7011         /* save off the [result] data and fflags */
7012         changed_flags = (kn->kn_sfflags ^ kn->kn_hook32);
7013
7014         /* save off the new input fflags and data */
7015         kn->kn_sfflags = kev->fflags;
7016         kn->kn_sdata = kev->data;
7017
7018         /* restrict the current results to the (smaller?) set of new interest */
7019         /*
7020          * For compatibility with previous implementations, we leave kn_fflags
7021          * as they were before.
7022          */
7023         //kn->kn_fflags &= kev->fflags;
7024
7025         /*
7026          * Since we keep track of events that are already
7027          * delivered, if any of those events are not requested
7028          * anymore the state related to them can be reset
7029          */
7030         kn->kn_hook32 &= ~(changed_flags & EVFILT_SOCK_LEVEL_TRIGGER_MASK);
7031
7032         /* determine if we have events to deliver */
7033         ret = filt_sockev_common(kn, NULL, so, 0);
7034
7035         socket_unlock(so, 1);
7036
7037         return ret;
7038 }
7039
7040 /*
7041  *      filt_sockprocess - query event fired state and return data
7042  */
7043 static int
7044 filt_sockprocess(struct knote *kn, struct kevent_qos_s *kev)
7045 {
7046         struct socket *so = (struct socket *)kn->kn_fp->f_fglob->fg_data;
7047         int ret = 0;
7048
7049         socket_lock(so, 1);
7050
7051         ret = filt_sockev_common(kn, kev, so, 0);
7052
7053         socket_unlock(so, 1);
7054
7055         return ret;
7056 }
7057
7058 void
7059 get_sockev_state(struct socket *so, u_int32_t *statep)
7060 {
7061         u_int32_t state = *(statep);
7062
7063         /*
7064          * If the state variable is already used by a previous event,
7065          * reset it.
7066          */
7067         if (state != 0) {
7068                 return;
7069         }
7070
7071         if (so->so_state & SS_ISCONNECTED) {
7072                 state |= SOCKEV_CONNECTED;
7073         } else {
7074                 state &= ~(SOCKEV_CONNECTED);
7075         }
7076         state |= ((so->so_state & SS_ISDISCONNECTED) ? SOCKEV_DISCONNECTED : 0);
7077         *(statep) = state;
7078 }
7079
7080 #define SO_LOCK_HISTORY_STR_LEN \
7081         (2 * SO_LCKDBG_MAX * (2 + (2 * sizeof (void *)) + 1) + 1)
7082
7083 __private_extern__ const char *
7084 solockhistory_nr(struct socket *so)
7085 {
7086         size_t n = 0;
7087         int i;
7088         static char lock_history_str[SO_LOCK_HISTORY_STR_LEN];
7089
7090         bzero(lock_history_str, sizeof(lock_history_str));
7091         for (i = SO_LCKDBG_MAX - 1; i >= 0; i--) {
7092                 n += scnprintf(lock_history_str + n,
7093                     SO_LOCK_HISTORY_STR_LEN - n, "%p:%p ",
7094                     so->lock_lr[(so->next_lock_lr + i) % SO_LCKDBG_MAX],
7095                     so->unlock_lr[(so->next_unlock_lr + i) % SO_LCKDBG_MAX]);
7096         }
7097         return lock_history_str;
7098 }
7099
7100 lck_mtx_t *
7101 socket_getlock(struct socket *so, int flags)
7102 {
7103         if (so->so_proto->pr_getlock != NULL) {
7104                 return (*so->so_proto->pr_getlock)(so, flags);
7105         } else {
7106                 return so->so_proto->pr_domain->dom_mtx;
7107         }
7108 }
7109
7110 void
7111 socket_lock(struct socket *so, int refcount)
7112 {
7113         void *lr_saved;
7114
7115         lr_saved = __builtin_return_address(0);
7116
7117         if (so->so_proto->pr_lock) {
7118                 (*so->so_proto->pr_lock)(so, refcount, lr_saved);
7119         } else {
7120 #ifdef MORE_LOCKING_DEBUG
7121                 LCK_MTX_ASSERT(so->so_proto->pr_domain->dom_mtx,
7122                     LCK_MTX_ASSERT_NOTOWNED);
7123 #endif
7124                 lck_mtx_lock(so->so_proto->pr_domain->dom_mtx);
7125                 if (refcount) {
7126                         so->so_usecount++;
7127                 }
7128                 so->lock_lr[so->next_lock_lr] = lr_saved;
7129                 so->next_lock_lr = (so->next_lock_lr + 1) % SO_LCKDBG_MAX;
7130         }
7131 }
7132
7133 void
7134 socket_lock_assert_owned(struct socket *so)
7135 {
7136         lck_mtx_t *mutex_held;
7137
7138         if (so->so_proto->pr_getlock != NULL) {
7139                 mutex_held = (*so->so_proto->pr_getlock)(so, 0);
7140         } else {
7141                 mutex_held = so->so_proto->pr_domain->dom_mtx;
7142         }
7143
7144         LCK_MTX_ASSERT(mutex_held, LCK_MTX_ASSERT_OWNED);
7145 }
7146
7147 int
7148 socket_try_lock(struct socket *so)
7149 {
7150         lck_mtx_t *mtx;
7151
7152         if (so->so_proto->pr_getlock != NULL) {
7153                 mtx = (*so->so_proto->pr_getlock)(so, 0);
7154         } else {
7155                 mtx = so->so_proto->pr_domain->dom_mtx;
7156         }
7157
7158         return lck_mtx_try_lock(mtx);
7159 }
7160
7161 void
7162 socket_unlock(struct socket *so, int refcount)
7163 {
7164         void *lr_saved;
7165         lck_mtx_t *mutex_held;
7166
7167         lr_saved = __builtin_return_address(0);
7168
7169         if (so == NULL || so->so_proto == NULL) {
7170                 panic("%s: null so_proto so=%p\n", __func__, so);
7171                 /* NOTREACHED */
7172         }
7173
7174         if (so->so_proto->pr_unlock) {
7175                 (*so->so_proto->pr_unlock)(so, refcount, lr_saved);
7176         } else {
7177                 mutex_held = so->so_proto->pr_domain->dom_mtx;
7178 #ifdef MORE_LOCKING_DEBUG
7179                 LCK_MTX_ASSERT(mutex_held, LCK_MTX_ASSERT_OWNED);
7180 #endif
7181                 so->unlock_lr[so->next_unlock_lr] = lr_saved;
7182                 so->next_unlock_lr = (so->next_unlock_lr + 1) % SO_LCKDBG_MAX;
7183
7184                 if (refcount) {
7185                         if (so->so_usecount <= 0) {
7186                                 panic("%s: bad refcount=%d so=%p (%d, %d, %d) "
7187                                     "lrh=%s", __func__, so->so_usecount, so,
7188                                     SOCK_DOM(so), so->so_type,
7189                                     SOCK_PROTO(so), solockhistory_nr(so));
7190                                 /* NOTREACHED */
7191                         }
7192
7193                         so->so_usecount--;
7194                         if (so->so_usecount == 0) {
7195                                 sofreelastref(so, 1);
7196                         }
7197                 }
7198                 lck_mtx_unlock(mutex_held);
7199         }
7200 }
7201
7202 /* Called with socket locked, will unlock socket */
7203 void
7204 sofree(struct socket *so)
7205 {
7206         lck_mtx_t *mutex_held;
7207
7208         if (so->so_proto->pr_getlock != NULL) {
7209                 mutex_held = (*so->so_proto->pr_getlock)(so, 0);
7210         } else {
7211                 mutex_held = so->so_proto->pr_domain->dom_mtx;
7212         }
7213         LCK_MTX_ASSERT(mutex_held, LCK_MTX_ASSERT_OWNED);
7214
7215         sofreelastref(so, 0);
7216 }
7217
7218 void
7219 soreference(struct socket *so)
7220 {
7221         socket_lock(so, 1);     /* locks & take one reference on socket */
7222         socket_unlock(so, 0);   /* unlock only */
7223 }
7224
7225 void
7226 sodereference(struct socket *so)
7227 {
7228         socket_lock(so, 0);
7229         socket_unlock(so, 1);
7230 }
7231
7232 /*
7233  * Set or clear SOF_MULTIPAGES on the socket to enable or disable the
7234  * possibility of using jumbo clusters.  Caller must ensure to hold
7235  * the socket lock.
7236  */
7237 void
7238 somultipages(struct socket *so, boolean_t set)
7239 {
7240         if (set) {
7241                 so->so_flags |= SOF_MULTIPAGES;
7242         } else {
7243                 so->so_flags &= ~SOF_MULTIPAGES;
7244         }
7245 }
7246
7247 void
7248 soif2kcl(struct socket *so, boolean_t set)
7249 {
7250         if (set) {
7251                 so->so_flags1 |= SOF1_IF_2KCL;
7252         } else {
7253                 so->so_flags1 &= ~SOF1_IF_2KCL;
7254         }
7255 }
7256
7257 int
7258 so_isdstlocal(struct socket *so)
7259 {
7260         struct inpcb *inp = (struct inpcb *)so->so_pcb;
7261
7262         if (SOCK_DOM(so) == PF_INET) {
7263                 return inaddr_local(inp->inp_faddr);
7264         } else if (SOCK_DOM(so) == PF_INET6) {
7265                 return in6addr_local(&inp->in6p_faddr);
7266         }
7267
7268         return 0;
7269 }
7270
7271 int
7272 sosetdefunct(struct proc *p, struct socket *so, int level, boolean_t noforce)
7273 {
7274         struct sockbuf *rcv, *snd;
7275         int err = 0, defunct;
7276
7277         rcv = &so->so_rcv;
7278         snd = &so->so_snd;
7279
7280         defunct = (so->so_flags & SOF_DEFUNCT);
7281         if (defunct) {
7282                 if (!(snd->sb_flags & rcv->sb_flags & SB_DROP)) {
7283                         panic("%s: SB_DROP not set", __func__);
7284                         /* NOTREACHED */
7285                 }
7286                 goto done;
7287         }
7288
7289         if (so->so_flags & SOF_NODEFUNCT) {
7290                 if (noforce) {
7291                         err = EOPNOTSUPP;
7292                         if (p != PROC_NULL) {
7293                                 SODEFUNCTLOG("%s[%d, %s]: (target pid %d "
7294                                     "name %s level %d) so 0x%llx [%d,%d] "
7295                                     "is not eligible for defunct "
7296                                     "(%d)\n", __func__, proc_selfpid(),
7297                                     proc_best_name(current_proc()), proc_pid(p),
7298                                     proc_best_name(p), level,
7299                                     (uint64_t)DEBUG_KERNEL_ADDRPERM(so),
7300                                     SOCK_DOM(so), SOCK_TYPE(so), err);
7301                         }
7302                         return err;
7303                 }
7304                 so->so_flags &= ~SOF_NODEFUNCT;
7305                 if (p != PROC_NULL) {
7306                         SODEFUNCTLOG("%s[%d, %s]: (target pid %d "
7307                             "name %s level %d) so 0x%llx [%d,%d] "
7308                             "defunct by force "
7309                             "(%d)\n", __func__, proc_selfpid(),
7310                             proc_best_name(current_proc()), proc_pid(p),
7311                             proc_best_name(p), level,
7312                             (uint64_t)DEBUG_KERNEL_ADDRPERM(so),
7313                             SOCK_DOM(so), SOCK_TYPE(so), err);
7314                 }
7315         } else if (so->so_flags1 & SOF1_EXTEND_BK_IDLE_WANTED) {
7316                 struct inpcb *inp = (struct inpcb *)so->so_pcb;
7317                 struct ifnet *ifp = inp->inp_last_outifp;
7318
7319                 if (ifp && IFNET_IS_CELLULAR(ifp)) {
7320                         OSIncrementAtomic(&soextbkidlestat.so_xbkidle_nocell);
7321                 } else if (so->so_flags & SOF_DELEGATED) {
7322                         OSIncrementAtomic(&soextbkidlestat.so_xbkidle_nodlgtd);
7323                 } else if (soextbkidlestat.so_xbkidle_time == 0) {
7324                         OSIncrementAtomic(&soextbkidlestat.so_xbkidle_notime);
7325                 } else if (noforce && p != PROC_NULL) {
7326                         OSIncrementAtomic(&soextbkidlestat.so_xbkidle_active);
7327
7328                         so->so_flags1 |= SOF1_EXTEND_BK_IDLE_INPROG;
7329                         so->so_extended_bk_start = net_uptime();
7330                         OSBitOrAtomic(P_LXBKIDLEINPROG, &p->p_ladvflag);
7331
7332                         inpcb_timer_sched(inp->inp_pcbinfo, INPCB_TIMER_LAZY);
7333
7334                         err = EOPNOTSUPP;
7335                         SODEFUNCTLOG("%s[%d, %s]: (target pid %d "
7336                             "name %s level %d) so 0x%llx [%d,%d] "
7337                             "extend bk idle "
7338                             "(%d)\n", __func__, proc_selfpid(),
7339                             proc_best_name(current_proc()), proc_pid(p),
7340                             proc_best_name(p), level,
7341                             (uint64_t)DEBUG_KERNEL_ADDRPERM(so),
7342                             SOCK_DOM(so), SOCK_TYPE(so), err);
7343                         return err;
7344                 } else {
7345                         OSIncrementAtomic(&soextbkidlestat.so_xbkidle_forced);
7346                 }
7347         }
7348
7349         so->so_flags |= SOF_DEFUNCT;
7350
7351         /* Prevent further data from being appended to the socket buffers */
7352         snd->sb_flags |= SB_DROP;
7353         rcv->sb_flags |= SB_DROP;
7354
7355         /* Flush any existing data in the socket buffers */
7356         if (rcv->sb_cc != 0) {
7357                 rcv->sb_flags &= ~SB_SEL;
7358                 selthreadclear(&rcv->sb_sel);
7359                 sbrelease(rcv);
7360         }
7361         if (snd->sb_cc != 0) {
7362                 snd->sb_flags &= ~SB_SEL;
7363                 selthreadclear(&snd->sb_sel);
7364                 sbrelease(snd);
7365         }
7366
7367 done:
7368         if (p != PROC_NULL) {
7369                 SODEFUNCTLOG("%s[%d, %s]: (target pid %d name %s level %d) "
7370                     "so 0x%llx [%d,%d] %s defunct%s\n", __func__,
7371                     proc_selfpid(), proc_best_name(current_proc()),
7372                     proc_pid(p), proc_best_name(p), level,
7373                     (uint64_t)DEBUG_KERNEL_ADDRPERM(so), SOCK_DOM(so),
7374                     SOCK_TYPE(so), defunct ? "is already" : "marked as",
7375                     (so->so_flags1 & SOF1_EXTEND_BK_IDLE_WANTED) ?
7376                     " extbkidle" : "");
7377         }
7378         return err;
7379 }
7380
7381 int
7382 sodefunct(struct proc *p, struct socket *so, int level)
7383 {
7384         struct sockbuf *rcv, *snd;
7385
7386         if (!(so->so_flags & SOF_DEFUNCT)) {
7387                 panic("%s improperly called", __func__);
7388                 /* NOTREACHED */
7389         }
7390         if (so->so_state & SS_DEFUNCT) {
7391                 goto done;
7392         }
7393
7394         rcv = &so->so_rcv;
7395         snd = &so->so_snd;
7396
7397         if (SOCK_DOM(so) == PF_INET || SOCK_DOM(so) == PF_INET6) {
7398                 char s[MAX_IPv6_STR_LEN];
7399                 char d[MAX_IPv6_STR_LEN];
7400                 struct inpcb *inp = sotoinpcb(so);
7401
7402                 if (p != PROC_NULL) {
7403                         SODEFUNCTLOG(
7404                                 "%s[%d, %s]: (target pid %d name %s level %d) "
7405                                 "so 0x%llx [%s %s:%d -> %s:%d] is now defunct "
7406                                 "[rcv_si 0x%x, snd_si 0x%x, rcv_fl 0x%x, "
7407                                 " snd_fl 0x%x]\n", __func__,
7408                                 proc_selfpid(), proc_best_name(current_proc()),
7409                                 proc_pid(p), proc_best_name(p), level,
7410                                 (uint64_t)DEBUG_KERNEL_ADDRPERM(so),
7411                                 (SOCK_TYPE(so) == SOCK_STREAM) ? "TCP" : "UDP",
7412                                 inet_ntop(SOCK_DOM(so), ((SOCK_DOM(so) == PF_INET) ?
7413                                 (void *)&inp->inp_laddr.s_addr :
7414                                 (void *)&inp->in6p_laddr),
7415                                 s, sizeof(s)), ntohs(inp->in6p_lport),
7416                                 inet_ntop(SOCK_DOM(so), (SOCK_DOM(so) == PF_INET) ?
7417                                 (void *)&inp->inp_faddr.s_addr :
7418                                 (void *)&inp->in6p_faddr,
7419                                 d, sizeof(d)), ntohs(inp->in6p_fport),
7420                                 (uint32_t)rcv->sb_sel.si_flags,
7421                                 (uint32_t)snd->sb_sel.si_flags,
7422                                 rcv->sb_flags, snd->sb_flags);
7423                 }
7424         } else if (p != PROC_NULL) {
7425                 SODEFUNCTLOG("%s[%d, %s]: (target pid %d name %s level %d) "
7426                     "so 0x%llx [%d,%d] is now defunct [rcv_si 0x%x, "
7427                     "snd_si 0x%x, rcv_fl 0x%x, snd_fl 0x%x]\n", __func__,
7428                     proc_selfpid(), proc_best_name(current_proc()),
7429                     proc_pid(p), proc_best_name(p), level,
7430                     (uint64_t)DEBUG_KERNEL_ADDRPERM(so),
7431                     SOCK_DOM(so), SOCK_TYPE(so),
7432                     (uint32_t)rcv->sb_sel.si_flags,
7433                     (uint32_t)snd->sb_sel.si_flags, rcv->sb_flags,
7434                     snd->sb_flags);
7435         }
7436
7437         /*
7438          * Unwedge threads blocked on sbwait() and sb_lock().
7439          */
7440         sbwakeup(rcv);
7441         sbwakeup(snd);
7442
7443         so->so_flags1 |= SOF1_DEFUNCTINPROG;
7444         if (rcv->sb_flags & SB_LOCK) {
7445                 sbunlock(rcv, TRUE);    /* keep socket locked */
7446         }
7447         if (snd->sb_flags & SB_LOCK) {
7448                 sbunlock(snd, TRUE);    /* keep socket locked */
7449         }
7450         /*
7451          * Flush the buffers and disconnect.  We explicitly call shutdown
7452          * on both data directions to ensure that SS_CANT{RCV,SEND}MORE
7453          * states are set for the socket.  This would also flush out data
7454          * hanging off the receive list of this socket.
7455          */
7456         (void) soshutdownlock_final(so, SHUT_RD);
7457         (void) soshutdownlock_final(so, SHUT_WR);
7458         (void) sodisconnectlocked(so);
7459
7460         /*
7461          * Explicitly handle connectionless-protocol disconnection
7462          * and release any remaining data in the socket buffers.
7463          */
7464         if (!(so->so_state & SS_ISDISCONNECTED)) {
7465                 (void) soisdisconnected(so);
7466         }
7467
7468         if (so->so_error == 0) {
7469                 so->so_error = EBADF;
7470         }
7471
7472         if (rcv->sb_cc != 0) {
7473                 rcv->sb_flags &= ~SB_SEL;
7474                 selthreadclear(&rcv->sb_sel);
7475                 sbrelease(rcv);
7476         }
7477         if (snd->sb_cc != 0) {
7478                 snd->sb_flags &= ~SB_SEL;
7479                 selthreadclear(&snd->sb_sel);
7480                 sbrelease(snd);
7481         }
7482         so->so_state |= SS_DEFUNCT;
7483         OSIncrementAtomicLong((volatile long *)&sodefunct_calls);
7484
7485 done:
7486         return 0;
7487 }
7488
7489 int
7490 soresume(struct proc *p, struct socket *so, int locked)
7491 {
7492         if (locked == 0) {
7493                 socket_lock(so, 1);
7494         }
7495
7496         if (so->so_flags1 & SOF1_EXTEND_BK_IDLE_INPROG) {
7497                 SODEFUNCTLOG("%s[%d, %s]: (target pid %d name %s) so 0x%llx "
7498                     "[%d,%d] resumed from bk idle\n",
7499                     __func__, proc_selfpid(), proc_best_name(current_proc()),
7500                     proc_pid(p), proc_best_name(p),
7501                     (uint64_t)DEBUG_KERNEL_ADDRPERM(so),
7502                     SOCK_DOM(so), SOCK_TYPE(so));
7503
7504                 so->so_flags1 &= ~SOF1_EXTEND_BK_IDLE_INPROG;
7505                 so->so_extended_bk_start = 0;
7506                 OSBitAndAtomic(~P_LXBKIDLEINPROG, &p->p_ladvflag);
7507
7508                 OSIncrementAtomic(&soextbkidlestat.so_xbkidle_resumed);
7509                 OSDecrementAtomic(&soextbkidlestat.so_xbkidle_active);
7510                 VERIFY(soextbkidlestat.so_xbkidle_active >= 0);
7511         }
7512         if (locked == 0) {
7513                 socket_unlock(so, 1);
7514         }
7515
7516         return 0;
7517 }
7518
7519 /*
7520  * Does not attempt to account for sockets that are delegated from
7521  * the current process
7522  */
7523 int
7524 so_set_extended_bk_idle(struct socket *so, int optval)
7525 {
7526         int error = 0;
7527
7528         if ((SOCK_DOM(so) != PF_INET && SOCK_DOM(so) != PF_INET6) ||
7529             SOCK_PROTO(so) != IPPROTO_TCP) {
7530                 OSDecrementAtomic(&soextbkidlestat.so_xbkidle_notsupp);
7531                 error = EOPNOTSUPP;
7532         } else if (optval == 0) {
7533                 so->so_flags1 &= ~SOF1_EXTEND_BK_IDLE_WANTED;
7534
7535                 soresume(current_proc(), so, 1);
7536         } else {
7537                 struct proc *p = current_proc();
7538                 int i;
7539                 struct filedesc *fdp;
7540                 int count = 0;
7541
7542                 /*
7543                  * Unlock socket to avoid lock ordering issue with
7544                  * the proc fd table lock
7545                  */
7546                 socket_unlock(so, 0);
7547
7548                 proc_fdlock(p);
7549
7550                 fdp = p->p_fd;
7551                 for (i = 0; i < fdp->fd_nfiles; i++) {
7552                         struct fileproc *fp = fdp->fd_ofiles[i];
7553                         struct socket *so2;
7554
7555                         if (fp == NULL ||
7556                             (fdp->fd_ofileflags[i] & UF_RESERVED) != 0 ||
7557                             FILEGLOB_DTYPE(fp->f_fglob) != DTYPE_SOCKET) {
7558                                 continue;
7559                         }
7560
7561                         so2 = (struct socket *)fp->f_fglob->fg_data;
7562                         if (so != so2 &&
7563                             so2->so_flags1 & SOF1_EXTEND_BK_IDLE_WANTED) {
7564                                 count++;
7565                         }
7566                         if (count >= soextbkidlestat.so_xbkidle_maxperproc) {
7567                                 break;
7568                         }
7569                 }
7570                 proc_fdunlock(p);
7571
7572                 socket_lock(so, 0);
7573
7574                 if (count >= soextbkidlestat.so_xbkidle_maxperproc) {
7575                         OSIncrementAtomic(&soextbkidlestat.so_xbkidle_toomany);
7576                         error = EBUSY;
7577                 } else if (so->so_flags & SOF_DELEGATED) {
7578                         OSIncrementAtomic(&soextbkidlestat.so_xbkidle_nodlgtd);
7579                         error = EBUSY;
7580                 } else {
7581                         so->so_flags1 |= SOF1_EXTEND_BK_IDLE_WANTED;
7582                         OSIncrementAtomic(&soextbkidlestat.so_xbkidle_wantok);
7583                 }
7584                 SODEFUNCTLOG("%s[%d, %s]: so 0x%llx [%d,%d] "
7585                     "%s marked for extended bk idle\n",
7586                     __func__, proc_selfpid(), proc_best_name(current_proc()),
7587                     (uint64_t)DEBUG_KERNEL_ADDRPERM(so),
7588                     SOCK_DOM(so), SOCK_TYPE(so),
7589                     (so->so_flags1 & SOF1_EXTEND_BK_IDLE_WANTED) ?
7590                     "is" : "not");
7591         }
7592
7593         return error;
7594 }
7595
7596 static void
7597 so_stop_extended_bk_idle(struct socket *so)
7598 {
7599         so->so_flags1 &= ~SOF1_EXTEND_BK_IDLE_INPROG;
7600         so->so_extended_bk_start = 0;
7601
7602         OSDecrementAtomic(&soextbkidlestat.so_xbkidle_active);
7603         VERIFY(soextbkidlestat.so_xbkidle_active >= 0);
7604         /*
7605          * Force defunct
7606          */
7607         sosetdefunct(current_proc(), so,
7608             SHUTDOWN_SOCKET_LEVEL_DISCONNECT_INTERNAL, FALSE);
7609         if (so->so_flags & SOF_DEFUNCT) {
7610                 sodefunct(current_proc(), so,
7611                     SHUTDOWN_SOCKET_LEVEL_DISCONNECT_INTERNAL);
7612         }
7613 }
7614
7615 void
7616 so_drain_extended_bk_idle(struct socket *so)
7617 {
7618         if (so && (so->so_flags1 & SOF1_EXTEND_BK_IDLE_INPROG)) {
7619                 /*
7620                  * Only penalize sockets that have outstanding data
7621                  */
7622                 if (so->so_rcv.sb_cc || so->so_snd.sb_cc) {
7623                         so_stop_extended_bk_idle(so);
7624
7625                         OSIncrementAtomic(&soextbkidlestat.so_xbkidle_drained);
7626                 }
7627         }
7628 }
7629
7630 /*
7631  * Return values tells if socket is still in extended background idle
7632  */
7633 int
7634 so_check_extended_bk_idle_time(struct socket *so)
7635 {
7636         int ret = 1;
7637
7638         if ((so->so_flags1 & SOF1_EXTEND_BK_IDLE_INPROG)) {
7639                 SODEFUNCTLOG("%s[%d, %s]: so 0x%llx [%d,%d]\n",
7640                     __func__, proc_selfpid(), proc_best_name(current_proc()),
7641                     (uint64_t)DEBUG_KERNEL_ADDRPERM(so),
7642                     SOCK_DOM(so), SOCK_TYPE(so));
7643                 if (net_uptime() - so->so_extended_bk_start >
7644                     soextbkidlestat.so_xbkidle_time) {
7645                         so_stop_extended_bk_idle(so);
7646
7647                         OSIncrementAtomic(&soextbkidlestat.so_xbkidle_expired);
7648
7649                         ret = 0;
7650                 } else {
7651                         struct inpcb *inp = (struct inpcb *)so->so_pcb;
7652
7653                         inpcb_timer_sched(inp->inp_pcbinfo, INPCB_TIMER_LAZY);
7654                         OSIncrementAtomic(&soextbkidlestat.so_xbkidle_resched);
7655                 }
7656         }
7657
7658         return ret;
7659 }
7660
7661 void
7662 resume_proc_sockets(proc_t p)
7663 {
7664         if (p->p_ladvflag & P_LXBKIDLEINPROG) {
7665                 struct filedesc *fdp;
7666                 int i;
7667
7668                 proc_fdlock(p);
7669                 fdp = p->p_fd;
7670                 for (i = 0; i < fdp->fd_nfiles; i++) {
7671                         struct fileproc *fp;
7672                         struct socket *so;
7673
7674                         fp = fdp->fd_ofiles[i];
7675                         if (fp == NULL ||
7676                             (fdp->fd_ofileflags[i] & UF_RESERVED) != 0 ||
7677                             FILEGLOB_DTYPE(fp->f_fglob) != DTYPE_SOCKET) {
7678                                 continue;
7679                         }
7680
7681                         so = (struct socket *)fp->f_fglob->fg_data;
7682                         (void) soresume(p, so, 0);
7683                 }
7684                 proc_fdunlock(p);
7685
7686                 OSBitAndAtomic(~P_LXBKIDLEINPROG, &p->p_ladvflag);
7687         }
7688 }
7689
7690 __private_extern__ int
7691 so_set_recv_anyif(struct socket *so, int optval)
7692 {
7693         int ret = 0;
7694
7695 #if INET6
7696         if (SOCK_DOM(so) == PF_INET || SOCK_DOM(so) == PF_INET6) {
7697 #else
7698         if (SOCK_DOM(so) == PF_INET) {
7699 #endif /* !INET6 */
7700                 if (optval) {
7701                         sotoinpcb(so)->inp_flags |= INP_RECV_ANYIF;
7702                 } else {
7703                         sotoinpcb(so)->inp_flags &= ~INP_RECV_ANYIF;
7704                 }
7705         }
7706
7707
7708         return ret;
7709 }
7710
7711 __private_extern__ int
7712 so_get_recv_anyif(struct socket *so)
7713 {
7714         int ret = 0;
7715
7716 #if INET6
7717         if (SOCK_DOM(so) == PF_INET || SOCK_DOM(so) == PF_INET6) {
7718 #else
7719         if (SOCK_DOM(so) == PF_INET) {
7720 #endif /* !INET6 */
7721                 ret = (sotoinpcb(so)->inp_flags & INP_RECV_ANYIF) ? 1 : 0;
7722         }
7723
7724         return ret;
7725 }
7726
7727 int
7728 so_set_restrictions(struct socket *so, uint32_t vals)
7729 {
7730         int nocell_old, nocell_new;
7731         int noexpensive_old, noexpensive_new;
7732         int noconstrained_old, noconstrained_new;
7733
7734         /*
7735          * Deny-type restrictions are trapdoors; once set they cannot be
7736          * unset for the lifetime of the socket.  This allows them to be
7737          * issued by a framework on behalf of the application without
7738          * having to worry that they can be undone.
7739          *
7740          * Note here that socket-level restrictions overrides any protocol
7741          * level restrictions.  For instance, SO_RESTRICT_DENY_CELLULAR
7742          * socket restriction issued on the socket has a higher precendence
7743          * than INP_NO_IFT_CELLULAR.  The latter is affected by the UUID
7744          * policy PROC_UUID_NO_CELLULAR for unrestricted sockets only,
7745          * i.e. when SO_RESTRICT_DENY_CELLULAR has not been issued.
7746          */
7747         nocell_old = (so->so_restrictions & SO_RESTRICT_DENY_CELLULAR);
7748         noexpensive_old = (so->so_restrictions & SO_RESTRICT_DENY_EXPENSIVE);
7749         noconstrained_old = (so->so_restrictions & SO_RESTRICT_DENY_CONSTRAINED);
7750         so->so_restrictions |= (vals & (SO_RESTRICT_DENY_IN |
7751             SO_RESTRICT_DENY_OUT | SO_RESTRICT_DENY_CELLULAR |
7752             SO_RESTRICT_DENY_EXPENSIVE | SO_RESTRICT_DENY_CONSTRAINED));
7753         nocell_new = (so->so_restrictions & SO_RESTRICT_DENY_CELLULAR);
7754         noexpensive_new = (so->so_restrictions & SO_RESTRICT_DENY_EXPENSIVE);
7755         noconstrained_new = (so->so_restrictions & SO_RESTRICT_DENY_CONSTRAINED);
7756
7757         /* we can only set, not clear restrictions */
7758         if ((nocell_new - nocell_old) == 0 &&
7759             (noexpensive_new - noexpensive_old) == 0 &&
7760             (noconstrained_new - noconstrained_old) == 0) {
7761                 return 0;
7762         }
7763 #if INET6
7764         if (SOCK_DOM(so) == PF_INET || SOCK_DOM(so) == PF_INET6) {
7765 #else
7766         if (SOCK_DOM(so) == PF_INET) {
7767 #endif /* !INET6 */
7768                 if (nocell_new - nocell_old != 0) {
7769                         /*
7770                          * if deny cellular is now set, do what's needed
7771                          * for INPCB
7772                          */
7773                         inp_set_nocellular(sotoinpcb(so));
7774                 }
7775                 if (noexpensive_new - noexpensive_old != 0) {
7776                         inp_set_noexpensive(sotoinpcb(so));
7777                 }
7778                 if (noconstrained_new - noconstrained_old != 0) {
7779                         inp_set_noconstrained(sotoinpcb(so));
7780                 }
7781         }
7782
7783         if (SOCK_DOM(so) == PF_MULTIPATH) {
7784                 mptcp_set_restrictions(so);
7785         }
7786
7787         return 0;
7788 }
7789
7790 uint32_t
7791 so_get_restrictions(struct socket *so)
7792 {
7793         return so->so_restrictions & (SO_RESTRICT_DENY_IN |
7794                SO_RESTRICT_DENY_OUT |
7795                SO_RESTRICT_DENY_CELLULAR | SO_RESTRICT_DENY_EXPENSIVE);
7796 }
7797
7798 int
7799 so_set_effective_pid(struct socket *so, int epid, struct proc *p, boolean_t check_cred)
7800 {
7801         struct proc *ep = PROC_NULL;
7802         int error = 0;
7803
7804         /* pid 0 is reserved for kernel */
7805         if (epid == 0) {
7806                 error = EINVAL;
7807                 goto done;
7808         }
7809
7810         /*
7811          * If this is an in-kernel socket, prevent its delegate
7812          * association from changing unless the socket option is
7813          * coming from within the kernel itself.
7814          */
7815         if (so->last_pid == 0 && p != kernproc) {
7816                 error = EACCES;
7817                 goto done;
7818         }
7819
7820         /*
7821          * If this is issued by a process that's recorded as the
7822          * real owner of the socket, or if the pid is the same as
7823          * the process's own pid, then proceed.  Otherwise ensure
7824          * that the issuing process has the necessary privileges.
7825          */
7826         if (check_cred && (epid != so->last_pid || epid != proc_pid(p))) {
7827                 if ((error = priv_check_cred(kauth_cred_get(),
7828                     PRIV_NET_PRIVILEGED_SOCKET_DELEGATE, 0))) {
7829                         error = EACCES;
7830                         goto done;
7831                 }
7832         }
7833
7834         /* Find the process that corresponds to the effective pid */
7835         if ((ep = proc_find(epid)) == PROC_NULL) {
7836                 error = ESRCH;
7837                 goto done;
7838         }
7839
7840         /*
7841          * If a process tries to delegate the socket to itself, then
7842          * there's really nothing to do; treat it as a way for the
7843          * delegate association to be cleared.  Note that we check
7844          * the passed-in proc rather than calling proc_selfpid(),
7845          * as we need to check the process issuing the socket option
7846          * which could be kernproc.  Given that we don't allow 0 for
7847          * effective pid, it means that a delegated in-kernel socket
7848          * stays delegated during its lifetime (which is probably OK.)
7849          */
7850         if (epid == proc_pid(p)) {
7851                 so->so_flags &= ~SOF_DELEGATED;
7852                 so->e_upid = 0;
7853                 so->e_pid = 0;
7854                 uuid_clear(so->e_uuid);
7855         } else {
7856                 so->so_flags |= SOF_DELEGATED;
7857                 so->e_upid = proc_uniqueid(ep);
7858                 so->e_pid = proc_pid(ep);
7859                 proc_getexecutableuuid(ep, so->e_uuid, sizeof(so->e_uuid));
7860         }
7861         if (so->so_proto != NULL && so->so_proto->pr_update_last_owner != NULL) {
7862                 (*so->so_proto->pr_update_last_owner)(so, NULL, ep);
7863         }
7864 done:
7865         if (error == 0 && net_io_policy_log) {
7866                 uuid_string_t buf;
7867
7868                 uuid_unparse(so->e_uuid, buf);
7869                 log(LOG_DEBUG, "%s[%s,%d]: so 0x%llx [%d,%d] epid %d (%s) "
7870                     "euuid %s%s\n", __func__, proc_name_address(p),
7871                     proc_pid(p), (uint64_t)DEBUG_KERNEL_ADDRPERM(so),
7872                     SOCK_DOM(so), SOCK_TYPE(so),
7873                     so->e_pid, proc_name_address(ep), buf,
7874                     ((so->so_flags & SOF_DELEGATED) ? " [delegated]" : ""));
7875         } else if (error != 0 && net_io_policy_log) {
7876                 log(LOG_ERR, "%s[%s,%d]: so 0x%llx [%d,%d] epid %d (%s) "
7877                     "ERROR (%d)\n", __func__, proc_name_address(p),
7878                     proc_pid(p), (uint64_t)DEBUG_KERNEL_ADDRPERM(so),
7879                     SOCK_DOM(so), SOCK_TYPE(so),
7880                     epid, (ep == PROC_NULL) ? "PROC_NULL" :
7881                     proc_name_address(ep), error);
7882         }
7883
7884         /* Update this socket's policy upon success */
7885         if (error == 0) {
7886                 so->so_policy_gencnt *= -1;
7887                 so_update_policy(so);
7888 #if NECP
7889                 so_update_necp_policy(so, NULL, NULL);
7890 #endif /* NECP */
7891         }
7892
7893         if (ep != PROC_NULL) {
7894                 proc_rele(ep);
7895         }
7896
7897         return error;
7898 }
7899
7900 int
7901 so_set_effective_uuid(struct socket *so, uuid_t euuid, struct proc *p, boolean_t check_cred)
7902 {
7903         uuid_string_t buf;
7904         uuid_t uuid;
7905         int error = 0;
7906
7907         /* UUID must not be all-zeroes (reserved for kernel) */
7908         if (uuid_is_null(euuid)) {
7909                 error = EINVAL;
7910                 goto done;
7911         }
7912
7913         /*
7914          * If this is an in-kernel socket, prevent its delegate
7915          * association from changing unless the socket option is
7916          * coming from within the kernel itself.
7917          */
7918         if (so->last_pid == 0 && p != kernproc) {
7919                 error = EACCES;
7920                 goto done;
7921         }
7922
7923         /* Get the UUID of the issuing process */
7924         proc_getexecutableuuid(p, uuid, sizeof(uuid));
7925
7926         /*
7927          * If this is issued by a process that's recorded as the
7928          * real owner of the socket, or if the uuid is the same as
7929          * the process's own uuid, then proceed.  Otherwise ensure
7930          * that the issuing process has the necessary privileges.
7931          */
7932         if (check_cred &&
7933             (uuid_compare(euuid, so->last_uuid) != 0 ||
7934             uuid_compare(euuid, uuid) != 0)) {
7935                 if ((error = priv_check_cred(kauth_cred_get(),
7936                     PRIV_NET_PRIVILEGED_SOCKET_DELEGATE, 0))) {
7937                         error = EACCES;
7938                         goto done;
7939                 }
7940         }
7941
7942         /*
7943          * If a process tries to delegate the socket to itself, then
7944          * there's really nothing to do; treat it as a way for the
7945          * delegate association to be cleared.  Note that we check
7946          * the uuid of the passed-in proc rather than that of the
7947          * current process, as we need to check the process issuing
7948          * the socket option which could be kernproc itself.  Given
7949          * that we don't allow 0 for effective uuid, it means that
7950          * a delegated in-kernel socket stays delegated during its
7951          * lifetime (which is okay.)
7952          */
7953         if (uuid_compare(euuid, uuid) == 0) {
7954                 so->so_flags &= ~SOF_DELEGATED;
7955                 so->e_upid = 0;
7956                 so->e_pid = 0;
7957                 uuid_clear(so->e_uuid);
7958         } else {
7959                 so->so_flags |= SOF_DELEGATED;
7960                 /*
7961                  * Unlike so_set_effective_pid(), we only have the UUID
7962                  * here and the process ID is not known.  Inherit the
7963                  * real {pid,upid} of the socket.
7964                  */
7965                 so->e_upid = so->last_upid;
7966                 so->e_pid = so->last_pid;
7967                 uuid_copy(so->e_uuid, euuid);
7968         }
7969         /*
7970          * The following will clear the effective process name as it's the same
7971          * as the real process
7972          */
7973         if (so->so_proto != NULL && so->so_proto->pr_update_last_owner != NULL) {
7974                 (*so->so_proto->pr_update_last_owner)(so, NULL, NULL);
7975         }
7976 done:
7977         if (error == 0 && net_io_policy_log) {
7978                 uuid_unparse(so->e_uuid, buf);
7979                 log(LOG_DEBUG, "%s[%s,%d]: so 0x%llx [%d,%d] epid %d "
7980                     "euuid %s%s\n", __func__, proc_name_address(p), proc_pid(p),
7981                     (uint64_t)DEBUG_KERNEL_ADDRPERM(so), SOCK_DOM(so),
7982                     SOCK_TYPE(so), so->e_pid, buf,
7983                     ((so->so_flags & SOF_DELEGATED) ? " [delegated]" : ""));
7984         } else if (error != 0 && net_io_policy_log) {
7985                 uuid_unparse(euuid, buf);
7986                 log(LOG_DEBUG, "%s[%s,%d]: so 0x%llx [%d,%d] euuid %s "
7987                     "ERROR (%d)\n", __func__, proc_name_address(p), proc_pid(p),
7988                     (uint64_t)DEBUG_KERNEL_ADDRPERM(so), SOCK_DOM(so),
7989                     SOCK_TYPE(so), buf, error);
7990         }
7991
7992         /* Update this socket's policy upon success */
7993         if (error == 0) {
7994                 so->so_policy_gencnt *= -1;
7995                 so_update_policy(so);
7996 #if NECP
7997                 so_update_necp_policy(so, NULL, NULL);
7998 #endif /* NECP */
7999         }
8000
8001         return error;
8002 }
8003
8004 void
8005 netpolicy_post_msg(uint32_t ev_code, struct netpolicy_event_data *ev_data,
8006     uint32_t ev_datalen)
8007 {
8008         struct kev_msg ev_msg;
8009
8010         /*
8011          * A netpolicy event always starts with a netpolicy_event_data
8012          * structure, but the caller can provide for a longer event
8013          * structure to post, depending on the event code.
8014          */
8015         VERIFY(ev_data != NULL && ev_datalen >= sizeof(*ev_data));
8016
8017         bzero(&ev_msg, sizeof(ev_msg));
8018         ev_msg.vendor_code      = KEV_VENDOR_APPLE;
8019         ev_msg.kev_class        = KEV_NETWORK_CLASS;
8020         ev_msg.kev_subclass     = KEV_NETPOLICY_SUBCLASS;
8021         ev_msg.event_code       = ev_code;
8022
8023         ev_msg.dv[0].data_ptr   = ev_data;
8024         ev_msg.dv[0].data_length = ev_datalen;
8025
8026         kev_post_msg(&ev_msg);
8027 }
8028
8029 void
8030 socket_post_kev_msg(uint32_t ev_code,
8031     struct kev_socket_event_data *ev_data,
8032     uint32_t ev_datalen)
8033 {
8034         struct kev_msg ev_msg;
8035
8036         bzero(&ev_msg, sizeof(ev_msg));
8037         ev_msg.vendor_code = KEV_VENDOR_APPLE;
8038         ev_msg.kev_class = KEV_NETWORK_CLASS;
8039         ev_msg.kev_subclass = KEV_SOCKET_SUBCLASS;
8040         ev_msg.event_code = ev_code;
8041
8042         ev_msg.dv[0].data_ptr = ev_data;
8043         ev_msg.dv[0].data_length = ev_datalen;
8044
8045         kev_post_msg(&ev_msg);
8046 }
8047
8048 void
8049 socket_post_kev_msg_closed(struct socket *so)
8050 {
8051         struct kev_socket_closed ev;
8052         struct sockaddr *socksa = NULL, *peersa = NULL;
8053         int err;
8054         bzero(&ev, sizeof(ev));
8055         err = (*so->so_proto->pr_usrreqs->pru_sockaddr)(so, &socksa);
8056         if (err == 0) {
8057                 err = (*so->so_proto->pr_usrreqs->pru_peeraddr)(so,
8058                     &peersa);
8059                 if (err == 0) {
8060                         memcpy(&ev.ev_data.kev_sockname, socksa,
8061                             min(socksa->sa_len,
8062                             sizeof(ev.ev_data.kev_sockname)));
8063                         memcpy(&ev.ev_data.kev_peername, peersa,
8064                             min(peersa->sa_len,
8065                             sizeof(ev.ev_data.kev_peername)));
8066                         socket_post_kev_msg(KEV_SOCKET_CLOSED,
8067                             &ev.ev_data, sizeof(ev));
8068                 }
8069         }
8070         if (socksa != NULL) {
8071                 FREE(socksa, M_SONAME);
8072         }
8073         if (peersa != NULL) {
8074                 FREE(peersa, M_SONAME);
8075         }
8076 }