bsd/kern/uipc_socket.c

   1 /*
   2  * Copyright (c) 1998-2020 Apple Inc. All rights reserved.
   3  *
   4  * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
   5  *
   6  * This file contains Original Code and/or Modifications of Original Code
   7  * as defined in and that are subject to the Apple Public Source License
   8  * Version 2.0 (the 'License'). You may not use this file except in
   9  * compliance with the License. The rights granted to you under the License
  10  * may not be used to create, or enable the creation or redistribution of,
  11  * unlawful or unlicensed copies of an Apple operating system, or to
  12  * circumvent, violate, or enable the circumvention or violation of, any
  13  * terms of an Apple operating system software license agreement.
  14  *
  15  * Please obtain a copy of the License at
  16  * http://www.opensource.apple.com/apsl/ and read it before using this file.
  17  *
  18  * The Original Code and all software distributed under the License are
  19  * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
  20  * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
  21  * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
  22  * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
  23  * Please see the License for the specific language governing rights and
  24  * limitations under the License.
  25  *
  26  * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
  27  */
  28 /* Copyright (c) 1995 NeXT Computer, Inc. All Rights Reserved */
  29 /*
  30  * Copyright (c) 1982, 1986, 1988, 1990, 1993
  31  *      The Regents of the University of California.  All rights reserved.
  32  *
  33  * Redistribution and use in source and binary forms, with or without
  34  * modification, are permitted provided that the following conditions
  35  * are met:
  36  * 1. Redistributions of source code must retain the above copyright
  37  *    notice, this list of conditions and the following disclaimer.
  38  * 2. Redistributions in binary form must reproduce the above copyright
  39  *    notice, this list of conditions and the following disclaimer in the
  40  *    documentation and/or other materials provided with the distribution.
  41  * 3. All advertising materials mentioning features or use of this software
  42  *    must display the following acknowledgement:
  43  *      This product includes software developed by the University of
  44  *      California, Berkeley and its contributors.
  45  * 4. Neither the name of the University nor the names of its contributors
  46  *    may be used to endorse or promote products derived from this software
  47  *    without specific prior written permission.
  48  *
  49  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
  50  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  51  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  52  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  53  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  54  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  55  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  56  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  57  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  58  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  59  * SUCH DAMAGE.
  60  *
  61  *      @(#)uipc_socket.c       8.3 (Berkeley) 4/15/94
  62  */
  63 /*
  64  * NOTICE: This file was modified by SPARTA, Inc. in 2005 to introduce
  65  * support for mandatory and extensible security protections.  This notice
  66  * is included in support of clause 2.2 (b) of the Apple Public License,
  67  * Version 2.0.
  68  */
  69
  70 #include <sys/param.h>
  71 #include <sys/systm.h>
  72 #include <sys/filedesc.h>
  73 #include <sys/proc.h>
  74 #include <sys/proc_internal.h>
  75 #include <sys/kauth.h>
  76 #include <sys/file_internal.h>
  77 #include <sys/fcntl.h>
  78 #include <sys/malloc.h>
  79 #include <sys/mbuf.h>
  80 #include <sys/domain.h>
  81 #include <sys/kernel.h>
  82 #include <sys/event.h>
  83 #include <sys/poll.h>
  84 #include <sys/protosw.h>
  85 #include <sys/socket.h>
  86 #include <sys/socketvar.h>
  87 #include <sys/resourcevar.h>
  88 #include <sys/signalvar.h>
  89 #include <sys/sysctl.h>
  90 #include <sys/syslog.h>
  91 #include <sys/uio.h>
  92 #include <sys/uio_internal.h>
  93 #include <sys/ev.h>
  94 #include <sys/kdebug.h>
  95 #include <sys/un.h>
  96 #include <sys/user.h>
  97 #include <sys/priv.h>
  98 #include <sys/kern_event.h>
  99 #include <net/route.h>
 100 #include <net/init.h>
 101 #include <net/net_api_stats.h>
 102 #include <net/ntstat.h>
 103 #include <net/content_filter.h>
 104 #include <netinet/in.h>
 105 #include <netinet/in_pcb.h>
 106 #include <netinet/in_tclass.h>
 107 #include <netinet/in_var.h>
 108 #include <netinet/tcp_var.h>
 109 #include <netinet/ip6.h>
 110 #include <netinet6/ip6_var.h>
 111 #include <netinet/flow_divert.h>
 112 #include <kern/zalloc.h>
 113 #include <kern/locks.h>
 114 #include <machine/limits.h>
 115 #include <libkern/OSAtomic.h>
 116 #include <pexpert/pexpert.h>
 117 #include <kern/assert.h>
 118 #include <kern/task.h>
 119 #include <kern/policy_internal.h>
 120
 121 #include <sys/kpi_mbuf.h>
 122 #include <sys/mcache.h>
 123 #include <sys/unpcb.h>
 124 #include <libkern/section_keywords.h>
 125
 126 #if CONFIG_MACF
 127 #include <security/mac_framework.h>
 128 #endif /* MAC */
 129
 130 #if MULTIPATH
 131 #include <netinet/mp_pcb.h>
 132 #include <netinet/mptcp_var.h>
 133 #endif /* MULTIPATH */
 134
 135 #define ROUNDUP(a, b) (((a) + ((b) - 1)) & (~((b) - 1)))
 136
 137 #if DEBUG || DEVELOPMENT
 138 #define DEBUG_KERNEL_ADDRPERM(_v) (_v)
 139 #else
 140 #define DEBUG_KERNEL_ADDRPERM(_v) VM_KERNEL_ADDRPERM(_v)
 141 #endif
 142
 143 /* TODO: this should be in a header file somewhere */
 144 extern char *proc_name_address(void *p);
 145
 146 static u_int32_t        so_cache_hw;    /* High water mark for socache */
 147 static u_int32_t        so_cache_timeouts;      /* number of timeouts */
 148 static u_int32_t        so_cache_max_freed;     /* max freed per timeout */
 149 static u_int32_t        cached_sock_count = 0;
 150 STAILQ_HEAD(, socket)   so_cache_head;
 151 int     max_cached_sock_count = MAX_CACHED_SOCKETS;
 152 static u_int32_t        so_cache_time;
 153 static int              socketinit_done;
 154 static struct zone      *so_cache_zone;
 155
 156 static lck_grp_t        *so_cache_mtx_grp;
 157 static lck_attr_t       *so_cache_mtx_attr;
 158 static lck_grp_attr_t   *so_cache_mtx_grp_attr;
 159 static lck_mtx_t        *so_cache_mtx;
 160
 161 #include <machine/limits.h>
 162
 163 static int      filt_sorattach(struct knote *kn, struct kevent_qos_s *kev);
 164 static void     filt_sordetach(struct knote *kn);
 165 static int      filt_soread(struct knote *kn, long hint);
 166 static int      filt_sortouch(struct knote *kn, struct kevent_qos_s *kev);
 167 static int      filt_sorprocess(struct knote *kn, struct kevent_qos_s *kev);
 168
 169 static int      filt_sowattach(struct knote *kn, struct kevent_qos_s *kev);
 170 static void     filt_sowdetach(struct knote *kn);
 171 static int      filt_sowrite(struct knote *kn, long hint);
 172 static int      filt_sowtouch(struct knote *kn, struct kevent_qos_s *kev);
 173 static int      filt_sowprocess(struct knote *kn, struct kevent_qos_s *kev);
 174
 175 static int      filt_sockattach(struct knote *kn, struct kevent_qos_s *kev);
 176 static void     filt_sockdetach(struct knote *kn);
 177 static int      filt_sockev(struct knote *kn, long hint);
 178 static int      filt_socktouch(struct knote *kn, struct kevent_qos_s *kev);
 179 static int      filt_sockprocess(struct knote *kn, struct kevent_qos_s *kev);
 180
 181 static int sooptcopyin_timeval(struct sockopt *, struct timeval *);
 182 static int sooptcopyout_timeval(struct sockopt *, const struct timeval *);
 183
 184 SECURITY_READ_ONLY_EARLY(struct filterops) soread_filtops = {
 185         .f_isfd = 1,
 186         .f_attach = filt_sorattach,
 187         .f_detach = filt_sordetach,
 188         .f_event = filt_soread,
 189         .f_touch = filt_sortouch,
 190         .f_process = filt_sorprocess,
 191 };
 192
 193 SECURITY_READ_ONLY_EARLY(struct filterops) sowrite_filtops = {
 194         .f_isfd = 1,
 195         .f_attach = filt_sowattach,
 196         .f_detach = filt_sowdetach,
 197         .f_event = filt_sowrite,
 198         .f_touch = filt_sowtouch,
 199         .f_process = filt_sowprocess,
 200 };
 201
 202 SECURITY_READ_ONLY_EARLY(struct filterops) sock_filtops = {
 203         .f_isfd = 1,
 204         .f_attach = filt_sockattach,
 205         .f_detach = filt_sockdetach,
 206         .f_event = filt_sockev,
 207         .f_touch = filt_socktouch,
 208         .f_process = filt_sockprocess,
 209 };
 210
 211 SECURITY_READ_ONLY_EARLY(struct filterops) soexcept_filtops = {
 212         .f_isfd = 1,
 213         .f_attach = filt_sorattach,
 214         .f_detach = filt_sordetach,
 215         .f_event = filt_soread,
 216         .f_touch = filt_sortouch,
 217         .f_process = filt_sorprocess,
 218 };
 219
 220 SYSCTL_DECL(_kern_ipc);
 221
 222 #define EVEN_MORE_LOCKING_DEBUG 0
 223
 224 int socket_debug = 0;
 225 SYSCTL_INT(_kern_ipc, OID_AUTO, socket_debug,
 226     CTLFLAG_RW | CTLFLAG_LOCKED, &socket_debug, 0, "");
 227
 228 static unsigned long sodefunct_calls = 0;
 229 SYSCTL_LONG(_kern_ipc, OID_AUTO, sodefunct_calls, CTLFLAG_LOCKED,
 230     &sodefunct_calls, "");
 231
 232 ZONE_DECLARE(socket_zone, "socket", sizeof(struct socket), ZC_ZFREE_CLEARMEM);
 233 so_gen_t        so_gencnt;      /* generation count for sockets */
 234
 235 MALLOC_DEFINE(M_SONAME, "soname", "socket name");
 236 MALLOC_DEFINE(M_PCB, "pcb", "protocol control block");
 237
 238 #define DBG_LAYER_IN_BEG        NETDBG_CODE(DBG_NETSOCK, 0)
 239 #define DBG_LAYER_IN_END        NETDBG_CODE(DBG_NETSOCK, 2)
 240 #define DBG_LAYER_OUT_BEG       NETDBG_CODE(DBG_NETSOCK, 1)
 241 #define DBG_LAYER_OUT_END       NETDBG_CODE(DBG_NETSOCK, 3)
 242 #define DBG_FNC_SOSEND          NETDBG_CODE(DBG_NETSOCK, (4 << 8) | 1)
 243 #define DBG_FNC_SOSEND_LIST     NETDBG_CODE(DBG_NETSOCK, (4 << 8) | 3)
 244 #define DBG_FNC_SORECEIVE       NETDBG_CODE(DBG_NETSOCK, (8 << 8))
 245 #define DBG_FNC_SORECEIVE_LIST  NETDBG_CODE(DBG_NETSOCK, (8 << 8) | 3)
 246 #define DBG_FNC_SOSHUTDOWN      NETDBG_CODE(DBG_NETSOCK, (9 << 8))
 247
 248 #define MAX_SOOPTGETM_SIZE      (128 * MCLBYTES)
 249
 250 int somaxconn = SOMAXCONN;
 251 SYSCTL_INT(_kern_ipc, KIPC_SOMAXCONN, somaxconn,
 252     CTLFLAG_RW | CTLFLAG_LOCKED, &somaxconn, 0, "");
 253
 254 /* Should we get a maximum also ??? */
 255 static int sosendmaxchain = 65536;
 256 static int sosendminchain = 16384;
 257 static int sorecvmincopy  = 16384;
 258 SYSCTL_INT(_kern_ipc, OID_AUTO, sosendminchain,
 259     CTLFLAG_RW | CTLFLAG_LOCKED, &sosendminchain, 0, "");
 260 SYSCTL_INT(_kern_ipc, OID_AUTO, sorecvmincopy,
 261     CTLFLAG_RW | CTLFLAG_LOCKED, &sorecvmincopy, 0, "");
 262
 263 /*
 264  * Set to enable jumbo clusters (if available) for large writes when
 265  * the socket is marked with SOF_MULTIPAGES; see below.
 266  */
 267 int sosendjcl = 1;
 268 SYSCTL_INT(_kern_ipc, OID_AUTO, sosendjcl,
 269     CTLFLAG_RW | CTLFLAG_LOCKED, &sosendjcl, 0, "");
 270
 271 /*
 272  * Set this to ignore SOF_MULTIPAGES and use jumbo clusters for large
 273  * writes on the socket for all protocols on any network interfaces,
 274  * depending upon sosendjcl above.  Be extra careful when setting this
 275  * to 1, because sending down packets that cross physical pages down to
 276  * broken drivers (those that falsely assume that the physical pages
 277  * are contiguous) might lead to system panics or silent data corruption.
 278  * When set to 0, the system will respect SOF_MULTIPAGES, which is set
 279  * only for TCP sockets whose outgoing interface is IFNET_MULTIPAGES
 280  * capable.  Set this to 1 only for testing/debugging purposes.
 281  */
 282 int sosendjcl_ignore_capab = 0;
 283 SYSCTL_INT(_kern_ipc, OID_AUTO, sosendjcl_ignore_capab,
 284     CTLFLAG_RW | CTLFLAG_LOCKED, &sosendjcl_ignore_capab, 0, "");
 285
 286 /*
 287  * Set this to ignore SOF1_IF_2KCL and use big clusters for large
 288  * writes on the socket for all protocols on any network interfaces.
 289  * Be extra careful when setting this to 1, because sending down packets with
 290  * clusters larger that 2 KB might lead to system panics or data corruption.
 291  * When set to 0, the system will respect SOF1_IF_2KCL, which is set
 292  * on the outgoing interface
 293  * Set this to 1  for testing/debugging purposes only.
 294  */
 295 int sosendbigcl_ignore_capab = 0;
 296 SYSCTL_INT(_kern_ipc, OID_AUTO, sosendbigcl_ignore_capab,
 297     CTLFLAG_RW | CTLFLAG_LOCKED, &sosendbigcl_ignore_capab, 0, "");
 298
 299 int sodefunctlog = 0;
 300 SYSCTL_INT(_kern_ipc, OID_AUTO, sodefunctlog, CTLFLAG_RW | CTLFLAG_LOCKED,
 301     &sodefunctlog, 0, "");
 302
 303 int sothrottlelog = 0;
 304 SYSCTL_INT(_kern_ipc, OID_AUTO, sothrottlelog, CTLFLAG_RW | CTLFLAG_LOCKED,
 305     &sothrottlelog, 0, "");
 306
 307 int sorestrictrecv = 1;
 308 SYSCTL_INT(_kern_ipc, OID_AUTO, sorestrictrecv, CTLFLAG_RW | CTLFLAG_LOCKED,
 309     &sorestrictrecv, 0, "Enable inbound interface restrictions");
 310
 311 int sorestrictsend = 1;
 312 SYSCTL_INT(_kern_ipc, OID_AUTO, sorestrictsend, CTLFLAG_RW | CTLFLAG_LOCKED,
 313     &sorestrictsend, 0, "Enable outbound interface restrictions");
 314
 315 int soreserveheadroom = 1;
 316 SYSCTL_INT(_kern_ipc, OID_AUTO, soreserveheadroom, CTLFLAG_RW | CTLFLAG_LOCKED,
 317     &soreserveheadroom, 0, "To allocate contiguous datagram buffers");
 318
 319 #if (DEBUG || DEVELOPMENT)
 320 int so_notsent_lowat_check = 1;
 321 SYSCTL_INT(_kern_ipc, OID_AUTO, notsent_lowat, CTLFLAG_RW | CTLFLAG_LOCKED,
 322     &so_notsent_lowat_check, 0, "enable/disable notsnet lowat check");
 323 #endif /* DEBUG || DEVELOPMENT */
 324
 325 int so_accept_list_waits = 0;
 326 #if (DEBUG || DEVELOPMENT)
 327 SYSCTL_INT(_kern_ipc, OID_AUTO, accept_list_waits, CTLFLAG_RW | CTLFLAG_LOCKED,
 328     &so_accept_list_waits, 0, "number of waits for listener incomp list");
 329 #endif /* DEBUG || DEVELOPMENT */
 330
 331 extern struct inpcbinfo tcbinfo;
 332
 333 /* TODO: these should be in header file */
 334 extern int get_inpcb_str_size(void);
 335 extern int get_tcp_str_size(void);
 336
 337 vm_size_t       so_cache_zone_element_size;
 338
 339 static int sodelayed_copy(struct socket *, struct uio *, struct mbuf **,
 340     user_ssize_t *);
 341 static void cached_sock_alloc(struct socket **, zalloc_flags_t);
 342 static void cached_sock_free(struct socket *);
 343
 344 /*
 345  * Maximum of extended background idle sockets per process
 346  * Set to zero to disable further setting of the option
 347  */
 348
 349 #define SO_IDLE_BK_IDLE_MAX_PER_PROC    1
 350 #define SO_IDLE_BK_IDLE_TIME            600
 351 #define SO_IDLE_BK_IDLE_RCV_HIWAT       131072
 352
 353 struct soextbkidlestat soextbkidlestat;
 354
 355 SYSCTL_UINT(_kern_ipc, OID_AUTO, maxextbkidleperproc,
 356     CTLFLAG_RW | CTLFLAG_LOCKED, &soextbkidlestat.so_xbkidle_maxperproc, 0,
 357     "Maximum of extended background idle sockets per process");
 358
 359 SYSCTL_UINT(_kern_ipc, OID_AUTO, extbkidletime, CTLFLAG_RW | CTLFLAG_LOCKED,
 360     &soextbkidlestat.so_xbkidle_time, 0,
 361     "Time in seconds to keep extended background idle sockets");
 362
 363 SYSCTL_UINT(_kern_ipc, OID_AUTO, extbkidlercvhiwat, CTLFLAG_RW | CTLFLAG_LOCKED,
 364     &soextbkidlestat.so_xbkidle_rcvhiwat, 0,
 365     "High water mark for extended background idle sockets");
 366
 367 SYSCTL_STRUCT(_kern_ipc, OID_AUTO, extbkidlestat, CTLFLAG_RD | CTLFLAG_LOCKED,
 368     &soextbkidlestat, soextbkidlestat, "");
 369
 370 int so_set_extended_bk_idle(struct socket *, int);
 371
 372
 373 /*
 374  * SOTCDB_NO_DSCP is set by default, to prevent the networking stack from
 375  * setting the DSCP code on the packet based on the service class; see
 376  * <rdar://problem/11277343> for details.
 377  */
 378 __private_extern__ u_int32_t sotcdb = 0;
 379 SYSCTL_INT(_kern_ipc, OID_AUTO, sotcdb, CTLFLAG_RW | CTLFLAG_LOCKED,
 380     &sotcdb, 0, "");
 381
 382 void
 383 socketinit(void)
 384 {
 385         _CASSERT(sizeof(so_gencnt) == sizeof(uint64_t));
 386         VERIFY(IS_P2ALIGNED(&so_gencnt, sizeof(uint32_t)));
 387
 388 #ifdef __LP64__
 389         _CASSERT(sizeof(struct sa_endpoints) == sizeof(struct user64_sa_endpoints));
 390         _CASSERT(offsetof(struct sa_endpoints, sae_srcif) == offsetof(struct user64_sa_endpoints, sae_srcif));
 391         _CASSERT(offsetof(struct sa_endpoints, sae_srcaddr) == offsetof(struct user64_sa_endpoints, sae_srcaddr));
 392         _CASSERT(offsetof(struct sa_endpoints, sae_srcaddrlen) == offsetof(struct user64_sa_endpoints, sae_srcaddrlen));
 393         _CASSERT(offsetof(struct sa_endpoints, sae_dstaddr) == offsetof(struct user64_sa_endpoints, sae_dstaddr));
 394         _CASSERT(offsetof(struct sa_endpoints, sae_dstaddrlen) == offsetof(struct user64_sa_endpoints, sae_dstaddrlen));
 395 #else
 396         _CASSERT(sizeof(struct sa_endpoints) == sizeof(struct user32_sa_endpoints));
 397         _CASSERT(offsetof(struct sa_endpoints, sae_srcif) == offsetof(struct user32_sa_endpoints, sae_srcif));
 398         _CASSERT(offsetof(struct sa_endpoints, sae_srcaddr) == offsetof(struct user32_sa_endpoints, sae_srcaddr));
 399         _CASSERT(offsetof(struct sa_endpoints, sae_srcaddrlen) == offsetof(struct user32_sa_endpoints, sae_srcaddrlen));
 400         _CASSERT(offsetof(struct sa_endpoints, sae_dstaddr) == offsetof(struct user32_sa_endpoints, sae_dstaddr));
 401         _CASSERT(offsetof(struct sa_endpoints, sae_dstaddrlen) == offsetof(struct user32_sa_endpoints, sae_dstaddrlen));
 402 #endif
 403
 404         if (socketinit_done) {
 405                 printf("socketinit: already called...\n");
 406                 return;
 407         }
 408         socketinit_done = 1;
 409
 410         PE_parse_boot_argn("socket_debug", &socket_debug,
 411             sizeof(socket_debug));
 412
 413         /*
 414          * allocate lock group attribute and group for socket cache mutex
 415          */
 416         so_cache_mtx_grp_attr = lck_grp_attr_alloc_init();
 417         so_cache_mtx_grp = lck_grp_alloc_init("so_cache",
 418             so_cache_mtx_grp_attr);
 419
 420         /*
 421          * allocate the lock attribute for socket cache mutex
 422          */
 423         so_cache_mtx_attr = lck_attr_alloc_init();
 424
 425         /* cached sockets mutex */
 426         so_cache_mtx = lck_mtx_alloc_init(so_cache_mtx_grp, so_cache_mtx_attr);
 427         if (so_cache_mtx == NULL) {
 428                 panic("%s: unable to allocate so_cache_mtx\n", __func__);
 429                 /* NOTREACHED */
 430         }
 431         STAILQ_INIT(&so_cache_head);
 432
 433         so_cache_zone_element_size = (vm_size_t)(sizeof(struct socket) + 4
 434             + get_inpcb_str_size() + 4 + get_tcp_str_size());
 435
 436         so_cache_zone = zone_create("socache zone", so_cache_zone_element_size,
 437             ZC_ZFREE_CLEARMEM | ZC_NOENCRYPT);
 438
 439         bzero(&soextbkidlestat, sizeof(struct soextbkidlestat));
 440         soextbkidlestat.so_xbkidle_maxperproc = SO_IDLE_BK_IDLE_MAX_PER_PROC;
 441         soextbkidlestat.so_xbkidle_time = SO_IDLE_BK_IDLE_TIME;
 442         soextbkidlestat.so_xbkidle_rcvhiwat = SO_IDLE_BK_IDLE_RCV_HIWAT;
 443
 444         in_pcbinit();
 445         sflt_init();
 446         socket_tclass_init();
 447 #if MULTIPATH
 448         mp_pcbinit();
 449 #endif /* MULTIPATH */
 450 }
 451
 452 static void
 453 cached_sock_alloc(struct socket **so, zalloc_flags_t how)
 454 {
 455         caddr_t temp;
 456         uintptr_t offset;
 457
 458         lck_mtx_lock(so_cache_mtx);
 459
 460         if (!STAILQ_EMPTY(&so_cache_head)) {
 461                 VERIFY(cached_sock_count > 0);
 462
 463                 *so = STAILQ_FIRST(&so_cache_head);
 464                 STAILQ_REMOVE_HEAD(&so_cache_head, so_cache_ent);
 465                 STAILQ_NEXT((*so), so_cache_ent) = NULL;
 466
 467                 cached_sock_count--;
 468                 lck_mtx_unlock(so_cache_mtx);
 469
 470                 temp = (*so)->so_saved_pcb;
 471                 bzero((caddr_t)*so, sizeof(struct socket));
 472
 473                 (*so)->so_saved_pcb = temp;
 474         } else {
 475                 lck_mtx_unlock(so_cache_mtx);
 476
 477                 *so = zalloc_flags(so_cache_zone, how | Z_ZERO);
 478
 479                 /*
 480                  * Define offsets for extra structures into our
 481                  * single block of memory. Align extra structures
 482                  * on longword boundaries.
 483                  */
 484
 485                 offset = (uintptr_t)*so;
 486                 offset += sizeof(struct socket);
 487
 488                 offset = ALIGN(offset);
 489
 490                 (*so)->so_saved_pcb = (caddr_t)offset;
 491                 offset += get_inpcb_str_size();
 492
 493                 offset = ALIGN(offset);
 494
 495                 ((struct inpcb *)(void *)(*so)->so_saved_pcb)->inp_saved_ppcb =
 496                     (caddr_t)offset;
 497         }
 498
 499         OSBitOrAtomic(SOF1_CACHED_IN_SOCK_LAYER, &(*so)->so_flags1);
 500 }
 501
 502 static void
 503 cached_sock_free(struct socket *so)
 504 {
 505         lck_mtx_lock(so_cache_mtx);
 506
 507         so_cache_time = net_uptime();
 508         if (++cached_sock_count > max_cached_sock_count) {
 509                 --cached_sock_count;
 510                 lck_mtx_unlock(so_cache_mtx);
 511                 zfree(so_cache_zone, so);
 512         } else {
 513                 if (so_cache_hw < cached_sock_count) {
 514                         so_cache_hw = cached_sock_count;
 515                 }
 516
 517                 STAILQ_INSERT_TAIL(&so_cache_head, so, so_cache_ent);
 518
 519                 so->cache_timestamp = so_cache_time;
 520                 lck_mtx_unlock(so_cache_mtx);
 521         }
 522 }
 523
 524 void
 525 so_update_last_owner_locked(struct socket *so, proc_t self)
 526 {
 527         if (so->last_pid != 0) {
 528                 /*
 529                  * last_pid and last_upid should remain zero for sockets
 530                  * created using sock_socket. The check above achieves that
 531                  */
 532                 if (self == PROC_NULL) {
 533                         self = current_proc();
 534                 }
 535
 536                 if (so->last_upid != proc_uniqueid(self) ||
 537                     so->last_pid != proc_pid(self)) {
 538                         so->last_upid = proc_uniqueid(self);
 539                         so->last_pid = proc_pid(self);
 540                         proc_getexecutableuuid(self, so->last_uuid,
 541                             sizeof(so->last_uuid));
 542                         if (so->so_proto != NULL && so->so_proto->pr_update_last_owner != NULL) {
 543                                 (*so->so_proto->pr_update_last_owner)(so, self, NULL);
 544                         }
 545                 }
 546                 proc_pidoriginatoruuid(so->so_vuuid, sizeof(so->so_vuuid));
 547         }
 548 }
 549
 550 void
 551 so_update_policy(struct socket *so)
 552 {
 553         if (SOCK_DOM(so) == PF_INET || SOCK_DOM(so) == PF_INET6) {
 554                 (void) inp_update_policy(sotoinpcb(so));
 555         }
 556 }
 557
 558 #if NECP
 559 static void
 560 so_update_necp_policy(struct socket *so, struct sockaddr *override_local_addr,
 561     struct sockaddr *override_remote_addr)
 562 {
 563         if (SOCK_DOM(so) == PF_INET || SOCK_DOM(so) == PF_INET6) {
 564                 inp_update_necp_policy(sotoinpcb(so), override_local_addr,
 565                     override_remote_addr, 0);
 566         }
 567 }
 568 #endif /* NECP */
 569
 570 boolean_t
 571 so_cache_timer(void)
 572 {
 573         struct socket   *p;
 574         int             n_freed = 0;
 575         boolean_t rc = FALSE;
 576
 577         lck_mtx_lock(so_cache_mtx);
 578         so_cache_timeouts++;
 579         so_cache_time = net_uptime();
 580
 581         while (!STAILQ_EMPTY(&so_cache_head)) {
 582                 VERIFY(cached_sock_count > 0);
 583                 p = STAILQ_FIRST(&so_cache_head);
 584                 if ((so_cache_time - p->cache_timestamp) <
 585                     SO_CACHE_TIME_LIMIT) {
 586                         break;
 587                 }
 588
 589                 STAILQ_REMOVE_HEAD(&so_cache_head, so_cache_ent);
 590                 --cached_sock_count;
 591
 592                 zfree(so_cache_zone, p);
 593
 594                 if (++n_freed >= SO_CACHE_MAX_FREE_BATCH) {
 595                         so_cache_max_freed++;
 596                         break;
 597                 }
 598         }
 599
 600         /* Schedule again if there is more to cleanup */
 601         if (!STAILQ_EMPTY(&so_cache_head)) {
 602                 rc = TRUE;
 603         }
 604
 605         lck_mtx_unlock(so_cache_mtx);
 606         return rc;
 607 }
 608
 609 /*
 610  * Get a socket structure from our zone, and initialize it.
 611  * We don't implement `waitok' yet (see comments in uipc_domain.c).
 612  * Note that it would probably be better to allocate socket
 613  * and PCB at the same time, but I'm not convinced that all
 614  * the protocols can be easily modified to do this.
 615  */
 616 struct socket *
 617 soalloc(int waitok, int dom, int type)
 618 {
 619         zalloc_flags_t how = waitok ? Z_WAITOK : Z_NOWAIT;
 620         struct socket *so;
 621
 622         if ((dom == PF_INET) && (type == SOCK_STREAM)) {
 623                 cached_sock_alloc(&so, how);
 624         } else {
 625                 so = zalloc_flags(socket_zone, how | Z_ZERO);
 626         }
 627         if (so != NULL) {
 628                 so->so_gencnt = OSIncrementAtomic64((SInt64 *)&so_gencnt);
 629
 630                 /*
 631                  * Increment the socket allocation statistics
 632                  */
 633                 INC_ATOMIC_INT64_LIM(net_api_stats.nas_socket_alloc_total);
 634         }
 635
 636         return so;
 637 }
 638
 639 int
 640 socreate_internal(int dom, struct socket **aso, int type, int proto,
 641     struct proc *p, uint32_t flags, struct proc *ep)
 642 {
 643         struct protosw *prp;
 644         struct socket *so;
 645         int error = 0;
 646 #if defined(XNU_TARGET_OS_OSX)
 647         pid_t rpid = -1;
 648 #endif
 649
 650 #if TCPDEBUG
 651         extern int tcpconsdebug;
 652 #endif
 653
 654         VERIFY(aso != NULL);
 655         *aso = NULL;
 656
 657         if (proto != 0) {
 658                 prp = pffindproto(dom, proto, type);
 659         } else {
 660                 prp = pffindtype(dom, type);
 661         }
 662
 663         if (prp == NULL || prp->pr_usrreqs->pru_attach == NULL) {
 664                 if (pffinddomain(dom) == NULL) {
 665                         return EAFNOSUPPORT;
 666                 }
 667                 if (proto != 0) {
 668                         if (pffindprotonotype(dom, proto) != NULL) {
 669                                 return EPROTOTYPE;
 670                         }
 671                 }
 672                 return EPROTONOSUPPORT;
 673         }
 674         if (prp->pr_type != type) {
 675                 return EPROTOTYPE;
 676         }
 677         so = soalloc(1, dom, type);
 678         if (so == NULL) {
 679                 return ENOBUFS;
 680         }
 681
 682         switch (dom) {
 683         case PF_LOCAL:
 684                 INC_ATOMIC_INT64_LIM(net_api_stats.nas_socket_domain_local_total);
 685                 break;
 686         case PF_INET:
 687                 INC_ATOMIC_INT64_LIM(net_api_stats.nas_socket_domain_inet_total);
 688                 if (type == SOCK_STREAM) {
 689                         INC_ATOMIC_INT64_LIM(net_api_stats.nas_socket_inet_stream_total);
 690                 } else {
 691                         INC_ATOMIC_INT64_LIM(net_api_stats.nas_socket_inet_dgram_total);
 692                 }
 693                 break;
 694         case PF_ROUTE:
 695                 INC_ATOMIC_INT64_LIM(net_api_stats.nas_socket_domain_route_total);
 696                 break;
 697         case PF_NDRV:
 698                 INC_ATOMIC_INT64_LIM(net_api_stats.nas_socket_domain_ndrv_total);
 699                 break;
 700         case PF_KEY:
 701                 INC_ATOMIC_INT64_LIM(net_api_stats.nas_socket_domain_key_total);
 702                 break;
 703         case PF_INET6:
 704                 INC_ATOMIC_INT64_LIM(net_api_stats.nas_socket_domain_inet6_total);
 705                 if (type == SOCK_STREAM) {
 706                         INC_ATOMIC_INT64_LIM(net_api_stats.nas_socket_inet6_stream_total);
 707                 } else {
 708                         INC_ATOMIC_INT64_LIM(net_api_stats.nas_socket_inet6_dgram_total);
 709                 }
 710                 break;
 711         case PF_SYSTEM:
 712                 INC_ATOMIC_INT64_LIM(net_api_stats.nas_socket_domain_system_total);
 713                 break;
 714         case PF_MULTIPATH:
 715                 INC_ATOMIC_INT64_LIM(net_api_stats.nas_socket_domain_multipath_total);
 716                 break;
 717         default:
 718                 INC_ATOMIC_INT64_LIM(net_api_stats.nas_socket_domain_other_total);
 719                 break;
 720         }
 721
 722         if (flags & SOCF_MPTCP) {
 723                 so->so_state |= SS_NBIO;
 724         }
 725
 726         TAILQ_INIT(&so->so_incomp);
 727         TAILQ_INIT(&so->so_comp);
 728         so->so_type = type;
 729         so->last_upid = proc_uniqueid(p);
 730         so->last_pid = proc_pid(p);
 731         proc_getexecutableuuid(p, so->last_uuid, sizeof(so->last_uuid));
 732         proc_pidoriginatoruuid(so->so_vuuid, sizeof(so->so_vuuid));
 733
 734         if (ep != PROC_NULL && ep != p) {
 735                 so->e_upid = proc_uniqueid(ep);
 736                 so->e_pid = proc_pid(ep);
 737                 proc_getexecutableuuid(ep, so->e_uuid, sizeof(so->e_uuid));
 738                 so->so_flags |= SOF_DELEGATED;
 739 #if defined(XNU_TARGET_OS_OSX)
 740                 if (ep->p_responsible_pid != so->e_pid) {
 741                         rpid = ep->p_responsible_pid;
 742                 }
 743 #endif
 744         }
 745
 746 #if defined(XNU_TARGET_OS_OSX)
 747         if (rpid < 0 && p->p_responsible_pid != so->last_pid) {
 748                 rpid = p->p_responsible_pid;
 749         }
 750
 751         so->so_rpid = -1;
 752         uuid_clear(so->so_ruuid);
 753         if (rpid >= 0) {
 754                 proc_t rp = proc_find(rpid);
 755                 if (rp != PROC_NULL) {
 756                         proc_getexecutableuuid(rp, so->so_ruuid, sizeof(so->so_ruuid));
 757                         so->so_rpid = rpid;
 758                         proc_rele(rp);
 759                 }
 760         }
 761 #endif
 762
 763         so->so_cred = kauth_cred_proc_ref(p);
 764         if (!suser(kauth_cred_get(), NULL)) {
 765                 so->so_state |= SS_PRIV;
 766         }
 767
 768         so->so_proto = prp;
 769         so->so_rcv.sb_flags |= SB_RECV;
 770         so->so_rcv.sb_so = so->so_snd.sb_so = so;
 771         so->next_lock_lr = 0;
 772         so->next_unlock_lr = 0;
 773
 774         /*
 775          * Attachment will create the per pcb lock if necessary and
 776          * increase refcount for creation, make sure it's done before
 777          * socket is inserted in lists.
 778          */
 779         so->so_usecount++;
 780
 781         error = (*prp->pr_usrreqs->pru_attach)(so, proto, p);
 782         if (error != 0) {
 783                 /*
 784                  * Warning:
 785                  * If so_pcb is not zero, the socket will be leaked,
 786                  * so protocol attachment handler must be coded carefuly
 787                  */
 788                 so->so_state |= SS_NOFDREF;
 789                 VERIFY(so->so_usecount > 0);
 790                 so->so_usecount--;
 791                 sofreelastref(so, 1);   /* will deallocate the socket */
 792                 return error;
 793         }
 794
 795         /*
 796          * Note: needs so_pcb to be set after pru_attach
 797          */
 798         if (prp->pr_update_last_owner != NULL) {
 799                 (*prp->pr_update_last_owner)(so, p, ep);
 800         }
 801
 802         atomic_add_32(&prp->pr_domain->dom_refs, 1);
 803
 804         /* Attach socket filters for this protocol */
 805         sflt_initsock(so);
 806 #if TCPDEBUG
 807         if (tcpconsdebug == 2) {
 808                 so->so_options |= SO_DEBUG;
 809         }
 810 #endif
 811         so_set_default_traffic_class(so);
 812
 813         /*
 814          * If this thread or task is marked to create backgrounded sockets,
 815          * mark the socket as background.
 816          */
 817         if (!(flags & SOCF_MPTCP) &&
 818             proc_get_effective_thread_policy(current_thread(), TASK_POLICY_NEW_SOCKETS_BG)) {
 819                 socket_set_traffic_mgt_flags(so, TRAFFIC_MGT_SO_BACKGROUND);
 820                 so->so_background_thread = current_thread();
 821         }
 822
 823         switch (dom) {
 824         /*
 825          * Don't mark Unix domain or system
 826          * eligible for defunct by default.
 827          */
 828         case PF_LOCAL:
 829         case PF_SYSTEM:
 830                 so->so_flags |= SOF_NODEFUNCT;
 831                 break;
 832         default:
 833                 break;
 834         }
 835
 836         /*
 837          * Entitlements can't be checked at socket creation time except if the
 838          * application requested a feature guarded by a privilege (c.f., socket
 839          * delegation).
 840          * The priv(9) and the Sandboxing APIs are designed with the idea that
 841          * a privilege check should only be triggered by a userland request.
 842          * A privilege check at socket creation time is time consuming and
 843          * could trigger many authorisation error messages from the security
 844          * APIs.
 845          */
 846
 847         *aso = so;
 848
 849         return 0;
 850 }
 851
 852 /*
 853  * Returns:     0                       Success
 854  *              EAFNOSUPPORT
 855  *              EPROTOTYPE
 856  *              EPROTONOSUPPORT
 857  *              ENOBUFS
 858  *      <pru_attach>:ENOBUFS[AF_UNIX]
 859  *      <pru_attach>:ENOBUFS[TCP]
 860  *      <pru_attach>:ENOMEM[TCP]
 861  *      <pru_attach>:???                [other protocol families, IPSEC]
 862  */
 863 int
 864 socreate(int dom, struct socket **aso, int type, int proto)
 865 {
 866         return socreate_internal(dom, aso, type, proto, current_proc(), 0,
 867                    PROC_NULL);
 868 }
 869
 870 int
 871 socreate_delegate(int dom, struct socket **aso, int type, int proto, pid_t epid)
 872 {
 873         int error = 0;
 874         struct proc *ep = PROC_NULL;
 875
 876         if ((proc_selfpid() != epid) && ((ep = proc_find(epid)) == PROC_NULL)) {
 877                 error = ESRCH;
 878                 goto done;
 879         }
 880
 881         error = socreate_internal(dom, aso, type, proto, current_proc(), 0, ep);
 882
 883         /*
 884          * It might not be wise to hold the proc reference when calling
 885          * socreate_internal since it calls soalloc with M_WAITOK
 886          */
 887 done:
 888         if (ep != PROC_NULL) {
 889                 proc_rele(ep);
 890         }
 891
 892         return error;
 893 }
 894
 895 /*
 896  * Returns:     0                       Success
 897  *      <pru_bind>:EINVAL               Invalid argument [COMMON_START]
 898  *      <pru_bind>:EAFNOSUPPORT         Address family not supported
 899  *      <pru_bind>:EADDRNOTAVAIL        Address not available.
 900  *      <pru_bind>:EINVAL               Invalid argument
 901  *      <pru_bind>:EAFNOSUPPORT         Address family not supported [notdef]
 902  *      <pru_bind>:EACCES               Permission denied
 903  *      <pru_bind>:EADDRINUSE           Address in use
 904  *      <pru_bind>:EAGAIN               Resource unavailable, try again
 905  *      <pru_bind>:EPERM                Operation not permitted
 906  *      <pru_bind>:???
 907  *      <sf_bind>:???
 908  *
 909  * Notes:       It's not possible to fully enumerate the return codes above,
 910  *              since socket filter authors and protocol family authors may
 911  *              not choose to limit their error returns to those listed, even
 912  *              though this may result in some software operating incorrectly.
 913  *
 914  *              The error codes which are enumerated above are those known to
 915  *              be returned by the tcp_usr_bind function supplied.
 916  */
 917 int
 918 sobindlock(struct socket *so, struct sockaddr *nam, int dolock)
 919 {
 920         struct proc *p = current_proc();
 921         int error = 0;
 922
 923         if (dolock) {
 924                 socket_lock(so, 1);
 925         }
 926
 927         so_update_last_owner_locked(so, p);
 928         so_update_policy(so);
 929
 930 #if NECP
 931         so_update_necp_policy(so, nam, NULL);
 932 #endif /* NECP */
 933
 934         /*
 935          * If this is a bind request on a socket that has been marked
 936          * as inactive, reject it now before we go any further.
 937          */
 938         if (so->so_flags & SOF_DEFUNCT) {
 939                 error = EINVAL;
 940                 SODEFUNCTLOG("%s[%d, %s]: defunct so 0x%llx [%d,%d] (%d)\n",
 941                     __func__, proc_pid(p), proc_best_name(p),
 942                     (uint64_t)DEBUG_KERNEL_ADDRPERM(so),
 943                     SOCK_DOM(so), SOCK_TYPE(so), error);
 944                 goto out;
 945         }
 946
 947         /* Socket filter */
 948         error = sflt_bind(so, nam);
 949
 950         if (error == 0) {
 951                 error = (*so->so_proto->pr_usrreqs->pru_bind)(so, nam, p);
 952         }
 953 out:
 954         if (dolock) {
 955                 socket_unlock(so, 1);
 956         }
 957
 958         if (error == EJUSTRETURN) {
 959                 error = 0;
 960         }
 961
 962         return error;
 963 }
 964
 965 void
 966 sodealloc(struct socket *so)
 967 {
 968         kauth_cred_unref(&so->so_cred);
 969
 970         /* Remove any filters */
 971         sflt_termsock(so);
 972
 973 #if CONTENT_FILTER
 974         cfil_sock_detach(so);
 975 #endif /* CONTENT_FILTER */
 976
 977         so->so_gencnt = OSIncrementAtomic64((SInt64 *)&so_gencnt);
 978
 979         if (so->so_flags1 & SOF1_CACHED_IN_SOCK_LAYER) {
 980                 cached_sock_free(so);
 981         } else {
 982                 zfree(socket_zone, so);
 983         }
 984 }
 985
 986 /*
 987  * Returns:     0                       Success
 988  *              EINVAL
 989  *              EOPNOTSUPP
 990  *      <pru_listen>:EINVAL[AF_UNIX]
 991  *      <pru_listen>:EINVAL[TCP]
 992  *      <pru_listen>:EADDRNOTAVAIL[TCP] Address not available.
 993  *      <pru_listen>:EINVAL[TCP]        Invalid argument
 994  *      <pru_listen>:EAFNOSUPPORT[TCP]  Address family not supported [notdef]
 995  *      <pru_listen>:EACCES[TCP]        Permission denied
 996  *      <pru_listen>:EADDRINUSE[TCP]    Address in use
 997  *      <pru_listen>:EAGAIN[TCP]        Resource unavailable, try again
 998  *      <pru_listen>:EPERM[TCP]         Operation not permitted
 999  *      <sf_listen>:???
1000  *
1001  * Notes:       Other <pru_listen> returns depend on the protocol family; all
1002  *              <sf_listen> returns depend on what the filter author causes
1003  *              their filter to return.
1004  */
1005 int
1006 solisten(struct socket *so, int backlog)
1007 {
1008         struct proc *p = current_proc();
1009         int error = 0;
1010
1011         socket_lock(so, 1);
1012
1013         so_update_last_owner_locked(so, p);
1014         so_update_policy(so);
1015
1016 #if NECP
1017         so_update_necp_policy(so, NULL, NULL);
1018 #endif /* NECP */
1019
1020         if (so->so_proto == NULL) {
1021                 error = EINVAL;
1022                 goto out;
1023         }
1024         if ((so->so_proto->pr_flags & PR_CONNREQUIRED) == 0) {
1025                 error = EOPNOTSUPP;
1026                 goto out;
1027         }
1028
1029         /*
1030          * If the listen request is made on a socket that is not fully
1031          * disconnected, or on a socket that has been marked as inactive,
1032          * reject the request now.
1033          */
1034         if ((so->so_state &
1035             (SS_ISCONNECTED | SS_ISCONNECTING | SS_ISDISCONNECTING)) ||
1036             (so->so_flags & SOF_DEFUNCT)) {
1037                 error = EINVAL;
1038                 if (so->so_flags & SOF_DEFUNCT) {
1039                         SODEFUNCTLOG("%s[%d, %s]: defunct so 0x%llx [%d,%d] "
1040                             "(%d)\n", __func__, proc_pid(p),
1041                             proc_best_name(p),
1042                             (uint64_t)DEBUG_KERNEL_ADDRPERM(so),
1043                             SOCK_DOM(so), SOCK_TYPE(so), error);
1044                 }
1045                 goto out;
1046         }
1047
1048         if ((so->so_restrictions & SO_RESTRICT_DENY_IN) != 0) {
1049                 error = EPERM;
1050                 goto out;
1051         }
1052
1053         error = sflt_listen(so);
1054         if (error == 0) {
1055                 error = (*so->so_proto->pr_usrreqs->pru_listen)(so, p);
1056         }
1057
1058         if (error) {
1059                 if (error == EJUSTRETURN) {
1060                         error = 0;
1061                 }
1062                 goto out;
1063         }
1064
1065         if (TAILQ_EMPTY(&so->so_comp)) {
1066                 so->so_options |= SO_ACCEPTCONN;
1067         }
1068         /*
1069          * POSIX: The implementation may have an upper limit on the length of
1070          * the listen queue-either global or per accepting socket. If backlog
1071          * exceeds this limit, the length of the listen queue is set to the
1072          * limit.
1073          *
1074          * If listen() is called with a backlog argument value that is less
1075          * than 0, the function behaves as if it had been called with a backlog
1076          * argument value of 0.
1077          *
1078          * A backlog argument of 0 may allow the socket to accept connections,
1079          * in which case the length of the listen queue may be set to an
1080          * implementation-defined minimum value.
1081          */
1082         if (backlog <= 0 || backlog > somaxconn) {
1083                 backlog = somaxconn;
1084         }
1085
1086         so->so_qlimit = backlog;
1087 out:
1088         socket_unlock(so, 1);
1089         return error;
1090 }
1091
1092 /*
1093  * The "accept list lock" protects the fields related to the listener queues
1094  * because we can unlock a socket to respect the lock ordering between
1095  * the listener socket and its clients sockets. The lock ordering is first to
1096  * acquire the client socket before the listener socket.
1097  *
1098  * The accept list lock serializes access to the following fields:
1099  * - of the listener socket:
1100  *   - so_comp
1101  *   - so_incomp
1102  *   - so_qlen
1103  *   - so_inqlen
1104  * - of client sockets that are in so_comp or so_incomp:
1105  *   - so_head
1106  *   - so_list
1107  *
1108  * As one can see the accept list lock protects the consistent of the
1109  * linkage of the client sockets.
1110  *
1111  * Note that those fields may be read without holding the accept list lock
1112  * for a preflight provided the accept list lock is taken when committing
1113  * to take an action based on the result of the preflight. The preflight
1114  * saves the cost of doing the unlock/lock dance.
1115  */
1116 void
1117 so_acquire_accept_list(struct socket *head, struct socket *so)
1118 {
1119         lck_mtx_t *mutex_held;
1120
1121         if (head->so_proto->pr_getlock == NULL) {
1122                 return;
1123         }
1124         mutex_held = (*head->so_proto->pr_getlock)(head, PR_F_WILLUNLOCK);
1125         LCK_MTX_ASSERT(mutex_held, LCK_MTX_ASSERT_OWNED);
1126
1127         if (!(head->so_flags1 & SOF1_ACCEPT_LIST_HELD)) {
1128                 head->so_flags1 |= SOF1_ACCEPT_LIST_HELD;
1129                 return;
1130         }
1131         if (so != NULL) {
1132                 socket_unlock(so, 0);
1133         }
1134         while (head->so_flags1 & SOF1_ACCEPT_LIST_HELD) {
1135                 so_accept_list_waits += 1;
1136                 msleep((caddr_t)&head->so_incomp, mutex_held,
1137                     PSOCK | PCATCH, __func__, NULL);
1138         }
1139         head->so_flags1 |= SOF1_ACCEPT_LIST_HELD;
1140         if (so != NULL) {
1141                 socket_unlock(head, 0);
1142                 socket_lock(so, 0);
1143                 socket_lock(head, 0);
1144         }
1145 }
1146
1147 void
1148 so_release_accept_list(struct socket *head)
1149 {
1150         if (head->so_proto->pr_getlock != NULL) {
1151                 lck_mtx_t *mutex_held;
1152
1153                 mutex_held = (*head->so_proto->pr_getlock)(head, 0);
1154                 LCK_MTX_ASSERT(mutex_held, LCK_MTX_ASSERT_OWNED);
1155
1156                 head->so_flags1 &= ~SOF1_ACCEPT_LIST_HELD;
1157                 wakeup((caddr_t)&head->so_incomp);
1158         }
1159 }
1160
1161 void
1162 sofreelastref(struct socket *so, int dealloc)
1163 {
1164         struct socket *head = so->so_head;
1165
1166         /* Assume socket is locked */
1167
1168         if (!(so->so_flags & SOF_PCBCLEARING) || !(so->so_state & SS_NOFDREF)) {
1169                 selthreadclear(&so->so_snd.sb_sel);
1170                 selthreadclear(&so->so_rcv.sb_sel);
1171                 so->so_rcv.sb_flags &= ~(SB_SEL | SB_UPCALL);
1172                 so->so_snd.sb_flags &= ~(SB_SEL | SB_UPCALL);
1173                 so->so_event = sonullevent;
1174                 return;
1175         }
1176         if (head != NULL) {
1177                 /*
1178                  * Need to lock the listener when the protocol has
1179                  * per socket locks
1180                  */
1181                 if (head->so_proto->pr_getlock != NULL) {
1182                         socket_lock(head, 1);
1183                         so_acquire_accept_list(head, so);
1184                 }
1185                 if (so->so_state & SS_INCOMP) {
1186                         so->so_state &= ~SS_INCOMP;
1187                         TAILQ_REMOVE(&head->so_incomp, so, so_list);
1188                         head->so_incqlen--;
1189                         head->so_qlen--;
1190                         so->so_head = NULL;
1191
1192                         if (head->so_proto->pr_getlock != NULL) {
1193                                 so_release_accept_list(head);
1194                                 socket_unlock(head, 1);
1195                         }
1196                 } else if (so->so_state & SS_COMP) {
1197                         if (head->so_proto->pr_getlock != NULL) {
1198                                 so_release_accept_list(head);
1199                                 socket_unlock(head, 1);
1200                         }
1201                         /*
1202                          * We must not decommission a socket that's
1203                          * on the accept(2) queue.  If we do, then
1204                          * accept(2) may hang after select(2) indicated
1205                          * that the listening socket was ready.
1206                          */
1207                         selthreadclear(&so->so_snd.sb_sel);
1208                         selthreadclear(&so->so_rcv.sb_sel);
1209                         so->so_rcv.sb_flags &= ~(SB_SEL | SB_UPCALL);
1210                         so->so_snd.sb_flags &= ~(SB_SEL | SB_UPCALL);
1211                         so->so_event = sonullevent;
1212                         return;
1213                 } else {
1214                         if (head->so_proto->pr_getlock != NULL) {
1215                                 so_release_accept_list(head);
1216                                 socket_unlock(head, 1);
1217                         }
1218                         printf("sofree: not queued\n");
1219                 }
1220         }
1221         sowflush(so);
1222         sorflush(so);
1223
1224 #if FLOW_DIVERT
1225         if (so->so_flags & SOF_FLOW_DIVERT) {
1226                 flow_divert_detach(so);
1227         }
1228 #endif  /* FLOW_DIVERT */
1229
1230         /* 3932268: disable upcall */
1231         so->so_rcv.sb_flags &= ~SB_UPCALL;
1232         so->so_snd.sb_flags &= ~(SB_UPCALL | SB_SNDBYTE_CNT);
1233         so->so_event = sonullevent;
1234
1235         if (dealloc) {
1236                 sodealloc(so);
1237         }
1238 }
1239
1240 void
1241 soclose_wait_locked(struct socket *so)
1242 {
1243         lck_mtx_t *mutex_held;
1244
1245         if (so->so_proto->pr_getlock != NULL) {
1246                 mutex_held = (*so->so_proto->pr_getlock)(so, PR_F_WILLUNLOCK);
1247         } else {
1248                 mutex_held = so->so_proto->pr_domain->dom_mtx;
1249         }
1250         LCK_MTX_ASSERT(mutex_held, LCK_MTX_ASSERT_OWNED);
1251
1252         /*
1253          * Double check here and return if there's no outstanding upcall;
1254          * otherwise proceed further only if SOF_UPCALLCLOSEWAIT is set.
1255          */
1256         if (!so->so_upcallusecount || !(so->so_flags & SOF_UPCALLCLOSEWAIT)) {
1257                 return;
1258         }
1259         so->so_rcv.sb_flags &= ~SB_UPCALL;
1260         so->so_snd.sb_flags &= ~SB_UPCALL;
1261         so->so_flags |= SOF_CLOSEWAIT;
1262
1263         (void) msleep((caddr_t)&so->so_upcallusecount, mutex_held, (PZERO - 1),
1264             "soclose_wait_locked", NULL);
1265         LCK_MTX_ASSERT(mutex_held, LCK_MTX_ASSERT_OWNED);
1266         so->so_flags &= ~SOF_CLOSEWAIT;
1267 }
1268
1269 /*
1270  * Close a socket on last file table reference removal.
1271  * Initiate disconnect if connected.
1272  * Free socket when disconnect complete.
1273  */
1274 int
1275 soclose_locked(struct socket *so)
1276 {
1277         int error = 0;
1278         struct timespec ts;
1279
1280         if (so->so_usecount == 0) {
1281                 panic("soclose: so=%p refcount=0\n", so);
1282                 /* NOTREACHED */
1283         }
1284
1285         sflt_notify(so, sock_evt_closing, NULL);
1286
1287         if (so->so_upcallusecount) {
1288                 soclose_wait_locked(so);
1289         }
1290
1291 #if CONTENT_FILTER
1292         /*
1293          * We have to wait until the content filters are done
1294          */
1295         if ((so->so_flags & SOF_CONTENT_FILTER) != 0) {
1296                 cfil_sock_close_wait(so);
1297                 cfil_sock_is_closed(so);
1298                 cfil_sock_detach(so);
1299         }
1300 #endif /* CONTENT_FILTER */
1301
1302         if (so->so_flags1 & SOF1_EXTEND_BK_IDLE_INPROG) {
1303                 soresume(current_proc(), so, 1);
1304                 so->so_flags1 &= ~SOF1_EXTEND_BK_IDLE_WANTED;
1305         }
1306
1307         if ((so->so_options & SO_ACCEPTCONN)) {
1308                 struct socket *sp, *sonext;
1309                 int persocklock = 0;
1310                 int incomp_overflow_only;
1311
1312                 /*
1313                  * We do not want new connection to be added
1314                  * to the connection queues
1315                  */
1316                 so->so_options &= ~SO_ACCEPTCONN;
1317
1318                 /*
1319                  * We can drop the lock on the listener once
1320                  * we've acquired the incoming list
1321                  */
1322                 if (so->so_proto->pr_getlock != NULL) {
1323                         persocklock = 1;
1324                         so_acquire_accept_list(so, NULL);
1325                         socket_unlock(so, 0);
1326                 }
1327 again:
1328                 incomp_overflow_only = 1;
1329
1330                 TAILQ_FOREACH_SAFE(sp, &so->so_incomp, so_list, sonext) {
1331                         /*
1332                          * Radar 5350314
1333                          * skip sockets thrown away by tcpdropdropblreq
1334                          * they will get cleanup by the garbage collection.
1335                          * otherwise, remove the incomp socket from the queue
1336                          * and let soabort trigger the appropriate cleanup.
1337                          */
1338                         if (sp->so_flags & SOF_OVERFLOW) {
1339                                 continue;
1340                         }
1341
1342                         if (persocklock != 0) {
1343                                 socket_lock(sp, 1);
1344                         }
1345
1346                         /*
1347                          * Radar 27945981
1348                          * The extra reference for the list insure the
1349                          * validity of the socket pointer when we perform the
1350                          * unlock of the head above
1351                          */
1352                         if (sp->so_state & SS_INCOMP) {
1353                                 sp->so_state &= ~SS_INCOMP;
1354                                 sp->so_head = NULL;
1355                                 TAILQ_REMOVE(&so->so_incomp, sp, so_list);
1356                                 so->so_incqlen--;
1357                                 so->so_qlen--;
1358
1359                                 (void) soabort(sp);
1360                         } else {
1361                                 panic("%s sp %p in so_incomp but !SS_INCOMP",
1362                                     __func__, sp);
1363                         }
1364
1365                         if (persocklock != 0) {
1366                                 socket_unlock(sp, 1);
1367                         }
1368                 }
1369
1370                 TAILQ_FOREACH_SAFE(sp, &so->so_comp, so_list, sonext) {
1371                         /* Dequeue from so_comp since sofree() won't do it */
1372                         if (persocklock != 0) {
1373                                 socket_lock(sp, 1);
1374                         }
1375
1376                         if (sp->so_state & SS_COMP) {
1377                                 sp->so_state &= ~SS_COMP;
1378                                 sp->so_head = NULL;
1379                                 TAILQ_REMOVE(&so->so_comp, sp, so_list);
1380                                 so->so_qlen--;
1381
1382                                 (void) soabort(sp);
1383                         } else {
1384                                 panic("%s sp %p in so_comp but !SS_COMP",
1385                                     __func__, sp);
1386                         }
1387
1388                         if (persocklock) {
1389                                 socket_unlock(sp, 1);
1390                         }
1391                 }
1392
1393                 if (incomp_overflow_only == 0 && !TAILQ_EMPTY(&so->so_incomp)) {
1394 #if (DEBUG | DEVELOPMENT)
1395                         panic("%s head %p so_comp not empty\n", __func__, so);
1396 #endif /* (DEVELOPMENT || DEBUG) */
1397
1398                         goto again;
1399                 }
1400
1401                 if (!TAILQ_EMPTY(&so->so_comp)) {
1402 #if (DEBUG | DEVELOPMENT)
1403                         panic("%s head %p so_comp not empty\n", __func__, so);
1404 #endif /* (DEVELOPMENT || DEBUG) */
1405
1406                         goto again;
1407                 }
1408
1409                 if (persocklock) {
1410                         socket_lock(so, 0);
1411                         so_release_accept_list(so);
1412                 }
1413         }
1414         if (so->so_pcb == NULL) {
1415                 /* 3915887: mark the socket as ready for dealloc */
1416                 so->so_flags |= SOF_PCBCLEARING;
1417                 goto discard;
1418         }
1419         if (so->so_state & SS_ISCONNECTED) {
1420                 if ((so->so_state & SS_ISDISCONNECTING) == 0) {
1421                         error = sodisconnectlocked(so);
1422                         if (error) {
1423                                 goto drop;
1424                         }
1425                 }
1426                 if (so->so_options & SO_LINGER) {
1427                         lck_mtx_t *mutex_held;
1428
1429                         if ((so->so_state & SS_ISDISCONNECTING) &&
1430                             (so->so_state & SS_NBIO)) {
1431                                 goto drop;
1432                         }
1433                         if (so->so_proto->pr_getlock != NULL) {
1434                                 mutex_held = (*so->so_proto->pr_getlock)(so, PR_F_WILLUNLOCK);
1435                         } else {
1436                                 mutex_held = so->so_proto->pr_domain->dom_mtx;
1437                         }
1438                         while (so->so_state & SS_ISCONNECTED) {
1439                                 ts.tv_sec = (so->so_linger / 100);
1440                                 ts.tv_nsec = (so->so_linger % 100) *
1441                                     NSEC_PER_USEC * 1000 * 10;
1442                                 error = msleep((caddr_t)&so->so_timeo,
1443                                     mutex_held, PSOCK | PCATCH, "soclose", &ts);
1444                                 if (error) {
1445                                         /*
1446                                          * It's OK when the time fires,
1447                                          * don't report an error
1448                                          */
1449                                         if (error == EWOULDBLOCK) {
1450                                                 error = 0;
1451                                         }
1452                                         break;
1453                                 }
1454                         }
1455                 }
1456         }
1457 drop:
1458         if (so->so_usecount == 0) {
1459                 panic("soclose: usecount is zero so=%p\n", so);
1460                 /* NOTREACHED */
1461         }
1462         if (so->so_pcb != NULL && !(so->so_flags & SOF_PCBCLEARING)) {
1463                 int error2 = (*so->so_proto->pr_usrreqs->pru_detach)(so);
1464                 if (error == 0) {
1465                         error = error2;
1466                 }
1467         }
1468         if (so->so_usecount <= 0) {
1469                 panic("soclose: usecount is zero so=%p\n", so);
1470                 /* NOTREACHED */
1471         }
1472 discard:
1473         if (so->so_pcb != NULL && !(so->so_flags & SOF_MP_SUBFLOW) &&
1474             (so->so_state & SS_NOFDREF)) {
1475                 panic("soclose: NOFDREF");
1476                 /* NOTREACHED */
1477         }
1478         so->so_state |= SS_NOFDREF;
1479
1480         if ((so->so_flags & SOF_KNOTE) != 0) {
1481                 KNOTE(&so->so_klist, SO_FILT_HINT_LOCKED);
1482         }
1483
1484         atomic_add_32(&so->so_proto->pr_domain->dom_refs, -1);
1485
1486         VERIFY(so->so_usecount > 0);
1487         so->so_usecount--;
1488         sofree(so);
1489         return error;
1490 }
1491
1492 int
1493 soclose(struct socket *so)
1494 {
1495         int error = 0;
1496         socket_lock(so, 1);
1497
1498         if (so->so_retaincnt == 0) {
1499                 error = soclose_locked(so);
1500         } else {
1501                 /*
1502                  * if the FD is going away, but socket is
1503                  * retained in kernel remove its reference
1504                  */
1505                 so->so_usecount--;
1506                 if (so->so_usecount < 2) {
1507                         panic("soclose: retaincnt non null and so=%p "
1508                             "usecount=%d\n", so, so->so_usecount);
1509                 }
1510         }
1511         socket_unlock(so, 1);
1512         return error;
1513 }
1514
1515 /*
1516  * Must be called at splnet...
1517  */
1518 /* Should already be locked */
1519 int
1520 soabort(struct socket *so)
1521 {
1522         int error;
1523
1524 #ifdef MORE_LOCKING_DEBUG
1525         lck_mtx_t *mutex_held;
1526
1527         if (so->so_proto->pr_getlock != NULL) {
1528                 mutex_held = (*so->so_proto->pr_getlock)(so, 0);
1529         } else {
1530                 mutex_held = so->so_proto->pr_domain->dom_mtx;
1531         }
1532         LCK_MTX_ASSERT(mutex_held, LCK_MTX_ASSERT_OWNED);
1533 #endif
1534
1535         if ((so->so_flags & SOF_ABORTED) == 0) {
1536                 so->so_flags |= SOF_ABORTED;
1537                 error = (*so->so_proto->pr_usrreqs->pru_abort)(so);
1538                 if (error) {
1539                         sofree(so);
1540                         return error;
1541                 }
1542         }
1543         return 0;
1544 }
1545
1546 int
1547 soacceptlock(struct socket *so, struct sockaddr **nam, int dolock)
1548 {
1549         int error;
1550
1551         if (dolock) {
1552                 socket_lock(so, 1);
1553         }
1554
1555         so_update_last_owner_locked(so, PROC_NULL);
1556         so_update_policy(so);
1557 #if NECP
1558         so_update_necp_policy(so, NULL, NULL);
1559 #endif /* NECP */
1560
1561         if ((so->so_state & SS_NOFDREF) == 0) {
1562                 panic("soaccept: !NOFDREF");
1563         }
1564         so->so_state &= ~SS_NOFDREF;
1565         error = (*so->so_proto->pr_usrreqs->pru_accept)(so, nam);
1566
1567         if (dolock) {
1568                 socket_unlock(so, 1);
1569         }
1570         return error;
1571 }
1572
1573 int
1574 soaccept(struct socket *so, struct sockaddr **nam)
1575 {
1576         return soacceptlock(so, nam, 1);
1577 }
1578
1579 int
1580 soacceptfilter(struct socket *so, struct socket *head)
1581 {
1582         struct sockaddr *local = NULL, *remote = NULL;
1583         int error = 0;
1584
1585         /*
1586          * Hold the lock even if this socket has not been made visible
1587          * to the filter(s).  For sockets with global locks, this protects
1588          * against the head or peer going away
1589          */
1590         socket_lock(so, 1);
1591         if (sogetaddr_locked(so, &remote, 1) != 0 ||
1592             sogetaddr_locked(so, &local, 0) != 0) {
1593                 so->so_state &= ~SS_NOFDREF;
1594                 socket_unlock(so, 1);
1595                 soclose(so);
1596                 /* Out of resources; try it again next time */
1597                 error = ECONNABORTED;
1598                 goto done;
1599         }
1600
1601         error = sflt_accept(head, so, local, remote);
1602
1603         /*
1604          * If we get EJUSTRETURN from one of the filters, mark this socket
1605          * as inactive and return it anyway.  This newly accepted socket
1606          * will be disconnected later before we hand it off to the caller.
1607          */
1608         if (error == EJUSTRETURN) {
1609                 error = 0;
1610                 (void) sosetdefunct(current_proc(), so,
1611                     SHUTDOWN_SOCKET_LEVEL_DISCONNECT_INTERNAL, FALSE);
1612         }
1613
1614         if (error != 0) {
1615                 /*
1616                  * This may seem like a duplication to the above error
1617                  * handling part when we return ECONNABORTED, except
1618                  * the following is done while holding the lock since
1619                  * the socket has been exposed to the filter(s) earlier.
1620                  */
1621                 so->so_state &= ~SS_NOFDREF;
1622                 socket_unlock(so, 1);
1623                 soclose(so);
1624                 /* Propagate socket filter's error code to the caller */
1625         } else {
1626                 socket_unlock(so, 1);
1627         }
1628 done:
1629         /* Callee checks for NULL pointer */
1630         sock_freeaddr(remote);
1631         sock_freeaddr(local);
1632         return error;
1633 }
1634
1635 /*
1636  * Returns:     0                       Success
1637  *              EOPNOTSUPP              Operation not supported on socket
1638  *              EISCONN                 Socket is connected
1639  *      <pru_connect>:EADDRNOTAVAIL     Address not available.
1640  *      <pru_connect>:EINVAL            Invalid argument
1641  *      <pru_connect>:EAFNOSUPPORT      Address family not supported [notdef]
1642  *      <pru_connect>:EACCES            Permission denied
1643  *      <pru_connect>:EADDRINUSE        Address in use
1644  *      <pru_connect>:EAGAIN            Resource unavailable, try again
1645  *      <pru_connect>:EPERM             Operation not permitted
1646  *      <sf_connect_out>:???            [anything a filter writer might set]
1647  */
1648 int
1649 soconnectlock(struct socket *so, struct sockaddr *nam, int dolock)
1650 {
1651         int error;
1652         struct proc *p = current_proc();
1653
1654         if (dolock) {
1655                 socket_lock(so, 1);
1656         }
1657
1658         so_update_last_owner_locked(so, p);
1659         so_update_policy(so);
1660
1661 #if NECP
1662         so_update_necp_policy(so, NULL, nam);
1663 #endif /* NECP */
1664
1665         /*
1666          * If this is a listening socket or if this is a previously-accepted
1667          * socket that has been marked as inactive, reject the connect request.
1668          */
1669         if ((so->so_options & SO_ACCEPTCONN) || (so->so_flags & SOF_DEFUNCT)) {
1670                 error = EOPNOTSUPP;
1671                 if (so->so_flags & SOF_DEFUNCT) {
1672                         SODEFUNCTLOG("%s[%d, %s]: defunct so 0x%llx [%d,%d] "
1673                             "(%d)\n", __func__, proc_pid(p),
1674                             proc_best_name(p),
1675                             (uint64_t)DEBUG_KERNEL_ADDRPERM(so),
1676                             SOCK_DOM(so), SOCK_TYPE(so), error);
1677                 }
1678                 if (dolock) {
1679                         socket_unlock(so, 1);
1680                 }
1681                 return error;
1682         }
1683
1684         if ((so->so_restrictions & SO_RESTRICT_DENY_OUT) != 0) {
1685                 if (dolock) {
1686                         socket_unlock(so, 1);
1687                 }
1688                 return EPERM;
1689         }
1690
1691         /*
1692          * If protocol is connection-based, can only connect once.
1693          * Otherwise, if connected, try to disconnect first.
1694          * This allows user to disconnect by connecting to, e.g.,
1695          * a null address.
1696          */
1697         if (so->so_state & (SS_ISCONNECTED | SS_ISCONNECTING) &&
1698             ((so->so_proto->pr_flags & PR_CONNREQUIRED) ||
1699             (error = sodisconnectlocked(so)))) {
1700                 error = EISCONN;
1701         } else {
1702                 /*
1703                  * Run connect filter before calling protocol:
1704                  *  - non-blocking connect returns before completion;
1705                  */
1706                 error = sflt_connectout(so, nam);
1707                 if (error != 0) {
1708                         if (error == EJUSTRETURN) {
1709                                 error = 0;
1710                         }
1711                 } else {
1712                         error = (*so->so_proto->pr_usrreqs->pru_connect)
1713                             (so, nam, p);
1714                         if (error != 0) {
1715                                 so->so_state &= ~SS_ISCONNECTING;
1716                         }
1717                 }
1718         }
1719         if (dolock) {
1720                 socket_unlock(so, 1);
1721         }
1722         return error;
1723 }
1724
1725 int
1726 soconnect(struct socket *so, struct sockaddr *nam)
1727 {
1728         return soconnectlock(so, nam, 1);
1729 }
1730
1731 /*
1732  * Returns:     0                       Success
1733  *      <pru_connect2>:EINVAL[AF_UNIX]
1734  *      <pru_connect2>:EPROTOTYPE[AF_UNIX]
1735  *      <pru_connect2>:???              [other protocol families]
1736  *
1737  * Notes:       <pru_connect2> is not supported by [TCP].
1738  */
1739 int
1740 soconnect2(struct socket *so1, struct socket *so2)
1741 {
1742         int error;
1743
1744         socket_lock(so1, 1);
1745         if (so2->so_proto->pr_lock) {
1746                 socket_lock(so2, 1);
1747         }
1748
1749         error = (*so1->so_proto->pr_usrreqs->pru_connect2)(so1, so2);
1750
1751         socket_unlock(so1, 1);
1752         if (so2->so_proto->pr_lock) {
1753                 socket_unlock(so2, 1);
1754         }
1755         return error;
1756 }
1757
1758 int
1759 soconnectxlocked(struct socket *so, struct sockaddr *src,
1760     struct sockaddr *dst, struct proc *p, uint32_t ifscope,
1761     sae_associd_t aid, sae_connid_t *pcid, uint32_t flags, void *arg,
1762     uint32_t arglen, uio_t auio, user_ssize_t *bytes_written)
1763 {
1764         int error;
1765
1766         so_update_last_owner_locked(so, p);
1767         so_update_policy(so);
1768
1769         /*
1770          * If this is a listening socket or if this is a previously-accepted
1771          * socket that has been marked as inactive, reject the connect request.
1772          */
1773         if ((so->so_options & SO_ACCEPTCONN) || (so->so_flags & SOF_DEFUNCT)) {
1774                 error = EOPNOTSUPP;
1775                 if (so->so_flags & SOF_DEFUNCT) {
1776                         SODEFUNCTLOG("%s[%d, %s]: defunct so 0x%llx [%d,%d] "
1777                             "(%d)\n", __func__, proc_pid(p),
1778                             proc_best_name(p),
1779                             (uint64_t)DEBUG_KERNEL_ADDRPERM(so),
1780                             SOCK_DOM(so), SOCK_TYPE(so), error);
1781                 }
1782                 return error;
1783         }
1784
1785         if ((so->so_restrictions & SO_RESTRICT_DENY_OUT) != 0) {
1786                 return EPERM;
1787         }
1788
1789         /*
1790          * If protocol is connection-based, can only connect once
1791          * unless PR_MULTICONN is set.  Otherwise, if connected,
1792          * try to disconnect first.  This allows user to disconnect
1793          * by connecting to, e.g., a null address.
1794          */
1795         if ((so->so_state & (SS_ISCONNECTED | SS_ISCONNECTING)) &&
1796             !(so->so_proto->pr_flags & PR_MULTICONN) &&
1797             ((so->so_proto->pr_flags & PR_CONNREQUIRED) ||
1798             (error = sodisconnectlocked(so)) != 0)) {
1799                 error = EISCONN;
1800         } else {
1801                 if ((so->so_proto->pr_flags & PR_DATA_IDEMPOTENT) &&
1802                     (flags & CONNECT_DATA_IDEMPOTENT)) {
1803                         so->so_flags1 |= SOF1_DATA_IDEMPOTENT;
1804
1805                         if (flags & CONNECT_DATA_AUTHENTICATED) {
1806                                 so->so_flags1 |= SOF1_DATA_AUTHENTICATED;
1807                         }
1808                 }
1809
1810                 /*
1811                  * Case 1: CONNECT_RESUME_ON_READ_WRITE set, no data.
1812                  * Case 2: CONNECT_RESUME_ON_READ_WRITE set, with data (user error)
1813                  * Case 3: CONNECT_RESUME_ON_READ_WRITE not set, with data
1814                  * Case 3 allows user to combine write with connect even if they have
1815                  * no use for TFO (such as regular TCP, and UDP).
1816                  * Case 4: CONNECT_RESUME_ON_READ_WRITE not set, no data (regular case)
1817                  */
1818                 if ((so->so_proto->pr_flags & PR_PRECONN_WRITE) &&
1819                     ((flags & CONNECT_RESUME_ON_READ_WRITE) || auio)) {
1820                         so->so_flags1 |= SOF1_PRECONNECT_DATA;
1821                 }
1822
1823                 /*
1824                  * If a user sets data idempotent and does not pass an uio, or
1825                  * sets CONNECT_RESUME_ON_READ_WRITE, this is an error, reset
1826                  * SOF1_DATA_IDEMPOTENT.
1827                  */
1828                 if (!(so->so_flags1 & SOF1_PRECONNECT_DATA) &&
1829                     (so->so_flags1 & SOF1_DATA_IDEMPOTENT)) {
1830                         /* We should return EINVAL instead perhaps. */
1831                         so->so_flags1 &= ~SOF1_DATA_IDEMPOTENT;
1832                 }
1833
1834                 /*
1835                  * Run connect filter before calling protocol:
1836                  *  - non-blocking connect returns before completion;
1837                  */
1838                 error = sflt_connectout(so, dst);
1839                 if (error != 0) {
1840                         /* Disable PRECONNECT_DATA, as we don't need to send a SYN anymore. */
1841                         so->so_flags1 &= ~SOF1_PRECONNECT_DATA;
1842                         if (error == EJUSTRETURN) {
1843                                 error = 0;
1844                         }
1845                 } else {
1846                         error = (*so->so_proto->pr_usrreqs->pru_connectx)
1847                             (so, src, dst, p, ifscope, aid, pcid,
1848                             flags, arg, arglen, auio, bytes_written);
1849                         if (error != 0) {
1850                                 so->so_state &= ~SS_ISCONNECTING;
1851                                 if (error != EINPROGRESS) {
1852                                         so->so_flags1 &= ~SOF1_PRECONNECT_DATA;
1853                                 }
1854                         }
1855                 }
1856         }
1857
1858         return error;
1859 }
1860
1861 int
1862 sodisconnectlocked(struct socket *so)
1863 {
1864         int error;
1865
1866         if ((so->so_state & SS_ISCONNECTED) == 0) {
1867                 error = ENOTCONN;
1868                 goto bad;
1869         }
1870         if (so->so_state & SS_ISDISCONNECTING) {
1871                 error = EALREADY;
1872                 goto bad;
1873         }
1874
1875         error = (*so->so_proto->pr_usrreqs->pru_disconnect)(so);
1876         if (error == 0) {
1877                 sflt_notify(so, sock_evt_disconnected, NULL);
1878         }
1879
1880 bad:
1881         return error;
1882 }
1883
1884 /* Locking version */
1885 int
1886 sodisconnect(struct socket *so)
1887 {
1888         int error;
1889
1890         socket_lock(so, 1);
1891         error = sodisconnectlocked(so);
1892         socket_unlock(so, 1);
1893         return error;
1894 }
1895
1896 int
1897 sodisconnectxlocked(struct socket *so, sae_associd_t aid, sae_connid_t cid)
1898 {
1899         int error;
1900
1901         /*
1902          * Call the protocol disconnectx handler; let it handle all
1903          * matters related to the connection state of this session.
1904          */
1905         error = (*so->so_proto->pr_usrreqs->pru_disconnectx)(so, aid, cid);
1906         if (error == 0) {
1907                 /*
1908                  * The event applies only for the session, not for
1909                  * the disconnection of individual subflows.
1910                  */
1911                 if (so->so_state & (SS_ISDISCONNECTING | SS_ISDISCONNECTED)) {
1912                         sflt_notify(so, sock_evt_disconnected, NULL);
1913                 }
1914         }
1915         return error;
1916 }
1917
1918 int
1919 sodisconnectx(struct socket *so, sae_associd_t aid, sae_connid_t cid)
1920 {
1921         int error;
1922
1923         socket_lock(so, 1);
1924         error = sodisconnectxlocked(so, aid, cid);
1925         socket_unlock(so, 1);
1926         return error;
1927 }
1928
1929 #define SBLOCKWAIT(f)   (((f) & MSG_DONTWAIT) ? 0 : SBL_WAIT)
1930
1931 /*
1932  * sosendcheck will lock the socket buffer if it isn't locked and
1933  * verify that there is space for the data being inserted.
1934  *
1935  * Returns:     0                       Success
1936  *              EPIPE
1937  *      sblock:EWOULDBLOCK
1938  *      sblock:EINTR
1939  *      sbwait:EBADF
1940  *      sbwait:EINTR
1941  *      [so_error]:???
1942  */
1943 int
1944 sosendcheck(struct socket *so, struct sockaddr *addr, user_ssize_t resid,
1945     int32_t clen, int32_t atomic, int flags, int *sblocked)
1946 {
1947         int     error = 0;
1948         int32_t space;
1949         int     assumelock = 0;
1950
1951 restart:
1952         if (*sblocked == 0) {
1953                 if ((so->so_snd.sb_flags & SB_LOCK) != 0 &&
1954                     so->so_send_filt_thread != 0 &&
1955                     so->so_send_filt_thread == current_thread()) {
1956                         /*
1957                          * We're being called recursively from a filter,
1958                          * allow this to continue. Radar 4150520.
1959                          * Don't set sblocked because we don't want
1960                          * to perform an unlock later.
1961                          */
1962                         assumelock = 1;
1963                 } else {
1964                         error = sblock(&so->so_snd, SBLOCKWAIT(flags));
1965                         if (error) {
1966                                 if (so->so_flags & SOF_DEFUNCT) {
1967                                         goto defunct;
1968                                 }
1969                                 return error;
1970                         }
1971                         *sblocked = 1;
1972                 }
1973         }
1974
1975         /*
1976          * If a send attempt is made on a socket that has been marked
1977          * as inactive (disconnected), reject the request.
1978          */
1979         if (so->so_flags & SOF_DEFUNCT) {
1980 defunct:
1981                 error = EPIPE;
1982                 SODEFUNCTLOG("%s[%d, %s]: defunct so 0x%llx [%d,%d] (%d)\n",
1983                     __func__, proc_selfpid(), proc_best_name(current_proc()),
1984                     (uint64_t)DEBUG_KERNEL_ADDRPERM(so),
1985                     SOCK_DOM(so), SOCK_TYPE(so), error);
1986                 return error;
1987         }
1988
1989         if (so->so_state & SS_CANTSENDMORE) {
1990 #if CONTENT_FILTER
1991                 /*
1992                  * Can re-inject data of half closed connections
1993                  */
1994                 if ((so->so_state & SS_ISDISCONNECTED) == 0 &&
1995                     so->so_snd.sb_cfil_thread == current_thread() &&
1996                     cfil_sock_data_pending(&so->so_snd) != 0) {
1997                         CFIL_LOG(LOG_INFO,
1998                             "so %llx ignore SS_CANTSENDMORE",
1999                             (uint64_t)DEBUG_KERNEL_ADDRPERM(so));
2000                 } else
2001 #endif /* CONTENT_FILTER */
2002                 return EPIPE;
2003         }
2004         if (so->so_error) {
2005                 error = so->so_error;
2006                 so->so_error = 0;
2007                 return error;
2008         }
2009
2010         if ((so->so_state & SS_ISCONNECTED) == 0) {
2011                 if ((so->so_proto->pr_flags & PR_CONNREQUIRED) != 0) {
2012                         if (((so->so_state & SS_ISCONFIRMING) == 0) &&
2013                             (resid != 0 || clen == 0) &&
2014                             !(so->so_flags1 & SOF1_PRECONNECT_DATA)) {
2015                                 return ENOTCONN;
2016                         }
2017                 } else if (addr == 0) {
2018                         return (so->so_proto->pr_flags & PR_CONNREQUIRED) ?
2019                                ENOTCONN : EDESTADDRREQ;
2020                 }
2021         }
2022
2023         space = sbspace(&so->so_snd);
2024
2025         if (flags & MSG_OOB) {
2026                 space += 1024;
2027         }
2028         if ((atomic && resid > so->so_snd.sb_hiwat) ||
2029             clen > so->so_snd.sb_hiwat) {
2030                 return EMSGSIZE;
2031         }
2032
2033         if ((space < resid + clen &&
2034             (atomic || (space < (int32_t)so->so_snd.sb_lowat) ||
2035             space < clen)) ||
2036             (so->so_type == SOCK_STREAM && so_wait_for_if_feedback(so))) {
2037                 /*
2038                  * don't block the connectx call when there's more data
2039                  * than can be copied.
2040                  */
2041                 if (so->so_flags1 & SOF1_PRECONNECT_DATA) {
2042                         if (space == 0) {
2043                                 return EWOULDBLOCK;
2044                         }
2045                         if (space < (int32_t)so->so_snd.sb_lowat) {
2046                                 return 0;
2047                         }
2048                 }
2049                 if ((so->so_state & SS_NBIO) || (flags & MSG_NBIO) ||
2050                     assumelock) {
2051                         return EWOULDBLOCK;
2052                 }
2053                 sbunlock(&so->so_snd, TRUE);    /* keep socket locked */
2054                 *sblocked = 0;
2055                 error = sbwait(&so->so_snd);
2056                 if (error) {
2057                         if (so->so_flags & SOF_DEFUNCT) {
2058                                 goto defunct;
2059                         }
2060                         return error;
2061                 }
2062                 goto restart;
2063         }
2064         return 0;
2065 }
2066
2067 /*
2068  * Send on a socket.
2069  * If send must go all at once and message is larger than
2070  * send buffering, then hard error.
2071  * Lock against other senders.
2072  * If must go all at once and not enough room now, then
2073  * inform user that this would block and do nothing.
2074  * Otherwise, if nonblocking, send as much as possible.
2075  * The data to be sent is described by "uio" if nonzero,
2076  * otherwise by the mbuf chain "top" (which must be null
2077  * if uio is not).  Data provided in mbuf chain must be small
2078  * enough to send all at once.
2079  *
2080  * Returns nonzero on error, timeout or signal; callers
2081  * must check for short counts if EINTR/ERESTART are returned.
2082  * Data and control buffers are freed on return.
2083  *
2084  * Returns:     0                       Success
2085  *              EOPNOTSUPP
2086  *              EINVAL
2087  *              ENOBUFS
2088  *      uiomove:EFAULT
2089  *      sosendcheck:EPIPE
2090  *      sosendcheck:EWOULDBLOCK
2091  *      sosendcheck:EINTR
2092  *      sosendcheck:EBADF
2093  *      sosendcheck:EINTR
2094  *      sosendcheck:???                 [value from so_error]
2095  *      <pru_send>:ECONNRESET[TCP]
2096  *      <pru_send>:EINVAL[TCP]
2097  *      <pru_send>:ENOBUFS[TCP]
2098  *      <pru_send>:EADDRINUSE[TCP]
2099  *      <pru_send>:EADDRNOTAVAIL[TCP]
2100  *      <pru_send>:EAFNOSUPPORT[TCP]
2101  *      <pru_send>:EACCES[TCP]
2102  *      <pru_send>:EAGAIN[TCP]
2103  *      <pru_send>:EPERM[TCP]
2104  *      <pru_send>:EMSGSIZE[TCP]
2105  *      <pru_send>:EHOSTUNREACH[TCP]
2106  *      <pru_send>:ENETUNREACH[TCP]
2107  *      <pru_send>:ENETDOWN[TCP]
2108  *      <pru_send>:ENOMEM[TCP]
2109  *      <pru_send>:ENOBUFS[TCP]
2110  *      <pru_send>:???[TCP]             [ignorable: mostly IPSEC/firewall/DLIL]
2111  *      <pru_send>:EINVAL[AF_UNIX]
2112  *      <pru_send>:EOPNOTSUPP[AF_UNIX]
2113  *      <pru_send>:EPIPE[AF_UNIX]
2114  *      <pru_send>:ENOTCONN[AF_UNIX]
2115  *      <pru_send>:EISCONN[AF_UNIX]
2116  *      <pru_send>:???[AF_UNIX]         [whatever a filter author chooses]
2117  *      <sf_data_out>:???               [whatever a filter author chooses]
2118  *
2119  * Notes:       Other <pru_send> returns depend on the protocol family; all
2120  *              <sf_data_out> returns depend on what the filter author causes
2121  *              their filter to return.
2122  */
2123 int
2124 sosend(struct socket *so, struct sockaddr *addr, struct uio *uio,
2125     struct mbuf *top, struct mbuf *control, int flags)
2126 {
2127         struct mbuf **mp;
2128         struct mbuf *m, *freelist = NULL;
2129         user_ssize_t space, len, resid, orig_resid;
2130         int clen = 0, error, dontroute, mlen, sendflags;
2131         int atomic = sosendallatonce(so) || top;
2132         int sblocked = 0;
2133         struct proc *p = current_proc();
2134         uint16_t headroom = 0;
2135         boolean_t en_tracing = FALSE;
2136
2137         if (uio != NULL) {
2138                 resid = uio_resid(uio);
2139         } else {
2140                 resid = top->m_pkthdr.len;
2141         }
2142
2143         KERNEL_DEBUG((DBG_FNC_SOSEND | DBG_FUNC_START), so, resid,
2144             so->so_snd.sb_cc, so->so_snd.sb_lowat, so->so_snd.sb_hiwat);
2145
2146         socket_lock(so, 1);
2147
2148         /*
2149          * trace if tracing & network (vs. unix) sockets & and
2150          * non-loopback
2151          */
2152         if (ENTR_SHOULDTRACE &&
2153             (SOCK_CHECK_DOM(so, AF_INET) || SOCK_CHECK_DOM(so, AF_INET6))) {
2154                 struct inpcb *inp = sotoinpcb(so);
2155                 if (inp->inp_last_outifp != NULL &&
2156                     !(inp->inp_last_outifp->if_flags & IFF_LOOPBACK)) {
2157                         en_tracing = TRUE;
2158                         KERNEL_ENERGYTRACE(kEnTrActKernSockWrite, DBG_FUNC_START,
2159                             VM_KERNEL_ADDRPERM(so),
2160                             ((so->so_state & SS_NBIO) ? kEnTrFlagNonBlocking : 0),
2161                             (int64_t)resid);
2162                         orig_resid = resid;
2163                 }
2164         }
2165
2166         /*
2167          * Re-injection should not affect process accounting
2168          */
2169         if ((flags & MSG_SKIPCFIL) == 0) {
2170                 so_update_last_owner_locked(so, p);
2171                 so_update_policy(so);
2172
2173 #if NECP
2174                 so_update_necp_policy(so, NULL, addr);
2175 #endif /* NECP */
2176         }
2177
2178         if (so->so_type != SOCK_STREAM && (flags & MSG_OOB) != 0) {
2179                 error = EOPNOTSUPP;
2180                 goto out_locked;
2181         }
2182
2183         /*
2184          * In theory resid should be unsigned.
2185          * However, space must be signed, as it might be less than 0
2186          * if we over-committed, and we must use a signed comparison
2187          * of space and resid.  On the other hand, a negative resid
2188          * causes us to loop sending 0-length segments to the protocol.
2189          *
2190          * Usually, MSG_EOR isn't used on SOCK_STREAM type sockets.
2191          *
2192          * Note: We limit resid to be a positive int value as we use
2193          * imin() to set bytes_to_copy -- radr://14558484
2194          */
2195         if (resid < 0 || resid > INT_MAX ||
2196             (so->so_type == SOCK_STREAM && (flags & MSG_EOR))) {
2197                 error = EINVAL;
2198                 goto out_locked;
2199         }
2200
2201         dontroute = (flags & MSG_DONTROUTE) &&
2202             (so->so_options & SO_DONTROUTE) == 0 &&
2203             (so->so_proto->pr_flags & PR_ATOMIC);
2204         OSIncrementAtomicLong(&p->p_stats->p_ru.ru_msgsnd);
2205
2206         if (control != NULL) {
2207                 clen = control->m_len;
2208         }
2209
2210         if (soreserveheadroom != 0) {
2211                 headroom = so->so_pktheadroom;
2212         }
2213
2214         do {
2215                 error = sosendcheck(so, addr, resid, clen, atomic, flags,
2216                     &sblocked);
2217                 if (error) {
2218                         goto out_locked;
2219                 }
2220
2221                 mp = &top;
2222                 space = sbspace(&so->so_snd) - clen;
2223                 space += ((flags & MSG_OOB) ? 1024 : 0);
2224
2225                 do {
2226                         if (uio == NULL) {
2227                                 /*
2228                                  * Data is prepackaged in "top".
2229                                  */
2230                                 resid = 0;
2231                                 if (flags & MSG_EOR) {
2232                                         top->m_flags |= M_EOR;
2233                                 }
2234                         } else {
2235                                 int chainlength;
2236                                 int bytes_to_copy;
2237                                 boolean_t jumbocl;
2238                                 boolean_t bigcl;
2239                                 int bytes_to_alloc;
2240
2241                                 bytes_to_copy = imin(resid, space);
2242
2243                                 bytes_to_alloc = bytes_to_copy;
2244                                 if (top == NULL) {
2245                                         bytes_to_alloc += headroom;
2246                                 }
2247
2248                                 if (sosendminchain > 0) {
2249                                         chainlength = 0;
2250                                 } else {
2251                                         chainlength = sosendmaxchain;
2252                                 }
2253
2254                                 /*
2255                                  * Use big 4 KB cluster when the outgoing interface
2256                                  * does not prefer 2 KB clusters
2257                                  */
2258                                 bigcl = !(so->so_flags1 & SOF1_IF_2KCL) ||
2259                                     sosendbigcl_ignore_capab;
2260
2261                                 /*
2262                                  * Attempt to use larger than system page-size
2263                                  * clusters for large writes only if there is
2264                                  * a jumbo cluster pool and if the socket is
2265                                  * marked accordingly.
2266                                  */
2267                                 jumbocl = sosendjcl && njcl > 0 &&
2268                                     ((so->so_flags & SOF_MULTIPAGES) ||
2269                                     sosendjcl_ignore_capab) &&
2270                                     bigcl;
2271
2272                                 socket_unlock(so, 0);
2273
2274                                 do {
2275                                         int num_needed;
2276                                         int hdrs_needed = (top == NULL) ? 1 : 0;
2277
2278                                         /*
2279                                          * try to maintain a local cache of mbuf
2280                                          * clusters needed to complete this
2281                                          * write the list is further limited to
2282                                          * the number that are currently needed
2283                                          * to fill the socket this mechanism
2284                                          * allows a large number of mbufs/
2285                                          * clusters to be grabbed under a single
2286                                          * mbuf lock... if we can't get any
2287                                          * clusters, than fall back to trying
2288                                          * for mbufs if we fail early (or
2289                                          * miscalcluate the number needed) make
2290                                          * sure to release any clusters we
2291                                          * haven't yet consumed.
2292                                          */
2293                                         if (freelist == NULL &&
2294                                             bytes_to_alloc > MBIGCLBYTES &&
2295                                             jumbocl) {
2296                                                 num_needed =
2297                                                     bytes_to_alloc / M16KCLBYTES;
2298
2299                                                 if ((bytes_to_alloc -
2300                                                     (num_needed * M16KCLBYTES))
2301                                                     >= MINCLSIZE) {
2302                                                         num_needed++;
2303                                                 }
2304
2305                                                 freelist =
2306                                                     m_getpackets_internal(
2307                                                         (unsigned int *)&num_needed,
2308                                                         hdrs_needed, M_WAIT, 0,
2309                                                         M16KCLBYTES);
2310                                                 /*
2311                                                  * Fall back to 4K cluster size
2312                                                  * if allocation failed
2313                                                  */
2314                                         }
2315
2316                                         if (freelist == NULL &&
2317                                             bytes_to_alloc > MCLBYTES &&
2318                                             bigcl) {
2319                                                 num_needed =
2320                                                     bytes_to_alloc / MBIGCLBYTES;
2321
2322                                                 if ((bytes_to_alloc -
2323                                                     (num_needed * MBIGCLBYTES)) >=
2324                                                     MINCLSIZE) {
2325                                                         num_needed++;
2326                                                 }
2327
2328                                                 freelist =
2329                                                     m_getpackets_internal(
2330                                                         (unsigned int *)&num_needed,
2331                                                         hdrs_needed, M_WAIT, 0,
2332                                                         MBIGCLBYTES);
2333                                                 /*
2334                                                  * Fall back to cluster size
2335                                                  * if allocation failed
2336                                                  */
2337                                         }
2338
2339                                         /*
2340                                          * Allocate a cluster as we want to
2341                                          * avoid to split the data in more
2342                                          * that one segment and using MINCLSIZE
2343                                          * would lead us to allocate two mbufs
2344                                          */
2345                                         if (soreserveheadroom != 0 &&
2346                                             freelist == NULL &&
2347                                             ((top == NULL &&
2348                                             bytes_to_alloc > _MHLEN) ||
2349                                             bytes_to_alloc > _MLEN)) {
2350                                                 num_needed = ROUNDUP(bytes_to_alloc, MCLBYTES) /
2351                                                     MCLBYTES;
2352                                                 freelist =
2353                                                     m_getpackets_internal(
2354                                                         (unsigned int *)&num_needed,
2355                                                         hdrs_needed, M_WAIT, 0,
2356                                                         MCLBYTES);
2357                                                 /*
2358                                                  * Fall back to a single mbuf
2359                                                  * if allocation failed
2360                                                  */
2361                                         } else if (freelist == NULL &&
2362                                             bytes_to_alloc > MINCLSIZE) {
2363                                                 num_needed =
2364                                                     bytes_to_alloc / MCLBYTES;
2365
2366                                                 if ((bytes_to_alloc -
2367                                                     (num_needed * MCLBYTES)) >=
2368                                                     MINCLSIZE) {
2369                                                         num_needed++;
2370                                                 }
2371
2372                                                 freelist =
2373                                                     m_getpackets_internal(
2374                                                         (unsigned int *)&num_needed,
2375                                                         hdrs_needed, M_WAIT, 0,
2376                                                         MCLBYTES);
2377                                                 /*
2378                                                  * Fall back to a single mbuf
2379                                                  * if allocation failed
2380                                                  */
2381                                         }
2382                                         /*
2383                                          * For datagram protocols, leave
2384                                          * headroom for protocol headers
2385                                          * in the first cluster of the chain
2386                                          */
2387                                         if (freelist != NULL && atomic &&
2388                                             top == NULL && headroom > 0) {
2389                                                 freelist->m_data += headroom;
2390                                         }
2391
2392                                         /*
2393                                          * Fall back to regular mbufs without
2394                                          * reserving the socket headroom
2395                                          */
2396                                         if (freelist == NULL) {
2397                                                 if (SOCK_TYPE(so) != SOCK_STREAM || bytes_to_alloc <= MINCLSIZE) {
2398                                                         if (top == NULL) {
2399                                                                 MGETHDR(freelist,
2400                                                                     M_WAIT, MT_DATA);
2401                                                         } else {
2402                                                                 MGET(freelist,
2403                                                                     M_WAIT, MT_DATA);
2404                                                         }
2405                                                 }
2406
2407                                                 if (freelist == NULL) {
2408                                                         error = ENOBUFS;
2409                                                         socket_lock(so, 0);
2410                                                         goto out_locked;
2411                                                 }
2412                                                 /*
2413                                                  * For datagram protocols,
2414                                                  * leave room for protocol
2415                                                  * headers in first mbuf.
2416                                                  */
2417                                                 if (atomic && top == NULL &&
2418                                                     bytes_to_copy < MHLEN) {
2419                                                         MH_ALIGN(freelist,
2420                                                             bytes_to_copy);
2421                                                 }
2422                                         }
2423                                         m = freelist;
2424                                         freelist = m->m_next;
2425                                         m->m_next = NULL;
2426
2427                                         if ((m->m_flags & M_EXT)) {
2428                                                 mlen = m->m_ext.ext_size -
2429                                                     M_LEADINGSPACE(m);
2430                                         } else if ((m->m_flags & M_PKTHDR)) {
2431                                                 mlen =
2432                                                     MHLEN - M_LEADINGSPACE(m);
2433                                         } else {
2434                                                 mlen = MLEN - M_LEADINGSPACE(m);
2435                                         }
2436                                         len = imin(mlen, bytes_to_copy);
2437
2438                                         chainlength += len;
2439
2440                                         space -= len;
2441
2442                                         error = uiomove(mtod(m, caddr_t),
2443                                             len, uio);
2444
2445                                         resid = uio_resid(uio);
2446
2447                                         m->m_len = len;
2448                                         *mp = m;
2449                                         top->m_pkthdr.len += len;
2450                                         if (error) {
2451                                                 break;
2452                                         }
2453                                         mp = &m->m_next;
2454                                         if (resid <= 0) {
2455                                                 if (flags & MSG_EOR) {
2456                                                         top->m_flags |= M_EOR;
2457                                                 }
2458                                                 break;
2459                                         }
2460                                         bytes_to_copy = min(resid, space);
2461                                 } while (space > 0 &&
2462                                     (chainlength < sosendmaxchain || atomic ||
2463                                     resid < MINCLSIZE));
2464
2465                                 socket_lock(so, 0);
2466
2467                                 if (error) {
2468                                         goto out_locked;
2469                                 }
2470                         }
2471
2472                         if (dontroute) {
2473                                 so->so_options |= SO_DONTROUTE;
2474                         }
2475
2476                         /*
2477                          * Compute flags here, for pru_send and NKEs
2478                          *
2479                          * If the user set MSG_EOF, the protocol
2480                          * understands this flag and nothing left to
2481                          * send then use PRU_SEND_EOF instead of PRU_SEND.
2482                          */
2483                         sendflags = (flags & MSG_OOB) ? PRUS_OOB :
2484                             ((flags & MSG_EOF) &&
2485                             (so->so_proto->pr_flags & PR_IMPLOPCL) &&
2486                             (resid <= 0)) ? PRUS_EOF :
2487                             /* If there is more to send set PRUS_MORETOCOME */
2488                             (resid > 0 && space > 0) ? PRUS_MORETOCOME : 0;
2489
2490                         if ((flags & MSG_SKIPCFIL) == 0) {
2491                                 /*
2492                                  * Socket filter processing
2493                                  */
2494                                 error = sflt_data_out(so, addr, &top,
2495                                     &control, (sendflags & MSG_OOB) ?
2496                                     sock_data_filt_flag_oob : 0);
2497                                 if (error) {
2498                                         if (error == EJUSTRETURN) {
2499                                                 error = 0;
2500                                                 goto packet_consumed;
2501                                         }
2502                                         goto out_locked;
2503                                 }
2504 #if CONTENT_FILTER
2505                                 /*
2506                                  * Content filter processing
2507                                  */
2508                                 error = cfil_sock_data_out(so, addr, top,
2509                                     control, sendflags);
2510                                 if (error) {
2511                                         if (error == EJUSTRETURN) {
2512                                                 error = 0;
2513                                                 clen = 0;
2514                                                 control = NULL;
2515                                                 top = NULL;
2516                                         }
2517                                         goto out_locked;
2518                                 }
2519 #endif /* CONTENT_FILTER */
2520                         }
2521                         error = (*so->so_proto->pr_usrreqs->pru_send)
2522                             (so, sendflags, top, addr, control, p);
2523
2524 packet_consumed:
2525                         if (dontroute) {
2526                                 so->so_options &= ~SO_DONTROUTE;
2527                         }
2528
2529                         clen = 0;
2530                         control = NULL;
2531                         top = NULL;
2532                         mp = &top;
2533                         if (error) {
2534                                 goto out_locked;
2535                         }
2536                 } while (resid && space > 0);
2537         } while (resid);
2538
2539 out_locked:
2540         if (sblocked) {
2541                 sbunlock(&so->so_snd, FALSE);   /* will unlock socket */
2542         } else {
2543                 socket_unlock(so, 1);
2544         }
2545         if (top != NULL) {
2546                 m_freem(top);
2547         }
2548         if (control != NULL) {
2549                 m_freem(control);
2550         }
2551         if (freelist != NULL) {
2552                 m_freem_list(freelist);
2553         }
2554
2555         soclearfastopen(so);
2556
2557         if (en_tracing) {
2558                 /* resid passed here is the bytes left in uio */
2559                 KERNEL_ENERGYTRACE(kEnTrActKernSockWrite, DBG_FUNC_END,
2560                     VM_KERNEL_ADDRPERM(so),
2561                     ((error == EWOULDBLOCK) ? kEnTrFlagNoWork : 0),
2562                     (int64_t)(orig_resid - resid));
2563         }
2564         KERNEL_DEBUG(DBG_FNC_SOSEND | DBG_FUNC_END, so, resid,
2565             so->so_snd.sb_cc, space, error);
2566
2567         return error;
2568 }
2569
2570 int
2571 sosend_reinject(struct socket *so, struct sockaddr *addr, struct mbuf *top, struct mbuf *control, uint32_t sendflags)
2572 {
2573         struct mbuf *m0 = NULL, *control_end = NULL;
2574
2575         socket_lock_assert_owned(so);
2576
2577         /*
2578          * top must points to mbuf chain to be sent.
2579          * If control is not NULL, top must be packet header
2580          */
2581         VERIFY(top != NULL &&
2582             (control == NULL || top->m_flags & M_PKTHDR));
2583
2584         /*
2585          * If control is not passed in, see if we can get it
2586          * from top.
2587          */
2588         if (control == NULL && (top->m_flags & M_PKTHDR) == 0) {
2589                 // Locate start of control if present and start of data
2590                 for (m0 = top; m0 != NULL; m0 = m0->m_next) {
2591                         if (m0->m_flags & M_PKTHDR) {
2592                                 top = m0;
2593                                 break;
2594                         } else if (m0->m_type == MT_CONTROL) {
2595                                 if (control == NULL) {
2596                                         // Found start of control
2597                                         control = m0;
2598                                 }
2599                                 if (control != NULL && m0->m_next != NULL && m0->m_next->m_type != MT_CONTROL) {
2600                                         // Found end of control
2601                                         control_end = m0;
2602                                 }
2603                         }
2604                 }
2605                 if (control_end != NULL) {
2606                         control_end->m_next = NULL;
2607                 }
2608         }
2609
2610         int error = (*so->so_proto->pr_usrreqs->pru_send)
2611             (so, sendflags, top, addr, control, current_proc());
2612
2613         return error;
2614 }
2615
2616 /*
2617  * Supported only connected sockets (no address) without ancillary data
2618  * (control mbuf) for atomic protocols
2619  */
2620 int
2621 sosend_list(struct socket *so, struct uio **uioarray, u_int uiocnt, int flags)
2622 {
2623         struct mbuf *m, *freelist = NULL;
2624         user_ssize_t len, resid;
2625         int error, dontroute, mlen;
2626         int atomic = sosendallatonce(so);
2627         int sblocked = 0;
2628         struct proc *p = current_proc();
2629         u_int uiofirst = 0;
2630         u_int uiolast = 0;
2631         struct mbuf *top = NULL;
2632         uint16_t headroom = 0;
2633         boolean_t bigcl;
2634
2635         KERNEL_DEBUG((DBG_FNC_SOSEND_LIST | DBG_FUNC_START), so, uiocnt,
2636             so->so_snd.sb_cc, so->so_snd.sb_lowat, so->so_snd.sb_hiwat);
2637
2638         if (so->so_type != SOCK_DGRAM) {
2639                 error = EINVAL;
2640                 goto out;
2641         }
2642         if (atomic == 0) {
2643                 error = EINVAL;
2644                 goto out;
2645         }
2646         if (so->so_proto->pr_usrreqs->pru_send_list == NULL) {
2647                 error = EPROTONOSUPPORT;
2648                 goto out;
2649         }
2650         if (flags & ~(MSG_DONTWAIT | MSG_NBIO)) {
2651                 error = EINVAL;
2652                 goto out;
2653         }
2654         resid = uio_array_resid(uioarray, uiocnt);
2655
2656         /*
2657          * In theory resid should be unsigned.
2658          * However, space must be signed, as it might be less than 0
2659          * if we over-committed, and we must use a signed comparison
2660          * of space and resid.  On the other hand, a negative resid
2661          * causes us to loop sending 0-length segments to the protocol.
2662          *
2663          * Note: We limit resid to be a positive int value as we use
2664          * imin() to set bytes_to_copy -- radr://14558484
2665          */
2666         if (resid < 0 || resid > INT_MAX) {
2667                 error = EINVAL;
2668                 goto out;
2669         }
2670
2671         socket_lock(so, 1);
2672         so_update_last_owner_locked(so, p);
2673         so_update_policy(so);
2674
2675 #if NECP
2676         so_update_necp_policy(so, NULL, NULL);
2677 #endif /* NECP */
2678
2679         dontroute = (flags & MSG_DONTROUTE) &&
2680             (so->so_options & SO_DONTROUTE) == 0 &&
2681             (so->so_proto->pr_flags & PR_ATOMIC);
2682         OSIncrementAtomicLong(&p->p_stats->p_ru.ru_msgsnd);
2683
2684         error = sosendcheck(so, NULL, resid, 0, atomic, flags, &sblocked);
2685         if (error) {
2686                 goto release;
2687         }
2688
2689         /*
2690          * Use big 4 KB clusters when the outgoing interface does not prefer
2691          * 2 KB clusters
2692          */
2693         bigcl = !(so->so_flags1 & SOF1_IF_2KCL) || sosendbigcl_ignore_capab;
2694
2695         if (soreserveheadroom != 0) {
2696                 headroom = so->so_pktheadroom;
2697         }
2698
2699         do {
2700                 int i;
2701                 int num_needed = 0;
2702                 int chainlength;
2703                 size_t maxpktlen = 0;
2704                 int bytes_to_alloc;
2705
2706                 if (sosendminchain > 0) {
2707                         chainlength = 0;
2708                 } else {
2709                         chainlength = sosendmaxchain;
2710                 }
2711
2712                 socket_unlock(so, 0);
2713
2714                 /*
2715                  * Find a set of uio that fit in a reasonable number
2716                  * of mbuf packets
2717                  */
2718                 for (i = uiofirst; i < uiocnt; i++) {
2719                         struct uio *auio = uioarray[i];
2720
2721                         len = uio_resid(auio);
2722
2723                         /* Do nothing for empty messages */
2724                         if (len == 0) {
2725                                 continue;
2726                         }
2727
2728                         num_needed += 1;
2729                         uiolast += 1;
2730
2731                         if (len > maxpktlen) {
2732                                 maxpktlen = len;
2733                         }
2734
2735                         chainlength += len;
2736                         if (chainlength > sosendmaxchain) {
2737                                 break;
2738                         }
2739                 }
2740                 /*
2741                  * Nothing left to send
2742                  */
2743                 if (num_needed == 0) {
2744                         socket_lock(so, 0);
2745                         break;
2746                 }
2747                 /*
2748                  * Allocate buffer large enough to include headroom space for
2749                  * network and link header
2750                  *
2751                  */
2752                 bytes_to_alloc = maxpktlen + headroom;
2753
2754                 /*
2755                  * Allocate a single contiguous buffer of the smallest available
2756                  * size when possible
2757                  */
2758                 if (bytes_to_alloc > MCLBYTES &&
2759                     bytes_to_alloc <= MBIGCLBYTES && bigcl) {
2760                         freelist = m_getpackets_internal(
2761                                 (unsigned int *)&num_needed,
2762                                 num_needed, M_WAIT, 1,
2763                                 MBIGCLBYTES);
2764                 } else if (bytes_to_alloc > _MHLEN &&
2765                     bytes_to_alloc <= MCLBYTES) {
2766                         freelist = m_getpackets_internal(
2767                                 (unsigned int *)&num_needed,
2768                                 num_needed, M_WAIT, 1,
2769                                 MCLBYTES);
2770                 } else {
2771                         freelist = m_allocpacket_internal(
2772                                 (unsigned int *)&num_needed,
2773                                 bytes_to_alloc, NULL, M_WAIT, 1, 0);
2774                 }
2775
2776                 if (freelist == NULL) {
2777                         socket_lock(so, 0);
2778                         error = ENOMEM;
2779                         goto release;
2780                 }
2781                 /*
2782                  * Copy each uio of the set into its own mbuf packet
2783                  */
2784                 for (i = uiofirst, m = freelist;
2785                     i < uiolast && m != NULL;
2786                     i++) {
2787                         int bytes_to_copy;
2788                         struct mbuf *n;
2789                         struct uio *auio = uioarray[i];
2790
2791                         bytes_to_copy = uio_resid(auio);
2792
2793                         /* Do nothing for empty messages */
2794                         if (bytes_to_copy == 0) {
2795                                 continue;
2796                         }
2797                         /*
2798                          * Leave headroom for protocol headers
2799                          * in the first mbuf of the chain
2800                          */
2801                         m->m_data += headroom;
2802
2803                         for (n = m; n != NULL; n = n->m_next) {
2804                                 if ((m->m_flags & M_EXT)) {
2805                                         mlen = m->m_ext.ext_size -
2806                                             M_LEADINGSPACE(m);
2807                                 } else if ((m->m_flags & M_PKTHDR)) {
2808                                         mlen =
2809                                             MHLEN - M_LEADINGSPACE(m);
2810                                 } else {
2811                                         mlen = MLEN - M_LEADINGSPACE(m);
2812                                 }
2813                                 len = imin(mlen, bytes_to_copy);
2814
2815                                 /*
2816                                  * Note: uiomove() decrements the iovec
2817                                  * length
2818                                  */
2819                                 error = uiomove(mtod(n, caddr_t),
2820                                     len, auio);
2821                                 if (error != 0) {
2822                                         break;
2823                                 }
2824                                 n->m_len = len;
2825                                 m->m_pkthdr.len += len;
2826
2827                                 VERIFY(m->m_pkthdr.len <= maxpktlen);
2828
2829                                 bytes_to_copy -= len;
2830                                 resid -= len;
2831                         }
2832                         if (m->m_pkthdr.len == 0) {
2833                                 printf(
2834                                         "%s:%d so %llx pkt %llx type %u len null\n",
2835                                         __func__, __LINE__,
2836                                         (uint64_t)DEBUG_KERNEL_ADDRPERM(so),
2837                                         (uint64_t)DEBUG_KERNEL_ADDRPERM(m),
2838                                         m->m_type);
2839                         }
2840                         if (error != 0) {
2841                                 break;
2842                         }
2843                         m = m->m_nextpkt;
2844                 }
2845
2846                 socket_lock(so, 0);
2847
2848                 if (error) {
2849                         goto release;
2850                 }
2851                 top = freelist;
2852                 freelist = NULL;
2853
2854                 if (dontroute) {
2855                         so->so_options |= SO_DONTROUTE;
2856                 }
2857
2858                 if ((flags & MSG_SKIPCFIL) == 0) {
2859                         struct mbuf **prevnextp = NULL;
2860
2861                         for (i = uiofirst, m = top;
2862                             i < uiolast && m != NULL;
2863                             i++) {
2864                                 struct mbuf *nextpkt = m->m_nextpkt;
2865
2866                                 /*
2867                                  * Socket filter processing
2868                                  */
2869                                 error = sflt_data_out(so, NULL, &m,
2870                                     NULL, 0);
2871                                 if (error != 0 && error != EJUSTRETURN) {
2872                                         goto release;
2873                                 }
2874
2875 #if CONTENT_FILTER
2876                                 if (error == 0) {
2877                                         /*
2878                                          * Content filter processing
2879                                          */
2880                                         error = cfil_sock_data_out(so, NULL, m,
2881                                             NULL, 0);
2882                                         if (error != 0 && error != EJUSTRETURN) {
2883                                                 goto release;
2884                                         }
2885                                 }
2886 #endif /* CONTENT_FILTER */
2887                                 /*
2888                                  * Remove packet from the list when
2889                                  * swallowed by a filter
2890                                  */
2891                                 if (error == EJUSTRETURN) {
2892                                         error = 0;
2893                                         if (prevnextp != NULL) {
2894                                                 *prevnextp = nextpkt;
2895                                         } else {
2896                                                 top = nextpkt;
2897                                         }
2898                                 }
2899
2900                                 m = nextpkt;
2901                                 if (m != NULL) {
2902                                         prevnextp = &m->m_nextpkt;
2903                                 }
2904                         }
2905                 }
2906                 if (top != NULL) {
2907                         error = (*so->so_proto->pr_usrreqs->pru_send_list)
2908                             (so, 0, top, NULL, NULL, p);
2909                 }
2910
2911                 if (dontroute) {
2912                         so->so_options &= ~SO_DONTROUTE;
2913                 }
2914
2915                 top = NULL;
2916                 uiofirst = uiolast;
2917         } while (resid > 0 && error == 0);
2918 release:
2919         if (sblocked) {
2920                 sbunlock(&so->so_snd, FALSE);   /* will unlock socket */
2921         } else {
2922                 socket_unlock(so, 1);
2923         }
2924 out:
2925         if (top != NULL) {
2926                 m_freem(top);
2927         }
2928         if (freelist != NULL) {
2929                 m_freem_list(freelist);
2930         }
2931
2932         KERNEL_DEBUG(DBG_FNC_SOSEND_LIST | DBG_FUNC_END, so, resid,
2933             so->so_snd.sb_cc, 0, error);
2934
2935         return error;
2936 }
2937
2938 /*
2939  * May return ERESTART when packet is dropped by MAC policy check
2940  */
2941 static int
2942 soreceive_addr(struct proc *p, struct socket *so, struct sockaddr **psa,
2943     int flags, struct mbuf **mp, struct mbuf **nextrecordp, int canwait)
2944 {
2945         int error = 0;
2946         struct mbuf *m = *mp;
2947         struct mbuf *nextrecord = *nextrecordp;
2948
2949         KASSERT(m->m_type == MT_SONAME, ("receive 1a"));
2950 #if CONFIG_MACF_SOCKET_SUBSET
2951         /*
2952          * Call the MAC framework for policy checking if we're in
2953          * the user process context and the socket isn't connected.
2954          */
2955         if (p != kernproc && !(so->so_state & SS_ISCONNECTED)) {
2956                 struct mbuf *m0 = m;
2957                 /*
2958                  * Dequeue this record (temporarily) from the receive
2959                  * list since we're about to drop the socket's lock
2960                  * where a new record may arrive and be appended to
2961                  * the list.  Upon MAC policy failure, the record
2962                  * will be freed.  Otherwise, we'll add it back to
2963                  * the head of the list.  We cannot rely on SB_LOCK
2964                  * because append operation uses the socket's lock.
2965                  */
2966                 do {
2967                         m->m_nextpkt = NULL;
2968                         sbfree(&so->so_rcv, m);
2969                         m = m->m_next;
2970                 } while (m != NULL);
2971                 m = m0;
2972                 so->so_rcv.sb_mb = nextrecord;
2973                 SB_EMPTY_FIXUP(&so->so_rcv);
2974                 SBLASTRECORDCHK(&so->so_rcv, "soreceive 1a");
2975                 SBLASTMBUFCHK(&so->so_rcv, "soreceive 1a");
2976                 socket_unlock(so, 0);
2977
2978                 if (mac_socket_check_received(proc_ucred(p), so,
2979                     mtod(m, struct sockaddr *)) != 0) {
2980                         /*
2981                          * MAC policy failure; free this record and
2982                          * process the next record (or block until
2983                          * one is available).  We have adjusted sb_cc
2984                          * and sb_mbcnt above so there is no need to
2985                          * call sbfree() again.
2986                          */
2987                         m_freem(m);
2988                         /*
2989                          * Clear SB_LOCK but don't unlock the socket.
2990                          * Process the next record or wait for one.
2991                          */
2992                         socket_lock(so, 0);
2993                         sbunlock(&so->so_rcv, TRUE); /* stay locked */
2994                         error = ERESTART;
2995                         goto done;
2996                 }
2997                 socket_lock(so, 0);
2998                 /*
2999                  * If the socket has been defunct'd, drop it.
3000                  */
3001                 if (so->so_flags & SOF_DEFUNCT) {
3002                         m_freem(m);
3003                         error = ENOTCONN;
3004                         goto done;
3005                 }
3006                 /*
3007                  * Re-adjust the socket receive list and re-enqueue
3008                  * the record in front of any packets which may have
3009                  * been appended while we dropped the lock.
3010                  */
3011                 for (m = m0; m->m_next != NULL; m = m->m_next) {
3012                         sballoc(&so->so_rcv, m);
3013                 }
3014                 sballoc(&so->so_rcv, m);
3015                 if (so->so_rcv.sb_mb == NULL) {
3016                         so->so_rcv.sb_lastrecord = m0;
3017                         so->so_rcv.sb_mbtail = m;
3018                 }
3019                 m = m0;
3020                 nextrecord = m->m_nextpkt = so->so_rcv.sb_mb;
3021                 so->so_rcv.sb_mb = m;
3022                 SBLASTRECORDCHK(&so->so_rcv, "soreceive 1b");
3023                 SBLASTMBUFCHK(&so->so_rcv, "soreceive 1b");
3024         }
3025 #endif /* CONFIG_MACF_SOCKET_SUBSET */
3026         if (psa != NULL) {
3027                 *psa = dup_sockaddr(mtod(m, struct sockaddr *), canwait);
3028                 if ((*psa == NULL) && (flags & MSG_NEEDSA)) {
3029                         error = EWOULDBLOCK;
3030                         goto done;
3031                 }
3032         }
3033         if (flags & MSG_PEEK) {
3034                 m = m->m_next;
3035         } else {
3036                 sbfree(&so->so_rcv, m);
3037                 if (m->m_next == NULL && so->so_rcv.sb_cc != 0) {
3038                         panic("%s: about to create invalid socketbuf",
3039                             __func__);
3040                         /* NOTREACHED */
3041                 }
3042                 MFREE(m, so->so_rcv.sb_mb);
3043                 m = so->so_rcv.sb_mb;
3044                 if (m != NULL) {
3045                         m->m_nextpkt = nextrecord;
3046                 } else {
3047                         so->so_rcv.sb_mb = nextrecord;
3048                         SB_EMPTY_FIXUP(&so->so_rcv);
3049                 }
3050         }
3051 done:
3052         *mp = m;
3053         *nextrecordp = nextrecord;
3054
3055         return error;
3056 }
3057
3058 /*
3059  * Process one or more MT_CONTROL mbufs present before any data mbufs
3060  * in the first mbuf chain on the socket buffer.  If MSG_PEEK, we
3061  * just copy the data; if !MSG_PEEK, we call into the protocol to
3062  * perform externalization.
3063  */
3064 static int
3065 soreceive_ctl(struct socket *so, struct mbuf **controlp, int flags,
3066     struct mbuf **mp, struct mbuf **nextrecordp)
3067 {
3068         int error = 0;
3069         struct mbuf *cm = NULL, *cmn;
3070         struct mbuf **cme = &cm;
3071         struct sockbuf *sb_rcv = &so->so_rcv;
3072         struct mbuf **msgpcm = NULL;
3073         struct mbuf *m = *mp;
3074         struct mbuf *nextrecord = *nextrecordp;
3075         struct protosw *pr = so->so_proto;
3076
3077         /*
3078          * Externalizing the control messages would require us to
3079          * drop the socket's lock below.  Once we re-acquire the
3080          * lock, the mbuf chain might change.  In order to preserve
3081          * consistency, we unlink all control messages from the
3082          * first mbuf chain in one shot and link them separately
3083          * onto a different chain.
3084          */
3085         do {
3086                 if (flags & MSG_PEEK) {
3087                         if (controlp != NULL) {
3088                                 if (*controlp == NULL) {
3089                                         msgpcm = controlp;
3090                                 }
3091                                 *controlp = m_copy(m, 0, m->m_len);
3092
3093                                 /*
3094                                  * If we failed to allocate an mbuf,
3095                                  * release any previously allocated
3096                                  * mbufs for control data. Return
3097                                  * an error. Keep the mbufs in the
3098                                  * socket as this is using
3099                                  * MSG_PEEK flag.
3100                                  */
3101                                 if (*controlp == NULL) {
3102                                         m_freem(*msgpcm);
3103                                         error = ENOBUFS;
3104                                         goto done;
3105                                 }
3106                                 controlp = &(*controlp)->m_next;
3107                         }
3108                         m = m->m_next;
3109                 } else {
3110                         m->m_nextpkt = NULL;
3111                         sbfree(sb_rcv, m);
3112                         sb_rcv->sb_mb = m->m_next;
3113                         m->m_next = NULL;
3114                         *cme = m;
3115                         cme = &(*cme)->m_next;
3116                         m = sb_rcv->sb_mb;
3117                 }
3118         } while (m != NULL && m->m_type == MT_CONTROL);
3119
3120         if (!(flags & MSG_PEEK)) {
3121                 if (sb_rcv->sb_mb != NULL) {
3122                         sb_rcv->sb_mb->m_nextpkt = nextrecord;
3123                 } else {
3124                         sb_rcv->sb_mb = nextrecord;
3125                         SB_EMPTY_FIXUP(sb_rcv);
3126                 }
3127                 if (nextrecord == NULL) {
3128                         sb_rcv->sb_lastrecord = m;
3129                 }
3130         }
3131
3132         SBLASTRECORDCHK(&so->so_rcv, "soreceive ctl");
3133         SBLASTMBUFCHK(&so->so_rcv, "soreceive ctl");
3134
3135         while (cm != NULL) {
3136                 int cmsg_type;
3137
3138                 cmn = cm->m_next;
3139                 cm->m_next = NULL;
3140                 cmsg_type = mtod(cm, struct cmsghdr *)->cmsg_type;
3141
3142                 /*
3143                  * Call the protocol to externalize SCM_RIGHTS message
3144                  * and return the modified message to the caller upon
3145                  * success.  Otherwise, all other control messages are
3146                  * returned unmodified to the caller.  Note that we
3147                  * only get into this loop if MSG_PEEK is not set.
3148                  */
3149                 if (pr->pr_domain->dom_externalize != NULL &&
3150                     cmsg_type == SCM_RIGHTS) {
3151                         /*
3152                          * Release socket lock: see 3903171.  This
3153                          * would also allow more records to be appended
3154                          * to the socket buffer.  We still have SB_LOCK
3155                          * set on it, so we can be sure that the head
3156                          * of the mbuf chain won't change.
3157                          */
3158                         socket_unlock(so, 0);
3159                         error = (*pr->pr_domain->dom_externalize)(cm);
3160                         socket_lock(so, 0);
3161                 } else {
3162                         error = 0;
3163                 }
3164
3165                 if (controlp != NULL && error == 0) {
3166                         *controlp = cm;
3167                         controlp = &(*controlp)->m_next;
3168                 } else {
3169                         (void) m_free(cm);
3170                 }
3171                 cm = cmn;
3172         }
3173         /*
3174          * Update the value of nextrecord in case we received new
3175          * records when the socket was unlocked above for
3176          * externalizing SCM_RIGHTS.
3177          */
3178         if (m != NULL) {
3179                 nextrecord = sb_rcv->sb_mb->m_nextpkt;
3180         } else {
3181                 nextrecord = sb_rcv->sb_mb;
3182         }
3183
3184 done:
3185         *mp = m;
3186         *nextrecordp = nextrecord;
3187
3188         return error;
3189 }
3190
3191 /*
3192  * If we have less data than requested, block awaiting more
3193  * (subject to any timeout) if:
3194  *   1. the current count is less than the low water mark, or
3195  *   2. MSG_WAITALL is set, and it is possible to do the entire
3196  *      receive operation at once if we block (resid <= hiwat).
3197  *   3. MSG_DONTWAIT is not set
3198  * If MSG_WAITALL is set but resid is larger than the receive buffer,
3199  * we have to do the receive in sections, and thus risk returning
3200  * a short count if a timeout or signal occurs after we start.
3201  */
3202 static boolean_t
3203 so_should_wait(struct socket *so, struct uio *uio, struct mbuf *m, int flags)
3204 {
3205         struct protosw *pr = so->so_proto;
3206
3207         /* No mbufs in the receive-queue? Wait! */
3208         if (m == NULL) {
3209                 return true;
3210         }
3211
3212         /* Not enough data in the receive socket-buffer - we may have to wait */
3213         if ((flags & MSG_DONTWAIT) == 0 && so->so_rcv.sb_cc < uio_resid(uio) &&
3214             m->m_nextpkt == NULL && (pr->pr_flags & PR_ATOMIC) == 0) {
3215                 /*
3216                  * Application did set the lowater-mark, so we should wait for
3217                  * this data to be present.
3218                  */
3219                 if (so->so_rcv.sb_cc < so->so_rcv.sb_lowat) {
3220                         return true;
3221                 }
3222
3223                 /*
3224                  * Application wants all the data - so let's try to do the
3225                  * receive-operation at once by waiting for everything to
3226                  * be there.
3227                  */
3228                 if ((flags & MSG_WAITALL) && uio_resid(uio) <= so->so_rcv.sb_hiwat) {
3229                         return true;
3230                 }
3231         }
3232
3233         return false;
3234 }
3235
3236 /*
3237  * Implement receive operations on a socket.
3238  * We depend on the way that records are added to the sockbuf
3239  * by sbappend*.  In particular, each record (mbufs linked through m_next)
3240  * must begin with an address if the protocol so specifies,
3241  * followed by an optional mbuf or mbufs containing ancillary data,
3242  * and then zero or more mbufs of data.
3243  * In order to avoid blocking network interrupts for the entire time here,
3244  * we splx() while doing the actual copy to user space.
3245  * Although the sockbuf is locked, new data may still be appended,
3246  * and thus we must maintain consistency of the sockbuf during that time.
3247  *
3248  * The caller may receive the data as a single mbuf chain by supplying
3249  * an mbuf **mp0 for use in returning the chain.  The uio is then used
3250  * only for the count in uio_resid.
3251  *
3252  * Returns:     0                       Success
3253  *              ENOBUFS
3254  *              ENOTCONN
3255  *              EWOULDBLOCK
3256  *      uiomove:EFAULT
3257  *      sblock:EWOULDBLOCK
3258  *      sblock:EINTR
3259  *      sbwait:EBADF
3260  *      sbwait:EINTR
3261  *      sodelayed_copy:EFAULT
3262  *      <pru_rcvoob>:EINVAL[TCP]
3263  *      <pru_rcvoob>:EWOULDBLOCK[TCP]
3264  *      <pru_rcvoob>:???
3265  *      <pr_domain->dom_externalize>:EMSGSIZE[AF_UNIX]
3266  *      <pr_domain->dom_externalize>:ENOBUFS[AF_UNIX]
3267  *      <pr_domain->dom_externalize>:???
3268  *
3269  * Notes:       Additional return values from calls through <pru_rcvoob> and
3270  *              <pr_domain->dom_externalize> depend on protocols other than
3271  *              TCP or AF_UNIX, which are documented above.
3272  */
3273 int
3274 soreceive(struct socket *so, struct sockaddr **psa, struct uio *uio,
3275     struct mbuf **mp0, struct mbuf **controlp, int *flagsp)
3276 {
3277         struct mbuf *m, **mp, *ml = NULL;
3278         struct mbuf *nextrecord, *free_list;
3279         int flags, error, offset;
3280         user_ssize_t len;
3281         struct protosw *pr = so->so_proto;
3282         int moff, type = 0;
3283         user_ssize_t orig_resid = uio_resid(uio);
3284         user_ssize_t delayed_copy_len;
3285         int can_delay;
3286         struct proc *p = current_proc();
3287         boolean_t en_tracing = FALSE;
3288
3289         /*
3290          * Sanity check on the length passed by caller as we are making 'int'
3291          * comparisons
3292          */
3293         if (orig_resid < 0 || orig_resid > INT_MAX) {
3294                 return EINVAL;
3295         }
3296
3297         KERNEL_DEBUG(DBG_FNC_SORECEIVE | DBG_FUNC_START, so,
3298             uio_resid(uio), so->so_rcv.sb_cc, so->so_rcv.sb_lowat,
3299             so->so_rcv.sb_hiwat);
3300
3301         socket_lock(so, 1);
3302         so_update_last_owner_locked(so, p);
3303         so_update_policy(so);
3304
3305 #ifdef MORE_LOCKING_DEBUG
3306         if (so->so_usecount == 1) {
3307                 panic("%s: so=%x no other reference on socket\n", __func__, so);
3308                 /* NOTREACHED */
3309         }
3310 #endif
3311         mp = mp0;
3312         if (psa != NULL) {
3313                 *psa = NULL;
3314         }
3315         if (controlp != NULL) {
3316                 *controlp = NULL;
3317         }
3318         if (flagsp != NULL) {
3319                 flags = *flagsp & ~MSG_EOR;
3320         } else {
3321                 flags = 0;
3322         }
3323
3324         /*
3325          * If a recv attempt is made on a previously-accepted socket
3326          * that has been marked as inactive (disconnected), reject
3327          * the request.
3328          */
3329         if (so->so_flags & SOF_DEFUNCT) {
3330                 struct sockbuf *sb = &so->so_rcv;
3331
3332                 error = ENOTCONN;
3333                 SODEFUNCTLOG("%s[%d, %s]: defunct so 0x%llx [%d,%d] (%d)\n",
3334                     __func__, proc_pid(p), proc_best_name(p),
3335                     (uint64_t)DEBUG_KERNEL_ADDRPERM(so),
3336                     SOCK_DOM(so), SOCK_TYPE(so), error);
3337                 /*
3338                  * This socket should have been disconnected and flushed
3339                  * prior to being returned from sodefunct(); there should
3340                  * be no data on its receive list, so panic otherwise.
3341                  */
3342                 if (so->so_state & SS_DEFUNCT) {
3343                         sb_empty_assert(sb, __func__);
3344                 }
3345                 socket_unlock(so, 1);
3346                 return error;
3347         }
3348
3349         if ((so->so_flags1 & SOF1_PRECONNECT_DATA) &&
3350             pr->pr_usrreqs->pru_preconnect) {
3351                 /*
3352                  * A user may set the CONNECT_RESUME_ON_READ_WRITE-flag but not
3353                  * calling write() right after this. *If* the app calls a read
3354                  * we do not want to block this read indefinetely. Thus,
3355                  * we trigger a connect so that the session gets initiated.
3356                  */
3357                 error = (*pr->pr_usrreqs->pru_preconnect)(so);
3358
3359                 if (error) {
3360                         socket_unlock(so, 1);
3361                         return error;
3362                 }
3363         }
3364
3365         if (ENTR_SHOULDTRACE &&
3366             (SOCK_CHECK_DOM(so, AF_INET) || SOCK_CHECK_DOM(so, AF_INET6))) {
3367                 /*
3368                  * enable energy tracing for inet sockets that go over
3369                  * non-loopback interfaces only.
3370                  */
3371                 struct inpcb *inp = sotoinpcb(so);
3372                 if (inp->inp_last_outifp != NULL &&
3373                     !(inp->inp_last_outifp->if_flags & IFF_LOOPBACK)) {
3374                         en_tracing = TRUE;
3375                         KERNEL_ENERGYTRACE(kEnTrActKernSockRead, DBG_FUNC_START,
3376                             VM_KERNEL_ADDRPERM(so),
3377                             ((so->so_state & SS_NBIO) ?
3378                             kEnTrFlagNonBlocking : 0),
3379                             (int64_t)orig_resid);
3380                 }
3381         }
3382
3383         /*
3384          * When SO_WANTOOBFLAG is set we try to get out-of-band data
3385          * regardless of the flags argument. Here is the case were
3386          * out-of-band data is not inline.
3387          */
3388         if ((flags & MSG_OOB) ||
3389             ((so->so_options & SO_WANTOOBFLAG) != 0 &&
3390             (so->so_options & SO_OOBINLINE) == 0 &&
3391             (so->so_oobmark || (so->so_state & SS_RCVATMARK)))) {
3392                 m = m_get(M_WAIT, MT_DATA);
3393                 if (m == NULL) {
3394                         socket_unlock(so, 1);
3395                         KERNEL_DEBUG(DBG_FNC_SORECEIVE | DBG_FUNC_END,
3396                             ENOBUFS, 0, 0, 0, 0);
3397                         return ENOBUFS;
3398                 }
3399                 error = (*pr->pr_usrreqs->pru_rcvoob)(so, m, flags & MSG_PEEK);
3400                 if (error) {
3401                         goto bad;
3402                 }
3403                 socket_unlock(so, 0);
3404                 do {
3405                         error = uiomove(mtod(m, caddr_t),
3406                             imin(uio_resid(uio), m->m_len), uio);
3407                         m = m_free(m);
3408                 } while (uio_resid(uio) && error == 0 && m != NULL);
3409                 socket_lock(so, 0);
3410 bad:
3411                 if (m != NULL) {
3412                         m_freem(m);
3413                 }
3414
3415                 if ((so->so_options & SO_WANTOOBFLAG) != 0) {
3416                         if (error == EWOULDBLOCK || error == EINVAL) {
3417                                 /*
3418                                  * Let's try to get normal data:
3419                                  * EWOULDBLOCK: out-of-band data not
3420                                  * receive yet. EINVAL: out-of-band data
3421                                  * already read.
3422                                  */
3423                                 error = 0;
3424                                 goto nooob;
3425                         } else if (error == 0 && flagsp != NULL) {
3426                                 *flagsp |= MSG_OOB;
3427                         }
3428                 }
3429                 socket_unlock(so, 1);
3430                 if (en_tracing) {
3431                         KERNEL_ENERGYTRACE(kEnTrActKernSockRead, DBG_FUNC_END,
3432                             VM_KERNEL_ADDRPERM(so), 0,
3433                             (int64_t)(orig_resid - uio_resid(uio)));
3434                 }
3435                 KERNEL_DEBUG(DBG_FNC_SORECEIVE | DBG_FUNC_END, error,
3436                     0, 0, 0, 0);
3437
3438                 return error;
3439         }
3440 nooob:
3441         if (mp != NULL) {
3442                 *mp = NULL;
3443         }
3444
3445         if (so->so_state & SS_ISCONFIRMING && uio_resid(uio)) {
3446                 (*pr->pr_usrreqs->pru_rcvd)(so, 0);
3447         }
3448
3449         free_list = NULL;
3450         delayed_copy_len = 0;
3451 restart:
3452 #ifdef MORE_LOCKING_DEBUG
3453         if (so->so_usecount <= 1) {
3454                 printf("soreceive: sblock so=0x%llx ref=%d on socket\n",
3455                     (uint64_t)DEBUG_KERNEL_ADDRPERM(so), so->so_usecount);
3456         }
3457 #endif
3458         /*
3459          * See if the socket has been closed (SS_NOFDREF|SS_CANTRCVMORE)
3460          * and if so just return to the caller.  This could happen when
3461          * soreceive() is called by a socket upcall function during the
3462          * time the socket is freed.  The socket buffer would have been
3463          * locked across the upcall, therefore we cannot put this thread
3464          * to sleep (else we will deadlock) or return EWOULDBLOCK (else
3465          * we may livelock), because the lock on the socket buffer will
3466          * only be released when the upcall routine returns to its caller.
3467          * Because the socket has been officially closed, there can be
3468          * no further read on it.
3469          *
3470          * A multipath subflow socket would have its SS_NOFDREF set by
3471          * default, so check for SOF_MP_SUBFLOW socket flag; when the
3472          * socket is closed for real, SOF_MP_SUBFLOW would be cleared.
3473          */
3474         if ((so->so_state & (SS_NOFDREF | SS_CANTRCVMORE)) ==
3475             (SS_NOFDREF | SS_CANTRCVMORE) && !(so->so_flags & SOF_MP_SUBFLOW)) {
3476                 socket_unlock(so, 1);
3477                 return 0;
3478         }
3479
3480         error = sblock(&so->so_rcv, SBLOCKWAIT(flags));
3481         if (error) {
3482                 socket_unlock(so, 1);
3483                 KERNEL_DEBUG(DBG_FNC_SORECEIVE | DBG_FUNC_END, error,
3484                     0, 0, 0, 0);
3485                 if (en_tracing) {
3486                         KERNEL_ENERGYTRACE(kEnTrActKernSockRead, DBG_FUNC_END,
3487                             VM_KERNEL_ADDRPERM(so), 0,
3488                             (int64_t)(orig_resid - uio_resid(uio)));
3489                 }
3490                 return error;
3491         }
3492
3493         m = so->so_rcv.sb_mb;
3494         if (so_should_wait(so, uio, m, flags)) {
3495                 /*
3496                  * Panic if we notice inconsistencies in the socket's
3497                  * receive list; both sb_mb and sb_cc should correctly
3498                  * reflect the contents of the list, otherwise we may
3499                  * end up with false positives during select() or poll()
3500                  * which could put the application in a bad state.
3501                  */
3502                 SB_MB_CHECK(&so->so_rcv);
3503
3504                 if (so->so_error) {
3505                         if (m != NULL) {
3506                                 goto dontblock;
3507                         }
3508                         error = so->so_error;
3509                         if ((flags & MSG_PEEK) == 0) {
3510                                 so->so_error = 0;
3511                         }
3512                         goto release;
3513                 }
3514                 if (so->so_state & SS_CANTRCVMORE) {
3515 #if CONTENT_FILTER
3516                         /*
3517                          * Deal with half closed connections
3518                          */
3519                         if ((so->so_state & SS_ISDISCONNECTED) == 0 &&
3520                             cfil_sock_data_pending(&so->so_rcv) != 0) {
3521                                 CFIL_LOG(LOG_INFO,
3522                                     "so %llx ignore SS_CANTRCVMORE",
3523                                     (uint64_t)DEBUG_KERNEL_ADDRPERM(so));
3524                         } else
3525 #endif /* CONTENT_FILTER */
3526                         if (m != NULL) {
3527                                 goto dontblock;
3528                         } else {
3529                                 goto release;
3530                         }
3531                 }
3532                 for (; m != NULL; m = m->m_next) {
3533                         if (m->m_type == MT_OOBDATA || (m->m_flags & M_EOR)) {
3534                                 m = so->so_rcv.sb_mb;
3535                                 goto dontblock;
3536                         }
3537                 }
3538                 if ((so->so_state & (SS_ISCONNECTED | SS_ISCONNECTING)) == 0 &&
3539                     (so->so_proto->pr_flags & PR_CONNREQUIRED)) {
3540                         error = ENOTCONN;
3541                         goto release;
3542                 }
3543                 if (uio_resid(uio) == 0) {
3544                         goto release;
3545                 }
3546
3547                 if ((so->so_state & SS_NBIO) ||
3548                     (flags & (MSG_DONTWAIT | MSG_NBIO))) {
3549                         error = EWOULDBLOCK;
3550                         goto release;
3551                 }
3552                 SBLASTRECORDCHK(&so->so_rcv, "soreceive sbwait 1");
3553                 SBLASTMBUFCHK(&so->so_rcv, "soreceive sbwait 1");
3554                 sbunlock(&so->so_rcv, TRUE);    /* keep socket locked */
3555 #if EVEN_MORE_LOCKING_DEBUG
3556                 if (socket_debug) {
3557                         printf("Waiting for socket data\n");
3558                 }
3559 #endif
3560
3561                 /*
3562                  * Depending on the protocol (e.g. TCP), the following
3563                  * might cause the socket lock to be dropped and later
3564                  * be reacquired, and more data could have arrived and
3565                  * have been appended to the receive socket buffer by
3566                  * the time it returns.  Therefore, we only sleep in
3567                  * sbwait() below if and only if the wait-condition is still
3568                  * true.
3569                  */
3570                 if ((pr->pr_flags & PR_WANTRCVD) && so->so_pcb != NULL) {
3571                         (*pr->pr_usrreqs->pru_rcvd)(so, flags);
3572                 }
3573
3574                 error = 0;
3575                 if (so_should_wait(so, uio, so->so_rcv.sb_mb, flags)) {
3576                         error = sbwait(&so->so_rcv);
3577                 }
3578
3579 #if EVEN_MORE_LOCKING_DEBUG
3580                 if (socket_debug) {
3581                         printf("SORECEIVE - sbwait returned %d\n", error);
3582                 }
3583 #endif
3584                 if (so->so_usecount < 1) {
3585                         panic("%s: after 2nd sblock so=%p ref=%d on socket\n",
3586                             __func__, so, so->so_usecount);
3587                         /* NOTREACHED */
3588                 }
3589                 if (error) {
3590                         socket_unlock(so, 1);
3591                         KERNEL_DEBUG(DBG_FNC_SORECEIVE | DBG_FUNC_END, error,
3592                             0, 0, 0, 0);
3593                         if (en_tracing) {
3594                                 KERNEL_ENERGYTRACE(kEnTrActKernSockRead, DBG_FUNC_END,
3595                                     VM_KERNEL_ADDRPERM(so), 0,
3596                                     (int64_t)(orig_resid - uio_resid(uio)));
3597                         }
3598                         return error;
3599                 }
3600                 goto restart;
3601         }
3602 dontblock:
3603         OSIncrementAtomicLong(&p->p_stats->p_ru.ru_msgrcv);
3604         SBLASTRECORDCHK(&so->so_rcv, "soreceive 1");
3605         SBLASTMBUFCHK(&so->so_rcv, "soreceive 1");
3606         nextrecord = m->m_nextpkt;
3607
3608         if ((pr->pr_flags & PR_ADDR) && m->m_type == MT_SONAME) {
3609                 error = soreceive_addr(p, so, psa, flags, &m, &nextrecord,
3610                     mp0 == NULL);
3611                 if (error == ERESTART) {
3612                         goto restart;
3613                 } else if (error != 0) {
3614                         goto release;
3615                 }
3616                 orig_resid = 0;
3617         }
3618
3619         /*
3620          * Process one or more MT_CONTROL mbufs present before any data mbufs
3621          * in the first mbuf chain on the socket buffer.  If MSG_PEEK, we
3622          * just copy the data; if !MSG_PEEK, we call into the protocol to
3623          * perform externalization.
3624          */
3625         if (m != NULL && m->m_type == MT_CONTROL) {
3626                 error = soreceive_ctl(so, controlp, flags, &m, &nextrecord);
3627                 if (error != 0) {
3628                         goto release;
3629                 }
3630                 orig_resid = 0;
3631         }
3632
3633         if (m != NULL) {
3634                 if (!(flags & MSG_PEEK)) {
3635                         /*
3636                          * We get here because m points to an mbuf following
3637                          * any MT_SONAME or MT_CONTROL mbufs which have been
3638                          * processed above.  In any case, m should be pointing
3639                          * to the head of the mbuf chain, and the nextrecord
3640                          * should be either NULL or equal to m->m_nextpkt.
3641                          * See comments above about SB_LOCK.
3642                          */
3643                         if (m != so->so_rcv.sb_mb ||
3644                             m->m_nextpkt != nextrecord) {
3645                                 panic("%s: post-control !sync so=%p m=%p "
3646                                     "nextrecord=%p\n", __func__, so, m,
3647                                     nextrecord);
3648                                 /* NOTREACHED */
3649                         }
3650                         if (nextrecord == NULL) {
3651                                 so->so_rcv.sb_lastrecord = m;
3652                         }
3653                 }
3654                 type = m->m_type;
3655                 if (type == MT_OOBDATA) {
3656                         flags |= MSG_OOB;
3657                 }
3658         } else {
3659                 if (!(flags & MSG_PEEK)) {
3660                         SB_EMPTY_FIXUP(&so->so_rcv);
3661                 }
3662         }
3663         SBLASTRECORDCHK(&so->so_rcv, "soreceive 2");
3664         SBLASTMBUFCHK(&so->so_rcv, "soreceive 2");
3665
3666         moff = 0;
3667         offset = 0;
3668
3669         if (!(flags & MSG_PEEK) && uio_resid(uio) > sorecvmincopy) {
3670                 can_delay = 1;
3671         } else {
3672                 can_delay = 0;
3673         }
3674
3675         while (m != NULL &&
3676             (uio_resid(uio) - delayed_copy_len) > 0 && error == 0) {
3677                 if (m->m_type == MT_OOBDATA) {
3678                         if (type != MT_OOBDATA) {
3679                                 break;
3680                         }
3681                 } else if (type == MT_OOBDATA) {
3682                         break;
3683                 }
3684                 /*
3685                  * Make sure to allways set MSG_OOB event when getting
3686                  * out of band data inline.
3687                  */
3688                 if ((so->so_options & SO_WANTOOBFLAG) != 0 &&
3689                     (so->so_options & SO_OOBINLINE) != 0 &&
3690                     (so->so_state & SS_RCVATMARK) != 0) {
3691                         flags |= MSG_OOB;
3692                 }
3693                 so->so_state &= ~SS_RCVATMARK;
3694                 len = uio_resid(uio) - delayed_copy_len;
3695                 if (so->so_oobmark && len > so->so_oobmark - offset) {
3696                         len = so->so_oobmark - offset;
3697                 }
3698                 if (len > m->m_len - moff) {
3699                         len = m->m_len - moff;
3700                 }
3701                 /*
3702                  * If mp is set, just pass back the mbufs.
3703                  * Otherwise copy them out via the uio, then free.
3704                  * Sockbuf must be consistent here (points to current mbuf,
3705                  * it points to next record) when we drop priority;
3706                  * we must note any additions to the sockbuf when we
3707                  * block interrupts again.
3708                  */
3709                 if (mp == NULL) {
3710                         SBLASTRECORDCHK(&so->so_rcv, "soreceive uiomove");
3711                         SBLASTMBUFCHK(&so->so_rcv, "soreceive uiomove");
3712                         if (can_delay && len == m->m_len) {
3713                                 /*
3714                                  * only delay the copy if we're consuming the
3715                                  * mbuf and we're NOT in MSG_PEEK mode
3716                                  * and we have enough data to make it worthwile
3717                                  * to drop and retake the lock... can_delay
3718                                  * reflects the state of the 2 latter
3719                                  * constraints moff should always be zero
3720                                  * in these cases
3721                                  */
3722                                 delayed_copy_len += len;
3723                         } else {
3724                                 if (delayed_copy_len) {
3725                                         error = sodelayed_copy(so, uio,
3726                                             &free_list, &delayed_copy_len);
3727
3728                                         if (error) {
3729                                                 goto release;
3730                                         }
3731                                         /*
3732                                          * can only get here if MSG_PEEK is not
3733                                          * set therefore, m should point at the
3734                                          * head of the rcv queue; if it doesn't,
3735                                          * it means something drastically
3736                                          * changed while we were out from behind
3737                                          * the lock in sodelayed_copy. perhaps
3738                                          * a RST on the stream. in any event,
3739                                          * the stream has been interrupted. it's
3740                                          * probably best just to return whatever
3741                                          * data we've moved and let the caller
3742                                          * sort it out...
3743                                          */
3744                                         if (m != so->so_rcv.sb_mb) {
3745                                                 break;
3746                                         }
3747                                 }
3748                                 socket_unlock(so, 0);
3749                                 error = uiomove(mtod(m, caddr_t) + moff,
3750                                     (int)len, uio);
3751                                 socket_lock(so, 0);
3752
3753                                 if (error) {
3754                                         goto release;
3755                                 }
3756                         }
3757                 } else {
3758                         uio_setresid(uio, (uio_resid(uio) - len));
3759                 }
3760                 if (len == m->m_len - moff) {
3761                         if (m->m_flags & M_EOR) {
3762                                 flags |= MSG_EOR;
3763                         }
3764                         if (flags & MSG_PEEK) {
3765                                 m = m->m_next;
3766                                 moff = 0;
3767                         } else {
3768                                 nextrecord = m->m_nextpkt;
3769                                 sbfree(&so->so_rcv, m);
3770                                 m->m_nextpkt = NULL;
3771
3772                                 if (mp != NULL) {
3773                                         *mp = m;
3774                                         mp = &m->m_next;
3775                                         so->so_rcv.sb_mb = m = m->m_next;
3776                                         *mp = NULL;
3777                                 } else {
3778                                         if (free_list == NULL) {
3779                                                 free_list = m;
3780                                         } else {
3781                                                 ml->m_next = m;
3782                                         }
3783                                         ml = m;
3784                                         so->so_rcv.sb_mb = m = m->m_next;
3785                                         ml->m_next = NULL;
3786                                 }
3787                                 if (m != NULL) {
3788                                         m->m_nextpkt = nextrecord;
3789                                         if (nextrecord == NULL) {
3790                                                 so->so_rcv.sb_lastrecord = m;
3791                                         }
3792                                 } else {
3793                                         so->so_rcv.sb_mb = nextrecord;
3794                                         SB_EMPTY_FIXUP(&so->so_rcv);
3795                                 }
3796                                 SBLASTRECORDCHK(&so->so_rcv, "soreceive 3");
3797                                 SBLASTMBUFCHK(&so->so_rcv, "soreceive 3");
3798                         }
3799                 } else {
3800                         if (flags & MSG_PEEK) {
3801                                 moff += len;
3802                         } else {
3803                                 if (mp != NULL) {
3804                                         int copy_flag;
3805
3806                                         if (flags & MSG_DONTWAIT) {
3807                                                 copy_flag = M_DONTWAIT;
3808                                         } else {
3809                                                 copy_flag = M_WAIT;
3810                                         }
3811                                         *mp = m_copym(m, 0, len, copy_flag);
3812                                         /*
3813                                          * Failed to allocate an mbuf?
3814                                          * Adjust uio_resid back, it was
3815                                          * adjusted down by len bytes which
3816                                          * we didn't copy over.
3817                                          */
3818                                         if (*mp == NULL) {
3819                                                 uio_setresid(uio,
3820                                                     (uio_resid(uio) + len));
3821                                                 break;
3822                                         }
3823                                 }
3824                                 m->m_data += len;
3825                                 m->m_len -= len;
3826                                 so->so_rcv.sb_cc -= len;
3827                         }
3828                 }
3829                 if (so->so_oobmark) {
3830                         if ((flags & MSG_PEEK) == 0) {
3831                                 so->so_oobmark -= len;
3832                                 if (so->so_oobmark == 0) {
3833                                         so->so_state |= SS_RCVATMARK;
3834                                         break;
3835                                 }
3836                         } else {
3837                                 offset += len;
3838                                 if (offset == so->so_oobmark) {
3839                                         break;
3840                                 }
3841                         }
3842                 }
3843                 if (flags & MSG_EOR) {
3844                         break;
3845                 }
3846                 /*
3847                  * If the MSG_WAITALL or MSG_WAITSTREAM flag is set
3848                  * (for non-atomic socket), we must not quit until
3849                  * "uio->uio_resid == 0" or an error termination.
3850                  * If a signal/timeout occurs, return with a short
3851                  * count but without error.  Keep sockbuf locked
3852                  * against other readers.
3853                  */
3854                 while (flags & (MSG_WAITALL | MSG_WAITSTREAM) && m == NULL &&
3855                     (uio_resid(uio) - delayed_copy_len) > 0 &&
3856                     !sosendallatonce(so) && !nextrecord) {
3857                         if (so->so_error || ((so->so_state & SS_CANTRCVMORE)
3858 #if CONTENT_FILTER
3859                             && cfil_sock_data_pending(&so->so_rcv) == 0
3860 #endif /* CONTENT_FILTER */
3861                             )) {
3862                                 goto release;
3863                         }
3864
3865                         /*
3866                          * Depending on the protocol (e.g. TCP), the following
3867                          * might cause the socket lock to be dropped and later
3868                          * be reacquired, and more data could have arrived and
3869                          * have been appended to the receive socket buffer by
3870                          * the time it returns.  Therefore, we only sleep in
3871                          * sbwait() below if and only if the socket buffer is
3872                          * empty, in order to avoid a false sleep.
3873                          */
3874                         if ((pr->pr_flags & PR_WANTRCVD) && so->so_pcb != NULL) {
3875                                 (*pr->pr_usrreqs->pru_rcvd)(so, flags);
3876                         }
3877
3878                         SBLASTRECORDCHK(&so->so_rcv, "soreceive sbwait 2");
3879                         SBLASTMBUFCHK(&so->so_rcv, "soreceive sbwait 2");
3880
3881                         if (so->so_rcv.sb_mb == NULL && sbwait(&so->so_rcv)) {
3882                                 error = 0;
3883                                 goto release;
3884                         }
3885                         /*
3886                          * have to wait until after we get back from the sbwait
3887                          * to do the copy because we will drop the lock if we
3888                          * have enough data that has been delayed... by dropping
3889                          * the lock we open up a window allowing the netisr
3890                          * thread to process the incoming packets and to change
3891                          * the state of this socket... we're issuing the sbwait
3892                          * because the socket is empty and we're expecting the
3893                          * netisr thread to wake us up when more packets arrive;
3894                          * if we allow that processing to happen and then sbwait
3895                          * we could stall forever with packets sitting in the
3896                          * socket if no further packets arrive from the remote
3897                          * side.
3898                          *
3899                          * we want to copy before we've collected all the data
3900                          * to satisfy this request to allow the copy to overlap
3901                          * the incoming packet processing on an MP system
3902                          */
3903                         if (delayed_copy_len > sorecvmincopy &&
3904                             (delayed_copy_len > (so->so_rcv.sb_hiwat / 2))) {
3905                                 error = sodelayed_copy(so, uio,
3906                                     &free_list, &delayed_copy_len);
3907
3908                                 if (error) {
3909                                         goto release;
3910                                 }
3911                         }
3912                         m = so->so_rcv.sb_mb;
3913                         if (m != NULL) {
3914                                 nextrecord = m->m_nextpkt;
3915                         }
3916                         SB_MB_CHECK(&so->so_rcv);
3917                 }
3918         }
3919 #ifdef MORE_LOCKING_DEBUG
3920         if (so->so_usecount <= 1) {
3921                 panic("%s: after big while so=%p ref=%d on socket\n",
3922                     __func__, so, so->so_usecount);
3923                 /* NOTREACHED */
3924         }
3925 #endif
3926
3927         if (m != NULL && pr->pr_flags & PR_ATOMIC) {
3928                 if (so->so_options & SO_DONTTRUNC) {
3929                         flags |= MSG_RCVMORE;
3930                 } else {
3931                         flags |= MSG_TRUNC;
3932                         if ((flags & MSG_PEEK) == 0) {
3933                                 (void) sbdroprecord(&so->so_rcv);
3934                         }
3935                 }
3936         }
3937
3938         /*
3939          * pru_rcvd below (for TCP) may cause more data to be received
3940          * if the socket lock is dropped prior to sending the ACK; some
3941          * legacy OpenTransport applications don't handle this well
3942          * (if it receives less data than requested while MSG_HAVEMORE
3943          * is set), and so we set the flag now based on what we know
3944          * prior to calling pru_rcvd.
3945          */
3946         if ((so->so_options & SO_WANTMORE) && so->so_rcv.sb_cc > 0) {
3947                 flags |= MSG_HAVEMORE;
3948         }
3949
3950         if ((flags & MSG_PEEK) == 0) {
3951                 if (m == NULL) {
3952                         so->so_rcv.sb_mb = nextrecord;
3953                         /*
3954                          * First part is an inline SB_EMPTY_FIXUP().  Second
3955                          * part makes sure sb_lastrecord is up-to-date if
3956                          * there is still data in the socket buffer.
3957                          */
3958                         if (so->so_rcv.sb_mb == NULL) {
3959                                 so->so_rcv.sb_mbtail = NULL;
3960                                 so->so_rcv.sb_lastrecord = NULL;
3961                         } else if (nextrecord->m_nextpkt == NULL) {
3962                                 so->so_rcv.sb_lastrecord = nextrecord;
3963                         }
3964                         SB_MB_CHECK(&so->so_rcv);
3965                 }
3966                 SBLASTRECORDCHK(&so->so_rcv, "soreceive 4");
3967                 SBLASTMBUFCHK(&so->so_rcv, "soreceive 4");
3968                 if (pr->pr_flags & PR_WANTRCVD && so->so_pcb) {
3969                         (*pr->pr_usrreqs->pru_rcvd)(so, flags);
3970                 }
3971         }
3972
3973         if (delayed_copy_len) {
3974                 error = sodelayed_copy(so, uio, &free_list, &delayed_copy_len);
3975                 if (error) {
3976                         goto release;
3977                 }
3978         }
3979         if (free_list != NULL) {
3980                 m_freem_list(free_list);
3981                 free_list = NULL;
3982         }
3983
3984         if (orig_resid == uio_resid(uio) && orig_resid &&
3985             (flags & MSG_EOR) == 0 && (so->so_state & SS_CANTRCVMORE) == 0) {
3986                 sbunlock(&so->so_rcv, TRUE);    /* keep socket locked */
3987                 goto restart;
3988         }
3989
3990         if (flagsp != NULL) {
3991                 *flagsp |= flags;
3992         }
3993 release:
3994 #ifdef MORE_LOCKING_DEBUG
3995         if (so->so_usecount <= 1) {
3996                 panic("%s: release so=%p ref=%d on socket\n", __func__,
3997                     so, so->so_usecount);
3998                 /* NOTREACHED */
3999         }
4000 #endif
4001         if (delayed_copy_len) {
4002                 error = sodelayed_copy(so, uio, &free_list, &delayed_copy_len);
4003         }
4004
4005         if (free_list != NULL) {
4006                 m_freem_list(free_list);
4007         }
4008
4009         sbunlock(&so->so_rcv, FALSE);   /* will unlock socket */
4010
4011         if (en_tracing) {
4012                 KERNEL_ENERGYTRACE(kEnTrActKernSockRead, DBG_FUNC_END,
4013                     VM_KERNEL_ADDRPERM(so),
4014                     ((error == EWOULDBLOCK) ? kEnTrFlagNoWork : 0),
4015                     (int64_t)(orig_resid - uio_resid(uio)));
4016         }
4017         KERNEL_DEBUG(DBG_FNC_SORECEIVE | DBG_FUNC_END, so, uio_resid(uio),
4018             so->so_rcv.sb_cc, 0, error);
4019
4020         return error;
4021 }
4022
4023 /*
4024  * Returns:     0                       Success
4025  *      uiomove:EFAULT
4026  */
4027 static int
4028 sodelayed_copy(struct socket *so, struct uio *uio, struct mbuf **free_list,
4029     user_ssize_t *resid)
4030 {
4031         int error = 0;
4032         struct mbuf *m;
4033
4034         m = *free_list;
4035
4036         socket_unlock(so, 0);
4037
4038         while (m != NULL && error == 0) {
4039                 error = uiomove(mtod(m, caddr_t), (int)m->m_len, uio);
4040                 m = m->m_next;
4041         }
4042         m_freem_list(*free_list);
4043
4044         *free_list = NULL;
4045         *resid = 0;
4046
4047         socket_lock(so, 0);
4048
4049         return error;
4050 }
4051
4052 static int
4053 sodelayed_copy_list(struct socket *so, struct recv_msg_elem *msgarray,
4054     u_int uiocnt, struct mbuf **free_list, user_ssize_t *resid)
4055 {
4056 #pragma unused(so)
4057         int error = 0;
4058         struct mbuf *ml, *m;
4059         int i = 0;
4060         struct uio *auio;
4061
4062         for (ml = *free_list, i = 0; ml != NULL && i < uiocnt;
4063             ml = ml->m_nextpkt, i++) {
4064                 auio = msgarray[i].uio;
4065                 for (m = ml; m != NULL; m = m->m_next) {
4066                         error = uiomove(mtod(m, caddr_t), m->m_len, auio);
4067                         if (error != 0) {
4068                                 goto out;
4069                         }
4070                 }
4071         }
4072 out:
4073         m_freem_list(*free_list);
4074
4075         *free_list = NULL;
4076         *resid = 0;
4077
4078         return error;
4079 }
4080
4081 int
4082 soreceive_list(struct socket *so, struct recv_msg_elem *msgarray, u_int uiocnt,
4083     int *flagsp)
4084 {
4085         struct mbuf *m;
4086         struct mbuf *nextrecord;
4087         struct mbuf *ml = NULL, *free_list = NULL, *free_tail = NULL;
4088         int error;
4089         user_ssize_t len, pktlen, delayed_copy_len = 0;
4090         struct protosw *pr = so->so_proto;
4091         user_ssize_t resid;
4092         struct proc *p = current_proc();
4093         struct uio *auio = NULL;
4094         int npkts = 0;
4095         int sblocked = 0;
4096         struct sockaddr **psa = NULL;
4097         struct mbuf **controlp = NULL;
4098         int can_delay;
4099         int flags;
4100         struct mbuf *free_others = NULL;
4101
4102         KERNEL_DEBUG(DBG_FNC_SORECEIVE_LIST | DBG_FUNC_START,
4103             so, uiocnt,
4104             so->so_rcv.sb_cc, so->so_rcv.sb_lowat, so->so_rcv.sb_hiwat);
4105
4106         /*
4107          * Sanity checks:
4108          * - Only supports don't wait flags
4109          * - Only support datagram sockets (could be extended to raw)
4110          * - Must be atomic
4111          * - Protocol must support packet chains
4112          * - The uio array is NULL (should we panic?)
4113          */
4114         if (flagsp != NULL) {
4115                 flags = *flagsp;
4116         } else {
4117                 flags = 0;
4118         }
4119         if (flags & ~(MSG_PEEK | MSG_WAITALL | MSG_DONTWAIT | MSG_NEEDSA |
4120             MSG_NBIO)) {
4121                 printf("%s invalid flags 0x%x\n", __func__, flags);
4122                 error = EINVAL;
4123                 goto out;
4124         }
4125         if (so->so_type != SOCK_DGRAM) {
4126                 error = EINVAL;
4127                 goto out;
4128         }
4129         if (sosendallatonce(so) == 0) {
4130                 error = EINVAL;
4131                 goto out;
4132         }
4133         if (so->so_proto->pr_usrreqs->pru_send_list == NULL) {
4134                 error = EPROTONOSUPPORT;
4135                 goto out;
4136         }
4137         if (msgarray == NULL) {
4138                 printf("%s uioarray is NULL\n", __func__);
4139                 error = EINVAL;
4140                 goto out;
4141         }
4142         if (uiocnt == 0) {
4143                 printf("%s uiocnt is 0\n", __func__);
4144                 error = EINVAL;
4145                 goto out;
4146         }
4147         /*
4148          * Sanity check on the length passed by caller as we are making 'int'
4149          * comparisons
4150          */
4151         resid = recv_msg_array_resid(msgarray, uiocnt);
4152         if (resid < 0 || resid > INT_MAX) {
4153                 error = EINVAL;
4154                 goto out;
4155         }
4156
4157         if (!(flags & MSG_PEEK) && sorecvmincopy > 0) {
4158                 can_delay = 1;
4159         } else {
4160                 can_delay = 0;
4161         }
4162
4163         socket_lock(so, 1);
4164         so_update_last_owner_locked(so, p);
4165         so_update_policy(so);
4166
4167 #if NECP
4168         so_update_necp_policy(so, NULL, NULL);
4169 #endif /* NECP */
4170
4171         /*
4172          * If a recv attempt is made on a previously-accepted socket
4173          * that has been marked as inactive (disconnected), reject
4174          * the request.
4175          */
4176         if (so->so_flags & SOF_DEFUNCT) {
4177                 struct sockbuf *sb = &so->so_rcv;
4178
4179                 error = ENOTCONN;
4180                 SODEFUNCTLOG("%s[%d, %s]: defunct so 0x%llx [%d,%d] (%d)\n",
4181                     __func__, proc_pid(p), proc_best_name(p),
4182                     (uint64_t)DEBUG_KERNEL_ADDRPERM(so),
4183                     SOCK_DOM(so), SOCK_TYPE(so), error);
4184                 /*
4185                  * This socket should have been disconnected and flushed
4186                  * prior to being returned from sodefunct(); there should
4187                  * be no data on its receive list, so panic otherwise.
4188                  */
4189                 if (so->so_state & SS_DEFUNCT) {
4190                         sb_empty_assert(sb, __func__);
4191                 }
4192                 goto release;
4193         }
4194
4195 next:
4196         /*
4197          * The uio may be empty
4198          */
4199         if (npkts >= uiocnt) {
4200                 error = 0;
4201                 goto release;
4202         }
4203 restart:
4204         /*
4205          * See if the socket has been closed (SS_NOFDREF|SS_CANTRCVMORE)
4206          * and if so just return to the caller.  This could happen when
4207          * soreceive() is called by a socket upcall function during the
4208          * time the socket is freed.  The socket buffer would have been
4209          * locked across the upcall, therefore we cannot put this thread
4210          * to sleep (else we will deadlock) or return EWOULDBLOCK (else
4211          * we may livelock), because the lock on the socket buffer will
4212          * only be released when the upcall routine returns to its caller.
4213          * Because the socket has been officially closed, there can be
4214          * no further read on it.
4215          */
4216         if ((so->so_state & (SS_NOFDREF | SS_CANTRCVMORE)) ==
4217             (SS_NOFDREF | SS_CANTRCVMORE)) {
4218                 error = 0;
4219                 goto release;
4220         }
4221
4222         error = sblock(&so->so_rcv, SBLOCKWAIT(flags));
4223         if (error) {
4224                 goto release;
4225         }
4226         sblocked = 1;
4227
4228         m = so->so_rcv.sb_mb;
4229         /*
4230          * Block awaiting more datagram if needed
4231          */
4232         if (m == NULL || (((flags & MSG_DONTWAIT) == 0 &&
4233             (so->so_rcv.sb_cc < so->so_rcv.sb_lowat ||
4234             ((flags & MSG_WAITALL) && npkts < uiocnt))))) {
4235                 /*
4236                  * Panic if we notice inconsistencies in the socket's
4237                  * receive list; both sb_mb and sb_cc should correctly
4238                  * reflect the contents of the list, otherwise we may
4239                  * end up with false positives during select() or poll()
4240                  * which could put the application in a bad state.
4241                  */
4242                 SB_MB_CHECK(&so->so_rcv);
4243
4244                 if (so->so_error) {
4245                         error = so->so_error;
4246                         if ((flags & MSG_PEEK) == 0) {
4247                                 so->so_error = 0;
4248                         }
4249                         goto release;
4250                 }
4251                 if (so->so_state & SS_CANTRCVMORE) {
4252                         goto release;
4253                 }
4254                 if ((so->so_state & (SS_ISCONNECTED | SS_ISCONNECTING)) == 0 &&
4255                     (so->so_proto->pr_flags & PR_CONNREQUIRED)) {
4256                         error = ENOTCONN;
4257                         goto release;
4258                 }
4259                 if ((so->so_state & SS_NBIO) ||
4260                     (flags & (MSG_DONTWAIT | MSG_NBIO))) {
4261                         error = EWOULDBLOCK;
4262                         goto release;
4263                 }
4264                 /*
4265                  * Do not block if we got some data
4266                  */
4267                 if (free_list != NULL) {
4268                         error = 0;
4269                         goto release;
4270                 }
4271
4272                 SBLASTRECORDCHK(&so->so_rcv, "soreceive sbwait 1");
4273                 SBLASTMBUFCHK(&so->so_rcv, "soreceive sbwait 1");
4274
4275                 sbunlock(&so->so_rcv, TRUE);    /* keep socket locked */
4276                 sblocked = 0;
4277
4278                 error = sbwait(&so->so_rcv);
4279                 if (error) {
4280                         goto release;
4281                 }
4282                 goto restart;
4283         }
4284
4285         OSIncrementAtomicLong(&p->p_stats->p_ru.ru_msgrcv);
4286         SBLASTRECORDCHK(&so->so_rcv, "soreceive 1");
4287         SBLASTMBUFCHK(&so->so_rcv, "soreceive 1");
4288
4289         /*
4290          * Consume the current uio index as we have a datagram
4291          */
4292         auio = msgarray[npkts].uio;
4293         resid = uio_resid(auio);
4294         msgarray[npkts].which |= SOCK_MSG_DATA;
4295         psa = (msgarray[npkts].which & SOCK_MSG_SA) ?
4296             &msgarray[npkts].psa : NULL;
4297         controlp = (msgarray[npkts].which & SOCK_MSG_CONTROL) ?
4298             &msgarray[npkts].controlp : NULL;
4299         npkts += 1;
4300         nextrecord = m->m_nextpkt;
4301
4302         if ((pr->pr_flags & PR_ADDR) && m->m_type == MT_SONAME) {
4303                 error = soreceive_addr(p, so, psa, flags, &m, &nextrecord, 1);
4304                 if (error == ERESTART) {
4305                         goto restart;
4306                 } else if (error != 0) {
4307                         goto release;
4308                 }
4309         }
4310
4311         if (m != NULL && m->m_type == MT_CONTROL) {
4312                 error = soreceive_ctl(so, controlp, flags, &m, &nextrecord);
4313                 if (error != 0) {
4314                         goto release;
4315                 }
4316         }
4317
4318         if (m->m_pkthdr.len == 0) {
4319                 printf("%s:%d so %llx pkt %llx type %u pktlen null\n",
4320                     __func__, __LINE__,
4321                     (uint64_t)DEBUG_KERNEL_ADDRPERM(so),
4322                     (uint64_t)DEBUG_KERNEL_ADDRPERM(m),
4323                     m->m_type);
4324         }
4325
4326         /*
4327          * Loop to copy the mbufs of the current record
4328          * Support zero length packets
4329          */
4330         ml = NULL;
4331         pktlen = 0;
4332         while (m != NULL && (len = resid - pktlen) >= 0 && error == 0) {
4333                 if (m->m_len == 0) {
4334                         panic("%p m_len zero", m);
4335                 }
4336                 if (m->m_type == 0) {
4337                         panic("%p m_type zero", m);
4338                 }
4339                 /*
4340                  * Clip to the residual length
4341                  */
4342                 if (len > m->m_len) {
4343                         len = m->m_len;
4344                 }
4345                 pktlen += len;
4346                 /*
4347                  * Copy the mbufs via the uio or delay the copy
4348                  * Sockbuf must be consistent here (points to current mbuf,
4349                  * it points to next record) when we drop priority;
4350                  * we must note any additions to the sockbuf when we
4351                  * block interrupts again.
4352                  */
4353                 if (len > 0 && can_delay == 0) {
4354                         socket_unlock(so, 0);
4355                         error = uiomove(mtod(m, caddr_t), (int)len, auio);
4356                         socket_lock(so, 0);
4357                         if (error) {
4358                                 goto release;
4359                         }
4360                 } else {
4361                         delayed_copy_len += len;
4362                 }
4363
4364                 if (len == m->m_len) {
4365                         /*
4366                          * m was entirely copied
4367                          */
4368                         sbfree(&so->so_rcv, m);
4369                         nextrecord = m->m_nextpkt;
4370                         m->m_nextpkt = NULL;
4371
4372                         /*
4373                          * Set the first packet to the head of the free list
4374                          */
4375                         if (free_list == NULL) {
4376                                 free_list = m;
4377                         }
4378                         /*
4379                          * Link current packet to tail of free list
4380                          */
4381                         if (ml == NULL) {
4382                                 if (free_tail != NULL) {
4383                                         free_tail->m_nextpkt = m;
4384                                 }
4385                                 free_tail = m;
4386                         }
4387                         /*
4388                          * Link current mbuf to last mbuf of current packet
4389                          */
4390                         if (ml != NULL) {
4391                                 ml->m_next = m;
4392                         }
4393                         ml = m;
4394
4395                         /*
4396                          * Move next buf to head of socket buffer
4397                          */
4398                         so->so_rcv.sb_mb = m = ml->m_next;
4399                         ml->m_next = NULL;
4400
4401                         if (m != NULL) {
4402                                 m->m_nextpkt = nextrecord;
4403                                 if (nextrecord == NULL) {
4404                                         so->so_rcv.sb_lastrecord = m;
4405                                 }
4406                         } else {
4407                                 so->so_rcv.sb_mb = nextrecord;
4408                                 SB_EMPTY_FIXUP(&so->so_rcv);
4409                         }
4410                         SBLASTRECORDCHK(&so->so_rcv, "soreceive 3");
4411                         SBLASTMBUFCHK(&so->so_rcv, "soreceive 3");
4412                 } else {
4413                         /*
4414                          * Stop the loop on partial copy
4415                          */
4416                         break;
4417                 }
4418         }
4419 #ifdef MORE_LOCKING_DEBUG
4420         if (so->so_usecount <= 1) {
4421                 panic("%s: after big while so=%llx ref=%d on socket\n",
4422                     __func__,
4423                     (uint64_t)DEBUG_KERNEL_ADDRPERM(so), so->so_usecount);
4424                 /* NOTREACHED */
4425         }
4426 #endif
4427         /*
4428          * Tell the caller we made a partial copy
4429          */
4430         if (m != NULL) {
4431                 if (so->so_options & SO_DONTTRUNC) {
4432                         /*
4433                          * Copyout first the freelist then the partial mbuf
4434                          */
4435                         socket_unlock(so, 0);
4436                         if (delayed_copy_len) {
4437                                 error = sodelayed_copy_list(so, msgarray,
4438                                     uiocnt, &free_list, &delayed_copy_len);
4439                         }
4440
4441                         if (error == 0) {
4442                                 error = uiomove(mtod(m, caddr_t), (int)len,
4443                                     auio);
4444                         }
4445                         socket_lock(so, 0);
4446                         if (error) {
4447                                 goto release;
4448                         }
4449
4450                         m->m_data += len;
4451                         m->m_len -= len;
4452                         so->so_rcv.sb_cc -= len;
4453                         flags |= MSG_RCVMORE;
4454                 } else {
4455                         (void) sbdroprecord(&so->so_rcv);
4456                         nextrecord = so->so_rcv.sb_mb;
4457                         m = NULL;
4458                         flags |= MSG_TRUNC;
4459                 }
4460         }
4461
4462         if (m == NULL) {
4463                 so->so_rcv.sb_mb = nextrecord;
4464                 /*
4465                  * First part is an inline SB_EMPTY_FIXUP().  Second
4466                  * part makes sure sb_lastrecord is up-to-date if
4467                  * there is still data in the socket buffer.
4468                  */
4469                 if (so->so_rcv.sb_mb == NULL) {
4470                         so->so_rcv.sb_mbtail = NULL;
4471                         so->so_rcv.sb_lastrecord = NULL;
4472                 } else if (nextrecord->m_nextpkt == NULL) {
4473                         so->so_rcv.sb_lastrecord = nextrecord;
4474                 }
4475                 SB_MB_CHECK(&so->so_rcv);
4476         }
4477         SBLASTRECORDCHK(&so->so_rcv, "soreceive 4");
4478         SBLASTMBUFCHK(&so->so_rcv, "soreceive 4");
4479
4480         /*
4481          * We can continue to the next packet as long as:
4482          * - We haven't exhausted the uio array
4483          * - There was no error
4484          * - A packet was not truncated
4485          * - We can still receive more data
4486          */
4487         if (npkts < uiocnt && error == 0 &&
4488             (flags & (MSG_RCVMORE | MSG_TRUNC)) == 0 &&
4489             (so->so_state & SS_CANTRCVMORE) == 0) {
4490                 sbunlock(&so->so_rcv, TRUE);    /* keep socket locked */
4491                 sblocked = 0;
4492
4493                 goto next;
4494         }
4495         if (flagsp != NULL) {
4496                 *flagsp |= flags;
4497         }
4498
4499 release:
4500         /*
4501          * pru_rcvd may cause more data to be received if the socket lock
4502          * is dropped so we set MSG_HAVEMORE now based on what we know.
4503          * That way the caller won't be surprised if it receives less data
4504          * than requested.
4505          */
4506         if ((so->so_options & SO_WANTMORE) && so->so_rcv.sb_cc > 0) {
4507                 flags |= MSG_HAVEMORE;
4508         }
4509
4510         if (pr->pr_flags & PR_WANTRCVD && so->so_pcb) {
4511                 (*pr->pr_usrreqs->pru_rcvd)(so, flags);
4512         }
4513
4514         if (sblocked) {
4515                 sbunlock(&so->so_rcv, FALSE);   /* will unlock socket */
4516         } else {
4517                 socket_unlock(so, 1);
4518         }
4519
4520         if (delayed_copy_len) {
4521                 error = sodelayed_copy_list(so, msgarray, uiocnt,
4522                     &free_list, &delayed_copy_len);
4523         }
4524 out:
4525         /*
4526          * Amortize the cost of freeing the mbufs
4527          */
4528         if (free_list != NULL) {
4529                 m_freem_list(free_list);
4530         }
4531         if (free_others != NULL) {
4532                 m_freem_list(free_others);
4533         }
4534
4535         KERNEL_DEBUG(DBG_FNC_SORECEIVE_LIST | DBG_FUNC_END, error,
4536             0, 0, 0, 0);
4537         return error;
4538 }
4539
4540 static int
4541 so_statistics_event_to_nstat_event(int64_t *input_options,
4542     uint64_t *nstat_event)
4543 {
4544         int error = 0;
4545         switch (*input_options) {
4546         case SO_STATISTICS_EVENT_ENTER_CELLFALLBACK:
4547                 *nstat_event = NSTAT_EVENT_SRC_ENTER_CELLFALLBACK;
4548                 break;
4549         case SO_STATISTICS_EVENT_EXIT_CELLFALLBACK:
4550                 *nstat_event = NSTAT_EVENT_SRC_EXIT_CELLFALLBACK;
4551                 break;
4552 #if (DEBUG || DEVELOPMENT)
4553         case SO_STATISTICS_EVENT_RESERVED_1:
4554                 *nstat_event = NSTAT_EVENT_SRC_RESERVED_1;
4555                 break;
4556         case SO_STATISTICS_EVENT_RESERVED_2:
4557                 *nstat_event = NSTAT_EVENT_SRC_RESERVED_2;
4558                 break;
4559 #endif /* (DEBUG || DEVELOPMENT) */
4560         default:
4561                 error = EINVAL;
4562                 break;
4563         }
4564         return error;
4565 }
4566
4567 /*
4568  * Returns:     0                       Success
4569  *              EINVAL
4570  *              ENOTCONN
4571  *      <pru_shutdown>:EINVAL
4572  *      <pru_shutdown>:EADDRNOTAVAIL[TCP]
4573  *      <pru_shutdown>:ENOBUFS[TCP]
4574  *      <pru_shutdown>:EMSGSIZE[TCP]
4575  *      <pru_shutdown>:EHOSTUNREACH[TCP]
4576  *      <pru_shutdown>:ENETUNREACH[TCP]
4577  *      <pru_shutdown>:ENETDOWN[TCP]
4578  *      <pru_shutdown>:ENOMEM[TCP]
4579  *      <pru_shutdown>:EACCES[TCP]
4580  *      <pru_shutdown>:EMSGSIZE[TCP]
4581  *      <pru_shutdown>:ENOBUFS[TCP]
4582  *      <pru_shutdown>:???[TCP]         [ignorable: mostly IPSEC/firewall/DLIL]
4583  *      <pru_shutdown>:???              [other protocol families]
4584  */
4585 int
4586 soshutdown(struct socket *so, int how)
4587 {
4588         int error;
4589
4590         KERNEL_DEBUG(DBG_FNC_SOSHUTDOWN | DBG_FUNC_START, how, 0, 0, 0, 0);
4591
4592         switch (how) {
4593         case SHUT_RD:
4594         case SHUT_WR:
4595         case SHUT_RDWR:
4596                 socket_lock(so, 1);
4597                 if ((so->so_state &
4598                     (SS_ISCONNECTED | SS_ISCONNECTING | SS_ISDISCONNECTING)) == 0) {
4599                         error = ENOTCONN;
4600                 } else {
4601                         error = soshutdownlock(so, how);
4602                 }
4603                 socket_unlock(so, 1);
4604                 break;
4605         default:
4606                 error = EINVAL;
4607                 break;
4608         }
4609
4610         KERNEL_DEBUG(DBG_FNC_SOSHUTDOWN | DBG_FUNC_END, how, error, 0, 0, 0);
4611
4612         return error;
4613 }
4614
4615 int
4616 soshutdownlock_final(struct socket *so, int how)
4617 {
4618         struct protosw *pr = so->so_proto;
4619         int error = 0;
4620
4621         sflt_notify(so, sock_evt_shutdown, &how);
4622
4623         if (how != SHUT_WR) {
4624                 if ((so->so_state & SS_CANTRCVMORE) != 0) {
4625                         /* read already shut down */
4626                         error = ENOTCONN;
4627                         goto done;
4628                 }
4629                 sorflush(so);
4630         }
4631         if (how != SHUT_RD) {
4632                 if ((so->so_state & SS_CANTSENDMORE) != 0) {
4633                         /* write already shut down */
4634                         error = ENOTCONN;
4635                         goto done;
4636                 }
4637                 error = (*pr->pr_usrreqs->pru_shutdown)(so);
4638         }
4639 done:
4640         KERNEL_DEBUG(DBG_FNC_SOSHUTDOWN, how, 1, 0, 0, 0);
4641         return error;
4642 }
4643
4644 int
4645 soshutdownlock(struct socket *so, int how)
4646 {
4647         int error = 0;
4648
4649 #if CONTENT_FILTER
4650         /*
4651          * A content filter may delay the actual shutdown until it
4652          * has processed the pending data
4653          */
4654         if (so->so_flags & SOF_CONTENT_FILTER) {
4655                 error = cfil_sock_shutdown(so, &how);
4656                 if (error == EJUSTRETURN) {
4657                         error = 0;
4658                         goto done;
4659                 } else if (error != 0) {
4660                         goto done;
4661                 }
4662         }
4663 #endif /* CONTENT_FILTER */
4664
4665         error = soshutdownlock_final(so, how);
4666
4667 done:
4668         return error;
4669 }
4670
4671 void
4672 sowflush(struct socket *so)
4673 {
4674         struct sockbuf *sb = &so->so_snd;
4675
4676         /*
4677          * Obtain lock on the socket buffer (SB_LOCK).  This is required
4678          * to prevent the socket buffer from being unexpectedly altered
4679          * while it is used by another thread in socket send/receive.
4680          *
4681          * sblock() must not fail here, hence the assertion.
4682          */
4683         (void) sblock(sb, SBL_WAIT | SBL_NOINTR | SBL_IGNDEFUNCT);
4684         VERIFY(sb->sb_flags & SB_LOCK);
4685
4686         sb->sb_flags            &= ~(SB_SEL | SB_UPCALL);
4687         sb->sb_flags            |= SB_DROP;
4688         sb->sb_upcall           = NULL;
4689         sb->sb_upcallarg        = NULL;
4690
4691         sbunlock(sb, TRUE);     /* keep socket locked */
4692
4693         selthreadclear(&sb->sb_sel);
4694         sbrelease(sb);
4695 }
4696
4697 void
4698 sorflush(struct socket *so)
4699 {
4700         struct sockbuf *sb = &so->so_rcv;
4701         struct protosw *pr = so->so_proto;
4702         struct sockbuf asb;
4703 #ifdef notyet
4704         lck_mtx_t *mutex_held;
4705         /*
4706          * XXX: This code is currently commented out, because we may get here
4707          * as part of sofreelastref(), and at that time, pr_getlock() may no
4708          * longer be able to return us the lock; this will be fixed in future.
4709          */
4710         if (so->so_proto->pr_getlock != NULL) {
4711                 mutex_held = (*so->so_proto->pr_getlock)(so, 0);
4712         } else {
4713                 mutex_held = so->so_proto->pr_domain->dom_mtx;
4714         }
4715
4716         LCK_MTX_ASSERT(mutex_held, LCK_MTX_ASSERT_OWNED);
4717 #endif /* notyet */
4718
4719         sflt_notify(so, sock_evt_flush_read, NULL);
4720
4721         socantrcvmore(so);
4722
4723         /*
4724          * Obtain lock on the socket buffer (SB_LOCK).  This is required
4725          * to prevent the socket buffer from being unexpectedly altered
4726          * while it is used by another thread in socket send/receive.
4727          *
4728          * sblock() must not fail here, hence the assertion.
4729          */
4730         (void) sblock(sb, SBL_WAIT | SBL_NOINTR | SBL_IGNDEFUNCT);
4731         VERIFY(sb->sb_flags & SB_LOCK);
4732
4733         /*
4734          * Copy only the relevant fields from "sb" to "asb" which we
4735          * need for sbrelease() to function.  In particular, skip
4736          * sb_sel as it contains the wait queue linkage, which would
4737          * wreak havoc if we were to issue selthreadclear() on "asb".
4738          * Make sure to not carry over SB_LOCK in "asb", as we need
4739          * to acquire it later as part of sbrelease().
4740          */
4741         bzero(&asb, sizeof(asb));
4742         asb.sb_cc               = sb->sb_cc;
4743         asb.sb_hiwat            = sb->sb_hiwat;
4744         asb.sb_mbcnt            = sb->sb_mbcnt;
4745         asb.sb_mbmax            = sb->sb_mbmax;
4746         asb.sb_ctl              = sb->sb_ctl;
4747         asb.sb_lowat            = sb->sb_lowat;
4748         asb.sb_mb               = sb->sb_mb;
4749         asb.sb_mbtail           = sb->sb_mbtail;
4750         asb.sb_lastrecord       = sb->sb_lastrecord;
4751         asb.sb_so               = sb->sb_so;
4752         asb.sb_flags            = sb->sb_flags;
4753         asb.sb_flags            &= ~(SB_LOCK | SB_SEL | SB_KNOTE | SB_UPCALL);
4754         asb.sb_flags            |= SB_DROP;
4755
4756         /*
4757          * Ideally we'd bzero() these and preserve the ones we need;
4758          * but to do that we'd need to shuffle things around in the
4759          * sockbuf, and we can't do it now because there are KEXTS
4760          * that are directly referring to the socket structure.
4761          *
4762          * Setting SB_DROP acts as a barrier to prevent further appends.
4763          * Clearing SB_SEL is done for selthreadclear() below.
4764          */
4765         sb->sb_cc               = 0;
4766         sb->sb_hiwat            = 0;
4767         sb->sb_mbcnt            = 0;
4768         sb->sb_mbmax            = 0;
4769         sb->sb_ctl              = 0;
4770         sb->sb_lowat            = 0;
4771         sb->sb_mb               = NULL;
4772         sb->sb_mbtail           = NULL;
4773         sb->sb_lastrecord       = NULL;
4774         sb->sb_timeo.tv_sec     = 0;
4775         sb->sb_timeo.tv_usec    = 0;
4776         sb->sb_upcall           = NULL;
4777         sb->sb_upcallarg        = NULL;
4778         sb->sb_flags            &= ~(SB_SEL | SB_UPCALL);
4779         sb->sb_flags            |= SB_DROP;
4780
4781         sbunlock(sb, TRUE);     /* keep socket locked */
4782
4783         /*
4784          * Note that selthreadclear() is called on the original "sb" and
4785          * not the local "asb" because of the way wait queue linkage is
4786          * implemented.  Given that selwakeup() may be triggered, SB_SEL
4787          * should no longer be set (cleared above.)
4788          */
4789         selthreadclear(&sb->sb_sel);
4790
4791         if ((pr->pr_flags & PR_RIGHTS) && pr->pr_domain->dom_dispose) {
4792                 (*pr->pr_domain->dom_dispose)(asb.sb_mb);
4793         }
4794
4795         sbrelease(&asb);
4796 }
4797
4798 /*
4799  * Perhaps this routine, and sooptcopyout(), below, ought to come in
4800  * an additional variant to handle the case where the option value needs
4801  * to be some kind of integer, but not a specific size.
4802  * In addition to their use here, these functions are also called by the
4803  * protocol-level pr_ctloutput() routines.
4804  *
4805  * Returns:     0                       Success
4806  *              EINVAL
4807  *      copyin:EFAULT
4808  */
4809 int
4810 sooptcopyin(struct sockopt *sopt, void *buf, size_t len, size_t minlen)
4811 {
4812         size_t  valsize;
4813
4814         /*
4815          * If the user gives us more than we wanted, we ignore it,
4816          * but if we don't get the minimum length the caller
4817          * wants, we return EINVAL.  On success, sopt->sopt_valsize
4818          * is set to however much we actually retrieved.
4819          */
4820         if ((valsize = sopt->sopt_valsize) < minlen) {
4821                 return EINVAL;
4822         }
4823         if (valsize > len) {
4824                 sopt->sopt_valsize = valsize = len;
4825         }
4826
4827         if (sopt->sopt_p != kernproc) {
4828                 return copyin(sopt->sopt_val, buf, valsize);
4829         }
4830
4831         bcopy(CAST_DOWN(caddr_t, sopt->sopt_val), buf, valsize);
4832         return 0;
4833 }
4834
4835 /*
4836  * sooptcopyin_timeval
4837  *   Copy in a timeval value into tv_p, and take into account whether the
4838  *   the calling process is 64-bit or 32-bit.  Moved the sanity checking
4839  *   code here so that we can verify the 64-bit tv_sec value before we lose
4840  *   the top 32-bits assigning tv64.tv_sec to tv_p->tv_sec.
4841  */
4842 static int
4843 sooptcopyin_timeval(struct sockopt *sopt, struct timeval *tv_p)
4844 {
4845         int                     error;
4846
4847         if (proc_is64bit(sopt->sopt_p)) {
4848                 struct user64_timeval   tv64;
4849
4850                 if (sopt->sopt_valsize < sizeof(tv64)) {
4851                         return EINVAL;
4852                 }
4853
4854                 sopt->sopt_valsize = sizeof(tv64);
4855                 if (sopt->sopt_p != kernproc) {
4856                         error = copyin(sopt->sopt_val, &tv64, sizeof(tv64));
4857                         if (error != 0) {
4858                                 return error;
4859                         }
4860                 } else {
4861                         bcopy(CAST_DOWN(caddr_t, sopt->sopt_val), &tv64,
4862                             sizeof(tv64));
4863                 }
4864                 if (tv64.tv_sec < 0 || tv64.tv_sec > LONG_MAX ||
4865                     tv64.tv_usec < 0 || tv64.tv_usec >= 1000000) {
4866                         return EDOM;
4867                 }
4868
4869                 tv_p->tv_sec = tv64.tv_sec;
4870                 tv_p->tv_usec = tv64.tv_usec;
4871         } else {
4872                 struct user32_timeval   tv32;
4873
4874                 if (sopt->sopt_valsize < sizeof(tv32)) {
4875                         return EINVAL;
4876                 }
4877
4878                 sopt->sopt_valsize = sizeof(tv32);
4879                 if (sopt->sopt_p != kernproc) {
4880                         error = copyin(sopt->sopt_val, &tv32, sizeof(tv32));
4881                         if (error != 0) {
4882                                 return error;
4883                         }
4884                 } else {
4885                         bcopy(CAST_DOWN(caddr_t, sopt->sopt_val), &tv32,
4886                             sizeof(tv32));
4887                 }
4888 #ifndef __LP64__
4889                 /*
4890                  * K64todo "comparison is always false due to
4891                  * limited range of data type"
4892                  */
4893                 if (tv32.tv_sec < 0 || tv32.tv_sec > LONG_MAX ||
4894                     tv32.tv_usec < 0 || tv32.tv_usec >= 1000000) {
4895                         return EDOM;
4896                 }
4897 #endif
4898                 tv_p->tv_sec = tv32.tv_sec;
4899                 tv_p->tv_usec = tv32.tv_usec;
4900         }
4901         return 0;
4902 }
4903
4904 int
4905 soopt_cred_check(struct socket *so, int priv, boolean_t allow_root,
4906     boolean_t ignore_delegate)
4907 {
4908         kauth_cred_t cred =  NULL;
4909         proc_t ep = PROC_NULL;
4910         uid_t uid;
4911         int error = 0;
4912
4913         if (ignore_delegate == false && so->so_flags & SOF_DELEGATED) {
4914                 ep = proc_find(so->e_pid);
4915                 if (ep) {
4916                         cred = kauth_cred_proc_ref(ep);
4917                 }
4918         }
4919
4920         uid = kauth_cred_getuid(cred ? cred : so->so_cred);
4921
4922         /* uid is 0 for root */
4923         if (uid != 0 || !allow_root) {
4924                 error = priv_check_cred(cred ? cred : so->so_cred, priv, 0);
4925         }
4926         if (cred) {
4927                 kauth_cred_unref(&cred);
4928         }
4929         if (ep != PROC_NULL) {
4930                 proc_rele(ep);
4931         }
4932
4933         return error;
4934 }
4935
4936 /*
4937  * Returns:     0                       Success
4938  *              EINVAL
4939  *              ENOPROTOOPT
4940  *              ENOBUFS
4941  *              EDOM
4942  *      sooptcopyin:EINVAL
4943  *      sooptcopyin:EFAULT
4944  *      sooptcopyin_timeval:EINVAL
4945  *      sooptcopyin_timeval:EFAULT
4946  *      sooptcopyin_timeval:EDOM
4947  *      <pr_ctloutput>:EOPNOTSUPP[AF_UNIX]
4948  *      <pr_ctloutput>:???w
4949  *      sflt_attach_private:???         [whatever a filter author chooses]
4950  *      <sf_setoption>:???              [whatever a filter author chooses]
4951  *
4952  * Notes:       Other <pru_listen> returns depend on the protocol family; all
4953  *              <sf_listen> returns depend on what the filter author causes
4954  *              their filter to return.
4955  */
4956 int
4957 sosetoptlock(struct socket *so, struct sockopt *sopt, int dolock)
4958 {
4959         int     error, optval;
4960         int64_t long_optval;
4961         struct  linger l;
4962         struct  timeval tv;
4963
4964         if (sopt->sopt_dir != SOPT_SET) {
4965                 sopt->sopt_dir = SOPT_SET;
4966         }
4967
4968         if (dolock) {
4969                 socket_lock(so, 1);
4970         }
4971
4972         if ((so->so_state & (SS_CANTRCVMORE | SS_CANTSENDMORE)) ==
4973             (SS_CANTRCVMORE | SS_CANTSENDMORE) &&
4974             (so->so_flags & SOF_NPX_SETOPTSHUT) == 0) {
4975                 /* the socket has been shutdown, no more sockopt's */
4976                 error = EINVAL;
4977                 goto out;
4978         }
4979
4980         error = sflt_setsockopt(so, sopt);
4981         if (error != 0) {
4982                 if (error == EJUSTRETURN) {
4983                         error = 0;
4984                 }
4985                 goto out;
4986         }
4987
4988         if (sopt->sopt_level != SOL_SOCKET) {
4989                 if (so->so_proto != NULL &&
4990                     so->so_proto->pr_ctloutput != NULL) {
4991                         error = (*so->so_proto->pr_ctloutput)(so, sopt);
4992                         goto out;
4993                 }
4994                 error = ENOPROTOOPT;
4995         } else {
4996                 /*
4997                  * Allow socket-level (SOL_SOCKET) options to be filtered by
4998                  * the protocol layer, if needed.  A zero value returned from
4999                  * the handler means use default socket-level processing as
5000                  * done by the rest of this routine.  Otherwise, any other
5001                  * return value indicates that the option is unsupported.
5002                  */
5003                 if (so->so_proto != NULL && (error = so->so_proto->pr_usrreqs->
5004                     pru_socheckopt(so, sopt)) != 0) {
5005                         goto out;
5006                 }
5007
5008                 error = 0;
5009                 switch (sopt->sopt_name) {
5010                 case SO_LINGER:
5011                 case SO_LINGER_SEC:
5012                         error = sooptcopyin(sopt, &l, sizeof(l), sizeof(l));
5013                         if (error != 0) {
5014                                 goto out;
5015                         }
5016
5017                         so->so_linger = (sopt->sopt_name == SO_LINGER) ?
5018                             l.l_linger : l.l_linger * hz;
5019                         if (l.l_onoff != 0) {
5020                                 so->so_options |= SO_LINGER;
5021                         } else {
5022                                 so->so_options &= ~SO_LINGER;
5023                         }
5024                         break;
5025
5026                 case SO_DEBUG:
5027                 case SO_KEEPALIVE:
5028                 case SO_DONTROUTE:
5029                 case SO_USELOOPBACK:
5030                 case SO_BROADCAST:
5031                 case SO_REUSEADDR:
5032                 case SO_REUSEPORT:
5033                 case SO_OOBINLINE:
5034                 case SO_TIMESTAMP:
5035                 case SO_TIMESTAMP_MONOTONIC:
5036                 case SO_TIMESTAMP_CONTINUOUS:
5037                 case SO_DONTTRUNC:
5038                 case SO_WANTMORE:
5039                 case SO_WANTOOBFLAG:
5040                 case SO_NOWAKEFROMSLEEP:
5041                 case SO_NOAPNFALLBK:
5042                         error = sooptcopyin(sopt, &optval, sizeof(optval),
5043                             sizeof(optval));
5044                         if (error != 0) {
5045                                 goto out;
5046                         }
5047                         if (optval) {
5048                                 so->so_options |= sopt->sopt_name;
5049                         } else {
5050                                 so->so_options &= ~sopt->sopt_name;
5051                         }
5052                         break;
5053
5054                 case SO_SNDBUF:
5055                 case SO_RCVBUF:
5056                 case SO_SNDLOWAT:
5057                 case SO_RCVLOWAT:
5058                         error = sooptcopyin(sopt, &optval, sizeof(optval),
5059                             sizeof(optval));
5060                         if (error != 0) {
5061                                 goto out;
5062                         }
5063
5064                         /*
5065                          * Values < 1 make no sense for any of these
5066                          * options, so disallow them.
5067                          */
5068                         if (optval < 1) {
5069                                 error = EINVAL;
5070                                 goto out;
5071                         }
5072
5073                         switch (sopt->sopt_name) {
5074                         case SO_SNDBUF:
5075                         case SO_RCVBUF: {
5076                                 struct sockbuf *sb =
5077                                     (sopt->sopt_name == SO_SNDBUF) ?
5078                                     &so->so_snd : &so->so_rcv;
5079                                 if (sbreserve(sb, (u_int32_t)optval) == 0) {
5080                                         error = ENOBUFS;
5081                                         goto out;
5082                                 }
5083                                 sb->sb_flags |= SB_USRSIZE;
5084                                 sb->sb_flags &= ~SB_AUTOSIZE;
5085                                 sb->sb_idealsize = (u_int32_t)optval;
5086                                 break;
5087                         }
5088                         /*
5089                          * Make sure the low-water is never greater than
5090                          * the high-water.
5091                          */
5092                         case SO_SNDLOWAT: {
5093                                 int space = sbspace(&so->so_snd);
5094                                 u_int32_t hiwat = so->so_snd.sb_hiwat;
5095
5096                                 if (so->so_snd.sb_flags & SB_UNIX) {
5097                                         struct unpcb *unp =
5098                                             (struct unpcb *)(so->so_pcb);
5099                                         if (unp != NULL &&
5100                                             unp->unp_conn != NULL) {
5101                                                 hiwat += unp->unp_conn->unp_cc;
5102                                         }
5103                                 }
5104
5105                                 so->so_snd.sb_lowat =
5106                                     (optval > hiwat) ?
5107                                     hiwat : optval;
5108
5109                                 if (space >= so->so_snd.sb_lowat) {
5110                                         sowwakeup(so);
5111                                 }
5112                                 break;
5113                         }
5114                         case SO_RCVLOWAT: {
5115                                 int64_t data_len;
5116                                 so->so_rcv.sb_lowat =
5117                                     (optval > so->so_rcv.sb_hiwat) ?
5118                                     so->so_rcv.sb_hiwat : optval;
5119                                 data_len = so->so_rcv.sb_cc
5120                                     - so->so_rcv.sb_ctl;
5121                                 if (data_len >= so->so_rcv.sb_lowat) {
5122                                         sorwakeup(so);
5123                                 }
5124                                 break;
5125                         }
5126                         }
5127                         break;
5128
5129                 case SO_SNDTIMEO:
5130                 case SO_RCVTIMEO:
5131                         error = sooptcopyin_timeval(sopt, &tv);
5132                         if (error != 0) {
5133                                 goto out;
5134                         }
5135
5136                         switch (sopt->sopt_name) {
5137                         case SO_SNDTIMEO:
5138                                 so->so_snd.sb_timeo = tv;
5139                                 break;
5140                         case SO_RCVTIMEO:
5141                                 so->so_rcv.sb_timeo = tv;
5142                                 break;
5143                         }
5144                         break;
5145
5146                 case SO_NKE: {
5147                         struct so_nke nke;
5148
5149                         error = sooptcopyin(sopt, &nke, sizeof(nke),
5150                             sizeof(nke));
5151                         if (error != 0) {
5152                                 goto out;
5153                         }
5154
5155                         error = sflt_attach_internal(so, nke.nke_handle);
5156                         break;
5157                 }
5158
5159                 case SO_NOSIGPIPE:
5160                         error = sooptcopyin(sopt, &optval, sizeof(optval),
5161                             sizeof(optval));
5162                         if (error != 0) {
5163                                 goto out;
5164                         }
5165                         if (optval != 0) {
5166                                 so->so_flags |= SOF_NOSIGPIPE;
5167                         } else {
5168                                 so->so_flags &= ~SOF_NOSIGPIPE;
5169                         }
5170                         break;
5171
5172                 case SO_NOADDRERR:
5173                         error = sooptcopyin(sopt, &optval, sizeof(optval),
5174                             sizeof(optval));
5175                         if (error != 0) {
5176                                 goto out;
5177                         }
5178                         if (optval != 0) {
5179                                 so->so_flags |= SOF_NOADDRAVAIL;
5180                         } else {
5181                                 so->so_flags &= ~SOF_NOADDRAVAIL;
5182                         }
5183                         break;
5184
5185                 case SO_REUSESHAREUID:
5186                         error = sooptcopyin(sopt, &optval, sizeof(optval),
5187                             sizeof(optval));
5188                         if (error != 0) {
5189                                 goto out;
5190                         }
5191                         if (optval != 0) {
5192                                 so->so_flags |= SOF_REUSESHAREUID;
5193                         } else {
5194                                 so->so_flags &= ~SOF_REUSESHAREUID;
5195                         }
5196                         break;
5197
5198                 case SO_NOTIFYCONFLICT:
5199                         if (kauth_cred_issuser(kauth_cred_get()) == 0) {
5200                                 error = EPERM;
5201                                 goto out;
5202                         }
5203                         error = sooptcopyin(sopt, &optval, sizeof(optval),
5204                             sizeof(optval));
5205                         if (error != 0) {
5206                                 goto out;
5207                         }
5208                         if (optval != 0) {
5209                                 so->so_flags |= SOF_NOTIFYCONFLICT;
5210                         } else {
5211                                 so->so_flags &= ~SOF_NOTIFYCONFLICT;
5212                         }
5213                         break;
5214
5215                 case SO_RESTRICTIONS:
5216                         error = sooptcopyin(sopt, &optval, sizeof(optval),
5217                             sizeof(optval));
5218                         if (error != 0) {
5219                                 goto out;
5220                         }
5221
5222                         error = so_set_restrictions(so, optval);
5223                         break;
5224
5225                 case SO_AWDL_UNRESTRICTED:
5226                         if (SOCK_DOM(so) != PF_INET &&
5227                             SOCK_DOM(so) != PF_INET6) {
5228                                 error = EOPNOTSUPP;
5229                                 goto out;
5230                         }
5231                         error = sooptcopyin(sopt, &optval, sizeof(optval),
5232                             sizeof(optval));
5233                         if (error != 0) {
5234                                 goto out;
5235                         }
5236                         if (optval != 0) {
5237                                 error = soopt_cred_check(so,
5238                                     PRIV_NET_RESTRICTED_AWDL, false, false);
5239                                 if (error == 0) {
5240                                         inp_set_awdl_unrestricted(
5241                                                 sotoinpcb(so));
5242                                 }
5243                         } else {
5244                                 inp_clear_awdl_unrestricted(sotoinpcb(so));
5245                         }
5246                         break;
5247                 case SO_INTCOPROC_ALLOW:
5248                         if (SOCK_DOM(so) != PF_INET6) {
5249                                 error = EOPNOTSUPP;
5250                                 goto out;
5251                         }
5252                         error = sooptcopyin(sopt, &optval, sizeof(optval),
5253                             sizeof(optval));
5254                         if (error != 0) {
5255                                 goto out;
5256                         }
5257                         if (optval != 0 &&
5258                             inp_get_intcoproc_allowed(sotoinpcb(so)) == FALSE) {
5259                                 error = soopt_cred_check(so,
5260                                     PRIV_NET_RESTRICTED_INTCOPROC, false, false);
5261                                 if (error == 0) {
5262                                         inp_set_intcoproc_allowed(
5263                                                 sotoinpcb(so));
5264                                 }
5265                         } else if (optval == 0) {
5266                                 inp_clear_intcoproc_allowed(sotoinpcb(so));
5267                         }
5268                         break;
5269
5270                 case SO_LABEL:
5271                         error = EOPNOTSUPP;
5272                         break;
5273
5274                 case SO_UPCALLCLOSEWAIT:
5275                         error = sooptcopyin(sopt, &optval, sizeof(optval),
5276                             sizeof(optval));
5277                         if (error != 0) {
5278                                 goto out;
5279                         }
5280                         if (optval != 0) {
5281                                 so->so_flags |= SOF_UPCALLCLOSEWAIT;
5282                         } else {
5283                                 so->so_flags &= ~SOF_UPCALLCLOSEWAIT;
5284                         }
5285                         break;
5286
5287                 case SO_RANDOMPORT:
5288                         error = sooptcopyin(sopt, &optval, sizeof(optval),
5289                             sizeof(optval));
5290                         if (error != 0) {
5291                                 goto out;
5292                         }
5293                         if (optval != 0) {
5294                                 so->so_flags |= SOF_BINDRANDOMPORT;
5295                         } else {
5296                                 so->so_flags &= ~SOF_BINDRANDOMPORT;
5297                         }
5298                         break;
5299
5300                 case SO_NP_EXTENSIONS: {
5301                         struct so_np_extensions sonpx;
5302
5303                         error = sooptcopyin(sopt, &sonpx, sizeof(sonpx),
5304                             sizeof(sonpx));
5305                         if (error != 0) {
5306                                 goto out;
5307                         }
5308                         if (sonpx.npx_mask & ~SONPX_MASK_VALID) {
5309                                 error = EINVAL;
5310                                 goto out;
5311                         }
5312                         /*
5313                          * Only one bit defined for now
5314                          */
5315                         if ((sonpx.npx_mask & SONPX_SETOPTSHUT)) {
5316                                 if ((sonpx.npx_flags & SONPX_SETOPTSHUT)) {
5317                                         so->so_flags |= SOF_NPX_SETOPTSHUT;
5318                                 } else {
5319                                         so->so_flags &= ~SOF_NPX_SETOPTSHUT;
5320                                 }
5321                         }
5322                         break;
5323                 }
5324
5325                 case SO_TRAFFIC_CLASS: {
5326                         error = sooptcopyin(sopt, &optval, sizeof(optval),
5327                             sizeof(optval));
5328                         if (error != 0) {
5329                                 goto out;
5330                         }
5331                         if (optval >= SO_TC_NET_SERVICE_OFFSET) {
5332                                 int netsvc = optval - SO_TC_NET_SERVICE_OFFSET;
5333                                 error = so_set_net_service_type(so, netsvc);
5334                                 goto out;
5335                         }
5336                         error = so_set_traffic_class(so, optval);
5337                         if (error != 0) {
5338                                 goto out;
5339                         }
5340                         so->so_flags1 &= ~SOF1_TC_NET_SERV_TYPE;
5341                         so->so_netsvctype = _NET_SERVICE_TYPE_UNSPEC;
5342                         break;
5343                 }
5344
5345                 case SO_RECV_TRAFFIC_CLASS: {
5346                         error = sooptcopyin(sopt, &optval, sizeof(optval),
5347                             sizeof(optval));
5348                         if (error != 0) {
5349                                 goto out;
5350                         }
5351                         if (optval == 0) {
5352                                 so->so_flags &= ~SOF_RECV_TRAFFIC_CLASS;
5353                         } else {
5354                                 so->so_flags |= SOF_RECV_TRAFFIC_CLASS;
5355                         }
5356                         break;
5357                 }
5358
5359 #if (DEVELOPMENT || DEBUG)
5360                 case SO_TRAFFIC_CLASS_DBG: {
5361                         struct so_tcdbg so_tcdbg;
5362
5363                         error = sooptcopyin(sopt, &so_tcdbg,
5364                             sizeof(struct so_tcdbg), sizeof(struct so_tcdbg));
5365                         if (error != 0) {
5366                                 goto out;
5367                         }
5368                         error = so_set_tcdbg(so, &so_tcdbg);
5369                         if (error != 0) {
5370                                 goto out;
5371                         }
5372                         break;
5373                 }
5374 #endif /* (DEVELOPMENT || DEBUG) */
5375
5376                 case SO_PRIVILEGED_TRAFFIC_CLASS:
5377                         error = priv_check_cred(kauth_cred_get(),
5378                             PRIV_NET_PRIVILEGED_TRAFFIC_CLASS, 0);
5379                         if (error != 0) {
5380                                 goto out;
5381                         }
5382                         error = sooptcopyin(sopt, &optval, sizeof(optval),
5383                             sizeof(optval));
5384                         if (error != 0) {
5385                                 goto out;
5386                         }
5387                         if (optval == 0) {
5388                                 so->so_flags &= ~SOF_PRIVILEGED_TRAFFIC_CLASS;
5389                         } else {
5390                                 so->so_flags |= SOF_PRIVILEGED_TRAFFIC_CLASS;
5391                         }
5392                         break;
5393
5394 #if (DEVELOPMENT || DEBUG)
5395                 case SO_DEFUNCTIT:
5396                         error = sosetdefunct(current_proc(), so, 0, FALSE);
5397                         if (error == 0) {
5398                                 error = sodefunct(current_proc(), so, 0);
5399                         }
5400
5401                         break;
5402 #endif /* (DEVELOPMENT || DEBUG) */
5403
5404                 case SO_DEFUNCTOK:
5405                         error = sooptcopyin(sopt, &optval, sizeof(optval),
5406                             sizeof(optval));
5407                         if (error != 0 || (so->so_flags & SOF_DEFUNCT)) {
5408                                 if (error == 0) {
5409                                         error = EBADF;
5410                                 }
5411                                 goto out;
5412                         }
5413                         /*
5414                          * Any process can set SO_DEFUNCTOK (clear
5415                          * SOF_NODEFUNCT), but only root can clear
5416                          * SO_DEFUNCTOK (set SOF_NODEFUNCT).
5417                          */
5418                         if (optval == 0 &&
5419                             kauth_cred_issuser(kauth_cred_get()) == 0) {
5420                                 error = EPERM;
5421                                 goto out;
5422                         }
5423                         if (optval) {
5424                                 so->so_flags &= ~SOF_NODEFUNCT;
5425                         } else {
5426                                 so->so_flags |= SOF_NODEFUNCT;
5427                         }
5428
5429                         if (SOCK_DOM(so) == PF_INET ||
5430                             SOCK_DOM(so) == PF_INET6) {
5431                                 char s[MAX_IPv6_STR_LEN];
5432                                 char d[MAX_IPv6_STR_LEN];
5433                                 struct inpcb *inp = sotoinpcb(so);
5434
5435                                 SODEFUNCTLOG("%s[%d, %s]: so 0x%llx "
5436                                     "[%s %s:%d -> %s:%d] is now marked "
5437                                     "as %seligible for "
5438                                     "defunct\n", __func__, proc_selfpid(),
5439                                     proc_best_name(current_proc()),
5440                                     (uint64_t)DEBUG_KERNEL_ADDRPERM(so),
5441                                     (SOCK_TYPE(so) == SOCK_STREAM) ?
5442                                     "TCP" : "UDP", inet_ntop(SOCK_DOM(so),
5443                                     ((SOCK_DOM(so) == PF_INET) ?
5444                                     (void *)&inp->inp_laddr.s_addr :
5445                                     (void *)&inp->in6p_laddr), s, sizeof(s)),
5446                                     ntohs(inp->in6p_lport),
5447                                     inet_ntop(SOCK_DOM(so),
5448                                     (SOCK_DOM(so) == PF_INET) ?
5449                                     (void *)&inp->inp_faddr.s_addr :
5450                                     (void *)&inp->in6p_faddr, d, sizeof(d)),
5451                                     ntohs(inp->in6p_fport),
5452                                     (so->so_flags & SOF_NODEFUNCT) ?
5453                                     "not " : "");
5454                         } else {
5455                                 SODEFUNCTLOG("%s[%d, %s]: so 0x%llx [%d,%d] "
5456                                     "is now marked as %seligible for "
5457                                     "defunct\n",
5458                                     __func__, proc_selfpid(),
5459                                     proc_best_name(current_proc()),
5460                                     (uint64_t)DEBUG_KERNEL_ADDRPERM(so),
5461                                     SOCK_DOM(so), SOCK_TYPE(so),
5462                                     (so->so_flags & SOF_NODEFUNCT) ?
5463                                     "not " : "");
5464                         }
5465                         break;
5466
5467                 case SO_ISDEFUNCT:
5468                         /* This option is not settable */
5469                         error = EINVAL;
5470                         break;
5471
5472                 case SO_OPPORTUNISTIC:
5473                         error = sooptcopyin(sopt, &optval, sizeof(optval),
5474                             sizeof(optval));
5475                         if (error == 0) {
5476                                 error = so_set_opportunistic(so, optval);
5477                         }
5478                         break;
5479
5480                 case SO_FLUSH:
5481                         /* This option is handled by lower layer(s) */
5482                         error = 0;
5483                         break;
5484
5485                 case SO_RECV_ANYIF:
5486                         error = sooptcopyin(sopt, &optval, sizeof(optval),
5487                             sizeof(optval));
5488                         if (error == 0) {
5489                                 error = so_set_recv_anyif(so, optval);
5490                         }
5491                         break;
5492
5493                 case SO_TRAFFIC_MGT_BACKGROUND: {
5494                         /* This option is handled by lower layer(s) */
5495                         error = 0;
5496                         break;
5497                 }
5498
5499 #if FLOW_DIVERT
5500                 case SO_FLOW_DIVERT_TOKEN:
5501                         error = flow_divert_token_set(so, sopt);
5502                         break;
5503 #endif  /* FLOW_DIVERT */
5504
5505
5506                 case SO_DELEGATED:
5507                         if ((error = sooptcopyin(sopt, &optval, sizeof(optval),
5508                             sizeof(optval))) != 0) {
5509                                 break;
5510                         }
5511
5512                         error = so_set_effective_pid(so, optval, sopt->sopt_p, true);
5513                         break;
5514
5515                 case SO_DELEGATED_UUID: {
5516                         uuid_t euuid;
5517
5518                         if ((error = sooptcopyin(sopt, &euuid, sizeof(euuid),
5519                             sizeof(euuid))) != 0) {
5520                                 break;
5521                         }
5522
5523                         error = so_set_effective_uuid(so, euuid, sopt->sopt_p, true);
5524                         break;
5525                 }
5526
5527 #if NECP
5528                 case SO_NECP_ATTRIBUTES:
5529                         error = necp_set_socket_attributes(so, sopt);
5530                         break;
5531
5532                 case SO_NECP_CLIENTUUID: {
5533                         if (SOCK_DOM(so) == PF_MULTIPATH) {
5534                                 /* Handled by MPTCP itself */
5535                                 break;
5536                         }
5537
5538                         if (SOCK_DOM(so) != PF_INET && SOCK_DOM(so) != PF_INET6) {
5539                                 error = EINVAL;
5540                                 goto out;
5541                         }
5542
5543                         struct inpcb *inp = sotoinpcb(so);
5544                         if (!uuid_is_null(inp->necp_client_uuid)) {
5545                                 // Clear out the old client UUID if present
5546                                 necp_inpcb_remove_cb(inp);
5547                         }
5548
5549                         error = sooptcopyin(sopt, &inp->necp_client_uuid,
5550                             sizeof(uuid_t), sizeof(uuid_t));
5551                         if (error != 0) {
5552                                 goto out;
5553                         }
5554
5555                         if (uuid_is_null(inp->necp_client_uuid)) {
5556                                 error = EINVAL;
5557                                 goto out;
5558                         }
5559
5560                         pid_t current_pid = proc_pid(current_proc());
5561                         error = necp_client_register_socket_flow(current_pid,
5562                             inp->necp_client_uuid, inp);
5563                         if (error != 0) {
5564                                 uuid_clear(inp->necp_client_uuid);
5565                                 goto out;
5566                         }
5567
5568                         if (inp->inp_lport != 0) {
5569                                 // There is a bound local port, so this is not
5570                                 // a fresh socket. Assign to the client.
5571                                 necp_client_assign_from_socket(current_pid, inp->necp_client_uuid, inp);
5572                         }
5573
5574                         break;
5575                 }
5576                 case SO_NECP_LISTENUUID: {
5577                         if (SOCK_DOM(so) != PF_INET && SOCK_DOM(so) != PF_INET6) {
5578                                 error = EINVAL;
5579                                 goto out;
5580                         }
5581
5582                         struct inpcb *inp = sotoinpcb(so);
5583                         if (!uuid_is_null(inp->necp_client_uuid)) {
5584                                 error = EINVAL;
5585                                 goto out;
5586                         }
5587
5588                         error = sooptcopyin(sopt, &inp->necp_client_uuid,
5589                             sizeof(uuid_t), sizeof(uuid_t));
5590                         if (error != 0) {
5591                                 goto out;
5592                         }
5593
5594                         if (uuid_is_null(inp->necp_client_uuid)) {
5595                                 error = EINVAL;
5596                                 goto out;
5597                         }
5598
5599                         error = necp_client_register_socket_listener(proc_pid(current_proc()),
5600                             inp->necp_client_uuid, inp);
5601                         if (error != 0) {
5602                                 uuid_clear(inp->necp_client_uuid);
5603                                 goto out;
5604                         }
5605
5606                         // Mark that the port registration is held by NECP
5607                         inp->inp_flags2 |= INP2_EXTERNAL_PORT;
5608
5609                         break;
5610                 }
5611 #endif /* NECP */
5612
5613                 case SO_EXTENDED_BK_IDLE:
5614                         error = sooptcopyin(sopt, &optval, sizeof(optval),
5615                             sizeof(optval));
5616                         if (error == 0) {
5617                                 error = so_set_extended_bk_idle(so, optval);
5618                         }
5619                         break;
5620
5621                 case SO_MARK_CELLFALLBACK:
5622                         error = sooptcopyin(sopt, &optval, sizeof(optval),
5623                             sizeof(optval));
5624                         if (error != 0) {
5625                                 goto out;
5626                         }
5627                         if (optval < 0) {
5628                                 error = EINVAL;
5629                                 goto out;
5630                         }
5631                         if (optval == 0) {
5632                                 so->so_flags1 &= ~SOF1_CELLFALLBACK;
5633                         } else {
5634                                 so->so_flags1 |= SOF1_CELLFALLBACK;
5635                         }
5636                         break;
5637
5638                 case SO_STATISTICS_EVENT:
5639                         error = sooptcopyin(sopt, &long_optval,
5640                             sizeof(long_optval), sizeof(long_optval));
5641                         if (error != 0) {
5642                                 goto out;
5643                         }
5644                         u_int64_t nstat_event = 0;
5645                         error = so_statistics_event_to_nstat_event(
5646                                 &long_optval, &nstat_event);
5647                         if (error != 0) {
5648                                 goto out;
5649                         }
5650                         nstat_pcb_event(sotoinpcb(so), nstat_event);
5651                         break;
5652
5653                 case SO_NET_SERVICE_TYPE: {
5654                         error = sooptcopyin(sopt, &optval, sizeof(optval),
5655                             sizeof(optval));
5656                         if (error != 0) {
5657                                 goto out;
5658                         }
5659                         error = so_set_net_service_type(so, optval);
5660                         break;
5661                 }
5662
5663                 case SO_QOSMARKING_POLICY_OVERRIDE:
5664                         error = priv_check_cred(kauth_cred_get(),
5665                             PRIV_NET_QOSMARKING_POLICY_OVERRIDE, 0);
5666                         if (error != 0) {
5667                                 goto out;
5668                         }
5669                         error = sooptcopyin(sopt, &optval, sizeof(optval),
5670                             sizeof(optval));
5671                         if (error != 0) {
5672                                 goto out;
5673                         }
5674                         if (optval == 0) {
5675                                 so->so_flags1 &= ~SOF1_QOSMARKING_POLICY_OVERRIDE;
5676                         } else {
5677                                 so->so_flags1 |= SOF1_QOSMARKING_POLICY_OVERRIDE;
5678                         }
5679                         break;
5680
5681                 case SO_MPKL_SEND_INFO: {
5682                         struct so_mpkl_send_info so_mpkl_send_info;
5683
5684                         error = sooptcopyin(sopt, &so_mpkl_send_info,
5685                             sizeof(struct so_mpkl_send_info), sizeof(struct so_mpkl_send_info));
5686                         if (error != 0) {
5687                                 goto out;
5688                         }
5689                         uuid_copy(so->so_mpkl_send_uuid, so_mpkl_send_info.mpkl_uuid);
5690                         so->so_mpkl_send_proto = so_mpkl_send_info.mpkl_proto;
5691
5692                         if (uuid_is_null(so->so_mpkl_send_uuid) && so->so_mpkl_send_proto == 0) {
5693                                 so->so_flags1 &= ~SOF1_MPKL_SEND_INFO;
5694                         } else {
5695                                 so->so_flags1 |= SOF1_MPKL_SEND_INFO;
5696                         }
5697                         break;
5698                 }
5699                 case SO_WANT_KEV_SOCKET_CLOSED: {
5700                         error = sooptcopyin(sopt, &optval, sizeof(optval),
5701                             sizeof(optval));
5702                         if (error != 0) {
5703                                 goto out;
5704                         }
5705                         if (optval == 0) {
5706                                 so->so_flags1 &= ~SOF1_WANT_KEV_SOCK_CLOSED;
5707                         } else {
5708                                 so->so_flags1 |= SOF1_WANT_KEV_SOCK_CLOSED;
5709                         }
5710                         break;
5711                 }
5712                 default:
5713                         error = ENOPROTOOPT;
5714                         break;
5715                 }
5716                 if (error == 0 && so->so_proto != NULL &&
5717                     so->so_proto->pr_ctloutput != NULL) {
5718                         (void) so->so_proto->pr_ctloutput(so, sopt);
5719                 }
5720         }
5721 out:
5722         if (dolock) {
5723                 socket_unlock(so, 1);
5724         }
5725         return error;
5726 }
5727
5728 /* Helper routines for getsockopt */
5729 int
5730 sooptcopyout(struct sockopt *sopt, void *buf, size_t len)
5731 {
5732         int     error;
5733         size_t  valsize;
5734
5735         error = 0;
5736
5737         /*
5738          * Documented get behavior is that we always return a value,
5739          * possibly truncated to fit in the user's buffer.
5740          * Traditional behavior is that we always tell the user
5741          * precisely how much we copied, rather than something useful
5742          * like the total amount we had available for her.
5743          * Note that this interface is not idempotent; the entire answer must
5744          * generated ahead of time.
5745          */
5746         valsize = min(len, sopt->sopt_valsize);
5747         sopt->sopt_valsize = valsize;
5748         if (sopt->sopt_val != USER_ADDR_NULL) {
5749                 if (sopt->sopt_p != kernproc) {
5750                         error = copyout(buf, sopt->sopt_val, valsize);
5751                 } else {
5752                         bcopy(buf, CAST_DOWN(caddr_t, sopt->sopt_val), valsize);
5753                 }
5754         }
5755         return error;
5756 }
5757
5758 static int
5759 sooptcopyout_timeval(struct sockopt *sopt, const struct timeval *tv_p)
5760 {
5761         int                     error;
5762         size_t                  len;
5763         struct user64_timeval   tv64 = {};
5764         struct user32_timeval   tv32 = {};
5765         const void *            val;
5766         size_t                  valsize;
5767
5768         error = 0;
5769         if (proc_is64bit(sopt->sopt_p)) {
5770                 len = sizeof(tv64);
5771                 tv64.tv_sec = tv_p->tv_sec;
5772                 tv64.tv_usec = tv_p->tv_usec;
5773                 val = &tv64;
5774         } else {
5775                 len = sizeof(tv32);
5776                 tv32.tv_sec = tv_p->tv_sec;
5777                 tv32.tv_usec = tv_p->tv_usec;
5778                 val = &tv32;
5779         }
5780         valsize = min(len, sopt->sopt_valsize);
5781         sopt->sopt_valsize = valsize;
5782         if (sopt->sopt_val != USER_ADDR_NULL) {
5783                 if (sopt->sopt_p != kernproc) {
5784                         error = copyout(val, sopt->sopt_val, valsize);
5785                 } else {
5786                         bcopy(val, CAST_DOWN(caddr_t, sopt->sopt_val), valsize);
5787                 }
5788         }
5789         return error;
5790 }
5791
5792 /*
5793  * Return:      0                       Success
5794  *              ENOPROTOOPT
5795  *      <pr_ctloutput>:EOPNOTSUPP[AF_UNIX]
5796  *      <pr_ctloutput>:???
5797  *      <sf_getoption>:???
5798  */
5799 int
5800 sogetoptlock(struct socket *so, struct sockopt *sopt, int dolock)
5801 {
5802         int     error, optval;
5803         struct  linger l;
5804         struct  timeval tv;
5805
5806         if (sopt->sopt_dir != SOPT_GET) {
5807                 sopt->sopt_dir = SOPT_GET;
5808         }
5809
5810         if (dolock) {
5811                 socket_lock(so, 1);
5812         }
5813
5814         error = sflt_getsockopt(so, sopt);
5815         if (error != 0) {
5816                 if (error == EJUSTRETURN) {
5817                         error = 0;
5818                 }
5819                 goto out;
5820         }
5821
5822         if (sopt->sopt_level != SOL_SOCKET) {
5823                 if (so->so_proto != NULL &&
5824                     so->so_proto->pr_ctloutput != NULL) {
5825                         error = (*so->so_proto->pr_ctloutput)(so, sopt);
5826                         goto out;
5827                 }
5828                 error = ENOPROTOOPT;
5829         } else {
5830                 /*
5831                  * Allow socket-level (SOL_SOCKET) options to be filtered by
5832                  * the protocol layer, if needed.  A zero value returned from
5833                  * the handler means use default socket-level processing as
5834                  * done by the rest of this routine.  Otherwise, any other
5835                  * return value indicates that the option is unsupported.
5836                  */
5837                 if (so->so_proto != NULL && (error = so->so_proto->pr_usrreqs->
5838                     pru_socheckopt(so, sopt)) != 0) {
5839                         goto out;
5840                 }
5841
5842                 error = 0;
5843                 switch (sopt->sopt_name) {
5844                 case SO_LINGER:
5845                 case SO_LINGER_SEC:
5846                         l.l_onoff = ((so->so_options & SO_LINGER) ? 1 : 0);
5847                         l.l_linger = (sopt->sopt_name == SO_LINGER) ?
5848                             so->so_linger : so->so_linger / hz;
5849                         error = sooptcopyout(sopt, &l, sizeof(l));
5850                         break;
5851
5852                 case SO_USELOOPBACK:
5853                 case SO_DONTROUTE:
5854                 case SO_DEBUG:
5855                 case SO_KEEPALIVE:
5856                 case SO_REUSEADDR:
5857                 case SO_REUSEPORT:
5858                 case SO_BROADCAST:
5859                 case SO_OOBINLINE:
5860                 case SO_TIMESTAMP:
5861                 case SO_TIMESTAMP_MONOTONIC:
5862                 case SO_TIMESTAMP_CONTINUOUS:
5863                 case SO_DONTTRUNC:
5864                 case SO_WANTMORE:
5865                 case SO_WANTOOBFLAG:
5866                 case SO_NOWAKEFROMSLEEP:
5867                 case SO_NOAPNFALLBK:
5868                         optval = so->so_options & sopt->sopt_name;
5869 integer:
5870                         error = sooptcopyout(sopt, &optval, sizeof(optval));
5871                         break;
5872
5873                 case SO_TYPE:
5874                         optval = so->so_type;
5875                         goto integer;
5876
5877                 case SO_NREAD:
5878                         if (so->so_proto->pr_flags & PR_ATOMIC) {
5879                                 int pkt_total;
5880                                 struct mbuf *m1;
5881
5882                                 pkt_total = 0;
5883                                 m1 = so->so_rcv.sb_mb;
5884                                 while (m1 != NULL) {
5885                                         if (m1->m_type == MT_DATA ||
5886                                             m1->m_type == MT_HEADER ||
5887                                             m1->m_type == MT_OOBDATA) {
5888                                                 pkt_total += m1->m_len;
5889                                         }
5890                                         m1 = m1->m_next;
5891                                 }
5892                                 optval = pkt_total;
5893                         } else {
5894                                 optval = so->so_rcv.sb_cc - so->so_rcv.sb_ctl;
5895                         }
5896                         goto integer;
5897
5898                 case SO_NUMRCVPKT:
5899                         if (so->so_proto->pr_flags & PR_ATOMIC) {
5900                                 int cnt = 0;
5901                                 struct mbuf *m1;
5902
5903                                 m1 = so->so_rcv.sb_mb;
5904                                 while (m1 != NULL) {
5905                                         cnt += 1;
5906                                         m1 = m1->m_nextpkt;
5907                                 }
5908                                 optval = cnt;
5909                                 goto integer;
5910                         } else {
5911                                 error = ENOPROTOOPT;
5912                                 break;
5913                         }
5914
5915                 case SO_NWRITE:
5916                         optval = so->so_snd.sb_cc;
5917                         goto integer;
5918
5919                 case SO_ERROR:
5920                         optval = so->so_error;
5921                         so->so_error = 0;
5922                         goto integer;
5923
5924                 case SO_SNDBUF: {
5925                         u_int32_t hiwat = so->so_snd.sb_hiwat;
5926
5927                         if (so->so_snd.sb_flags & SB_UNIX) {
5928                                 struct unpcb *unp =
5929                                     (struct unpcb *)(so->so_pcb);
5930                                 if (unp != NULL && unp->unp_conn != NULL) {
5931                                         hiwat += unp->unp_conn->unp_cc;
5932                                 }
5933                         }
5934
5935                         optval = hiwat;
5936                         goto integer;
5937                 }
5938                 case SO_RCVBUF:
5939                         optval = so->so_rcv.sb_hiwat;
5940                         goto integer;
5941
5942                 case SO_SNDLOWAT:
5943                         optval = so->so_snd.sb_lowat;
5944                         goto integer;
5945
5946                 case SO_RCVLOWAT:
5947                         optval = so->so_rcv.sb_lowat;
5948                         goto integer;
5949
5950                 case SO_SNDTIMEO:
5951                 case SO_RCVTIMEO:
5952                         tv = (sopt->sopt_name == SO_SNDTIMEO ?
5953                             so->so_snd.sb_timeo : so->so_rcv.sb_timeo);
5954
5955                         error = sooptcopyout_timeval(sopt, &tv);
5956                         break;
5957
5958                 case SO_NOSIGPIPE:
5959                         optval = (so->so_flags & SOF_NOSIGPIPE);
5960                         goto integer;
5961
5962                 case SO_NOADDRERR:
5963                         optval = (so->so_flags & SOF_NOADDRAVAIL);
5964                         goto integer;
5965
5966                 case SO_REUSESHAREUID:
5967                         optval = (so->so_flags & SOF_REUSESHAREUID);
5968                         goto integer;
5969
5970
5971                 case SO_NOTIFYCONFLICT:
5972                         optval = (so->so_flags & SOF_NOTIFYCONFLICT);
5973                         goto integer;
5974
5975                 case SO_RESTRICTIONS:
5976                         optval = so_get_restrictions(so);
5977                         goto integer;
5978
5979                 case SO_AWDL_UNRESTRICTED:
5980                         if (SOCK_DOM(so) == PF_INET ||
5981                             SOCK_DOM(so) == PF_INET6) {
5982                                 optval = inp_get_awdl_unrestricted(
5983                                         sotoinpcb(so));
5984                                 goto integer;
5985                         } else {
5986                                 error = EOPNOTSUPP;
5987                         }
5988                         break;
5989
5990                 case SO_INTCOPROC_ALLOW:
5991                         if (SOCK_DOM(so) == PF_INET6) {
5992                                 optval = inp_get_intcoproc_allowed(
5993                                         sotoinpcb(so));
5994                                 goto integer;
5995                         } else {
5996                                 error = EOPNOTSUPP;
5997                         }
5998                         break;
5999
6000                 case SO_LABEL:
6001                         error = EOPNOTSUPP;
6002                         break;
6003
6004                 case SO_PEERLABEL:
6005                         error = EOPNOTSUPP;
6006                         break;
6007
6008 #ifdef __APPLE_API_PRIVATE
6009                 case SO_UPCALLCLOSEWAIT:
6010                         optval = (so->so_flags & SOF_UPCALLCLOSEWAIT);
6011                         goto integer;
6012 #endif
6013                 case SO_RANDOMPORT:
6014                         optval = (so->so_flags & SOF_BINDRANDOMPORT);
6015                         goto integer;
6016
6017                 case SO_NP_EXTENSIONS: {
6018                         struct so_np_extensions sonpx = {};
6019
6020                         sonpx.npx_flags = (so->so_flags & SOF_NPX_SETOPTSHUT) ?
6021                             SONPX_SETOPTSHUT : 0;
6022                         sonpx.npx_mask = SONPX_MASK_VALID;
6023
6024                         error = sooptcopyout(sopt, &sonpx,
6025                             sizeof(struct so_np_extensions));
6026                         break;
6027                 }
6028
6029                 case SO_TRAFFIC_CLASS:
6030                         optval = so->so_traffic_class;
6031                         goto integer;
6032
6033                 case SO_RECV_TRAFFIC_CLASS:
6034                         optval = (so->so_flags & SOF_RECV_TRAFFIC_CLASS);
6035                         goto integer;
6036
6037 #if (DEVELOPMENT || DEBUG)
6038                 case SO_TRAFFIC_CLASS_DBG:
6039                         error = sogetopt_tcdbg(so, sopt);
6040                         break;
6041 #endif /* (DEVELOPMENT || DEBUG) */
6042
6043                 case SO_PRIVILEGED_TRAFFIC_CLASS:
6044                         optval = (so->so_flags & SOF_PRIVILEGED_TRAFFIC_CLASS);
6045                         goto integer;
6046
6047                 case SO_DEFUNCTOK:
6048                         optval = !(so->so_flags & SOF_NODEFUNCT);
6049                         goto integer;
6050
6051                 case SO_ISDEFUNCT:
6052                         optval = (so->so_flags & SOF_DEFUNCT);
6053                         goto integer;
6054
6055                 case SO_OPPORTUNISTIC:
6056                         optval = so_get_opportunistic(so);
6057                         goto integer;
6058
6059                 case SO_FLUSH:
6060                         /* This option is not gettable */
6061                         error = EINVAL;
6062                         break;
6063
6064                 case SO_RECV_ANYIF:
6065                         optval = so_get_recv_anyif(so);
6066                         goto integer;
6067
6068                 case SO_TRAFFIC_MGT_BACKGROUND:
6069                         /* This option is handled by lower layer(s) */
6070                         if (so->so_proto != NULL &&
6071                             so->so_proto->pr_ctloutput != NULL) {
6072                                 (void) so->so_proto->pr_ctloutput(so, sopt);
6073                         }
6074                         break;
6075
6076 #if FLOW_DIVERT
6077                 case SO_FLOW_DIVERT_TOKEN:
6078                         error = flow_divert_token_get(so, sopt);
6079                         break;
6080 #endif  /* FLOW_DIVERT */
6081
6082 #if NECP
6083                 case SO_NECP_ATTRIBUTES:
6084                         error = necp_get_socket_attributes(so, sopt);
6085                         break;
6086
6087                 case SO_NECP_CLIENTUUID: {
6088                         uuid_t *ncu;
6089
6090                         if (SOCK_DOM(so) == PF_MULTIPATH) {
6091                                 ncu = &mpsotomppcb(so)->necp_client_uuid;
6092                         } else if (SOCK_DOM(so) == PF_INET || SOCK_DOM(so) == PF_INET6) {
6093                                 ncu = &sotoinpcb(so)->necp_client_uuid;
6094                         } else {
6095                                 error = EINVAL;
6096                                 goto out;
6097                         }
6098
6099                         error = sooptcopyout(sopt, ncu, sizeof(uuid_t));
6100                         break;
6101                 }
6102
6103                 case SO_NECP_LISTENUUID: {
6104                         uuid_t *nlu;
6105
6106                         if (SOCK_DOM(so) == PF_INET || SOCK_DOM(so) == PF_INET6) {
6107                                 if (sotoinpcb(so)->inp_flags2 & INP2_EXTERNAL_PORT) {
6108                                         nlu = &sotoinpcb(so)->necp_client_uuid;
6109                                 } else {
6110                                         error = ENOENT;
6111                                         goto out;
6112                                 }
6113                         } else {
6114                                 error = EINVAL;
6115                                 goto out;
6116                         }
6117
6118                         error = sooptcopyout(sopt, nlu, sizeof(uuid_t));
6119                         break;
6120                 }
6121 #endif /* NECP */
6122
6123 #if CONTENT_FILTER
6124                 case SO_CFIL_SOCK_ID: {
6125                         cfil_sock_id_t sock_id;
6126
6127                         sock_id = cfil_sock_id_from_socket(so);
6128
6129                         error = sooptcopyout(sopt, &sock_id,
6130                             sizeof(cfil_sock_id_t));
6131                         break;
6132                 }
6133 #endif  /* CONTENT_FILTER */
6134
6135                 case SO_EXTENDED_BK_IDLE:
6136                         optval = (so->so_flags1 & SOF1_EXTEND_BK_IDLE_WANTED);
6137                         goto integer;
6138                 case SO_MARK_CELLFALLBACK:
6139                         optval = ((so->so_flags1 & SOF1_CELLFALLBACK) > 0)
6140                             ? 1 : 0;
6141                         goto integer;
6142                 case SO_NET_SERVICE_TYPE: {
6143                         if ((so->so_flags1 & SOF1_TC_NET_SERV_TYPE)) {
6144                                 optval = so->so_netsvctype;
6145                         } else {
6146                                 optval = NET_SERVICE_TYPE_BE;
6147                         }
6148                         goto integer;
6149                 }
6150                 case SO_NETSVC_MARKING_LEVEL:
6151                         optval = so_get_netsvc_marking_level(so);
6152                         goto integer;
6153
6154                 case SO_MPKL_SEND_INFO: {
6155                         struct so_mpkl_send_info so_mpkl_send_info;
6156
6157                         uuid_copy(so_mpkl_send_info.mpkl_uuid, so->so_mpkl_send_uuid);
6158                         so_mpkl_send_info.mpkl_proto = so->so_mpkl_send_proto;
6159                         error = sooptcopyout(sopt, &so_mpkl_send_info,
6160                             sizeof(struct so_mpkl_send_info));
6161                         break;
6162                 }
6163                 default:
6164                         error = ENOPROTOOPT;
6165                         break;
6166                 }
6167         }
6168 out:
6169         if (dolock) {
6170                 socket_unlock(so, 1);
6171         }
6172         return error;
6173 }
6174
6175 /*
6176  * The size limits on our soopt_getm is different from that on FreeBSD.
6177  * We limit the size of options to MCLBYTES. This will have to change
6178  * if we need to define options that need more space than MCLBYTES.
6179  */
6180 int
6181 soopt_getm(struct sockopt *sopt, struct mbuf **mp)
6182 {
6183         struct mbuf *m, *m_prev;
6184         int sopt_size = sopt->sopt_valsize;
6185         int how;
6186
6187         if (sopt_size <= 0 || sopt_size > MCLBYTES) {
6188                 return EMSGSIZE;
6189         }
6190
6191         how = sopt->sopt_p != kernproc ? M_WAIT : M_DONTWAIT;
6192         MGET(m, how, MT_DATA);
6193         if (m == NULL) {
6194                 return ENOBUFS;
6195         }
6196         if (sopt_size > MLEN) {
6197                 MCLGET(m, how);
6198                 if ((m->m_flags & M_EXT) == 0) {
6199                         m_free(m);
6200                         return ENOBUFS;
6201                 }
6202                 m->m_len = min(MCLBYTES, sopt_size);
6203         } else {
6204                 m->m_len = min(MLEN, sopt_size);
6205         }
6206         sopt_size -= m->m_len;
6207         *mp = m;
6208         m_prev = m;
6209
6210         while (sopt_size > 0) {
6211                 MGET(m, how, MT_DATA);
6212                 if (m == NULL) {
6213                         m_freem(*mp);
6214                         return ENOBUFS;
6215                 }
6216                 if (sopt_size > MLEN) {
6217                         MCLGET(m, how);
6218                         if ((m->m_flags & M_EXT) == 0) {
6219                                 m_freem(*mp);
6220                                 m_freem(m);
6221                                 return ENOBUFS;
6222                         }
6223                         m->m_len = min(MCLBYTES, sopt_size);
6224                 } else {
6225                         m->m_len = min(MLEN, sopt_size);
6226                 }
6227                 sopt_size -= m->m_len;
6228                 m_prev->m_next = m;
6229                 m_prev = m;
6230         }
6231         return 0;
6232 }
6233
6234 /* copyin sopt data into mbuf chain */
6235 int
6236 soopt_mcopyin(struct sockopt *sopt, struct mbuf *m)
6237 {
6238         struct mbuf *m0 = m;
6239
6240         if (sopt->sopt_val == USER_ADDR_NULL) {
6241                 return 0;
6242         }
6243         while (m != NULL && sopt->sopt_valsize >= m->m_len) {
6244                 if (sopt->sopt_p != kernproc) {
6245                         int error;
6246
6247                         error = copyin(sopt->sopt_val, mtod(m, char *),
6248                             m->m_len);
6249                         if (error != 0) {
6250                                 m_freem(m0);
6251                                 return error;
6252                         }
6253                 } else {
6254                         bcopy(CAST_DOWN(caddr_t, sopt->sopt_val),
6255                             mtod(m, char *), m->m_len);
6256                 }
6257                 sopt->sopt_valsize -= m->m_len;
6258                 sopt->sopt_val += m->m_len;
6259                 m = m->m_next;
6260         }
6261         /* should be allocated enoughly at ip6_sooptmcopyin() */
6262         if (m != NULL) {
6263                 panic("soopt_mcopyin");
6264                 /* NOTREACHED */
6265         }
6266         return 0;
6267 }
6268
6269 /* copyout mbuf chain data into soopt */
6270 int
6271 soopt_mcopyout(struct sockopt *sopt, struct mbuf *m)
6272 {
6273         struct mbuf *m0 = m;
6274         size_t valsize = 0;
6275
6276         if (sopt->sopt_val == USER_ADDR_NULL) {
6277                 return 0;
6278         }
6279         while (m != NULL && sopt->sopt_valsize >= m->m_len) {
6280                 if (sopt->sopt_p != kernproc) {
6281                         int error;
6282
6283                         error = copyout(mtod(m, char *), sopt->sopt_val,
6284                             m->m_len);
6285                         if (error != 0) {
6286                                 m_freem(m0);
6287                                 return error;
6288                         }
6289                 } else {
6290                         bcopy(mtod(m, char *),
6291                             CAST_DOWN(caddr_t, sopt->sopt_val), m->m_len);
6292                 }
6293                 sopt->sopt_valsize -= m->m_len;
6294                 sopt->sopt_val += m->m_len;
6295                 valsize += m->m_len;
6296                 m = m->m_next;
6297         }
6298         if (m != NULL) {
6299                 /* enough soopt buffer should be given from user-land */
6300                 m_freem(m0);
6301                 return EINVAL;
6302         }
6303         sopt->sopt_valsize = valsize;
6304         return 0;
6305 }
6306
6307 void
6308 sohasoutofband(struct socket *so)
6309 {
6310         if (so->so_pgid < 0) {
6311                 gsignal(-so->so_pgid, SIGURG);
6312         } else if (so->so_pgid > 0) {
6313                 proc_signal(so->so_pgid, SIGURG);
6314         }
6315         selwakeup(&so->so_rcv.sb_sel);
6316         if (so->so_rcv.sb_flags & SB_KNOTE) {
6317                 KNOTE(&so->so_rcv.sb_sel.si_note,
6318                     (NOTE_OOB | SO_FILT_HINT_LOCKED));
6319         }
6320 }
6321
6322 int
6323 sopoll(struct socket *so, int events, kauth_cred_t cred, void * wql)
6324 {
6325 #pragma unused(cred)
6326         struct proc *p = current_proc();
6327         int revents = 0;
6328
6329         socket_lock(so, 1);
6330         so_update_last_owner_locked(so, PROC_NULL);
6331         so_update_policy(so);
6332
6333         if (events & (POLLIN | POLLRDNORM)) {
6334                 if (soreadable(so)) {
6335                         revents |= events & (POLLIN | POLLRDNORM);
6336                 }
6337         }
6338
6339         if (events & (POLLOUT | POLLWRNORM)) {
6340                 if (sowriteable(so)) {
6341                         revents |= events & (POLLOUT | POLLWRNORM);
6342                 }
6343         }
6344
6345         if (events & (POLLPRI | POLLRDBAND)) {
6346                 if (so->so_oobmark || (so->so_state & SS_RCVATMARK)) {
6347                         revents |= events & (POLLPRI | POLLRDBAND);
6348                 }
6349         }
6350
6351         if (revents == 0) {
6352                 if (events & (POLLIN | POLLPRI | POLLRDNORM | POLLRDBAND)) {
6353                         /*
6354                          * Darwin sets the flag first,
6355                          * BSD calls selrecord first
6356                          */
6357                         so->so_rcv.sb_flags |= SB_SEL;
6358                         selrecord(p, &so->so_rcv.sb_sel, wql);
6359                 }
6360
6361                 if (events & (POLLOUT | POLLWRNORM)) {
6362                         /*
6363                          * Darwin sets the flag first,
6364                          * BSD calls selrecord first
6365                          */
6366                         so->so_snd.sb_flags |= SB_SEL;
6367                         selrecord(p, &so->so_snd.sb_sel, wql);
6368                 }
6369         }
6370
6371         socket_unlock(so, 1);
6372         return revents;
6373 }
6374
6375 int
6376 soo_kqfilter(struct fileproc *fp, struct knote *kn, struct kevent_qos_s *kev)
6377 {
6378         struct socket *so = (struct socket *)fp->fp_glob->fg_data;
6379         int result;
6380
6381         socket_lock(so, 1);
6382         so_update_last_owner_locked(so, PROC_NULL);
6383         so_update_policy(so);
6384
6385         switch (kn->kn_filter) {
6386         case EVFILT_READ:
6387                 kn->kn_filtid = EVFILTID_SOREAD;
6388                 break;
6389         case EVFILT_WRITE:
6390                 kn->kn_filtid = EVFILTID_SOWRITE;
6391                 break;
6392         case EVFILT_SOCK:
6393                 kn->kn_filtid = EVFILTID_SCK;
6394                 break;
6395         case EVFILT_EXCEPT:
6396                 kn->kn_filtid = EVFILTID_SOEXCEPT;
6397                 break;
6398         default:
6399                 socket_unlock(so, 1);
6400                 knote_set_error(kn, EINVAL);
6401                 return 0;
6402         }
6403
6404         /*
6405          * call the appropriate sub-filter attach
6406          * with the socket still locked
6407          */
6408         result = knote_fops(kn)->f_attach(kn, kev);
6409
6410         socket_unlock(so, 1);
6411
6412         return result;
6413 }
6414
6415 static int
6416 filt_soread_common(struct knote *kn, struct kevent_qos_s *kev, struct socket *so)
6417 {
6418         int retval = 0;
6419         int64_t data = 0;
6420
6421         if (so->so_options & SO_ACCEPTCONN) {
6422                 /*
6423                  * Radar 6615193 handle the listen case dynamically
6424                  * for kqueue read filter. This allows to call listen()
6425                  * after registering the kqueue EVFILT_READ.
6426                  */
6427
6428                 retval = !TAILQ_EMPTY(&so->so_comp);
6429                 data = so->so_qlen;
6430                 goto out;
6431         }
6432
6433         /* socket isn't a listener */
6434         /*
6435          * NOTE_LOWAT specifies new low water mark in data, i.e.
6436          * the bytes of protocol data. We therefore exclude any
6437          * control bytes.
6438          */
6439         data = so->so_rcv.sb_cc - so->so_rcv.sb_ctl;
6440
6441         if (kn->kn_sfflags & NOTE_OOB) {
6442                 if (so->so_oobmark || (so->so_state & SS_RCVATMARK)) {
6443                         kn->kn_fflags |= NOTE_OOB;
6444                         data -= so->so_oobmark;
6445                         retval = 1;
6446                         goto out;
6447                 }
6448         }
6449
6450         if ((so->so_state & SS_CANTRCVMORE)
6451 #if CONTENT_FILTER
6452             && cfil_sock_data_pending(&so->so_rcv) == 0
6453 #endif /* CONTENT_FILTER */
6454             ) {
6455                 kn->kn_flags |= EV_EOF;
6456                 kn->kn_fflags = so->so_error;
6457                 retval = 1;
6458                 goto out;
6459         }
6460
6461         if (so->so_error) {     /* temporary udp error */
6462                 retval = 1;
6463                 goto out;
6464         }
6465
6466         int64_t lowwat = so->so_rcv.sb_lowat;
6467         /*
6468          * Ensure that when NOTE_LOWAT is used, the derived
6469          * low water mark is bounded by socket's rcv buf's
6470          * high and low water mark values.
6471          */
6472         if (kn->kn_sfflags & NOTE_LOWAT) {
6473                 if (kn->kn_sdata > so->so_rcv.sb_hiwat) {
6474                         lowwat = so->so_rcv.sb_hiwat;
6475                 } else if (kn->kn_sdata > lowwat) {
6476                         lowwat = kn->kn_sdata;
6477                 }
6478         }
6479
6480         /*
6481          * While the `data` field is the amount of data to read,
6482          * 0-sized packets need to wake up the kqueue, see 58140856,
6483          * so we need to take control bytes into account too.
6484          */
6485         retval = (so->so_rcv.sb_cc >= lowwat);
6486
6487 out:
6488         if (retval && kev) {
6489                 knote_fill_kevent(kn, kev, data);
6490         }
6491         return retval;
6492 }
6493
6494 static int
6495 filt_sorattach(struct knote *kn, __unused struct kevent_qos_s *kev)
6496 {
6497         struct socket *so = (struct socket *)kn->kn_fp->fp_glob->fg_data;
6498
6499         /* socket locked */
6500
6501         /*
6502          * If the caller explicitly asked for OOB results (e.g. poll())
6503          * from EVFILT_READ, then save that off in the hookid field
6504          * and reserve the kn_flags EV_OOBAND bit for output only.
6505          */
6506         if (kn->kn_filter == EVFILT_READ &&
6507             kn->kn_flags & EV_OOBAND) {
6508                 kn->kn_flags &= ~EV_OOBAND;
6509                 kn->kn_hook32 = EV_OOBAND;
6510         } else {
6511                 kn->kn_hook32 = 0;
6512         }
6513         if (KNOTE_ATTACH(&so->so_rcv.sb_sel.si_note, kn)) {
6514                 so->so_rcv.sb_flags |= SB_KNOTE;
6515         }
6516
6517         /* indicate if event is already fired */
6518         return filt_soread_common(kn, NULL, so);
6519 }
6520
6521 static void
6522 filt_sordetach(struct knote *kn)
6523 {
6524         struct socket *so = (struct socket *)kn->kn_fp->fp_glob->fg_data;
6525
6526         socket_lock(so, 1);
6527         if (so->so_rcv.sb_flags & SB_KNOTE) {
6528                 if (KNOTE_DETACH(&so->so_rcv.sb_sel.si_note, kn)) {
6529                         so->so_rcv.sb_flags &= ~SB_KNOTE;
6530                 }
6531         }
6532         socket_unlock(so, 1);
6533 }
6534
6535 /*ARGSUSED*/
6536 static int
6537 filt_soread(struct knote *kn, long hint)
6538 {
6539         struct socket *so = (struct socket *)kn->kn_fp->fp_glob->fg_data;
6540         int retval;
6541
6542         if ((hint & SO_FILT_HINT_LOCKED) == 0) {
6543                 socket_lock(so, 1);
6544         }
6545
6546         retval = filt_soread_common(kn, NULL, so);
6547
6548         if ((hint & SO_FILT_HINT_LOCKED) == 0) {
6549                 socket_unlock(so, 1);
6550         }
6551
6552         return retval;
6553 }
6554
6555 static int
6556 filt_sortouch(struct knote *kn, struct kevent_qos_s *kev)
6557 {
6558         struct socket *so = (struct socket *)kn->kn_fp->fp_glob->fg_data;
6559         int retval;
6560
6561         socket_lock(so, 1);
6562
6563         /* save off the new input fflags and data */
6564         kn->kn_sfflags = kev->fflags;
6565         kn->kn_sdata = kev->data;
6566
6567         /* determine if changes result in fired events */
6568         retval = filt_soread_common(kn, NULL, so);
6569
6570         socket_unlock(so, 1);
6571
6572         return retval;
6573 }
6574
6575 static int
6576 filt_sorprocess(struct knote *kn, struct kevent_qos_s *kev)
6577 {
6578         struct socket *so = (struct socket *)kn->kn_fp->fp_glob->fg_data;
6579         int retval;
6580
6581         socket_lock(so, 1);
6582         retval = filt_soread_common(kn, kev, so);
6583         socket_unlock(so, 1);
6584
6585         return retval;
6586 }
6587
6588 int
6589 so_wait_for_if_feedback(struct socket *so)
6590 {
6591         if ((SOCK_DOM(so) == PF_INET || SOCK_DOM(so) == PF_INET6) &&
6592             (so->so_state & SS_ISCONNECTED)) {
6593                 struct inpcb *inp = sotoinpcb(so);
6594                 if (INP_WAIT_FOR_IF_FEEDBACK(inp)) {
6595                         return 1;
6596                 }
6597         }
6598         return 0;
6599 }
6600
6601 static int
6602 filt_sowrite_common(struct knote *kn, struct kevent_qos_s *kev, struct socket *so)
6603 {
6604         int ret = 0;
6605         int64_t data = sbspace(&so->so_snd);
6606
6607         if (so->so_state & SS_CANTSENDMORE) {
6608                 kn->kn_flags |= EV_EOF;
6609                 kn->kn_fflags = so->so_error;
6610                 ret = 1;
6611                 goto out;
6612         }
6613
6614         if (so->so_error) {     /* temporary udp error */
6615                 ret = 1;
6616                 goto out;
6617         }
6618
6619         if (!socanwrite(so)) {
6620                 ret = 0;
6621                 goto out;
6622         }
6623
6624         if (so->so_flags1 & SOF1_PRECONNECT_DATA) {
6625                 ret = 1;
6626                 goto out;
6627         }
6628
6629         int64_t lowwat = so->so_snd.sb_lowat;
6630
6631         if (kn->kn_sfflags & NOTE_LOWAT) {
6632                 if (kn->kn_sdata > so->so_snd.sb_hiwat) {
6633                         lowwat = so->so_snd.sb_hiwat;
6634                 } else if (kn->kn_sdata > lowwat) {
6635                         lowwat = kn->kn_sdata;
6636                 }
6637         }
6638
6639         if (data >= lowwat) {
6640                 if ((so->so_flags & SOF_NOTSENT_LOWAT)
6641 #if (DEBUG || DEVELOPMENT)
6642                     && so_notsent_lowat_check == 1
6643 #endif /* DEBUG || DEVELOPMENT */
6644                     ) {
6645                         if ((SOCK_DOM(so) == PF_INET ||
6646                             SOCK_DOM(so) == PF_INET6) &&
6647                             so->so_type == SOCK_STREAM) {
6648                                 ret = tcp_notsent_lowat_check(so);
6649                         }
6650 #if MPTCP
6651                         else if ((SOCK_DOM(so) == PF_MULTIPATH) &&
6652                             (SOCK_PROTO(so) == IPPROTO_TCP)) {
6653                                 ret = mptcp_notsent_lowat_check(so);
6654                         }
6655 #endif
6656                         else {
6657                                 ret = 1;
6658                                 goto out;
6659                         }
6660                 } else {
6661                         ret = 1;
6662                 }
6663         }
6664         if (so_wait_for_if_feedback(so)) {
6665                 ret = 0;
6666         }
6667
6668 out:
6669         if (ret && kev) {
6670                 knote_fill_kevent(kn, kev, data);
6671         }
6672         return ret;
6673 }
6674
6675 static int
6676 filt_sowattach(struct knote *kn, __unused struct kevent_qos_s *kev)
6677 {
6678         struct socket *so = (struct socket *)kn->kn_fp->fp_glob->fg_data;
6679
6680         /* socket locked */
6681         if (KNOTE_ATTACH(&so->so_snd.sb_sel.si_note, kn)) {
6682                 so->so_snd.sb_flags |= SB_KNOTE;
6683         }
6684
6685         /* determine if its already fired */
6686         return filt_sowrite_common(kn, NULL, so);
6687 }
6688
6689 static void
6690 filt_sowdetach(struct knote *kn)
6691 {
6692         struct socket *so = (struct socket *)kn->kn_fp->fp_glob->fg_data;
6693         socket_lock(so, 1);
6694
6695         if (so->so_snd.sb_flags & SB_KNOTE) {
6696                 if (KNOTE_DETACH(&so->so_snd.sb_sel.si_note, kn)) {
6697                         so->so_snd.sb_flags &= ~SB_KNOTE;
6698                 }
6699         }
6700         socket_unlock(so, 1);
6701 }
6702
6703 /*ARGSUSED*/
6704 static int
6705 filt_sowrite(struct knote *kn, long hint)
6706 {
6707         struct socket *so = (struct socket *)kn->kn_fp->fp_glob->fg_data;
6708         int ret;
6709
6710         if ((hint & SO_FILT_HINT_LOCKED) == 0) {
6711                 socket_lock(so, 1);
6712         }
6713
6714         ret = filt_sowrite_common(kn, NULL, so);
6715
6716         if ((hint & SO_FILT_HINT_LOCKED) == 0) {
6717                 socket_unlock(so, 1);
6718         }
6719
6720         return ret;
6721 }
6722
6723 static int
6724 filt_sowtouch(struct knote *kn, struct kevent_qos_s *kev)
6725 {
6726         struct socket *so = (struct socket *)kn->kn_fp->fp_glob->fg_data;
6727         int ret;
6728
6729         socket_lock(so, 1);
6730
6731         /*save off the new input fflags and data */
6732         kn->kn_sfflags = kev->fflags;
6733         kn->kn_sdata = kev->data;
6734
6735         /* determine if these changes result in a triggered event */
6736         ret = filt_sowrite_common(kn, NULL, so);
6737
6738         socket_unlock(so, 1);
6739
6740         return ret;
6741 }
6742
6743 static int
6744 filt_sowprocess(struct knote *kn, struct kevent_qos_s *kev)
6745 {
6746         struct socket *so = (struct socket *)kn->kn_fp->fp_glob->fg_data;
6747         int ret;
6748
6749         socket_lock(so, 1);
6750         ret = filt_sowrite_common(kn, kev, so);
6751         socket_unlock(so, 1);
6752
6753         return ret;
6754 }
6755
6756 static int
6757 filt_sockev_common(struct knote *kn, struct kevent_qos_s *kev,
6758     struct socket *so, long ev_hint)
6759 {
6760         int ret = 0;
6761         int64_t data = 0;
6762         uint32_t level_trigger = 0;
6763
6764         if (ev_hint & SO_FILT_HINT_CONNRESET) {
6765                 kn->kn_fflags |= NOTE_CONNRESET;
6766         }
6767         if (ev_hint & SO_FILT_HINT_TIMEOUT) {
6768                 kn->kn_fflags |= NOTE_TIMEOUT;
6769         }
6770         if (ev_hint & SO_FILT_HINT_NOSRCADDR) {
6771                 kn->kn_fflags |= NOTE_NOSRCADDR;
6772         }
6773         if (ev_hint & SO_FILT_HINT_IFDENIED) {
6774                 kn->kn_fflags |= NOTE_IFDENIED;
6775         }
6776         if (ev_hint & SO_FILT_HINT_KEEPALIVE) {
6777                 kn->kn_fflags |= NOTE_KEEPALIVE;
6778         }
6779         if (ev_hint & SO_FILT_HINT_ADAPTIVE_WTIMO) {
6780                 kn->kn_fflags |= NOTE_ADAPTIVE_WTIMO;
6781         }
6782         if (ev_hint & SO_FILT_HINT_ADAPTIVE_RTIMO) {
6783                 kn->kn_fflags |= NOTE_ADAPTIVE_RTIMO;
6784         }
6785         if ((ev_hint & SO_FILT_HINT_CONNECTED) ||
6786             (so->so_state & SS_ISCONNECTED)) {
6787                 kn->kn_fflags |= NOTE_CONNECTED;
6788                 level_trigger |= NOTE_CONNECTED;
6789         }
6790         if ((ev_hint & SO_FILT_HINT_DISCONNECTED) ||
6791             (so->so_state & SS_ISDISCONNECTED)) {
6792                 kn->kn_fflags |= NOTE_DISCONNECTED;
6793                 level_trigger |= NOTE_DISCONNECTED;
6794         }
6795         if (ev_hint & SO_FILT_HINT_CONNINFO_UPDATED) {
6796                 if (so->so_proto != NULL &&
6797                     (so->so_proto->pr_flags & PR_EVCONNINFO)) {
6798                         kn->kn_fflags |= NOTE_CONNINFO_UPDATED;
6799                 }
6800         }
6801
6802         if ((ev_hint & SO_FILT_HINT_NOTIFY_ACK) ||
6803             tcp_notify_ack_active(so)) {
6804                 kn->kn_fflags |= NOTE_NOTIFY_ACK;
6805         }
6806
6807         if ((so->so_state & SS_CANTRCVMORE)
6808 #if CONTENT_FILTER
6809             && cfil_sock_data_pending(&so->so_rcv) == 0
6810 #endif /* CONTENT_FILTER */
6811             ) {
6812                 kn->kn_fflags |= NOTE_READCLOSED;
6813                 level_trigger |= NOTE_READCLOSED;
6814         }
6815
6816         if (so->so_state & SS_CANTSENDMORE) {
6817                 kn->kn_fflags |= NOTE_WRITECLOSED;
6818                 level_trigger |= NOTE_WRITECLOSED;
6819         }
6820
6821         if ((ev_hint & SO_FILT_HINT_SUSPEND) ||
6822             (so->so_flags & SOF_SUSPENDED)) {
6823                 kn->kn_fflags &= ~(NOTE_SUSPEND | NOTE_RESUME);
6824
6825                 /* If resume event was delivered before, reset it */
6826                 kn->kn_hook32 &= ~NOTE_RESUME;
6827
6828                 kn->kn_fflags |= NOTE_SUSPEND;
6829                 level_trigger |= NOTE_SUSPEND;
6830         }
6831
6832         if ((ev_hint & SO_FILT_HINT_RESUME) ||
6833             (so->so_flags & SOF_SUSPENDED) == 0) {
6834                 kn->kn_fflags &= ~(NOTE_SUSPEND | NOTE_RESUME);
6835
6836                 /* If suspend event was delivered before, reset it */
6837                 kn->kn_hook32 &= ~NOTE_SUSPEND;
6838
6839                 kn->kn_fflags |= NOTE_RESUME;
6840                 level_trigger |= NOTE_RESUME;
6841         }
6842
6843         if (so->so_error != 0) {
6844                 ret = 1;
6845                 data = so->so_error;
6846                 kn->kn_flags |= EV_EOF;
6847         } else {
6848                 u_int32_t data32 = 0;
6849                 get_sockev_state(so, &data32);
6850                 data = data32;
6851         }
6852
6853         /* Reset any events that are not requested on this knote */
6854         kn->kn_fflags &= (kn->kn_sfflags & EVFILT_SOCK_ALL_MASK);
6855         level_trigger &= (kn->kn_sfflags & EVFILT_SOCK_ALL_MASK);
6856
6857         /* Find the level triggerred events that are already delivered */
6858         level_trigger &= kn->kn_hook32;
6859         level_trigger &= EVFILT_SOCK_LEVEL_TRIGGER_MASK;
6860
6861         /* Do not deliver level triggerred events more than once */
6862         if ((kn->kn_fflags & ~level_trigger) != 0) {
6863                 ret = 1;
6864         }
6865
6866         if (ret && kev) {
6867                 /*
6868                  * Store the state of the events being delivered. This
6869                  * state can be used to deliver level triggered events
6870                  * ateast once and still avoid waking up the application
6871                  * multiple times as long as the event is active.
6872                  */
6873                 if (kn->kn_fflags != 0) {
6874                         kn->kn_hook32 |= (kn->kn_fflags &
6875                             EVFILT_SOCK_LEVEL_TRIGGER_MASK);
6876                 }
6877
6878                 /*
6879                  * NOTE_RESUME and NOTE_SUSPEND are an exception, deliver
6880                  * only one of them and remember the last one that was
6881                  * delivered last
6882                  */
6883                 if (kn->kn_fflags & NOTE_SUSPEND) {
6884                         kn->kn_hook32 &= ~NOTE_RESUME;
6885                 }
6886                 if (kn->kn_fflags & NOTE_RESUME) {
6887                         kn->kn_hook32 &= ~NOTE_SUSPEND;
6888                 }
6889
6890                 knote_fill_kevent(kn, kev, data);
6891         }
6892         return ret;
6893 }
6894
6895 static int
6896 filt_sockattach(struct knote *kn, __unused struct kevent_qos_s *kev)
6897 {
6898         struct socket *so = (struct socket *)kn->kn_fp->fp_glob->fg_data;
6899
6900         /* socket locked */
6901         kn->kn_hook32 = 0;
6902         if (KNOTE_ATTACH(&so->so_klist, kn)) {
6903                 so->so_flags |= SOF_KNOTE;
6904         }
6905
6906         /* determine if event already fired */
6907         return filt_sockev_common(kn, NULL, so, 0);
6908 }
6909
6910 static void
6911 filt_sockdetach(struct knote *kn)
6912 {
6913         struct socket *so = (struct socket *)kn->kn_fp->fp_glob->fg_data;
6914         socket_lock(so, 1);
6915
6916         if ((so->so_flags & SOF_KNOTE) != 0) {
6917                 if (KNOTE_DETACH(&so->so_klist, kn)) {
6918                         so->so_flags &= ~SOF_KNOTE;
6919                 }
6920         }
6921         socket_unlock(so, 1);
6922 }
6923
6924 static int
6925 filt_sockev(struct knote *kn, long hint)
6926 {
6927         int ret = 0, locked = 0;
6928         struct socket *so = (struct socket *)kn->kn_fp->fp_glob->fg_data;
6929         long ev_hint = (hint & SO_FILT_HINT_EV);
6930
6931         if ((hint & SO_FILT_HINT_LOCKED) == 0) {
6932                 socket_lock(so, 1);
6933                 locked = 1;
6934         }
6935
6936         ret = filt_sockev_common(kn, NULL, so, ev_hint);
6937
6938         if (locked) {
6939                 socket_unlock(so, 1);
6940         }
6941
6942         return ret;
6943 }
6944
6945
6946
6947 /*
6948  *      filt_socktouch - update event state
6949  */
6950 static int
6951 filt_socktouch(
6952         struct knote *kn,
6953         struct kevent_qos_s *kev)
6954 {
6955         struct socket *so = (struct socket *)kn->kn_fp->fp_glob->fg_data;
6956         uint32_t changed_flags;
6957         int ret;
6958
6959         socket_lock(so, 1);
6960
6961         /* save off the [result] data and fflags */
6962         changed_flags = (kn->kn_sfflags ^ kn->kn_hook32);
6963
6964         /* save off the new input fflags and data */
6965         kn->kn_sfflags = kev->fflags;
6966         kn->kn_sdata = kev->data;
6967
6968         /* restrict the current results to the (smaller?) set of new interest */
6969         /*
6970          * For compatibility with previous implementations, we leave kn_fflags
6971          * as they were before.
6972          */
6973         //kn->kn_fflags &= kev->fflags;
6974
6975         /*
6976          * Since we keep track of events that are already
6977          * delivered, if any of those events are not requested
6978          * anymore the state related to them can be reset
6979          */
6980         kn->kn_hook32 &= ~(changed_flags & EVFILT_SOCK_LEVEL_TRIGGER_MASK);
6981
6982         /* determine if we have events to deliver */
6983         ret = filt_sockev_common(kn, NULL, so, 0);
6984
6985         socket_unlock(so, 1);
6986
6987         return ret;
6988 }
6989
6990 /*
6991  *      filt_sockprocess - query event fired state and return data
6992  */
6993 static int
6994 filt_sockprocess(struct knote *kn, struct kevent_qos_s *kev)
6995 {
6996         struct socket *so = (struct socket *)kn->kn_fp->fp_glob->fg_data;
6997         int ret = 0;
6998
6999         socket_lock(so, 1);
7000
7001         ret = filt_sockev_common(kn, kev, so, 0);
7002
7003         socket_unlock(so, 1);
7004
7005         return ret;
7006 }
7007
7008 void
7009 get_sockev_state(struct socket *so, u_int32_t *statep)
7010 {
7011         u_int32_t state = *(statep);
7012
7013         /*
7014          * If the state variable is already used by a previous event,
7015          * reset it.
7016          */
7017         if (state != 0) {
7018                 return;
7019         }
7020
7021         if (so->so_state & SS_ISCONNECTED) {
7022                 state |= SOCKEV_CONNECTED;
7023         } else {
7024                 state &= ~(SOCKEV_CONNECTED);
7025         }
7026         state |= ((so->so_state & SS_ISDISCONNECTED) ? SOCKEV_DISCONNECTED : 0);
7027         *(statep) = state;
7028 }
7029
7030 #define SO_LOCK_HISTORY_STR_LEN \
7031         (2 * SO_LCKDBG_MAX * (2 + (2 * sizeof (void *)) + 1) + 1)
7032
7033 __private_extern__ const char *
7034 solockhistory_nr(struct socket *so)
7035 {
7036         size_t n = 0;
7037         int i;
7038         static char lock_history_str[SO_LOCK_HISTORY_STR_LEN];
7039
7040         bzero(lock_history_str, sizeof(lock_history_str));
7041         for (i = SO_LCKDBG_MAX - 1; i >= 0; i--) {
7042                 n += scnprintf(lock_history_str + n,
7043                     SO_LOCK_HISTORY_STR_LEN - n, "%p:%p ",
7044                     so->lock_lr[(so->next_lock_lr + i) % SO_LCKDBG_MAX],
7045                     so->unlock_lr[(so->next_unlock_lr + i) % SO_LCKDBG_MAX]);
7046         }
7047         return lock_history_str;
7048 }
7049
7050 lck_mtx_t *
7051 socket_getlock(struct socket *so, int flags)
7052 {
7053         if (so->so_proto->pr_getlock != NULL) {
7054                 return (*so->so_proto->pr_getlock)(so, flags);
7055         } else {
7056                 return so->so_proto->pr_domain->dom_mtx;
7057         }
7058 }
7059
7060 void
7061 socket_lock(struct socket *so, int refcount)
7062 {
7063         void *lr_saved;
7064
7065         lr_saved = __builtin_return_address(0);
7066
7067         if (so->so_proto->pr_lock) {
7068                 (*so->so_proto->pr_lock)(so, refcount, lr_saved);
7069         } else {
7070 #ifdef MORE_LOCKING_DEBUG
7071                 LCK_MTX_ASSERT(so->so_proto->pr_domain->dom_mtx,
7072                     LCK_MTX_ASSERT_NOTOWNED);
7073 #endif
7074                 lck_mtx_lock(so->so_proto->pr_domain->dom_mtx);
7075                 if (refcount) {
7076                         so->so_usecount++;
7077                 }
7078                 so->lock_lr[so->next_lock_lr] = lr_saved;
7079                 so->next_lock_lr = (so->next_lock_lr + 1) % SO_LCKDBG_MAX;
7080         }
7081 }
7082
7083 void
7084 socket_lock_assert_owned(struct socket *so)
7085 {
7086         lck_mtx_t *mutex_held;
7087
7088         if (so->so_proto->pr_getlock != NULL) {
7089                 mutex_held = (*so->so_proto->pr_getlock)(so, 0);
7090         } else {
7091                 mutex_held = so->so_proto->pr_domain->dom_mtx;
7092         }
7093
7094         LCK_MTX_ASSERT(mutex_held, LCK_MTX_ASSERT_OWNED);
7095 }
7096
7097 int
7098 socket_try_lock(struct socket *so)
7099 {
7100         lck_mtx_t *mtx;
7101
7102         if (so->so_proto->pr_getlock != NULL) {
7103                 mtx = (*so->so_proto->pr_getlock)(so, 0);
7104         } else {
7105                 mtx = so->so_proto->pr_domain->dom_mtx;
7106         }
7107
7108         return lck_mtx_try_lock(mtx);
7109 }
7110
7111 void
7112 socket_unlock(struct socket *so, int refcount)
7113 {
7114         void *lr_saved;
7115         lck_mtx_t *mutex_held;
7116
7117         lr_saved = __builtin_return_address(0);
7118
7119         if (so == NULL || so->so_proto == NULL) {
7120                 panic("%s: null so_proto so=%p\n", __func__, so);
7121                 /* NOTREACHED */
7122         }
7123
7124         if (so->so_proto->pr_unlock) {
7125                 (*so->so_proto->pr_unlock)(so, refcount, lr_saved);
7126         } else {
7127                 mutex_held = so->so_proto->pr_domain->dom_mtx;
7128 #ifdef MORE_LOCKING_DEBUG
7129                 LCK_MTX_ASSERT(mutex_held, LCK_MTX_ASSERT_OWNED);
7130 #endif
7131                 so->unlock_lr[so->next_unlock_lr] = lr_saved;
7132                 so->next_unlock_lr = (so->next_unlock_lr + 1) % SO_LCKDBG_MAX;
7133
7134                 if (refcount) {
7135                         if (so->so_usecount <= 0) {
7136                                 panic("%s: bad refcount=%d so=%p (%d, %d, %d) "
7137                                     "lrh=%s", __func__, so->so_usecount, so,
7138                                     SOCK_DOM(so), so->so_type,
7139                                     SOCK_PROTO(so), solockhistory_nr(so));
7140                                 /* NOTREACHED */
7141                         }
7142
7143                         so->so_usecount--;
7144                         if (so->so_usecount == 0) {
7145                                 sofreelastref(so, 1);
7146                         }
7147                 }
7148                 lck_mtx_unlock(mutex_held);
7149         }
7150 }
7151
7152 /* Called with socket locked, will unlock socket */
7153 void
7154 sofree(struct socket *so)
7155 {
7156         lck_mtx_t *mutex_held;
7157
7158         if (so->so_proto->pr_getlock != NULL) {
7159                 mutex_held = (*so->so_proto->pr_getlock)(so, 0);
7160         } else {
7161                 mutex_held = so->so_proto->pr_domain->dom_mtx;
7162         }
7163         LCK_MTX_ASSERT(mutex_held, LCK_MTX_ASSERT_OWNED);
7164
7165         sofreelastref(so, 0);
7166 }
7167
7168 void
7169 soreference(struct socket *so)
7170 {
7171         socket_lock(so, 1);     /* locks & take one reference on socket */
7172         socket_unlock(so, 0);   /* unlock only */
7173 }
7174
7175 void
7176 sodereference(struct socket *so)
7177 {
7178         socket_lock(so, 0);
7179         socket_unlock(so, 1);
7180 }
7181
7182 /*
7183  * Set or clear SOF_MULTIPAGES on the socket to enable or disable the
7184  * possibility of using jumbo clusters.  Caller must ensure to hold
7185  * the socket lock.
7186  */
7187 void
7188 somultipages(struct socket *so, boolean_t set)
7189 {
7190         if (set) {
7191                 so->so_flags |= SOF_MULTIPAGES;
7192         } else {
7193                 so->so_flags &= ~SOF_MULTIPAGES;
7194         }
7195 }
7196
7197 void
7198 soif2kcl(struct socket *so, boolean_t set)
7199 {
7200         if (set) {
7201                 so->so_flags1 |= SOF1_IF_2KCL;
7202         } else {
7203                 so->so_flags1 &= ~SOF1_IF_2KCL;
7204         }
7205 }
7206
7207 int
7208 so_isdstlocal(struct socket *so)
7209 {
7210         struct inpcb *inp = (struct inpcb *)so->so_pcb;
7211
7212         if (SOCK_DOM(so) == PF_INET) {
7213                 return inaddr_local(inp->inp_faddr);
7214         } else if (SOCK_DOM(so) == PF_INET6) {
7215                 return in6addr_local(&inp->in6p_faddr);
7216         }
7217
7218         return 0;
7219 }
7220
7221 int
7222 sosetdefunct(struct proc *p, struct socket *so, int level, boolean_t noforce)
7223 {
7224         struct sockbuf *rcv, *snd;
7225         int err = 0, defunct;
7226
7227         rcv = &so->so_rcv;
7228         snd = &so->so_snd;
7229
7230         defunct = (so->so_flags & SOF_DEFUNCT);
7231         if (defunct) {
7232                 if (!(snd->sb_flags & rcv->sb_flags & SB_DROP)) {
7233                         panic("%s: SB_DROP not set", __func__);
7234                         /* NOTREACHED */
7235                 }
7236                 goto done;
7237         }
7238
7239         if (so->so_flags & SOF_NODEFUNCT) {
7240                 if (noforce) {
7241                         err = EOPNOTSUPP;
7242                         if (p != PROC_NULL) {
7243                                 SODEFUNCTLOG("%s[%d, %s]: (target pid %d "
7244                                     "name %s level %d) so 0x%llx [%d,%d] "
7245                                     "is not eligible for defunct "
7246                                     "(%d)\n", __func__, proc_selfpid(),
7247                                     proc_best_name(current_proc()), proc_pid(p),
7248                                     proc_best_name(p), level,
7249                                     (uint64_t)DEBUG_KERNEL_ADDRPERM(so),
7250                                     SOCK_DOM(so), SOCK_TYPE(so), err);
7251                         }
7252                         return err;
7253                 }
7254                 so->so_flags &= ~SOF_NODEFUNCT;
7255                 if (p != PROC_NULL) {
7256                         SODEFUNCTLOG("%s[%d, %s]: (target pid %d "
7257                             "name %s level %d) so 0x%llx [%d,%d] "
7258                             "defunct by force "
7259                             "(%d)\n", __func__, proc_selfpid(),
7260                             proc_best_name(current_proc()), proc_pid(p),
7261                             proc_best_name(p), level,
7262                             (uint64_t)DEBUG_KERNEL_ADDRPERM(so),
7263                             SOCK_DOM(so), SOCK_TYPE(so), err);
7264                 }
7265         } else if (so->so_flags1 & SOF1_EXTEND_BK_IDLE_WANTED) {
7266                 struct inpcb *inp = (struct inpcb *)so->so_pcb;
7267                 struct ifnet *ifp = inp->inp_last_outifp;
7268
7269                 if (ifp && IFNET_IS_CELLULAR(ifp)) {
7270                         OSIncrementAtomic(&soextbkidlestat.so_xbkidle_nocell);
7271                 } else if (so->so_flags & SOF_DELEGATED) {
7272                         OSIncrementAtomic(&soextbkidlestat.so_xbkidle_nodlgtd);
7273                 } else if (soextbkidlestat.so_xbkidle_time == 0) {
7274                         OSIncrementAtomic(&soextbkidlestat.so_xbkidle_notime);
7275                 } else if (noforce && p != PROC_NULL) {
7276                         OSIncrementAtomic(&soextbkidlestat.so_xbkidle_active);
7277
7278                         so->so_flags1 |= SOF1_EXTEND_BK_IDLE_INPROG;
7279                         so->so_extended_bk_start = net_uptime();
7280                         OSBitOrAtomic(P_LXBKIDLEINPROG, &p->p_ladvflag);
7281
7282                         inpcb_timer_sched(inp->inp_pcbinfo, INPCB_TIMER_LAZY);
7283
7284                         err = EOPNOTSUPP;
7285                         SODEFUNCTLOG("%s[%d, %s]: (target pid %d "
7286                             "name %s level %d) so 0x%llx [%d,%d] "
7287                             "extend bk idle "
7288                             "(%d)\n", __func__, proc_selfpid(),
7289                             proc_best_name(current_proc()), proc_pid(p),
7290                             proc_best_name(p), level,
7291                             (uint64_t)DEBUG_KERNEL_ADDRPERM(so),
7292                             SOCK_DOM(so), SOCK_TYPE(so), err);
7293                         return err;
7294                 } else {
7295                         OSIncrementAtomic(&soextbkidlestat.so_xbkidle_forced);
7296                 }
7297         }
7298
7299         so->so_flags |= SOF_DEFUNCT;
7300
7301         /* Prevent further data from being appended to the socket buffers */
7302         snd->sb_flags |= SB_DROP;
7303         rcv->sb_flags |= SB_DROP;
7304
7305         /* Flush any existing data in the socket buffers */
7306         if (rcv->sb_cc != 0) {
7307                 rcv->sb_flags &= ~SB_SEL;
7308                 selthreadclear(&rcv->sb_sel);
7309                 sbrelease(rcv);
7310         }
7311         if (snd->sb_cc != 0) {
7312                 snd->sb_flags &= ~SB_SEL;
7313                 selthreadclear(&snd->sb_sel);
7314                 sbrelease(snd);
7315         }
7316
7317 done:
7318         if (p != PROC_NULL) {
7319                 SODEFUNCTLOG("%s[%d, %s]: (target pid %d name %s level %d) "
7320                     "so 0x%llx [%d,%d] %s defunct%s\n", __func__,
7321                     proc_selfpid(), proc_best_name(current_proc()),
7322                     proc_pid(p), proc_best_name(p), level,
7323                     (uint64_t)DEBUG_KERNEL_ADDRPERM(so), SOCK_DOM(so),
7324                     SOCK_TYPE(so), defunct ? "is already" : "marked as",
7325                     (so->so_flags1 & SOF1_EXTEND_BK_IDLE_WANTED) ?
7326                     " extbkidle" : "");
7327         }
7328         return err;
7329 }
7330
7331 int
7332 sodefunct(struct proc *p, struct socket *so, int level)
7333 {
7334         struct sockbuf *rcv, *snd;
7335
7336         if (!(so->so_flags & SOF_DEFUNCT)) {
7337                 panic("%s improperly called", __func__);
7338                 /* NOTREACHED */
7339         }
7340         if (so->so_state & SS_DEFUNCT) {
7341                 goto done;
7342         }
7343
7344         rcv = &so->so_rcv;
7345         snd = &so->so_snd;
7346
7347         if (SOCK_DOM(so) == PF_INET || SOCK_DOM(so) == PF_INET6) {
7348                 char s[MAX_IPv6_STR_LEN];
7349                 char d[MAX_IPv6_STR_LEN];
7350                 struct inpcb *inp = sotoinpcb(so);
7351
7352                 if (p != PROC_NULL) {
7353                         SODEFUNCTLOG(
7354                                 "%s[%d, %s]: (target pid %d name %s level %d) "
7355                                 "so 0x%llx [%s %s:%d -> %s:%d] is now defunct "
7356                                 "[rcv_si 0x%x, snd_si 0x%x, rcv_fl 0x%x, "
7357                                 " snd_fl 0x%x]\n", __func__,
7358                                 proc_selfpid(), proc_best_name(current_proc()),
7359                                 proc_pid(p), proc_best_name(p), level,
7360                                 (uint64_t)DEBUG_KERNEL_ADDRPERM(so),
7361                                 (SOCK_TYPE(so) == SOCK_STREAM) ? "TCP" : "UDP",
7362                                 inet_ntop(SOCK_DOM(so), ((SOCK_DOM(so) == PF_INET) ?
7363                                 (void *)&inp->inp_laddr.s_addr :
7364                                 (void *)&inp->in6p_laddr),
7365                                 s, sizeof(s)), ntohs(inp->in6p_lport),
7366                                 inet_ntop(SOCK_DOM(so), (SOCK_DOM(so) == PF_INET) ?
7367                                 (void *)&inp->inp_faddr.s_addr :
7368                                 (void *)&inp->in6p_faddr,
7369                                 d, sizeof(d)), ntohs(inp->in6p_fport),
7370                                 (uint32_t)rcv->sb_sel.si_flags,
7371                                 (uint32_t)snd->sb_sel.si_flags,
7372                                 rcv->sb_flags, snd->sb_flags);
7373                 }
7374         } else if (p != PROC_NULL) {
7375                 SODEFUNCTLOG("%s[%d, %s]: (target pid %d name %s level %d) "
7376                     "so 0x%llx [%d,%d] is now defunct [rcv_si 0x%x, "
7377                     "snd_si 0x%x, rcv_fl 0x%x, snd_fl 0x%x]\n", __func__,
7378                     proc_selfpid(), proc_best_name(current_proc()),
7379                     proc_pid(p), proc_best_name(p), level,
7380                     (uint64_t)DEBUG_KERNEL_ADDRPERM(so),
7381                     SOCK_DOM(so), SOCK_TYPE(so),
7382                     (uint32_t)rcv->sb_sel.si_flags,
7383                     (uint32_t)snd->sb_sel.si_flags, rcv->sb_flags,
7384                     snd->sb_flags);
7385         }
7386
7387         /*
7388          * Unwedge threads blocked on sbwait() and sb_lock().
7389          */
7390         sbwakeup(rcv);
7391         sbwakeup(snd);
7392
7393         so->so_flags1 |= SOF1_DEFUNCTINPROG;
7394         if (rcv->sb_flags & SB_LOCK) {
7395                 sbunlock(rcv, TRUE);    /* keep socket locked */
7396         }
7397         if (snd->sb_flags & SB_LOCK) {
7398                 sbunlock(snd, TRUE);    /* keep socket locked */
7399         }
7400         /*
7401          * Flush the buffers and disconnect.  We explicitly call shutdown
7402          * on both data directions to ensure that SS_CANT{RCV,SEND}MORE
7403          * states are set for the socket.  This would also flush out data
7404          * hanging off the receive list of this socket.
7405          */
7406         (void) soshutdownlock_final(so, SHUT_RD);
7407         (void) soshutdownlock_final(so, SHUT_WR);
7408         (void) sodisconnectlocked(so);
7409
7410         /*
7411          * Explicitly handle connectionless-protocol disconnection
7412          * and release any remaining data in the socket buffers.
7413          */
7414         if (!(so->so_state & SS_ISDISCONNECTED)) {
7415                 (void) soisdisconnected(so);
7416         }
7417
7418         if (so->so_error == 0) {
7419                 so->so_error = EBADF;
7420         }
7421
7422         if (rcv->sb_cc != 0) {
7423                 rcv->sb_flags &= ~SB_SEL;
7424                 selthreadclear(&rcv->sb_sel);
7425                 sbrelease(rcv);
7426         }
7427         if (snd->sb_cc != 0) {
7428                 snd->sb_flags &= ~SB_SEL;
7429                 selthreadclear(&snd->sb_sel);
7430                 sbrelease(snd);
7431         }
7432         so->so_state |= SS_DEFUNCT;
7433         OSIncrementAtomicLong((volatile long *)&sodefunct_calls);
7434
7435 done:
7436         return 0;
7437 }
7438
7439 int
7440 soresume(struct proc *p, struct socket *so, int locked)
7441 {
7442         if (locked == 0) {
7443                 socket_lock(so, 1);
7444         }
7445
7446         if (so->so_flags1 & SOF1_EXTEND_BK_IDLE_INPROG) {
7447                 SODEFUNCTLOG("%s[%d, %s]: (target pid %d name %s) so 0x%llx "
7448                     "[%d,%d] resumed from bk idle\n",
7449                     __func__, proc_selfpid(), proc_best_name(current_proc()),
7450                     proc_pid(p), proc_best_name(p),
7451                     (uint64_t)DEBUG_KERNEL_ADDRPERM(so),
7452                     SOCK_DOM(so), SOCK_TYPE(so));
7453
7454                 so->so_flags1 &= ~SOF1_EXTEND_BK_IDLE_INPROG;
7455                 so->so_extended_bk_start = 0;
7456                 OSBitAndAtomic(~P_LXBKIDLEINPROG, &p->p_ladvflag);
7457
7458                 OSIncrementAtomic(&soextbkidlestat.so_xbkidle_resumed);
7459                 OSDecrementAtomic(&soextbkidlestat.so_xbkidle_active);
7460                 VERIFY(soextbkidlestat.so_xbkidle_active >= 0);
7461         }
7462         if (locked == 0) {
7463                 socket_unlock(so, 1);
7464         }
7465
7466         return 0;
7467 }
7468
7469 /*
7470  * Does not attempt to account for sockets that are delegated from
7471  * the current process
7472  */
7473 int
7474 so_set_extended_bk_idle(struct socket *so, int optval)
7475 {
7476         int error = 0;
7477
7478         if ((SOCK_DOM(so) != PF_INET && SOCK_DOM(so) != PF_INET6) ||
7479             SOCK_PROTO(so) != IPPROTO_TCP) {
7480                 OSDecrementAtomic(&soextbkidlestat.so_xbkidle_notsupp);
7481                 error = EOPNOTSUPP;
7482         } else if (optval == 0) {
7483                 so->so_flags1 &= ~SOF1_EXTEND_BK_IDLE_WANTED;
7484
7485                 soresume(current_proc(), so, 1);
7486         } else {
7487                 struct proc *p = current_proc();
7488                 struct fileproc *fp;
7489                 int count = 0;
7490
7491                 /*
7492                  * Unlock socket to avoid lock ordering issue with
7493                  * the proc fd table lock
7494                  */
7495                 socket_unlock(so, 0);
7496
7497                 proc_fdlock(p);
7498                 fdt_foreach(fp, p) {
7499                         struct socket *so2;
7500
7501                         if (FILEGLOB_DTYPE(fp->fp_glob) != DTYPE_SOCKET) {
7502                                 continue;
7503                         }
7504
7505                         so2 = (struct socket *)fp->fp_glob->fg_data;
7506                         if (so != so2 &&
7507                             so2->so_flags1 & SOF1_EXTEND_BK_IDLE_WANTED) {
7508                                 count++;
7509                         }
7510                         if (count >= soextbkidlestat.so_xbkidle_maxperproc) {
7511                                 break;
7512                         }
7513                 }
7514                 proc_fdunlock(p);
7515
7516                 socket_lock(so, 0);
7517
7518                 if (count >= soextbkidlestat.so_xbkidle_maxperproc) {
7519                         OSIncrementAtomic(&soextbkidlestat.so_xbkidle_toomany);
7520                         error = EBUSY;
7521                 } else if (so->so_flags & SOF_DELEGATED) {
7522                         OSIncrementAtomic(&soextbkidlestat.so_xbkidle_nodlgtd);
7523                         error = EBUSY;
7524                 } else {
7525                         so->so_flags1 |= SOF1_EXTEND_BK_IDLE_WANTED;
7526                         OSIncrementAtomic(&soextbkidlestat.so_xbkidle_wantok);
7527                 }
7528                 SODEFUNCTLOG("%s[%d, %s]: so 0x%llx [%d,%d] "
7529                     "%s marked for extended bk idle\n",
7530                     __func__, proc_selfpid(), proc_best_name(current_proc()),
7531                     (uint64_t)DEBUG_KERNEL_ADDRPERM(so),
7532                     SOCK_DOM(so), SOCK_TYPE(so),
7533                     (so->so_flags1 & SOF1_EXTEND_BK_IDLE_WANTED) ?
7534                     "is" : "not");
7535         }
7536
7537         return error;
7538 }
7539
7540 static void
7541 so_stop_extended_bk_idle(struct socket *so)
7542 {
7543         so->so_flags1 &= ~SOF1_EXTEND_BK_IDLE_INPROG;
7544         so->so_extended_bk_start = 0;
7545
7546         OSDecrementAtomic(&soextbkidlestat.so_xbkidle_active);
7547         VERIFY(soextbkidlestat.so_xbkidle_active >= 0);
7548         /*
7549          * Force defunct
7550          */
7551         sosetdefunct(current_proc(), so,
7552             SHUTDOWN_SOCKET_LEVEL_DISCONNECT_INTERNAL, FALSE);
7553         if (so->so_flags & SOF_DEFUNCT) {
7554                 sodefunct(current_proc(), so,
7555                     SHUTDOWN_SOCKET_LEVEL_DISCONNECT_INTERNAL);
7556         }
7557 }
7558
7559 void
7560 so_drain_extended_bk_idle(struct socket *so)
7561 {
7562         if (so && (so->so_flags1 & SOF1_EXTEND_BK_IDLE_INPROG)) {
7563                 /*
7564                  * Only penalize sockets that have outstanding data
7565                  */
7566                 if (so->so_rcv.sb_cc || so->so_snd.sb_cc) {
7567                         so_stop_extended_bk_idle(so);
7568
7569                         OSIncrementAtomic(&soextbkidlestat.so_xbkidle_drained);
7570                 }
7571         }
7572 }
7573
7574 /*
7575  * Return values tells if socket is still in extended background idle
7576  */
7577 int
7578 so_check_extended_bk_idle_time(struct socket *so)
7579 {
7580         int ret = 1;
7581
7582         if ((so->so_flags1 & SOF1_EXTEND_BK_IDLE_INPROG)) {
7583                 SODEFUNCTLOG("%s[%d, %s]: so 0x%llx [%d,%d]\n",
7584                     __func__, proc_selfpid(), proc_best_name(current_proc()),
7585                     (uint64_t)DEBUG_KERNEL_ADDRPERM(so),
7586                     SOCK_DOM(so), SOCK_TYPE(so));
7587                 if (net_uptime() - so->so_extended_bk_start >
7588                     soextbkidlestat.so_xbkidle_time) {
7589                         so_stop_extended_bk_idle(so);
7590
7591                         OSIncrementAtomic(&soextbkidlestat.so_xbkidle_expired);
7592
7593                         ret = 0;
7594                 } else {
7595                         struct inpcb *inp = (struct inpcb *)so->so_pcb;
7596
7597                         inpcb_timer_sched(inp->inp_pcbinfo, INPCB_TIMER_LAZY);
7598                         OSIncrementAtomic(&soextbkidlestat.so_xbkidle_resched);
7599                 }
7600         }
7601
7602         return ret;
7603 }
7604
7605 void
7606 resume_proc_sockets(proc_t p)
7607 {
7608         if (p->p_ladvflag & P_LXBKIDLEINPROG) {
7609                 struct fileproc *fp;
7610                 struct socket *so;
7611
7612                 proc_fdlock(p);
7613                 fdt_foreach(fp, p) {
7614                         if (FILEGLOB_DTYPE(fp->fp_glob) != DTYPE_SOCKET) {
7615                                 continue;
7616                         }
7617
7618                         so = (struct socket *)fp->fp_glob->fg_data;
7619                         (void) soresume(p, so, 0);
7620                 }
7621                 proc_fdunlock(p);
7622
7623                 OSBitAndAtomic(~P_LXBKIDLEINPROG, &p->p_ladvflag);
7624         }
7625 }
7626
7627 __private_extern__ int
7628 so_set_recv_anyif(struct socket *so, int optval)
7629 {
7630         int ret = 0;
7631
7632         if (SOCK_DOM(so) == PF_INET || SOCK_DOM(so) == PF_INET6) {
7633                 if (optval) {
7634                         sotoinpcb(so)->inp_flags |= INP_RECV_ANYIF;
7635                 } else {
7636                         sotoinpcb(so)->inp_flags &= ~INP_RECV_ANYIF;
7637                 }
7638         }
7639
7640
7641         return ret;
7642 }
7643
7644 __private_extern__ int
7645 so_get_recv_anyif(struct socket *so)
7646 {
7647         int ret = 0;
7648
7649         if (SOCK_DOM(so) == PF_INET || SOCK_DOM(so) == PF_INET6) {
7650                 ret = (sotoinpcb(so)->inp_flags & INP_RECV_ANYIF) ? 1 : 0;
7651         }
7652
7653         return ret;
7654 }
7655
7656 int
7657 so_set_restrictions(struct socket *so, uint32_t vals)
7658 {
7659         int nocell_old, nocell_new;
7660         int noexpensive_old, noexpensive_new;
7661         int noconstrained_old, noconstrained_new;
7662
7663         /*
7664          * Deny-type restrictions are trapdoors; once set they cannot be
7665          * unset for the lifetime of the socket.  This allows them to be
7666          * issued by a framework on behalf of the application without
7667          * having to worry that they can be undone.
7668          *
7669          * Note here that socket-level restrictions overrides any protocol
7670          * level restrictions.  For instance, SO_RESTRICT_DENY_CELLULAR
7671          * socket restriction issued on the socket has a higher precendence
7672          * than INP_NO_IFT_CELLULAR.  The latter is affected by the UUID
7673          * policy PROC_UUID_NO_CELLULAR for unrestricted sockets only,
7674          * i.e. when SO_RESTRICT_DENY_CELLULAR has not been issued.
7675          */
7676         nocell_old = (so->so_restrictions & SO_RESTRICT_DENY_CELLULAR);
7677         noexpensive_old = (so->so_restrictions & SO_RESTRICT_DENY_EXPENSIVE);
7678         noconstrained_old = (so->so_restrictions & SO_RESTRICT_DENY_CONSTRAINED);
7679         so->so_restrictions |= (vals & (SO_RESTRICT_DENY_IN |
7680             SO_RESTRICT_DENY_OUT | SO_RESTRICT_DENY_CELLULAR |
7681             SO_RESTRICT_DENY_EXPENSIVE | SO_RESTRICT_DENY_CONSTRAINED));
7682         nocell_new = (so->so_restrictions & SO_RESTRICT_DENY_CELLULAR);
7683         noexpensive_new = (so->so_restrictions & SO_RESTRICT_DENY_EXPENSIVE);
7684         noconstrained_new = (so->so_restrictions & SO_RESTRICT_DENY_CONSTRAINED);
7685
7686         /* we can only set, not clear restrictions */
7687         if ((nocell_new - nocell_old) == 0 &&
7688             (noexpensive_new - noexpensive_old) == 0 &&
7689             (noconstrained_new - noconstrained_old) == 0) {
7690                 return 0;
7691         }
7692         if (SOCK_DOM(so) == PF_INET || SOCK_DOM(so) == PF_INET6) {
7693                 if (nocell_new - nocell_old != 0) {
7694                         /*
7695                          * if deny cellular is now set, do what's needed
7696                          * for INPCB
7697                          */
7698                         inp_set_nocellular(sotoinpcb(so));
7699                 }
7700                 if (noexpensive_new - noexpensive_old != 0) {
7701                         inp_set_noexpensive(sotoinpcb(so));
7702                 }
7703                 if (noconstrained_new - noconstrained_old != 0) {
7704                         inp_set_noconstrained(sotoinpcb(so));
7705                 }
7706         }
7707
7708         if (SOCK_DOM(so) == PF_MULTIPATH) {
7709                 mptcp_set_restrictions(so);
7710         }
7711
7712         return 0;
7713 }
7714
7715 uint32_t
7716 so_get_restrictions(struct socket *so)
7717 {
7718         return so->so_restrictions & (SO_RESTRICT_DENY_IN |
7719                SO_RESTRICT_DENY_OUT |
7720                SO_RESTRICT_DENY_CELLULAR | SO_RESTRICT_DENY_EXPENSIVE);
7721 }
7722
7723 int
7724 so_set_effective_pid(struct socket *so, int epid, struct proc *p, boolean_t check_cred)
7725 {
7726         struct proc *ep = PROC_NULL;
7727         int error = 0;
7728
7729         /* pid 0 is reserved for kernel */
7730         if (epid == 0) {
7731                 error = EINVAL;
7732                 goto done;
7733         }
7734
7735         /*
7736          * If this is an in-kernel socket, prevent its delegate
7737          * association from changing unless the socket option is
7738          * coming from within the kernel itself.
7739          */
7740         if (so->last_pid == 0 && p != kernproc) {
7741                 error = EACCES;
7742                 goto done;
7743         }
7744
7745         /*
7746          * If this is issued by a process that's recorded as the
7747          * real owner of the socket, or if the pid is the same as
7748          * the process's own pid, then proceed.  Otherwise ensure
7749          * that the issuing process has the necessary privileges.
7750          */
7751         if (check_cred && (epid != so->last_pid || epid != proc_pid(p))) {
7752                 if ((error = priv_check_cred(kauth_cred_get(),
7753                     PRIV_NET_PRIVILEGED_SOCKET_DELEGATE, 0))) {
7754                         error = EACCES;
7755                         goto done;
7756                 }
7757         }
7758
7759         /* Find the process that corresponds to the effective pid */
7760         if ((ep = proc_find(epid)) == PROC_NULL) {
7761                 error = ESRCH;
7762                 goto done;
7763         }
7764
7765         /*
7766          * If a process tries to delegate the socket to itself, then
7767          * there's really nothing to do; treat it as a way for the
7768          * delegate association to be cleared.  Note that we check
7769          * the passed-in proc rather than calling proc_selfpid(),
7770          * as we need to check the process issuing the socket option
7771          * which could be kernproc.  Given that we don't allow 0 for
7772          * effective pid, it means that a delegated in-kernel socket
7773          * stays delegated during its lifetime (which is probably OK.)
7774          */
7775         if (epid == proc_pid(p)) {
7776                 so->so_flags &= ~SOF_DELEGATED;
7777                 so->e_upid = 0;
7778                 so->e_pid = 0;
7779                 uuid_clear(so->e_uuid);
7780         } else {
7781                 so->so_flags |= SOF_DELEGATED;
7782                 so->e_upid = proc_uniqueid(ep);
7783                 so->e_pid = proc_pid(ep);
7784                 proc_getexecutableuuid(ep, so->e_uuid, sizeof(so->e_uuid));
7785
7786 #if defined(XNU_TARGET_OS_OSX)
7787                 if (ep->p_responsible_pid != so->e_pid) {
7788                         proc_t rp = proc_find(ep->p_responsible_pid);
7789                         if (rp != PROC_NULL) {
7790                                 proc_getexecutableuuid(rp, so->so_ruuid, sizeof(so->so_ruuid));
7791                                 so->so_rpid = ep->p_responsible_pid;
7792                                 proc_rele(rp);
7793                         } else {
7794                                 uuid_clear(so->so_ruuid);
7795                                 so->so_rpid = -1;
7796                         }
7797                 }
7798 #endif
7799         }
7800         if (so->so_proto != NULL && so->so_proto->pr_update_last_owner != NULL) {
7801                 (*so->so_proto->pr_update_last_owner)(so, NULL, ep);
7802         }
7803 done:
7804         if (error == 0 && net_io_policy_log) {
7805                 uuid_string_t buf;
7806
7807                 uuid_unparse(so->e_uuid, buf);
7808                 log(LOG_DEBUG, "%s[%s,%d]: so 0x%llx [%d,%d] epid %d (%s) "
7809                     "euuid %s%s\n", __func__, proc_name_address(p),
7810                     proc_pid(p), (uint64_t)DEBUG_KERNEL_ADDRPERM(so),
7811                     SOCK_DOM(so), SOCK_TYPE(so),
7812                     so->e_pid, proc_name_address(ep), buf,
7813                     ((so->so_flags & SOF_DELEGATED) ? " [delegated]" : ""));
7814         } else if (error != 0 && net_io_policy_log) {
7815                 log(LOG_ERR, "%s[%s,%d]: so 0x%llx [%d,%d] epid %d (%s) "
7816                     "ERROR (%d)\n", __func__, proc_name_address(p),
7817                     proc_pid(p), (uint64_t)DEBUG_KERNEL_ADDRPERM(so),
7818                     SOCK_DOM(so), SOCK_TYPE(so),
7819                     epid, (ep == PROC_NULL) ? "PROC_NULL" :
7820                     proc_name_address(ep), error);
7821         }
7822
7823         /* Update this socket's policy upon success */
7824         if (error == 0) {
7825                 so->so_policy_gencnt *= -1;
7826                 so_update_policy(so);
7827 #if NECP
7828                 so_update_necp_policy(so, NULL, NULL);
7829 #endif /* NECP */
7830         }
7831
7832         if (ep != PROC_NULL) {
7833                 proc_rele(ep);
7834         }
7835
7836         return error;
7837 }
7838
7839 int
7840 so_set_effective_uuid(struct socket *so, uuid_t euuid, struct proc *p, boolean_t check_cred)
7841 {
7842         uuid_string_t buf;
7843         uuid_t uuid;
7844         int error = 0;
7845
7846         /* UUID must not be all-zeroes (reserved for kernel) */
7847         if (uuid_is_null(euuid)) {
7848                 error = EINVAL;
7849                 goto done;
7850         }
7851
7852         /*
7853          * If this is an in-kernel socket, prevent its delegate
7854          * association from changing unless the socket option is
7855          * coming from within the kernel itself.
7856          */
7857         if (so->last_pid == 0 && p != kernproc) {
7858                 error = EACCES;
7859                 goto done;
7860         }
7861
7862         /* Get the UUID of the issuing process */
7863         proc_getexecutableuuid(p, uuid, sizeof(uuid));
7864
7865         /*
7866          * If this is issued by a process that's recorded as the
7867          * real owner of the socket, or if the uuid is the same as
7868          * the process's own uuid, then proceed.  Otherwise ensure
7869          * that the issuing process has the necessary privileges.
7870          */
7871         if (check_cred &&
7872             (uuid_compare(euuid, so->last_uuid) != 0 ||
7873             uuid_compare(euuid, uuid) != 0)) {
7874                 if ((error = priv_check_cred(kauth_cred_get(),
7875                     PRIV_NET_PRIVILEGED_SOCKET_DELEGATE, 0))) {
7876                         error = EACCES;
7877                         goto done;
7878                 }
7879         }
7880
7881         /*
7882          * If a process tries to delegate the socket to itself, then
7883          * there's really nothing to do; treat it as a way for the
7884          * delegate association to be cleared.  Note that we check
7885          * the uuid of the passed-in proc rather than that of the
7886          * current process, as we need to check the process issuing
7887          * the socket option which could be kernproc itself.  Given
7888          * that we don't allow 0 for effective uuid, it means that
7889          * a delegated in-kernel socket stays delegated during its
7890          * lifetime (which is okay.)
7891          */
7892         if (uuid_compare(euuid, uuid) == 0) {
7893                 so->so_flags &= ~SOF_DELEGATED;
7894                 so->e_upid = 0;
7895                 so->e_pid = 0;
7896                 uuid_clear(so->e_uuid);
7897         } else {
7898                 so->so_flags |= SOF_DELEGATED;
7899                 /*
7900                  * Unlike so_set_effective_pid(), we only have the UUID
7901                  * here and the process ID is not known.  Inherit the
7902                  * real {pid,upid} of the socket.
7903                  */
7904                 so->e_upid = so->last_upid;
7905                 so->e_pid = so->last_pid;
7906                 uuid_copy(so->e_uuid, euuid);
7907         }
7908         /*
7909          * The following will clear the effective process name as it's the same
7910          * as the real process
7911          */
7912         if (so->so_proto != NULL && so->so_proto->pr_update_last_owner != NULL) {
7913                 (*so->so_proto->pr_update_last_owner)(so, NULL, NULL);
7914         }
7915 done:
7916         if (error == 0 && net_io_policy_log) {
7917                 uuid_unparse(so->e_uuid, buf);
7918                 log(LOG_DEBUG, "%s[%s,%d]: so 0x%llx [%d,%d] epid %d "
7919                     "euuid %s%s\n", __func__, proc_name_address(p), proc_pid(p),
7920                     (uint64_t)DEBUG_KERNEL_ADDRPERM(so), SOCK_DOM(so),
7921                     SOCK_TYPE(so), so->e_pid, buf,
7922                     ((so->so_flags & SOF_DELEGATED) ? " [delegated]" : ""));
7923         } else if (error != 0 && net_io_policy_log) {
7924                 uuid_unparse(euuid, buf);
7925                 log(LOG_DEBUG, "%s[%s,%d]: so 0x%llx [%d,%d] euuid %s "
7926                     "ERROR (%d)\n", __func__, proc_name_address(p), proc_pid(p),
7927                     (uint64_t)DEBUG_KERNEL_ADDRPERM(so), SOCK_DOM(so),
7928                     SOCK_TYPE(so), buf, error);
7929         }
7930
7931         /* Update this socket's policy upon success */
7932         if (error == 0) {
7933                 so->so_policy_gencnt *= -1;
7934                 so_update_policy(so);
7935 #if NECP
7936                 so_update_necp_policy(so, NULL, NULL);
7937 #endif /* NECP */
7938         }
7939
7940         return error;
7941 }
7942
7943 void
7944 netpolicy_post_msg(uint32_t ev_code, struct netpolicy_event_data *ev_data,
7945     uint32_t ev_datalen)
7946 {
7947         struct kev_msg ev_msg;
7948
7949         /*
7950          * A netpolicy event always starts with a netpolicy_event_data
7951          * structure, but the caller can provide for a longer event
7952          * structure to post, depending on the event code.
7953          */
7954         VERIFY(ev_data != NULL && ev_datalen >= sizeof(*ev_data));
7955
7956         bzero(&ev_msg, sizeof(ev_msg));
7957         ev_msg.vendor_code      = KEV_VENDOR_APPLE;
7958         ev_msg.kev_class        = KEV_NETWORK_CLASS;
7959         ev_msg.kev_subclass     = KEV_NETPOLICY_SUBCLASS;
7960         ev_msg.event_code       = ev_code;
7961
7962         ev_msg.dv[0].data_ptr   = ev_data;
7963         ev_msg.dv[0].data_length = ev_datalen;
7964
7965         kev_post_msg(&ev_msg);
7966 }
7967
7968 void
7969 socket_post_kev_msg(uint32_t ev_code,
7970     struct kev_socket_event_data *ev_data,
7971     uint32_t ev_datalen)
7972 {
7973         struct kev_msg ev_msg;
7974
7975         bzero(&ev_msg, sizeof(ev_msg));
7976         ev_msg.vendor_code = KEV_VENDOR_APPLE;
7977         ev_msg.kev_class = KEV_NETWORK_CLASS;
7978         ev_msg.kev_subclass = KEV_SOCKET_SUBCLASS;
7979         ev_msg.event_code = ev_code;
7980
7981         ev_msg.dv[0].data_ptr = ev_data;
7982         ev_msg.dv[0].data_length = ev_datalen;
7983
7984         kev_post_msg(&ev_msg);
7985 }
7986
7987 void
7988 socket_post_kev_msg_closed(struct socket *so)
7989 {
7990         struct kev_socket_closed ev = {};
7991         struct sockaddr *socksa = NULL, *peersa = NULL;
7992         int err;
7993
7994         if ((so->so_flags1 & SOF1_WANT_KEV_SOCK_CLOSED) == 0) {
7995                 return;
7996         }
7997         err = (*so->so_proto->pr_usrreqs->pru_sockaddr)(so, &socksa);
7998         if (err == 0) {
7999                 err = (*so->so_proto->pr_usrreqs->pru_peeraddr)(so,
8000                     &peersa);
8001                 if (err == 0) {
8002                         memcpy(&ev.ev_data.kev_sockname, socksa,
8003                             min(socksa->sa_len,
8004                             sizeof(ev.ev_data.kev_sockname)));
8005                         memcpy(&ev.ev_data.kev_peername, peersa,
8006                             min(peersa->sa_len,
8007                             sizeof(ev.ev_data.kev_peername)));
8008                         socket_post_kev_msg(KEV_SOCKET_CLOSED,
8009                             &ev.ev_data, sizeof(ev));
8010                 }
8011         }
8012         if (socksa != NULL) {
8013                 FREE(socksa, M_SONAME);
8014         }
8015         if (peersa != NULL) {
8016                 FREE(peersa, M_SONAME);
8017         }
8018 }