bsd/kern/uipc_socket.c

   1 /*
   2  * Copyright (c) 1998-2020 Apple Inc. All rights reserved.
   3  *
   4  * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
   5  *
   6  * This file contains Original Code and/or Modifications of Original Code
   7  * as defined in and that are subject to the Apple Public Source License
   8  * Version 2.0 (the 'License'). You may not use this file except in
   9  * compliance with the License. The rights granted to you under the License
  10  * may not be used to create, or enable the creation or redistribution of,
  11  * unlawful or unlicensed copies of an Apple operating system, or to
  12  * circumvent, violate, or enable the circumvention or violation of, any
  13  * terms of an Apple operating system software license agreement.
  14  *
  15  * Please obtain a copy of the License at
  16  * http://www.opensource.apple.com/apsl/ and read it before using this file.
  17  *
  18  * The Original Code and all software distributed under the License are
  19  * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
  20  * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
  21  * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
  22  * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
  23  * Please see the License for the specific language governing rights and
  24  * limitations under the License.
  25  *
  26  * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
  27  */
  28 /* Copyright (c) 1995 NeXT Computer, Inc. All Rights Reserved */
  29 /*
  30  * Copyright (c) 1982, 1986, 1988, 1990, 1993
  31  *      The Regents of the University of California.  All rights reserved.
  32  *
  33  * Redistribution and use in source and binary forms, with or without
  34  * modification, are permitted provided that the following conditions
  35  * are met:
  36  * 1. Redistributions of source code must retain the above copyright
  37  *    notice, this list of conditions and the following disclaimer.
  38  * 2. Redistributions in binary form must reproduce the above copyright
  39  *    notice, this list of conditions and the following disclaimer in the
  40  *    documentation and/or other materials provided with the distribution.
  41  * 3. All advertising materials mentioning features or use of this software
  42  *    must display the following acknowledgement:
  43  *      This product includes software developed by the University of
  44  *      California, Berkeley and its contributors.
  45  * 4. Neither the name of the University nor the names of its contributors
  46  *    may be used to endorse or promote products derived from this software
  47  *    without specific prior written permission.
  48  *
  49  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
  50  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  51  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  52  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  53  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  54  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  55  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  56  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  57  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  58  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  59  * SUCH DAMAGE.
  60  *
  61  *      @(#)uipc_socket.c       8.3 (Berkeley) 4/15/94
  62  */
  63 /*
  64  * NOTICE: This file was modified by SPARTA, Inc. in 2005 to introduce
  65  * support for mandatory and extensible security protections.  This notice
  66  * is included in support of clause 2.2 (b) of the Apple Public License,
  67  * Version 2.0.
  68  */
  69
  70 #include <sys/param.h>
  71 #include <sys/systm.h>
  72 #include <sys/filedesc.h>
  73 #include <sys/proc.h>
  74 #include <sys/proc_internal.h>
  75 #include <sys/kauth.h>
  76 #include <sys/file_internal.h>
  77 #include <sys/fcntl.h>
  78 #include <sys/malloc.h>
  79 #include <sys/mbuf.h>
  80 #include <sys/domain.h>
  81 #include <sys/kernel.h>
  82 #include <sys/event.h>
  83 #include <sys/poll.h>
  84 #include <sys/protosw.h>
  85 #include <sys/socket.h>
  86 #include <sys/socketvar.h>
  87 #include <sys/resourcevar.h>
  88 #include <sys/signalvar.h>
  89 #include <sys/sysctl.h>
  90 #include <sys/syslog.h>
  91 #include <sys/uio.h>
  92 #include <sys/uio_internal.h>
  93 #include <sys/ev.h>
  94 #include <sys/kdebug.h>
  95 #include <sys/un.h>
  96 #include <sys/user.h>
  97 #include <sys/priv.h>
  98 #include <sys/kern_event.h>
  99 #include <net/route.h>
 100 #include <net/init.h>
 101 #include <net/net_api_stats.h>
 102 #include <net/ntstat.h>
 103 #include <net/content_filter.h>
 104 #include <netinet/in.h>
 105 #include <netinet/in_pcb.h>
 106 #include <netinet/in_tclass.h>
 107 #include <netinet/in_var.h>
 108 #include <netinet/tcp_var.h>
 109 #include <netinet/ip6.h>
 110 #include <netinet6/ip6_var.h>
 111 #include <netinet/flow_divert.h>
 112 #include <kern/zalloc.h>
 113 #include <kern/locks.h>
 114 #include <machine/limits.h>
 115 #include <libkern/OSAtomic.h>
 116 #include <pexpert/pexpert.h>
 117 #include <kern/assert.h>
 118 #include <kern/task.h>
 119 #include <kern/policy_internal.h>
 120
 121 #include <sys/kpi_mbuf.h>
 122 #include <sys/mcache.h>
 123 #include <sys/unpcb.h>
 124 #include <libkern/section_keywords.h>
 125
 126 #if CONFIG_MACF
 127 #include <security/mac_framework.h>
 128 #endif /* MAC */
 129
 130 #if MULTIPATH
 131 #include <netinet/mp_pcb.h>
 132 #include <netinet/mptcp_var.h>
 133 #endif /* MULTIPATH */
 134
 135 #define ROUNDUP(a, b) (((a) + ((b) - 1)) & (~((b) - 1)))
 136
 137 #if DEBUG || DEVELOPMENT
 138 #define DEBUG_KERNEL_ADDRPERM(_v) (_v)
 139 #else
 140 #define DEBUG_KERNEL_ADDRPERM(_v) VM_KERNEL_ADDRPERM(_v)
 141 #endif
 142
 143 /* TODO: this should be in a header file somewhere */
 144 extern char *proc_name_address(void *p);
 145
 146 static u_int32_t        so_cache_hw;    /* High water mark for socache */
 147 static u_int32_t        so_cache_timeouts;      /* number of timeouts */
 148 static u_int32_t        so_cache_max_freed;     /* max freed per timeout */
 149 static u_int32_t        cached_sock_count = 0;
 150 STAILQ_HEAD(, socket)   so_cache_head;
 151 int     max_cached_sock_count = MAX_CACHED_SOCKETS;
 152 static u_int32_t        so_cache_time;
 153 static int              socketinit_done;
 154 static struct zone      *so_cache_zone;
 155
 156 static lck_grp_t        *so_cache_mtx_grp;
 157 static lck_attr_t       *so_cache_mtx_attr;
 158 static lck_grp_attr_t   *so_cache_mtx_grp_attr;
 159 static lck_mtx_t        *so_cache_mtx;
 160
 161 #include <machine/limits.h>
 162
 163 static int      filt_sorattach(struct knote *kn, struct kevent_qos_s *kev);
 164 static void     filt_sordetach(struct knote *kn);
 165 static int      filt_soread(struct knote *kn, long hint);
 166 static int      filt_sortouch(struct knote *kn, struct kevent_qos_s *kev);
 167 static int      filt_sorprocess(struct knote *kn, struct kevent_qos_s *kev);
 168
 169 static int      filt_sowattach(struct knote *kn, struct kevent_qos_s *kev);
 170 static void     filt_sowdetach(struct knote *kn);
 171 static int      filt_sowrite(struct knote *kn, long hint);
 172 static int      filt_sowtouch(struct knote *kn, struct kevent_qos_s *kev);
 173 static int      filt_sowprocess(struct knote *kn, struct kevent_qos_s *kev);
 174
 175 static int      filt_sockattach(struct knote *kn, struct kevent_qos_s *kev);
 176 static void     filt_sockdetach(struct knote *kn);
 177 static int      filt_sockev(struct knote *kn, long hint);
 178 static int      filt_socktouch(struct knote *kn, struct kevent_qos_s *kev);
 179 static int      filt_sockprocess(struct knote *kn, struct kevent_qos_s *kev);
 180
 181 static int sooptcopyin_timeval(struct sockopt *, struct timeval *);
 182 static int sooptcopyout_timeval(struct sockopt *, const struct timeval *);
 183
 184 SECURITY_READ_ONLY_EARLY(struct filterops) soread_filtops = {
 185         .f_isfd = 1,
 186         .f_attach = filt_sorattach,
 187         .f_detach = filt_sordetach,
 188         .f_event = filt_soread,
 189         .f_touch = filt_sortouch,
 190         .f_process = filt_sorprocess,
 191 };
 192
 193 SECURITY_READ_ONLY_EARLY(struct filterops) sowrite_filtops = {
 194         .f_isfd = 1,
 195         .f_attach = filt_sowattach,
 196         .f_detach = filt_sowdetach,
 197         .f_event = filt_sowrite,
 198         .f_touch = filt_sowtouch,
 199         .f_process = filt_sowprocess,
 200 };
 201
 202 SECURITY_READ_ONLY_EARLY(struct filterops) sock_filtops = {
 203         .f_isfd = 1,
 204         .f_attach = filt_sockattach,
 205         .f_detach = filt_sockdetach,
 206         .f_event = filt_sockev,
 207         .f_touch = filt_socktouch,
 208         .f_process = filt_sockprocess,
 209 };
 210
 211 SECURITY_READ_ONLY_EARLY(struct filterops) soexcept_filtops = {
 212         .f_isfd = 1,
 213         .f_attach = filt_sorattach,
 214         .f_detach = filt_sordetach,
 215         .f_event = filt_soread,
 216         .f_touch = filt_sortouch,
 217         .f_process = filt_sorprocess,
 218 };
 219
 220 SYSCTL_DECL(_kern_ipc);
 221
 222 #define EVEN_MORE_LOCKING_DEBUG 0
 223
 224 int socket_debug = 0;
 225 SYSCTL_INT(_kern_ipc, OID_AUTO, socket_debug,
 226     CTLFLAG_RW | CTLFLAG_LOCKED, &socket_debug, 0, "");
 227
 228 static unsigned long sodefunct_calls = 0;
 229 SYSCTL_LONG(_kern_ipc, OID_AUTO, sodefunct_calls, CTLFLAG_LOCKED,
 230     &sodefunct_calls, "");
 231
 232 ZONE_DECLARE(socket_zone, "socket", sizeof(struct socket), ZC_ZFREE_CLEARMEM);
 233 so_gen_t        so_gencnt;      /* generation count for sockets */
 234
 235 MALLOC_DEFINE(M_SONAME, "soname", "socket name");
 236 MALLOC_DEFINE(M_PCB, "pcb", "protocol control block");
 237
 238 #define DBG_LAYER_IN_BEG        NETDBG_CODE(DBG_NETSOCK, 0)
 239 #define DBG_LAYER_IN_END        NETDBG_CODE(DBG_NETSOCK, 2)
 240 #define DBG_LAYER_OUT_BEG       NETDBG_CODE(DBG_NETSOCK, 1)
 241 #define DBG_LAYER_OUT_END       NETDBG_CODE(DBG_NETSOCK, 3)
 242 #define DBG_FNC_SOSEND          NETDBG_CODE(DBG_NETSOCK, (4 << 8) | 1)
 243 #define DBG_FNC_SOSEND_LIST     NETDBG_CODE(DBG_NETSOCK, (4 << 8) | 3)
 244 #define DBG_FNC_SORECEIVE       NETDBG_CODE(DBG_NETSOCK, (8 << 8))
 245 #define DBG_FNC_SORECEIVE_LIST  NETDBG_CODE(DBG_NETSOCK, (8 << 8) | 3)
 246 #define DBG_FNC_SOSHUTDOWN      NETDBG_CODE(DBG_NETSOCK, (9 << 8))
 247
 248 #define MAX_SOOPTGETM_SIZE      (128 * MCLBYTES)
 249
 250 int somaxconn = SOMAXCONN;
 251 SYSCTL_INT(_kern_ipc, KIPC_SOMAXCONN, somaxconn,
 252     CTLFLAG_RW | CTLFLAG_LOCKED, &somaxconn, 0, "");
 253
 254 /* Should we get a maximum also ??? */
 255 static int sosendmaxchain = 65536;
 256 static int sosendminchain = 16384;
 257 static int sorecvmincopy  = 16384;
 258 SYSCTL_INT(_kern_ipc, OID_AUTO, sosendminchain,
 259     CTLFLAG_RW | CTLFLAG_LOCKED, &sosendminchain, 0, "");
 260 SYSCTL_INT(_kern_ipc, OID_AUTO, sorecvmincopy,
 261     CTLFLAG_RW | CTLFLAG_LOCKED, &sorecvmincopy, 0, "");
 262
 263 /*
 264  * Set to enable jumbo clusters (if available) for large writes when
 265  * the socket is marked with SOF_MULTIPAGES; see below.
 266  */
 267 int sosendjcl = 1;
 268 SYSCTL_INT(_kern_ipc, OID_AUTO, sosendjcl,
 269     CTLFLAG_RW | CTLFLAG_LOCKED, &sosendjcl, 0, "");
 270
 271 /*
 272  * Set this to ignore SOF_MULTIPAGES and use jumbo clusters for large
 273  * writes on the socket for all protocols on any network interfaces,
 274  * depending upon sosendjcl above.  Be extra careful when setting this
 275  * to 1, because sending down packets that cross physical pages down to
 276  * broken drivers (those that falsely assume that the physical pages
 277  * are contiguous) might lead to system panics or silent data corruption.
 278  * When set to 0, the system will respect SOF_MULTIPAGES, which is set
 279  * only for TCP sockets whose outgoing interface is IFNET_MULTIPAGES
 280  * capable.  Set this to 1 only for testing/debugging purposes.
 281  */
 282 int sosendjcl_ignore_capab = 0;
 283 SYSCTL_INT(_kern_ipc, OID_AUTO, sosendjcl_ignore_capab,
 284     CTLFLAG_RW | CTLFLAG_LOCKED, &sosendjcl_ignore_capab, 0, "");
 285
 286 /*
 287  * Set this to ignore SOF1_IF_2KCL and use big clusters for large
 288  * writes on the socket for all protocols on any network interfaces.
 289  * Be extra careful when setting this to 1, because sending down packets with
 290  * clusters larger that 2 KB might lead to system panics or data corruption.
 291  * When set to 0, the system will respect SOF1_IF_2KCL, which is set
 292  * on the outgoing interface
 293  * Set this to 1  for testing/debugging purposes only.
 294  */
 295 int sosendbigcl_ignore_capab = 0;
 296 SYSCTL_INT(_kern_ipc, OID_AUTO, sosendbigcl_ignore_capab,
 297     CTLFLAG_RW | CTLFLAG_LOCKED, &sosendbigcl_ignore_capab, 0, "");
 298
 299 int sodefunctlog = 0;
 300 SYSCTL_INT(_kern_ipc, OID_AUTO, sodefunctlog, CTLFLAG_RW | CTLFLAG_LOCKED,
 301     &sodefunctlog, 0, "");
 302
 303 int sothrottlelog = 0;
 304 SYSCTL_INT(_kern_ipc, OID_AUTO, sothrottlelog, CTLFLAG_RW | CTLFLAG_LOCKED,
 305     &sothrottlelog, 0, "");
 306
 307 int sorestrictrecv = 1;
 308 SYSCTL_INT(_kern_ipc, OID_AUTO, sorestrictrecv, CTLFLAG_RW | CTLFLAG_LOCKED,
 309     &sorestrictrecv, 0, "Enable inbound interface restrictions");
 310
 311 int sorestrictsend = 1;
 312 SYSCTL_INT(_kern_ipc, OID_AUTO, sorestrictsend, CTLFLAG_RW | CTLFLAG_LOCKED,
 313     &sorestrictsend, 0, "Enable outbound interface restrictions");
 314
 315 int soreserveheadroom = 1;
 316 SYSCTL_INT(_kern_ipc, OID_AUTO, soreserveheadroom, CTLFLAG_RW | CTLFLAG_LOCKED,
 317     &soreserveheadroom, 0, "To allocate contiguous datagram buffers");
 318
 319 #if (DEBUG || DEVELOPMENT)
 320 int so_notsent_lowat_check = 1;
 321 SYSCTL_INT(_kern_ipc, OID_AUTO, notsent_lowat, CTLFLAG_RW | CTLFLAG_LOCKED,
 322     &so_notsent_lowat_check, 0, "enable/disable notsnet lowat check");
 323 #endif /* DEBUG || DEVELOPMENT */
 324
 325 int so_accept_list_waits = 0;
 326 #if (DEBUG || DEVELOPMENT)
 327 SYSCTL_INT(_kern_ipc, OID_AUTO, accept_list_waits, CTLFLAG_RW | CTLFLAG_LOCKED,
 328     &so_accept_list_waits, 0, "number of waits for listener incomp list");
 329 #endif /* DEBUG || DEVELOPMENT */
 330
 331 extern struct inpcbinfo tcbinfo;
 332
 333 /* TODO: these should be in header file */
 334 extern int get_inpcb_str_size(void);
 335 extern int get_tcp_str_size(void);
 336
 337 vm_size_t       so_cache_zone_element_size;
 338
 339 static int sodelayed_copy(struct socket *, struct uio *, struct mbuf **,
 340     user_ssize_t *);
 341 static void cached_sock_alloc(struct socket **, zalloc_flags_t);
 342 static void cached_sock_free(struct socket *);
 343
 344 /*
 345  * Maximum of extended background idle sockets per process
 346  * Set to zero to disable further setting of the option
 347  */
 348
 349 #define SO_IDLE_BK_IDLE_MAX_PER_PROC    1
 350 #define SO_IDLE_BK_IDLE_TIME            600
 351 #define SO_IDLE_BK_IDLE_RCV_HIWAT       131072
 352
 353 struct soextbkidlestat soextbkidlestat;
 354
 355 SYSCTL_UINT(_kern_ipc, OID_AUTO, maxextbkidleperproc,
 356     CTLFLAG_RW | CTLFLAG_LOCKED, &soextbkidlestat.so_xbkidle_maxperproc, 0,
 357     "Maximum of extended background idle sockets per process");
 358
 359 SYSCTL_UINT(_kern_ipc, OID_AUTO, extbkidletime, CTLFLAG_RW | CTLFLAG_LOCKED,
 360     &soextbkidlestat.so_xbkidle_time, 0,
 361     "Time in seconds to keep extended background idle sockets");
 362
 363 SYSCTL_UINT(_kern_ipc, OID_AUTO, extbkidlercvhiwat, CTLFLAG_RW | CTLFLAG_LOCKED,
 364     &soextbkidlestat.so_xbkidle_rcvhiwat, 0,
 365     "High water mark for extended background idle sockets");
 366
 367 SYSCTL_STRUCT(_kern_ipc, OID_AUTO, extbkidlestat, CTLFLAG_RD | CTLFLAG_LOCKED,
 368     &soextbkidlestat, soextbkidlestat, "");
 369
 370 int so_set_extended_bk_idle(struct socket *, int);
 371
 372
 373 /*
 374  * SOTCDB_NO_DSCP is set by default, to prevent the networking stack from
 375  * setting the DSCP code on the packet based on the service class; see
 376  * <rdar://problem/11277343> for details.
 377  */
 378 __private_extern__ u_int32_t sotcdb = 0;
 379 SYSCTL_INT(_kern_ipc, OID_AUTO, sotcdb, CTLFLAG_RW | CTLFLAG_LOCKED,
 380     &sotcdb, 0, "");
 381
 382 void
 383 socketinit(void)
 384 {
 385         _CASSERT(sizeof(so_gencnt) == sizeof(uint64_t));
 386         VERIFY(IS_P2ALIGNED(&so_gencnt, sizeof(uint32_t)));
 387
 388 #ifdef __LP64__
 389         _CASSERT(sizeof(struct sa_endpoints) == sizeof(struct user64_sa_endpoints));
 390         _CASSERT(offsetof(struct sa_endpoints, sae_srcif) == offsetof(struct user64_sa_endpoints, sae_srcif));
 391         _CASSERT(offsetof(struct sa_endpoints, sae_srcaddr) == offsetof(struct user64_sa_endpoints, sae_srcaddr));
 392         _CASSERT(offsetof(struct sa_endpoints, sae_srcaddrlen) == offsetof(struct user64_sa_endpoints, sae_srcaddrlen));
 393         _CASSERT(offsetof(struct sa_endpoints, sae_dstaddr) == offsetof(struct user64_sa_endpoints, sae_dstaddr));
 394         _CASSERT(offsetof(struct sa_endpoints, sae_dstaddrlen) == offsetof(struct user64_sa_endpoints, sae_dstaddrlen));
 395 #else
 396         _CASSERT(sizeof(struct sa_endpoints) == sizeof(struct user32_sa_endpoints));
 397         _CASSERT(offsetof(struct sa_endpoints, sae_srcif) == offsetof(struct user32_sa_endpoints, sae_srcif));
 398         _CASSERT(offsetof(struct sa_endpoints, sae_srcaddr) == offsetof(struct user32_sa_endpoints, sae_srcaddr));
 399         _CASSERT(offsetof(struct sa_endpoints, sae_srcaddrlen) == offsetof(struct user32_sa_endpoints, sae_srcaddrlen));
 400         _CASSERT(offsetof(struct sa_endpoints, sae_dstaddr) == offsetof(struct user32_sa_endpoints, sae_dstaddr));
 401         _CASSERT(offsetof(struct sa_endpoints, sae_dstaddrlen) == offsetof(struct user32_sa_endpoints, sae_dstaddrlen));
 402 #endif
 403
 404         if (socketinit_done) {
 405                 printf("socketinit: already called...\n");
 406                 return;
 407         }
 408         socketinit_done = 1;
 409
 410         PE_parse_boot_argn("socket_debug", &socket_debug,
 411             sizeof(socket_debug));
 412
 413         /*
 414          * allocate lock group attribute and group for socket cache mutex
 415          */
 416         so_cache_mtx_grp_attr = lck_grp_attr_alloc_init();
 417         so_cache_mtx_grp = lck_grp_alloc_init("so_cache",
 418             so_cache_mtx_grp_attr);
 419
 420         /*
 421          * allocate the lock attribute for socket cache mutex
 422          */
 423         so_cache_mtx_attr = lck_attr_alloc_init();
 424
 425         /* cached sockets mutex */
 426         so_cache_mtx = lck_mtx_alloc_init(so_cache_mtx_grp, so_cache_mtx_attr);
 427         if (so_cache_mtx == NULL) {
 428                 panic("%s: unable to allocate so_cache_mtx\n", __func__);
 429                 /* NOTREACHED */
 430         }
 431         STAILQ_INIT(&so_cache_head);
 432
 433         so_cache_zone_element_size = (vm_size_t)(sizeof(struct socket) + 4
 434             + get_inpcb_str_size() + 4 + get_tcp_str_size());
 435
 436         so_cache_zone = zone_create("socache zone", so_cache_zone_element_size,
 437             ZC_ZFREE_CLEARMEM | ZC_NOENCRYPT);
 438
 439         bzero(&soextbkidlestat, sizeof(struct soextbkidlestat));
 440         soextbkidlestat.so_xbkidle_maxperproc = SO_IDLE_BK_IDLE_MAX_PER_PROC;
 441         soextbkidlestat.so_xbkidle_time = SO_IDLE_BK_IDLE_TIME;
 442         soextbkidlestat.so_xbkidle_rcvhiwat = SO_IDLE_BK_IDLE_RCV_HIWAT;
 443
 444         in_pcbinit();
 445         sflt_init();
 446         socket_tclass_init();
 447 #if MULTIPATH
 448         mp_pcbinit();
 449 #endif /* MULTIPATH */
 450 }
 451
 452 static void
 453 cached_sock_alloc(struct socket **so, zalloc_flags_t how)
 454 {
 455         caddr_t temp;
 456         uintptr_t offset;
 457
 458         lck_mtx_lock(so_cache_mtx);
 459
 460         if (!STAILQ_EMPTY(&so_cache_head)) {
 461                 VERIFY(cached_sock_count > 0);
 462
 463                 *so = STAILQ_FIRST(&so_cache_head);
 464                 STAILQ_REMOVE_HEAD(&so_cache_head, so_cache_ent);
 465                 STAILQ_NEXT((*so), so_cache_ent) = NULL;
 466
 467                 cached_sock_count--;
 468                 lck_mtx_unlock(so_cache_mtx);
 469
 470                 temp = (*so)->so_saved_pcb;
 471                 bzero((caddr_t)*so, sizeof(struct socket));
 472
 473                 (*so)->so_saved_pcb = temp;
 474         } else {
 475                 lck_mtx_unlock(so_cache_mtx);
 476
 477                 *so = zalloc_flags(so_cache_zone, how | Z_ZERO);
 478
 479                 /*
 480                  * Define offsets for extra structures into our
 481                  * single block of memory. Align extra structures
 482                  * on longword boundaries.
 483                  */
 484
 485                 offset = (uintptr_t)*so;
 486                 offset += sizeof(struct socket);
 487
 488                 offset = ALIGN(offset);
 489
 490                 (*so)->so_saved_pcb = (caddr_t)offset;
 491                 offset += get_inpcb_str_size();
 492
 493                 offset = ALIGN(offset);
 494
 495                 ((struct inpcb *)(void *)(*so)->so_saved_pcb)->inp_saved_ppcb =
 496                     (caddr_t)offset;
 497         }
 498
 499         OSBitOrAtomic(SOF1_CACHED_IN_SOCK_LAYER, &(*so)->so_flags1);
 500 }
 501
 502 static void
 503 cached_sock_free(struct socket *so)
 504 {
 505         lck_mtx_lock(so_cache_mtx);
 506
 507         so_cache_time = net_uptime();
 508         if (++cached_sock_count > max_cached_sock_count) {
 509                 --cached_sock_count;
 510                 lck_mtx_unlock(so_cache_mtx);
 511                 zfree(so_cache_zone, so);
 512         } else {
 513                 if (so_cache_hw < cached_sock_count) {
 514                         so_cache_hw = cached_sock_count;
 515                 }
 516
 517                 STAILQ_INSERT_TAIL(&so_cache_head, so, so_cache_ent);
 518
 519                 so->cache_timestamp = so_cache_time;
 520                 lck_mtx_unlock(so_cache_mtx);
 521         }
 522 }
 523
 524 void
 525 so_update_last_owner_locked(struct socket *so, proc_t self)
 526 {
 527         if (so->last_pid != 0) {
 528                 /*
 529                  * last_pid and last_upid should remain zero for sockets
 530                  * created using sock_socket. The check above achieves that
 531                  */
 532                 if (self == PROC_NULL) {
 533                         self = current_proc();
 534                 }
 535
 536                 if (so->last_upid != proc_uniqueid(self) ||
 537                     so->last_pid != proc_pid(self)) {
 538                         so->last_upid = proc_uniqueid(self);
 539                         so->last_pid = proc_pid(self);
 540                         proc_getexecutableuuid(self, so->last_uuid,
 541                             sizeof(so->last_uuid));
 542                         if (so->so_proto != NULL && so->so_proto->pr_update_last_owner != NULL) {
 543                                 (*so->so_proto->pr_update_last_owner)(so, self, NULL);
 544                         }
 545                 }
 546                 proc_pidoriginatoruuid(so->so_vuuid, sizeof(so->so_vuuid));
 547         }
 548 }
 549
 550 void
 551 so_update_policy(struct socket *so)
 552 {
 553         if (SOCK_DOM(so) == PF_INET || SOCK_DOM(so) == PF_INET6) {
 554                 (void) inp_update_policy(sotoinpcb(so));
 555         }
 556 }
 557
 558 #if NECP
 559 static void
 560 so_update_necp_policy(struct socket *so, struct sockaddr *override_local_addr,
 561     struct sockaddr *override_remote_addr)
 562 {
 563         if (SOCK_DOM(so) == PF_INET || SOCK_DOM(so) == PF_INET6) {
 564                 inp_update_necp_policy(sotoinpcb(so), override_local_addr,
 565                     override_remote_addr, 0);
 566         }
 567 }
 568 #endif /* NECP */
 569
 570 boolean_t
 571 so_cache_timer(void)
 572 {
 573         struct socket   *p;
 574         int             n_freed = 0;
 575         boolean_t rc = FALSE;
 576
 577         lck_mtx_lock(so_cache_mtx);
 578         so_cache_timeouts++;
 579         so_cache_time = net_uptime();
 580
 581         while (!STAILQ_EMPTY(&so_cache_head)) {
 582                 VERIFY(cached_sock_count > 0);
 583                 p = STAILQ_FIRST(&so_cache_head);
 584                 if ((so_cache_time - p->cache_timestamp) <
 585                     SO_CACHE_TIME_LIMIT) {
 586                         break;
 587                 }
 588
 589                 STAILQ_REMOVE_HEAD(&so_cache_head, so_cache_ent);
 590                 --cached_sock_count;
 591
 592                 zfree(so_cache_zone, p);
 593
 594                 if (++n_freed >= SO_CACHE_MAX_FREE_BATCH) {
 595                         so_cache_max_freed++;
 596                         break;
 597                 }
 598         }
 599
 600         /* Schedule again if there is more to cleanup */
 601         if (!STAILQ_EMPTY(&so_cache_head)) {
 602                 rc = TRUE;
 603         }
 604
 605         lck_mtx_unlock(so_cache_mtx);
 606         return rc;
 607 }
 608
 609 /*
 610  * Get a socket structure from our zone, and initialize it.
 611  * We don't implement `waitok' yet (see comments in uipc_domain.c).
 612  * Note that it would probably be better to allocate socket
 613  * and PCB at the same time, but I'm not convinced that all
 614  * the protocols can be easily modified to do this.
 615  */
 616 struct socket *
 617 soalloc(int waitok, int dom, int type)
 618 {
 619         zalloc_flags_t how = waitok ? Z_WAITOK : Z_NOWAIT;
 620         struct socket *so;
 621
 622         if ((dom == PF_INET) && (type == SOCK_STREAM)) {
 623                 cached_sock_alloc(&so, how);
 624         } else {
 625                 so = zalloc_flags(socket_zone, how | Z_ZERO);
 626         }
 627         if (so != NULL) {
 628                 so->so_gencnt = OSIncrementAtomic64((SInt64 *)&so_gencnt);
 629
 630                 /*
 631                  * Increment the socket allocation statistics
 632                  */
 633                 INC_ATOMIC_INT64_LIM(net_api_stats.nas_socket_alloc_total);
 634         }
 635
 636         return so;
 637 }
 638
 639 int
 640 socreate_internal(int dom, struct socket **aso, int type, int proto,
 641     struct proc *p, uint32_t flags, struct proc *ep)
 642 {
 643         struct protosw *prp;
 644         struct socket *so;
 645         int error = 0;
 646 #if defined(XNU_TARGET_OS_OSX)
 647         pid_t rpid = -1;
 648 #endif
 649
 650 #if TCPDEBUG
 651         extern int tcpconsdebug;
 652 #endif
 653
 654         VERIFY(aso != NULL);
 655         *aso = NULL;
 656
 657         if (proto != 0) {
 658                 prp = pffindproto(dom, proto, type);
 659         } else {
 660                 prp = pffindtype(dom, type);
 661         }
 662
 663         if (prp == NULL || prp->pr_usrreqs->pru_attach == NULL) {
 664                 if (pffinddomain(dom) == NULL) {
 665                         return EAFNOSUPPORT;
 666                 }
 667                 if (proto != 0) {
 668                         if (pffindprotonotype(dom, proto) != NULL) {
 669                                 return EPROTOTYPE;
 670                         }
 671                 }
 672                 return EPROTONOSUPPORT;
 673         }
 674         if (prp->pr_type != type) {
 675                 return EPROTOTYPE;
 676         }
 677         so = soalloc(1, dom, type);
 678         if (so == NULL) {
 679                 return ENOBUFS;
 680         }
 681
 682         switch (dom) {
 683         case PF_LOCAL:
 684                 INC_ATOMIC_INT64_LIM(net_api_stats.nas_socket_domain_local_total);
 685                 break;
 686         case PF_INET:
 687                 INC_ATOMIC_INT64_LIM(net_api_stats.nas_socket_domain_inet_total);
 688                 if (type == SOCK_STREAM) {
 689                         INC_ATOMIC_INT64_LIM(net_api_stats.nas_socket_inet_stream_total);
 690                 } else {
 691                         INC_ATOMIC_INT64_LIM(net_api_stats.nas_socket_inet_dgram_total);
 692                 }
 693                 break;
 694         case PF_ROUTE:
 695                 INC_ATOMIC_INT64_LIM(net_api_stats.nas_socket_domain_route_total);
 696                 break;
 697         case PF_NDRV:
 698                 INC_ATOMIC_INT64_LIM(net_api_stats.nas_socket_domain_ndrv_total);
 699                 break;
 700         case PF_KEY:
 701                 INC_ATOMIC_INT64_LIM(net_api_stats.nas_socket_domain_key_total);
 702                 break;
 703         case PF_INET6:
 704                 INC_ATOMIC_INT64_LIM(net_api_stats.nas_socket_domain_inet6_total);
 705                 if (type == SOCK_STREAM) {
 706                         INC_ATOMIC_INT64_LIM(net_api_stats.nas_socket_inet6_stream_total);
 707                 } else {
 708                         INC_ATOMIC_INT64_LIM(net_api_stats.nas_socket_inet6_dgram_total);
 709                 }
 710                 break;
 711         case PF_SYSTEM:
 712                 INC_ATOMIC_INT64_LIM(net_api_stats.nas_socket_domain_system_total);
 713                 break;
 714         case PF_MULTIPATH:
 715                 INC_ATOMIC_INT64_LIM(net_api_stats.nas_socket_domain_multipath_total);
 716                 break;
 717         default:
 718                 INC_ATOMIC_INT64_LIM(net_api_stats.nas_socket_domain_other_total);
 719                 break;
 720         }
 721
 722         if (flags & SOCF_MPTCP) {
 723                 so->so_state |= SS_NBIO;
 724         }
 725
 726         TAILQ_INIT(&so->so_incomp);
 727         TAILQ_INIT(&so->so_comp);
 728         so->so_type = type;
 729         so->last_upid = proc_uniqueid(p);
 730         so->last_pid = proc_pid(p);
 731         proc_getexecutableuuid(p, so->last_uuid, sizeof(so->last_uuid));
 732         proc_pidoriginatoruuid(so->so_vuuid, sizeof(so->so_vuuid));
 733
 734         if (ep != PROC_NULL && ep != p) {
 735                 so->e_upid = proc_uniqueid(ep);
 736                 so->e_pid = proc_pid(ep);
 737                 proc_getexecutableuuid(ep, so->e_uuid, sizeof(so->e_uuid));
 738                 so->so_flags |= SOF_DELEGATED;
 739 #if defined(XNU_TARGET_OS_OSX)
 740                 if (ep->p_responsible_pid != so->e_pid) {
 741                         rpid = ep->p_responsible_pid;
 742                 }
 743 #endif
 744         }
 745
 746 #if defined(XNU_TARGET_OS_OSX)
 747         if (rpid < 0 && p->p_responsible_pid != so->last_pid) {
 748                 rpid = p->p_responsible_pid;
 749         }
 750
 751         so->so_rpid = -1;
 752         uuid_clear(so->so_ruuid);
 753         if (rpid >= 0) {
 754                 proc_t rp = proc_find(rpid);
 755                 if (rp != PROC_NULL) {
 756                         proc_getexecutableuuid(rp, so->so_ruuid, sizeof(so->so_ruuid));
 757                         so->so_rpid = rpid;
 758                         proc_rele(rp);
 759                 }
 760         }
 761 #endif
 762
 763         so->so_cred = kauth_cred_proc_ref(p);
 764         if (!suser(kauth_cred_get(), NULL)) {
 765                 so->so_state |= SS_PRIV;
 766         }
 767
 768         so->so_proto = prp;
 769         so->so_rcv.sb_flags |= SB_RECV;
 770         so->so_rcv.sb_so = so->so_snd.sb_so = so;
 771         so->next_lock_lr = 0;
 772         so->next_unlock_lr = 0;
 773
 774         /*
 775          * Attachment will create the per pcb lock if necessary and
 776          * increase refcount for creation, make sure it's done before
 777          * socket is inserted in lists.
 778          */
 779         so->so_usecount++;
 780
 781         error = (*prp->pr_usrreqs->pru_attach)(so, proto, p);
 782         if (error != 0) {
 783                 /*
 784                  * Warning:
 785                  * If so_pcb is not zero, the socket will be leaked,
 786                  * so protocol attachment handler must be coded carefuly
 787                  */
 788                 so->so_state |= SS_NOFDREF;
 789                 VERIFY(so->so_usecount > 0);
 790                 so->so_usecount--;
 791                 sofreelastref(so, 1);   /* will deallocate the socket */
 792                 return error;
 793         }
 794
 795         /*
 796          * Note: needs so_pcb to be set after pru_attach
 797          */
 798         if (prp->pr_update_last_owner != NULL) {
 799                 (*prp->pr_update_last_owner)(so, p, ep);
 800         }
 801
 802         atomic_add_32(&prp->pr_domain->dom_refs, 1);
 803
 804         /* Attach socket filters for this protocol */
 805         sflt_initsock(so);
 806 #if TCPDEBUG
 807         if (tcpconsdebug == 2) {
 808                 so->so_options |= SO_DEBUG;
 809         }
 810 #endif
 811         so_set_default_traffic_class(so);
 812
 813         /*
 814          * If this thread or task is marked to create backgrounded sockets,
 815          * mark the socket as background.
 816          */
 817         if (!(flags & SOCF_MPTCP) &&
 818             proc_get_effective_thread_policy(current_thread(), TASK_POLICY_NEW_SOCKETS_BG)) {
 819                 socket_set_traffic_mgt_flags(so, TRAFFIC_MGT_SO_BACKGROUND);
 820                 so->so_background_thread = current_thread();
 821         }
 822
 823         switch (dom) {
 824         /*
 825          * Don't mark Unix domain or system
 826          * eligible for defunct by default.
 827          */
 828         case PF_LOCAL:
 829         case PF_SYSTEM:
 830                 so->so_flags |= SOF_NODEFUNCT;
 831                 break;
 832         default:
 833                 break;
 834         }
 835
 836         /*
 837          * Entitlements can't be checked at socket creation time except if the
 838          * application requested a feature guarded by a privilege (c.f., socket
 839          * delegation).
 840          * The priv(9) and the Sandboxing APIs are designed with the idea that
 841          * a privilege check should only be triggered by a userland request.
 842          * A privilege check at socket creation time is time consuming and
 843          * could trigger many authorisation error messages from the security
 844          * APIs.
 845          */
 846
 847         *aso = so;
 848
 849         return 0;
 850 }
 851
 852 /*
 853  * Returns:     0                       Success
 854  *              EAFNOSUPPORT
 855  *              EPROTOTYPE
 856  *              EPROTONOSUPPORT
 857  *              ENOBUFS
 858  *      <pru_attach>:ENOBUFS[AF_UNIX]
 859  *      <pru_attach>:ENOBUFS[TCP]
 860  *      <pru_attach>:ENOMEM[TCP]
 861  *      <pru_attach>:???                [other protocol families, IPSEC]
 862  */
 863 int
 864 socreate(int dom, struct socket **aso, int type, int proto)
 865 {
 866         return socreate_internal(dom, aso, type, proto, current_proc(), 0,
 867                    PROC_NULL);
 868 }
 869
 870 int
 871 socreate_delegate(int dom, struct socket **aso, int type, int proto, pid_t epid)
 872 {
 873         int error = 0;
 874         struct proc *ep = PROC_NULL;
 875
 876         if ((proc_selfpid() != epid) && ((ep = proc_find(epid)) == PROC_NULL)) {
 877                 error = ESRCH;
 878                 goto done;
 879         }
 880
 881         error = socreate_internal(dom, aso, type, proto, current_proc(), 0, ep);
 882
 883         /*
 884          * It might not be wise to hold the proc reference when calling
 885          * socreate_internal since it calls soalloc with M_WAITOK
 886          */
 887 done:
 888         if (ep != PROC_NULL) {
 889                 proc_rele(ep);
 890         }
 891
 892         return error;
 893 }
 894
 895 /*
 896  * Returns:     0                       Success
 897  *      <pru_bind>:EINVAL               Invalid argument [COMMON_START]
 898  *      <pru_bind>:EAFNOSUPPORT         Address family not supported
 899  *      <pru_bind>:EADDRNOTAVAIL        Address not available.
 900  *      <pru_bind>:EINVAL               Invalid argument
 901  *      <pru_bind>:EAFNOSUPPORT         Address family not supported [notdef]
 902  *      <pru_bind>:EACCES               Permission denied
 903  *      <pru_bind>:EADDRINUSE           Address in use
 904  *      <pru_bind>:EAGAIN               Resource unavailable, try again
 905  *      <pru_bind>:EPERM                Operation not permitted
 906  *      <pru_bind>:???
 907  *      <sf_bind>:???
 908  *
 909  * Notes:       It's not possible to fully enumerate the return codes above,
 910  *              since socket filter authors and protocol family authors may
 911  *              not choose to limit their error returns to those listed, even
 912  *              though this may result in some software operating incorrectly.
 913  *
 914  *              The error codes which are enumerated above are those known to
 915  *              be returned by the tcp_usr_bind function supplied.
 916  */
 917 int
 918 sobindlock(struct socket *so, struct sockaddr *nam, int dolock)
 919 {
 920         struct proc *p = current_proc();
 921         int error = 0;
 922
 923         if (dolock) {
 924                 socket_lock(so, 1);
 925         }
 926
 927         so_update_last_owner_locked(so, p);
 928         so_update_policy(so);
 929
 930 #if NECP
 931         so_update_necp_policy(so, nam, NULL);
 932 #endif /* NECP */
 933
 934         /*
 935          * If this is a bind request on a socket that has been marked
 936          * as inactive, reject it now before we go any further.
 937          */
 938         if (so->so_flags & SOF_DEFUNCT) {
 939                 error = EINVAL;
 940                 SODEFUNCTLOG("%s[%d, %s]: defunct so 0x%llx [%d,%d] (%d)\n",
 941                     __func__, proc_pid(p), proc_best_name(p),
 942                     (uint64_t)DEBUG_KERNEL_ADDRPERM(so),
 943                     SOCK_DOM(so), SOCK_TYPE(so), error);
 944                 goto out;
 945         }
 946
 947         /* Socket filter */
 948         error = sflt_bind(so, nam);
 949
 950         if (error == 0) {
 951                 error = (*so->so_proto->pr_usrreqs->pru_bind)(so, nam, p);
 952         }
 953 out:
 954         if (dolock) {
 955                 socket_unlock(so, 1);
 956         }
 957
 958         if (error == EJUSTRETURN) {
 959                 error = 0;
 960         }
 961
 962         return error;
 963 }
 964
 965 void
 966 sodealloc(struct socket *so)
 967 {
 968         kauth_cred_unref(&so->so_cred);
 969
 970         /* Remove any filters */
 971         sflt_termsock(so);
 972
 973 #if CONTENT_FILTER
 974         cfil_sock_detach(so);
 975 #endif /* CONTENT_FILTER */
 976
 977         so->so_gencnt = OSIncrementAtomic64((SInt64 *)&so_gencnt);
 978
 979         if (so->so_flags1 & SOF1_CACHED_IN_SOCK_LAYER) {
 980                 cached_sock_free(so);
 981         } else {
 982                 zfree(socket_zone, so);
 983         }
 984 }
 985
 986 /*
 987  * Returns:     0                       Success
 988  *              EINVAL
 989  *              EOPNOTSUPP
 990  *      <pru_listen>:EINVAL[AF_UNIX]
 991  *      <pru_listen>:EINVAL[TCP]
 992  *      <pru_listen>:EADDRNOTAVAIL[TCP] Address not available.
 993  *      <pru_listen>:EINVAL[TCP]        Invalid argument
 994  *      <pru_listen>:EAFNOSUPPORT[TCP]  Address family not supported [notdef]
 995  *      <pru_listen>:EACCES[TCP]        Permission denied
 996  *      <pru_listen>:EADDRINUSE[TCP]    Address in use
 997  *      <pru_listen>:EAGAIN[TCP]        Resource unavailable, try again
 998  *      <pru_listen>:EPERM[TCP]         Operation not permitted
 999  *      <sf_listen>:???
1000  *
1001  * Notes:       Other <pru_listen> returns depend on the protocol family; all
1002  *              <sf_listen> returns depend on what the filter author causes
1003  *              their filter to return.
1004  */
1005 int
1006 solisten(struct socket *so, int backlog)
1007 {
1008         struct proc *p = current_proc();
1009         int error = 0;
1010
1011         socket_lock(so, 1);
1012
1013         so_update_last_owner_locked(so, p);
1014         so_update_policy(so);
1015
1016 #if NECP
1017         so_update_necp_policy(so, NULL, NULL);
1018 #endif /* NECP */
1019
1020         if (so->so_proto == NULL) {
1021                 error = EINVAL;
1022                 goto out;
1023         }
1024         if ((so->so_proto->pr_flags & PR_CONNREQUIRED) == 0) {
1025                 error = EOPNOTSUPP;
1026                 goto out;
1027         }
1028
1029         /*
1030          * If the listen request is made on a socket that is not fully
1031          * disconnected, or on a socket that has been marked as inactive,
1032          * reject the request now.
1033          */
1034         if ((so->so_state &
1035             (SS_ISCONNECTED | SS_ISCONNECTING | SS_ISDISCONNECTING)) ||
1036             (so->so_flags & SOF_DEFUNCT)) {
1037                 error = EINVAL;
1038                 if (so->so_flags & SOF_DEFUNCT) {
1039                         SODEFUNCTLOG("%s[%d, %s]: defunct so 0x%llx [%d,%d] "
1040                             "(%d)\n", __func__, proc_pid(p),
1041                             proc_best_name(p),
1042                             (uint64_t)DEBUG_KERNEL_ADDRPERM(so),
1043                             SOCK_DOM(so), SOCK_TYPE(so), error);
1044                 }
1045                 goto out;
1046         }
1047
1048         if ((so->so_restrictions & SO_RESTRICT_DENY_IN) != 0) {
1049                 error = EPERM;
1050                 goto out;
1051         }
1052
1053         error = sflt_listen(so);
1054         if (error == 0) {
1055                 error = (*so->so_proto->pr_usrreqs->pru_listen)(so, p);
1056         }
1057
1058         if (error) {
1059                 if (error == EJUSTRETURN) {
1060                         error = 0;
1061                 }
1062                 goto out;
1063         }
1064
1065         if (TAILQ_EMPTY(&so->so_comp)) {
1066                 so->so_options |= SO_ACCEPTCONN;
1067         }
1068         /*
1069          * POSIX: The implementation may have an upper limit on the length of
1070          * the listen queue-either global or per accepting socket. If backlog
1071          * exceeds this limit, the length of the listen queue is set to the
1072          * limit.
1073          *
1074          * If listen() is called with a backlog argument value that is less
1075          * than 0, the function behaves as if it had been called with a backlog
1076          * argument value of 0.
1077          *
1078          * A backlog argument of 0 may allow the socket to accept connections,
1079          * in which case the length of the listen queue may be set to an
1080          * implementation-defined minimum value.
1081          */
1082         if (backlog <= 0 || backlog > somaxconn) {
1083                 backlog = somaxconn;
1084         }
1085
1086         so->so_qlimit = backlog;
1087 out:
1088         socket_unlock(so, 1);
1089         return error;
1090 }
1091
1092 /*
1093  * The "accept list lock" protects the fields related to the listener queues
1094  * because we can unlock a socket to respect the lock ordering between
1095  * the listener socket and its clients sockets. The lock ordering is first to
1096  * acquire the client socket before the listener socket.
1097  *
1098  * The accept list lock serializes access to the following fields:
1099  * - of the listener socket:
1100  *   - so_comp
1101  *   - so_incomp
1102  *   - so_qlen
1103  *   - so_inqlen
1104  * - of client sockets that are in so_comp or so_incomp:
1105  *   - so_head
1106  *   - so_list
1107  *
1108  * As one can see the accept list lock protects the consistent of the
1109  * linkage of the client sockets.
1110  *
1111  * Note that those fields may be read without holding the accept list lock
1112  * for a preflight provided the accept list lock is taken when committing
1113  * to take an action based on the result of the preflight. The preflight
1114  * saves the cost of doing the unlock/lock dance.
1115  */
1116 void
1117 so_acquire_accept_list(struct socket *head, struct socket *so)
1118 {
1119         lck_mtx_t *mutex_held;
1120
1121         if (head->so_proto->pr_getlock == NULL) {
1122                 return;
1123         }
1124         mutex_held = (*head->so_proto->pr_getlock)(head, PR_F_WILLUNLOCK);
1125         LCK_MTX_ASSERT(mutex_held, LCK_MTX_ASSERT_OWNED);
1126
1127         if (!(head->so_flags1 & SOF1_ACCEPT_LIST_HELD)) {
1128                 head->so_flags1 |= SOF1_ACCEPT_LIST_HELD;
1129                 return;
1130         }
1131         if (so != NULL) {
1132                 socket_unlock(so, 0);
1133         }
1134         while (head->so_flags1 & SOF1_ACCEPT_LIST_HELD) {
1135                 so_accept_list_waits += 1;
1136                 msleep((caddr_t)&head->so_incomp, mutex_held,
1137                     PSOCK | PCATCH, __func__, NULL);
1138         }
1139         head->so_flags1 |= SOF1_ACCEPT_LIST_HELD;
1140         if (so != NULL) {
1141                 socket_unlock(head, 0);
1142                 socket_lock(so, 0);
1143                 socket_lock(head, 0);
1144         }
1145 }
1146
1147 void
1148 so_release_accept_list(struct socket *head)
1149 {
1150         if (head->so_proto->pr_getlock != NULL) {
1151                 lck_mtx_t *mutex_held;
1152
1153                 mutex_held = (*head->so_proto->pr_getlock)(head, 0);
1154                 LCK_MTX_ASSERT(mutex_held, LCK_MTX_ASSERT_OWNED);
1155
1156                 head->so_flags1 &= ~SOF1_ACCEPT_LIST_HELD;
1157                 wakeup((caddr_t)&head->so_incomp);
1158         }
1159 }
1160
1161 void
1162 sofreelastref(struct socket *so, int dealloc)
1163 {
1164         struct socket *head = so->so_head;
1165
1166         /* Assume socket is locked */
1167
1168         if (!(so->so_flags & SOF_PCBCLEARING) || !(so->so_state & SS_NOFDREF)) {
1169                 selthreadclear(&so->so_snd.sb_sel);
1170                 selthreadclear(&so->so_rcv.sb_sel);
1171                 so->so_rcv.sb_flags &= ~(SB_SEL | SB_UPCALL);
1172                 so->so_snd.sb_flags &= ~(SB_SEL | SB_UPCALL);
1173                 so->so_event = sonullevent;
1174                 return;
1175         }
1176         if (head != NULL) {
1177                 /*
1178                  * Need to lock the listener when the protocol has
1179                  * per socket locks
1180                  */
1181                 if (head->so_proto->pr_getlock != NULL) {
1182                         socket_lock(head, 1);
1183                         so_acquire_accept_list(head, so);
1184                 }
1185                 if (so->so_state & SS_INCOMP) {
1186                         so->so_state &= ~SS_INCOMP;
1187                         TAILQ_REMOVE(&head->so_incomp, so, so_list);
1188                         head->so_incqlen--;
1189                         head->so_qlen--;
1190                         so->so_head = NULL;
1191
1192                         if (head->so_proto->pr_getlock != NULL) {
1193                                 so_release_accept_list(head);
1194                                 socket_unlock(head, 1);
1195                         }
1196                 } else if (so->so_state & SS_COMP) {
1197                         if (head->so_proto->pr_getlock != NULL) {
1198                                 so_release_accept_list(head);
1199                                 socket_unlock(head, 1);
1200                         }
1201                         /*
1202                          * We must not decommission a socket that's
1203                          * on the accept(2) queue.  If we do, then
1204                          * accept(2) may hang after select(2) indicated
1205                          * that the listening socket was ready.
1206                          */
1207                         selthreadclear(&so->so_snd.sb_sel);
1208                         selthreadclear(&so->so_rcv.sb_sel);
1209                         so->so_rcv.sb_flags &= ~(SB_SEL | SB_UPCALL);
1210                         so->so_snd.sb_flags &= ~(SB_SEL | SB_UPCALL);
1211                         so->so_event = sonullevent;
1212                         return;
1213                 } else {
1214                         if (head->so_proto->pr_getlock != NULL) {
1215                                 so_release_accept_list(head);
1216                                 socket_unlock(head, 1);
1217                         }
1218                         printf("sofree: not queued\n");
1219                 }
1220         }
1221         sowflush(so);
1222         sorflush(so);
1223
1224 #if FLOW_DIVERT
1225         if (so->so_flags & SOF_FLOW_DIVERT) {
1226                 flow_divert_detach(so);
1227         }
1228 #endif  /* FLOW_DIVERT */
1229
1230         /* 3932268: disable upcall */
1231         so->so_rcv.sb_flags &= ~SB_UPCALL;
1232         so->so_snd.sb_flags &= ~(SB_UPCALL | SB_SNDBYTE_CNT);
1233         so->so_event = sonullevent;
1234
1235         if (dealloc) {
1236                 sodealloc(so);
1237         }
1238 }
1239
1240 void
1241 soclose_wait_locked(struct socket *so)
1242 {
1243         lck_mtx_t *mutex_held;
1244
1245         if (so->so_proto->pr_getlock != NULL) {
1246                 mutex_held = (*so->so_proto->pr_getlock)(so, PR_F_WILLUNLOCK);
1247         } else {
1248                 mutex_held = so->so_proto->pr_domain->dom_mtx;
1249         }
1250         LCK_MTX_ASSERT(mutex_held, LCK_MTX_ASSERT_OWNED);
1251
1252         /*
1253          * Double check here and return if there's no outstanding upcall;
1254          * otherwise proceed further only if SOF_UPCALLCLOSEWAIT is set.
1255          */
1256         if (!so->so_upcallusecount || !(so->so_flags & SOF_UPCALLCLOSEWAIT)) {
1257                 return;
1258         }
1259         so->so_rcv.sb_flags &= ~SB_UPCALL;
1260         so->so_snd.sb_flags &= ~SB_UPCALL;
1261         so->so_flags |= SOF_CLOSEWAIT;
1262
1263         (void) msleep((caddr_t)&so->so_upcallusecount, mutex_held, (PZERO - 1),
1264             "soclose_wait_locked", NULL);
1265         LCK_MTX_ASSERT(mutex_held, LCK_MTX_ASSERT_OWNED);
1266         so->so_flags &= ~SOF_CLOSEWAIT;
1267 }
1268
1269 /*
1270  * Close a socket on last file table reference removal.
1271  * Initiate disconnect if connected.
1272  * Free socket when disconnect complete.
1273  */
1274 int
1275 soclose_locked(struct socket *so)
1276 {
1277         int error = 0;
1278         struct timespec ts;
1279
1280         if (so->so_usecount == 0) {
1281                 panic("soclose: so=%p refcount=0\n", so);
1282                 /* NOTREACHED */
1283         }
1284
1285         sflt_notify(so, sock_evt_closing, NULL);
1286
1287         if (so->so_upcallusecount) {
1288                 soclose_wait_locked(so);
1289         }
1290
1291 #if CONTENT_FILTER
1292         /*
1293          * We have to wait until the content filters are done
1294          */
1295         if ((so->so_flags & SOF_CONTENT_FILTER) != 0) {
1296                 cfil_sock_close_wait(so);
1297                 cfil_sock_is_closed(so);
1298                 cfil_sock_detach(so);
1299         }
1300 #endif /* CONTENT_FILTER */
1301
1302         if (so->so_flags1 & SOF1_EXTEND_BK_IDLE_INPROG) {
1303                 soresume(current_proc(), so, 1);
1304                 so->so_flags1 &= ~SOF1_EXTEND_BK_IDLE_WANTED;
1305         }
1306
1307         if ((so->so_options & SO_ACCEPTCONN)) {
1308                 struct socket *sp, *sonext;
1309                 int persocklock = 0;
1310                 int incomp_overflow_only;
1311
1312                 /*
1313                  * We do not want new connection to be added
1314                  * to the connection queues
1315                  */
1316                 so->so_options &= ~SO_ACCEPTCONN;
1317
1318                 /*
1319                  * We can drop the lock on the listener once
1320                  * we've acquired the incoming list
1321                  */
1322                 if (so->so_proto->pr_getlock != NULL) {
1323                         persocklock = 1;
1324                         so_acquire_accept_list(so, NULL);
1325                         socket_unlock(so, 0);
1326                 }
1327 again:
1328                 incomp_overflow_only = 1;
1329
1330                 TAILQ_FOREACH_SAFE(sp, &so->so_incomp, so_list, sonext) {
1331                         /*
1332                          * Radar 5350314
1333                          * skip sockets thrown away by tcpdropdropblreq
1334                          * they will get cleanup by the garbage collection.
1335                          * otherwise, remove the incomp socket from the queue
1336                          * and let soabort trigger the appropriate cleanup.
1337                          */
1338                         if (sp->so_flags & SOF_OVERFLOW) {
1339                                 continue;
1340                         }
1341
1342                         if (persocklock != 0) {
1343                                 socket_lock(sp, 1);
1344                         }
1345
1346                         /*
1347                          * Radar 27945981
1348                          * The extra reference for the list insure the
1349                          * validity of the socket pointer when we perform the
1350                          * unlock of the head above
1351                          */
1352                         if (sp->so_state & SS_INCOMP) {
1353                                 sp->so_state &= ~SS_INCOMP;
1354                                 sp->so_head = NULL;
1355                                 TAILQ_REMOVE(&so->so_incomp, sp, so_list);
1356                                 so->so_incqlen--;
1357                                 so->so_qlen--;
1358
1359                                 (void) soabort(sp);
1360                         } else {
1361                                 panic("%s sp %p in so_incomp but !SS_INCOMP",
1362                                     __func__, sp);
1363                         }
1364
1365                         if (persocklock != 0) {
1366                                 socket_unlock(sp, 1);
1367                         }
1368                 }
1369
1370                 TAILQ_FOREACH_SAFE(sp, &so->so_comp, so_list, sonext) {
1371                         /* Dequeue from so_comp since sofree() won't do it */
1372                         if (persocklock != 0) {
1373                                 socket_lock(sp, 1);
1374                         }
1375
1376                         if (sp->so_state & SS_COMP) {
1377                                 sp->so_state &= ~SS_COMP;
1378                                 sp->so_head = NULL;
1379                                 TAILQ_REMOVE(&so->so_comp, sp, so_list);
1380                                 so->so_qlen--;
1381
1382                                 (void) soabort(sp);
1383                         } else {
1384                                 panic("%s sp %p in so_comp but !SS_COMP",
1385                                     __func__, sp);
1386                         }
1387
1388                         if (persocklock) {
1389                                 socket_unlock(sp, 1);
1390                         }
1391                 }
1392
1393                 if (incomp_overflow_only == 0 && !TAILQ_EMPTY(&so->so_incomp)) {
1394 #if (DEBUG | DEVELOPMENT)
1395                         panic("%s head %p so_comp not empty\n", __func__, so);
1396 #endif /* (DEVELOPMENT || DEBUG) */
1397
1398                         goto again;
1399                 }
1400
1401                 if (!TAILQ_EMPTY(&so->so_comp)) {
1402 #if (DEBUG | DEVELOPMENT)
1403                         panic("%s head %p so_comp not empty\n", __func__, so);
1404 #endif /* (DEVELOPMENT || DEBUG) */
1405
1406                         goto again;
1407                 }
1408
1409                 if (persocklock) {
1410                         socket_lock(so, 0);
1411                         so_release_accept_list(so);
1412                 }
1413         }
1414         if (so->so_pcb == NULL) {
1415                 /* 3915887: mark the socket as ready for dealloc */
1416                 so->so_flags |= SOF_PCBCLEARING;
1417                 goto discard;
1418         }
1419         if (so->so_state & SS_ISCONNECTED) {
1420                 if ((so->so_state & SS_ISDISCONNECTING) == 0) {
1421                         error = sodisconnectlocked(so);
1422                         if (error) {
1423                                 goto drop;
1424                         }
1425                 }
1426                 if (so->so_options & SO_LINGER) {
1427                         lck_mtx_t *mutex_held;
1428
1429                         if ((so->so_state & SS_ISDISCONNECTING) &&
1430                             (so->so_state & SS_NBIO)) {
1431                                 goto drop;
1432                         }
1433                         if (so->so_proto->pr_getlock != NULL) {
1434                                 mutex_held = (*so->so_proto->pr_getlock)(so, PR_F_WILLUNLOCK);
1435                         } else {
1436                                 mutex_held = so->so_proto->pr_domain->dom_mtx;
1437                         }
1438                         while (so->so_state & SS_ISCONNECTED) {
1439                                 ts.tv_sec = (so->so_linger / 100);
1440                                 ts.tv_nsec = (so->so_linger % 100) *
1441                                     NSEC_PER_USEC * 1000 * 10;
1442                                 error = msleep((caddr_t)&so->so_timeo,
1443                                     mutex_held, PSOCK | PCATCH, "soclose", &ts);
1444                                 if (error) {
1445                                         /*
1446                                          * It's OK when the time fires,
1447                                          * don't report an error
1448                                          */
1449                                         if (error == EWOULDBLOCK) {
1450                                                 error = 0;
1451                                         }
1452                                         break;
1453                                 }
1454                         }
1455                 }
1456         }
1457 drop:
1458         if (so->so_usecount == 0) {
1459                 panic("soclose: usecount is zero so=%p\n", so);
1460                 /* NOTREACHED */
1461         }
1462         if (so->so_pcb != NULL && !(so->so_flags & SOF_PCBCLEARING)) {
1463                 int error2 = (*so->so_proto->pr_usrreqs->pru_detach)(so);
1464                 if (error == 0) {
1465                         error = error2;
1466                 }
1467         }
1468         if (so->so_usecount <= 0) {
1469                 panic("soclose: usecount is zero so=%p\n", so);
1470                 /* NOTREACHED */
1471         }
1472 discard:
1473         if (so->so_pcb != NULL && !(so->so_flags & SOF_MP_SUBFLOW) &&
1474             (so->so_state & SS_NOFDREF)) {
1475                 panic("soclose: NOFDREF");
1476                 /* NOTREACHED */
1477         }
1478         so->so_state |= SS_NOFDREF;
1479
1480         if ((so->so_flags & SOF_KNOTE) != 0) {
1481                 KNOTE(&so->so_klist, SO_FILT_HINT_LOCKED);
1482         }
1483
1484         atomic_add_32(&so->so_proto->pr_domain->dom_refs, -1);
1485
1486         VERIFY(so->so_usecount > 0);
1487         so->so_usecount--;
1488         sofree(so);
1489         return error;
1490 }
1491
1492 int
1493 soclose(struct socket *so)
1494 {
1495         int error = 0;
1496         socket_lock(so, 1);
1497
1498         if (so->so_retaincnt == 0) {
1499                 error = soclose_locked(so);
1500         } else {
1501                 /*
1502                  * if the FD is going away, but socket is
1503                  * retained in kernel remove its reference
1504                  */
1505                 so->so_usecount--;
1506                 if (so->so_usecount < 2) {
1507                         panic("soclose: retaincnt non null and so=%p "
1508                             "usecount=%d\n", so, so->so_usecount);
1509                 }
1510         }
1511         socket_unlock(so, 1);
1512         return error;
1513 }
1514
1515 /*
1516  * Must be called at splnet...
1517  */
1518 /* Should already be locked */
1519 int
1520 soabort(struct socket *so)
1521 {
1522         int error;
1523
1524 #ifdef MORE_LOCKING_DEBUG
1525         lck_mtx_t *mutex_held;
1526
1527         if (so->so_proto->pr_getlock != NULL) {
1528                 mutex_held = (*so->so_proto->pr_getlock)(so, 0);
1529         } else {
1530                 mutex_held = so->so_proto->pr_domain->dom_mtx;
1531         }
1532         LCK_MTX_ASSERT(mutex_held, LCK_MTX_ASSERT_OWNED);
1533 #endif
1534
1535         if ((so->so_flags & SOF_ABORTED) == 0) {
1536                 so->so_flags |= SOF_ABORTED;
1537                 error = (*so->so_proto->pr_usrreqs->pru_abort)(so);
1538                 if (error) {
1539                         sofree(so);
1540                         return error;
1541                 }
1542         }
1543         return 0;
1544 }
1545
1546 int
1547 soacceptlock(struct socket *so, struct sockaddr **nam, int dolock)
1548 {
1549         int error;
1550
1551         if (dolock) {
1552                 socket_lock(so, 1);
1553         }
1554
1555         so_update_last_owner_locked(so, PROC_NULL);
1556         so_update_policy(so);
1557 #if NECP
1558         so_update_necp_policy(so, NULL, NULL);
1559 #endif /* NECP */
1560
1561         if ((so->so_state & SS_NOFDREF) == 0) {
1562                 panic("soaccept: !NOFDREF");
1563         }
1564         so->so_state &= ~SS_NOFDREF;
1565         error = (*so->so_proto->pr_usrreqs->pru_accept)(so, nam);
1566
1567         if (dolock) {
1568                 socket_unlock(so, 1);
1569         }
1570         return error;
1571 }
1572
1573 int
1574 soaccept(struct socket *so, struct sockaddr **nam)
1575 {
1576         return soacceptlock(so, nam, 1);
1577 }
1578
1579 int
1580 soacceptfilter(struct socket *so, struct socket *head)
1581 {
1582         struct sockaddr *local = NULL, *remote = NULL;
1583         int error = 0;
1584
1585         /*
1586          * Hold the lock even if this socket has not been made visible
1587          * to the filter(s).  For sockets with global locks, this protects
1588          * against the head or peer going away
1589          */
1590         socket_lock(so, 1);
1591         if (sogetaddr_locked(so, &remote, 1) != 0 ||
1592             sogetaddr_locked(so, &local, 0) != 0) {
1593                 so->so_state &= ~SS_NOFDREF;
1594                 socket_unlock(so, 1);
1595                 soclose(so);
1596                 /* Out of resources; try it again next time */
1597                 error = ECONNABORTED;
1598                 goto done;
1599         }
1600
1601         error = sflt_accept(head, so, local, remote);
1602
1603         /*
1604          * If we get EJUSTRETURN from one of the filters, mark this socket
1605          * as inactive and return it anyway.  This newly accepted socket
1606          * will be disconnected later before we hand it off to the caller.
1607          */
1608         if (error == EJUSTRETURN) {
1609                 error = 0;
1610                 (void) sosetdefunct(current_proc(), so,
1611                     SHUTDOWN_SOCKET_LEVEL_DISCONNECT_INTERNAL, FALSE);
1612         }
1613
1614         if (error != 0) {
1615                 /*
1616                  * This may seem like a duplication to the above error
1617                  * handling part when we return ECONNABORTED, except
1618                  * the following is done while holding the lock since
1619                  * the socket has been exposed to the filter(s) earlier.
1620                  */
1621                 so->so_state &= ~SS_NOFDREF;
1622                 socket_unlock(so, 1);
1623                 soclose(so);
1624                 /* Propagate socket filter's error code to the caller */
1625         } else {
1626                 socket_unlock(so, 1);
1627         }
1628 done:
1629         /* Callee checks for NULL pointer */
1630         sock_freeaddr(remote);
1631         sock_freeaddr(local);
1632         return error;
1633 }
1634
1635 /*
1636  * Returns:     0                       Success
1637  *              EOPNOTSUPP              Operation not supported on socket
1638  *              EISCONN                 Socket is connected
1639  *      <pru_connect>:EADDRNOTAVAIL     Address not available.
1640  *      <pru_connect>:EINVAL            Invalid argument
1641  *      <pru_connect>:EAFNOSUPPORT      Address family not supported [notdef]
1642  *      <pru_connect>:EACCES            Permission denied
1643  *      <pru_connect>:EADDRINUSE        Address in use
1644  *      <pru_connect>:EAGAIN            Resource unavailable, try again
1645  *      <pru_connect>:EPERM             Operation not permitted
1646  *      <sf_connect_out>:???            [anything a filter writer might set]
1647  */
1648 int
1649 soconnectlock(struct socket *so, struct sockaddr *nam, int dolock)
1650 {
1651         int error;
1652         struct proc *p = current_proc();
1653
1654         if (dolock) {
1655                 socket_lock(so, 1);
1656         }
1657
1658         so_update_last_owner_locked(so, p);
1659         so_update_policy(so);
1660
1661 #if NECP
1662         so_update_necp_policy(so, NULL, nam);
1663 #endif /* NECP */
1664
1665         /*
1666          * If this is a listening socket or if this is a previously-accepted
1667          * socket that has been marked as inactive, reject the connect request.
1668          */
1669         if ((so->so_options & SO_ACCEPTCONN) || (so->so_flags & SOF_DEFUNCT)) {
1670                 error = EOPNOTSUPP;
1671                 if (so->so_flags & SOF_DEFUNCT) {
1672                         SODEFUNCTLOG("%s[%d, %s]: defunct so 0x%llx [%d,%d] "
1673                             "(%d)\n", __func__, proc_pid(p),
1674                             proc_best_name(p),
1675                             (uint64_t)DEBUG_KERNEL_ADDRPERM(so),
1676                             SOCK_DOM(so), SOCK_TYPE(so), error);
1677                 }
1678                 if (dolock) {
1679                         socket_unlock(so, 1);
1680                 }
1681                 return error;
1682         }
1683
1684         if ((so->so_restrictions & SO_RESTRICT_DENY_OUT) != 0) {
1685                 if (dolock) {
1686                         socket_unlock(so, 1);
1687                 }
1688                 return EPERM;
1689         }
1690
1691         /*
1692          * If protocol is connection-based, can only connect once.
1693          * Otherwise, if connected, try to disconnect first.
1694          * This allows user to disconnect by connecting to, e.g.,
1695          * a null address.
1696          */
1697         if (so->so_state & (SS_ISCONNECTED | SS_ISCONNECTING) &&
1698             ((so->so_proto->pr_flags & PR_CONNREQUIRED) ||
1699             (error = sodisconnectlocked(so)))) {
1700                 error = EISCONN;
1701         } else {
1702                 /*
1703                  * Run connect filter before calling protocol:
1704                  *  - non-blocking connect returns before completion;
1705                  */
1706                 error = sflt_connectout(so, nam);
1707                 if (error != 0) {
1708                         if (error == EJUSTRETURN) {
1709                                 error = 0;
1710                         }
1711                 } else {
1712                         error = (*so->so_proto->pr_usrreqs->pru_connect)
1713                             (so, nam, p);
1714                         if (error != 0) {
1715                                 so->so_state &= ~SS_ISCONNECTING;
1716                         }
1717                 }
1718         }
1719         if (dolock) {
1720                 socket_unlock(so, 1);
1721         }
1722         return error;
1723 }
1724
1725 int
1726 soconnect(struct socket *so, struct sockaddr *nam)
1727 {
1728         return soconnectlock(so, nam, 1);
1729 }
1730
1731 /*
1732  * Returns:     0                       Success
1733  *      <pru_connect2>:EINVAL[AF_UNIX]
1734  *      <pru_connect2>:EPROTOTYPE[AF_UNIX]
1735  *      <pru_connect2>:???              [other protocol families]
1736  *
1737  * Notes:       <pru_connect2> is not supported by [TCP].
1738  */
1739 int
1740 soconnect2(struct socket *so1, struct socket *so2)
1741 {
1742         int error;
1743
1744         socket_lock(so1, 1);
1745         if (so2->so_proto->pr_lock) {
1746                 socket_lock(so2, 1);
1747         }
1748
1749         error = (*so1->so_proto->pr_usrreqs->pru_connect2)(so1, so2);
1750
1751         socket_unlock(so1, 1);
1752         if (so2->so_proto->pr_lock) {
1753                 socket_unlock(so2, 1);
1754         }
1755         return error;
1756 }
1757
1758 int
1759 soconnectxlocked(struct socket *so, struct sockaddr *src,
1760     struct sockaddr *dst, struct proc *p, uint32_t ifscope,
1761     sae_associd_t aid, sae_connid_t *pcid, uint32_t flags, void *arg,
1762     uint32_t arglen, uio_t auio, user_ssize_t *bytes_written)
1763 {
1764         int error;
1765
1766         so_update_last_owner_locked(so, p);
1767         so_update_policy(so);
1768
1769         /*
1770          * If this is a listening socket or if this is a previously-accepted
1771          * socket that has been marked as inactive, reject the connect request.
1772          */
1773         if ((so->so_options & SO_ACCEPTCONN) || (so->so_flags & SOF_DEFUNCT)) {
1774                 error = EOPNOTSUPP;
1775                 if (so->so_flags & SOF_DEFUNCT) {
1776                         SODEFUNCTLOG("%s[%d, %s]: defunct so 0x%llx [%d,%d] "
1777                             "(%d)\n", __func__, proc_pid(p),
1778                             proc_best_name(p),
1779                             (uint64_t)DEBUG_KERNEL_ADDRPERM(so),
1780                             SOCK_DOM(so), SOCK_TYPE(so), error);
1781                 }
1782                 return error;
1783         }
1784
1785         if ((so->so_restrictions & SO_RESTRICT_DENY_OUT) != 0) {
1786                 return EPERM;
1787         }
1788
1789         /*
1790          * If protocol is connection-based, can only connect once
1791          * unless PR_MULTICONN is set.  Otherwise, if connected,
1792          * try to disconnect first.  This allows user to disconnect
1793          * by connecting to, e.g., a null address.
1794          */
1795         if ((so->so_state & (SS_ISCONNECTED | SS_ISCONNECTING)) &&
1796             !(so->so_proto->pr_flags & PR_MULTICONN) &&
1797             ((so->so_proto->pr_flags & PR_CONNREQUIRED) ||
1798             (error = sodisconnectlocked(so)) != 0)) {
1799                 error = EISCONN;
1800         } else {
1801                 if ((so->so_proto->pr_flags & PR_DATA_IDEMPOTENT) &&
1802                     (flags & CONNECT_DATA_IDEMPOTENT)) {
1803                         so->so_flags1 |= SOF1_DATA_IDEMPOTENT;
1804
1805                         if (flags & CONNECT_DATA_AUTHENTICATED) {
1806                                 so->so_flags1 |= SOF1_DATA_AUTHENTICATED;
1807                         }
1808                 }
1809
1810                 /*
1811                  * Case 1: CONNECT_RESUME_ON_READ_WRITE set, no data.
1812                  * Case 2: CONNECT_RESUME_ON_READ_WRITE set, with data (user error)
1813                  * Case 3: CONNECT_RESUME_ON_READ_WRITE not set, with data
1814                  * Case 3 allows user to combine write with connect even if they have
1815                  * no use for TFO (such as regular TCP, and UDP).
1816                  * Case 4: CONNECT_RESUME_ON_READ_WRITE not set, no data (regular case)
1817                  */
1818                 if ((so->so_proto->pr_flags & PR_PRECONN_WRITE) &&
1819                     ((flags & CONNECT_RESUME_ON_READ_WRITE) || auio)) {
1820                         so->so_flags1 |= SOF1_PRECONNECT_DATA;
1821                 }
1822
1823                 /*
1824                  * If a user sets data idempotent and does not pass an uio, or
1825                  * sets CONNECT_RESUME_ON_READ_WRITE, this is an error, reset
1826                  * SOF1_DATA_IDEMPOTENT.
1827                  */
1828                 if (!(so->so_flags1 & SOF1_PRECONNECT_DATA) &&
1829                     (so->so_flags1 & SOF1_DATA_IDEMPOTENT)) {
1830                         /* We should return EINVAL instead perhaps. */
1831                         so->so_flags1 &= ~SOF1_DATA_IDEMPOTENT;
1832                 }
1833
1834                 /*
1835                  * Run connect filter before calling protocol:
1836                  *  - non-blocking connect returns before completion;
1837                  */
1838                 error = sflt_connectout(so, dst);
1839                 if (error != 0) {
1840                         /* Disable PRECONNECT_DATA, as we don't need to send a SYN anymore. */
1841                         so->so_flags1 &= ~SOF1_PRECONNECT_DATA;
1842                         if (error == EJUSTRETURN) {
1843                                 error = 0;
1844                         }
1845                 } else {
1846                         error = (*so->so_proto->pr_usrreqs->pru_connectx)
1847                             (so, src, dst, p, ifscope, aid, pcid,
1848                             flags, arg, arglen, auio, bytes_written);
1849                         if (error != 0) {
1850                                 so->so_state &= ~SS_ISCONNECTING;
1851                                 if (error != EINPROGRESS) {
1852                                         so->so_flags1 &= ~SOF1_PRECONNECT_DATA;
1853                                 }
1854                         }
1855                 }
1856         }
1857
1858         return error;
1859 }
1860
1861 int
1862 sodisconnectlocked(struct socket *so)
1863 {
1864         int error;
1865
1866         if ((so->so_state & SS_ISCONNECTED) == 0) {
1867                 error = ENOTCONN;
1868                 goto bad;
1869         }
1870         if (so->so_state & SS_ISDISCONNECTING) {
1871                 error = EALREADY;
1872                 goto bad;
1873         }
1874
1875         error = (*so->so_proto->pr_usrreqs->pru_disconnect)(so);
1876         if (error == 0) {
1877                 sflt_notify(so, sock_evt_disconnected, NULL);
1878         }
1879
1880 bad:
1881         return error;
1882 }
1883
1884 /* Locking version */
1885 int
1886 sodisconnect(struct socket *so)
1887 {
1888         int error;
1889
1890         socket_lock(so, 1);
1891         error = sodisconnectlocked(so);
1892         socket_unlock(so, 1);
1893         return error;
1894 }
1895
1896 int
1897 sodisconnectxlocked(struct socket *so, sae_associd_t aid, sae_connid_t cid)
1898 {
1899         int error;
1900
1901         /*
1902          * Call the protocol disconnectx handler; let it handle all
1903          * matters related to the connection state of this session.
1904          */
1905         error = (*so->so_proto->pr_usrreqs->pru_disconnectx)(so, aid, cid);
1906         if (error == 0) {
1907                 /*
1908                  * The event applies only for the session, not for
1909                  * the disconnection of individual subflows.
1910                  */
1911                 if (so->so_state & (SS_ISDISCONNECTING | SS_ISDISCONNECTED)) {
1912                         sflt_notify(so, sock_evt_disconnected, NULL);
1913                 }
1914         }
1915         return error;
1916 }
1917
1918 int
1919 sodisconnectx(struct socket *so, sae_associd_t aid, sae_connid_t cid)
1920 {
1921         int error;
1922
1923         socket_lock(so, 1);
1924         error = sodisconnectxlocked(so, aid, cid);
1925         socket_unlock(so, 1);
1926         return error;
1927 }
1928
1929 #define SBLOCKWAIT(f)   (((f) & MSG_DONTWAIT) ? 0 : SBL_WAIT)
1930
1931 /*
1932  * sosendcheck will lock the socket buffer if it isn't locked and
1933  * verify that there is space for the data being inserted.
1934  *
1935  * Returns:     0                       Success
1936  *              EPIPE
1937  *      sblock:EWOULDBLOCK
1938  *      sblock:EINTR
1939  *      sbwait:EBADF
1940  *      sbwait:EINTR
1941  *      [so_error]:???
1942  */
1943 int
1944 sosendcheck(struct socket *so, struct sockaddr *addr, user_ssize_t resid,
1945     int32_t clen, int32_t atomic, int flags, int *sblocked)
1946 {
1947         int     error = 0;
1948         int32_t space;
1949         int     assumelock = 0;
1950
1951 restart:
1952         if (*sblocked == 0) {
1953                 if ((so->so_snd.sb_flags & SB_LOCK) != 0 &&
1954                     so->so_send_filt_thread != 0 &&
1955                     so->so_send_filt_thread == current_thread()) {
1956                         /*
1957                          * We're being called recursively from a filter,
1958                          * allow this to continue. Radar 4150520.
1959                          * Don't set sblocked because we don't want
1960                          * to perform an unlock later.
1961                          */
1962                         assumelock = 1;
1963                 } else {
1964                         error = sblock(&so->so_snd, SBLOCKWAIT(flags));
1965                         if (error) {
1966                                 if (so->so_flags & SOF_DEFUNCT) {
1967                                         goto defunct;
1968                                 }
1969                                 return error;
1970                         }
1971                         *sblocked = 1;
1972                 }
1973         }
1974
1975         /*
1976          * If a send attempt is made on a socket that has been marked
1977          * as inactive (disconnected), reject the request.
1978          */
1979         if (so->so_flags & SOF_DEFUNCT) {
1980 defunct:
1981                 error = EPIPE;
1982                 SODEFUNCTLOG("%s[%d, %s]: defunct so 0x%llx [%d,%d] (%d)\n",
1983                     __func__, proc_selfpid(), proc_best_name(current_proc()),
1984                     (uint64_t)DEBUG_KERNEL_ADDRPERM(so),
1985                     SOCK_DOM(so), SOCK_TYPE(so), error);
1986                 return error;
1987         }
1988
1989         if (so->so_state & SS_CANTSENDMORE) {
1990 #if CONTENT_FILTER
1991                 /*
1992                  * Can re-inject data of half closed connections
1993                  */
1994                 if ((so->so_state & SS_ISDISCONNECTED) == 0 &&
1995                     so->so_snd.sb_cfil_thread == current_thread() &&
1996                     cfil_sock_data_pending(&so->so_snd) != 0) {
1997                         CFIL_LOG(LOG_INFO,
1998                             "so %llx ignore SS_CANTSENDMORE",
1999                             (uint64_t)DEBUG_KERNEL_ADDRPERM(so));
2000                 } else
2001 #endif /* CONTENT_FILTER */
2002                 return EPIPE;
2003         }
2004         if (so->so_error) {
2005                 error = so->so_error;
2006                 so->so_error = 0;
2007                 return error;
2008         }
2009
2010         if ((so->so_state & SS_ISCONNECTED) == 0) {
2011                 if ((so->so_proto->pr_flags & PR_CONNREQUIRED) != 0) {
2012                         if (((so->so_state & SS_ISCONFIRMING) == 0) &&
2013                             (resid != 0 || clen == 0) &&
2014                             !(so->so_flags1 & SOF1_PRECONNECT_DATA)) {
2015                                 return ENOTCONN;
2016                         }
2017                 } else if (addr == 0) {
2018                         return (so->so_proto->pr_flags & PR_CONNREQUIRED) ?
2019                                ENOTCONN : EDESTADDRREQ;
2020                 }
2021         }
2022
2023         space = sbspace(&so->so_snd);
2024
2025         if (flags & MSG_OOB) {
2026                 space += 1024;
2027         }
2028         if ((atomic && resid > so->so_snd.sb_hiwat) ||
2029             clen > so->so_snd.sb_hiwat) {
2030                 return EMSGSIZE;
2031         }
2032
2033         if ((space < resid + clen &&
2034             (atomic || (space < (int32_t)so->so_snd.sb_lowat) ||
2035             space < clen)) ||
2036             (so->so_type == SOCK_STREAM && so_wait_for_if_feedback(so))) {
2037                 /*
2038                  * don't block the connectx call when there's more data
2039                  * than can be copied.
2040                  */
2041                 if (so->so_flags1 & SOF1_PRECONNECT_DATA) {
2042                         if (space == 0) {
2043                                 return EWOULDBLOCK;
2044                         }
2045                         if (space < (int32_t)so->so_snd.sb_lowat) {
2046                                 return 0;
2047                         }
2048                 }
2049                 if ((so->so_state & SS_NBIO) || (flags & MSG_NBIO) ||
2050                     assumelock) {
2051                         return EWOULDBLOCK;
2052                 }
2053                 sbunlock(&so->so_snd, TRUE);    /* keep socket locked */
2054                 *sblocked = 0;
2055                 error = sbwait(&so->so_snd);
2056                 if (error) {
2057                         if (so->so_flags & SOF_DEFUNCT) {
2058                                 goto defunct;
2059                         }
2060                         return error;
2061                 }
2062                 goto restart;
2063         }
2064         return 0;
2065 }
2066
2067 /*
2068  * Send on a socket.
2069  * If send must go all at once and message is larger than
2070  * send buffering, then hard error.
2071  * Lock against other senders.
2072  * If must go all at once and not enough room now, then
2073  * inform user that this would block and do nothing.
2074  * Otherwise, if nonblocking, send as much as possible.
2075  * The data to be sent is described by "uio" if nonzero,
2076  * otherwise by the mbuf chain "top" (which must be null
2077  * if uio is not).  Data provided in mbuf chain must be small
2078  * enough to send all at once.
2079  *
2080  * Returns nonzero on error, timeout or signal; callers
2081  * must check for short counts if EINTR/ERESTART are returned.
2082  * Data and control buffers are freed on return.
2083  *
2084  * Returns:     0                       Success
2085  *              EOPNOTSUPP
2086  *              EINVAL
2087  *              ENOBUFS
2088  *      uiomove:EFAULT
2089  *      sosendcheck:EPIPE
2090  *      sosendcheck:EWOULDBLOCK
2091  *      sosendcheck:EINTR
2092  *      sosendcheck:EBADF
2093  *      sosendcheck:EINTR
2094  *      sosendcheck:???                 [value from so_error]
2095  *      <pru_send>:ECONNRESET[TCP]
2096  *      <pru_send>:EINVAL[TCP]
2097  *      <pru_send>:ENOBUFS[TCP]
2098  *      <pru_send>:EADDRINUSE[TCP]
2099  *      <pru_send>:EADDRNOTAVAIL[TCP]
2100  *      <pru_send>:EAFNOSUPPORT[TCP]
2101  *      <pru_send>:EACCES[TCP]
2102  *      <pru_send>:EAGAIN[TCP]
2103  *      <pru_send>:EPERM[TCP]
2104  *      <pru_send>:EMSGSIZE[TCP]
2105  *      <pru_send>:EHOSTUNREACH[TCP]
2106  *      <pru_send>:ENETUNREACH[TCP]
2107  *      <pru_send>:ENETDOWN[TCP]
2108  *      <pru_send>:ENOMEM[TCP]
2109  *      <pru_send>:ENOBUFS[TCP]
2110  *      <pru_send>:???[TCP]             [ignorable: mostly IPSEC/firewall/DLIL]
2111  *      <pru_send>:EINVAL[AF_UNIX]
2112  *      <pru_send>:EOPNOTSUPP[AF_UNIX]
2113  *      <pru_send>:EPIPE[AF_UNIX]
2114  *      <pru_send>:ENOTCONN[AF_UNIX]
2115  *      <pru_send>:EISCONN[AF_UNIX]
2116  *      <pru_send>:???[AF_UNIX]         [whatever a filter author chooses]
2117  *      <sf_data_out>:???               [whatever a filter author chooses]
2118  *
2119  * Notes:       Other <pru_send> returns depend on the protocol family; all
2120  *              <sf_data_out> returns depend on what the filter author causes
2121  *              their filter to return.
2122  */
2123 int
2124 sosend(struct socket *so, struct sockaddr *addr, struct uio *uio,
2125     struct mbuf *top, struct mbuf *control, int flags)
2126 {
2127         struct mbuf **mp;
2128         struct mbuf *m, *freelist = NULL;
2129         user_ssize_t space, len, resid, orig_resid;
2130         int clen = 0, error, dontroute, mlen, sendflags;
2131         int atomic = sosendallatonce(so) || top;
2132         int sblocked = 0;
2133         struct proc *p = current_proc();
2134         uint16_t headroom = 0;
2135         boolean_t en_tracing = FALSE;
2136
2137         if (uio != NULL) {
2138                 resid = uio_resid(uio);
2139         } else {
2140                 resid = top->m_pkthdr.len;
2141         }
2142
2143         KERNEL_DEBUG((DBG_FNC_SOSEND | DBG_FUNC_START), so, resid,
2144             so->so_snd.sb_cc, so->so_snd.sb_lowat, so->so_snd.sb_hiwat);
2145
2146         socket_lock(so, 1);
2147
2148         /*
2149          * trace if tracing & network (vs. unix) sockets & and
2150          * non-loopback
2151          */
2152         if (ENTR_SHOULDTRACE &&
2153             (SOCK_CHECK_DOM(so, AF_INET) || SOCK_CHECK_DOM(so, AF_INET6))) {
2154                 struct inpcb *inp = sotoinpcb(so);
2155                 if (inp->inp_last_outifp != NULL &&
2156                     !(inp->inp_last_outifp->if_flags & IFF_LOOPBACK)) {
2157                         en_tracing = TRUE;
2158                         KERNEL_ENERGYTRACE(kEnTrActKernSockWrite, DBG_FUNC_START,
2159                             VM_KERNEL_ADDRPERM(so),
2160                             ((so->so_state & SS_NBIO) ? kEnTrFlagNonBlocking : 0),
2161                             (int64_t)resid);
2162                         orig_resid = resid;
2163                 }
2164         }
2165
2166         /*
2167          * Re-injection should not affect process accounting
2168          */
2169         if ((flags & MSG_SKIPCFIL) == 0) {
2170                 so_update_last_owner_locked(so, p);
2171                 so_update_policy(so);
2172
2173 #if NECP
2174                 so_update_necp_policy(so, NULL, addr);
2175 #endif /* NECP */
2176         }
2177
2178         if (so->so_type != SOCK_STREAM && (flags & MSG_OOB) != 0) {
2179                 error = EOPNOTSUPP;
2180                 goto out_locked;
2181         }
2182
2183         /*
2184          * In theory resid should be unsigned.
2185          * However, space must be signed, as it might be less than 0
2186          * if we over-committed, and we must use a signed comparison
2187          * of space and resid.  On the other hand, a negative resid
2188          * causes us to loop sending 0-length segments to the protocol.
2189          *
2190          * Usually, MSG_EOR isn't used on SOCK_STREAM type sockets.
2191          *
2192          * Note: We limit resid to be a positive int value as we use
2193          * imin() to set bytes_to_copy -- radr://14558484
2194          */
2195         if (resid < 0 || resid > INT_MAX ||
2196             (so->so_type == SOCK_STREAM && (flags & MSG_EOR))) {
2197                 error = EINVAL;
2198                 goto out_locked;
2199         }
2200
2201         dontroute = (flags & MSG_DONTROUTE) &&
2202             (so->so_options & SO_DONTROUTE) == 0 &&
2203             (so->so_proto->pr_flags & PR_ATOMIC);
2204         OSIncrementAtomicLong(&p->p_stats->p_ru.ru_msgsnd);
2205
2206         if (control != NULL) {
2207                 clen = control->m_len;
2208         }
2209
2210         if (soreserveheadroom != 0) {
2211                 headroom = so->so_pktheadroom;
2212         }
2213
2214         do {
2215                 error = sosendcheck(so, addr, resid, clen, atomic, flags,
2216                     &sblocked);
2217                 if (error) {
2218                         goto out_locked;
2219                 }
2220
2221                 mp = &top;
2222                 space = sbspace(&so->so_snd) - clen;
2223                 space += ((flags & MSG_OOB) ? 1024 : 0);
2224
2225                 do {
2226                         if (uio == NULL) {
2227                                 /*
2228                                  * Data is prepackaged in "top".
2229                                  */
2230                                 resid = 0;
2231                                 if (flags & MSG_EOR) {
2232                                         top->m_flags |= M_EOR;
2233                                 }
2234                         } else {
2235                                 int chainlength;
2236                                 int bytes_to_copy;
2237                                 boolean_t jumbocl;
2238                                 boolean_t bigcl;
2239                                 int bytes_to_alloc;
2240
2241                                 bytes_to_copy = imin(resid, space);
2242
2243                                 bytes_to_alloc = bytes_to_copy;
2244                                 if (top == NULL) {
2245                                         bytes_to_alloc += headroom;
2246                                 }
2247
2248                                 if (sosendminchain > 0) {
2249                                         chainlength = 0;
2250                                 } else {
2251                                         chainlength = sosendmaxchain;
2252                                 }
2253
2254                                 /*
2255                                  * Use big 4 KB cluster when the outgoing interface
2256                                  * does not prefer 2 KB clusters
2257                                  */
2258                                 bigcl = !(so->so_flags1 & SOF1_IF_2KCL) ||
2259                                     sosendbigcl_ignore_capab;
2260
2261                                 /*
2262                                  * Attempt to use larger than system page-size
2263                                  * clusters for large writes only if there is
2264                                  * a jumbo cluster pool and if the socket is
2265                                  * marked accordingly.
2266                                  */
2267                                 jumbocl = sosendjcl && njcl > 0 &&
2268                                     ((so->so_flags & SOF_MULTIPAGES) ||
2269                                     sosendjcl_ignore_capab) &&
2270                                     bigcl;
2271
2272                                 socket_unlock(so, 0);
2273
2274                                 do {
2275                                         int num_needed;
2276                                         int hdrs_needed = (top == NULL) ? 1 : 0;
2277
2278                                         /*
2279                                          * try to maintain a local cache of mbuf
2280                                          * clusters needed to complete this
2281                                          * write the list is further limited to
2282                                          * the number that are currently needed
2283                                          * to fill the socket this mechanism
2284                                          * allows a large number of mbufs/
2285                                          * clusters to be grabbed under a single
2286                                          * mbuf lock... if we can't get any
2287                                          * clusters, than fall back to trying
2288                                          * for mbufs if we fail early (or
2289                                          * miscalcluate the number needed) make
2290                                          * sure to release any clusters we
2291                                          * haven't yet consumed.
2292                                          */
2293                                         if (freelist == NULL &&
2294                                             bytes_to_alloc > MBIGCLBYTES &&
2295                                             jumbocl) {
2296                                                 num_needed =
2297                                                     bytes_to_alloc / M16KCLBYTES;
2298
2299                                                 if ((bytes_to_alloc -
2300                                                     (num_needed * M16KCLBYTES))
2301                                                     >= MINCLSIZE) {
2302                                                         num_needed++;
2303                                                 }
2304
2305                                                 freelist =
2306                                                     m_getpackets_internal(
2307                                                         (unsigned int *)&num_needed,
2308                                                         hdrs_needed, M_WAIT, 0,
2309                                                         M16KCLBYTES);
2310                                                 /*
2311                                                  * Fall back to 4K cluster size
2312                                                  * if allocation failed
2313                                                  */
2314                                         }
2315
2316                                         if (freelist == NULL &&
2317                                             bytes_to_alloc > MCLBYTES &&
2318                                             bigcl) {
2319                                                 num_needed =
2320                                                     bytes_to_alloc / MBIGCLBYTES;
2321
2322                                                 if ((bytes_to_alloc -
2323                                                     (num_needed * MBIGCLBYTES)) >=
2324                                                     MINCLSIZE) {
2325                                                         num_needed++;
2326                                                 }
2327
2328                                                 freelist =
2329                                                     m_getpackets_internal(
2330                                                         (unsigned int *)&num_needed,
2331                                                         hdrs_needed, M_WAIT, 0,
2332                                                         MBIGCLBYTES);
2333                                                 /*
2334                                                  * Fall back to cluster size
2335                                                  * if allocation failed
2336                                                  */
2337                                         }
2338
2339                                         /*
2340                                          * Allocate a cluster as we want to
2341                                          * avoid to split the data in more
2342                                          * that one segment and using MINCLSIZE
2343                                          * would lead us to allocate two mbufs
2344                                          */
2345                                         if (soreserveheadroom != 0 &&
2346                                             freelist == NULL &&
2347                                             ((top == NULL &&
2348                                             bytes_to_alloc > _MHLEN) ||
2349                                             bytes_to_alloc > _MLEN)) {
2350                                                 num_needed = ROUNDUP(bytes_to_alloc, MCLBYTES) /
2351                                                     MCLBYTES;
2352                                                 freelist =
2353                                                     m_getpackets_internal(
2354                                                         (unsigned int *)&num_needed,
2355                                                         hdrs_needed, M_WAIT, 0,
2356                                                         MCLBYTES);
2357                                                 /*
2358                                                  * Fall back to a single mbuf
2359                                                  * if allocation failed
2360                                                  */
2361                                         } else if (freelist == NULL &&
2362                                             bytes_to_alloc > MINCLSIZE) {
2363                                                 num_needed =
2364                                                     bytes_to_alloc / MCLBYTES;
2365
2366                                                 if ((bytes_to_alloc -
2367                                                     (num_needed * MCLBYTES)) >=
2368                                                     MINCLSIZE) {
2369                                                         num_needed++;
2370                                                 }
2371
2372                                                 freelist =
2373                                                     m_getpackets_internal(
2374                                                         (unsigned int *)&num_needed,
2375                                                         hdrs_needed, M_WAIT, 0,
2376                                                         MCLBYTES);
2377                                                 /*
2378                                                  * Fall back to a single mbuf
2379                                                  * if allocation failed
2380                                                  */
2381                                         }
2382                                         /*
2383                                          * For datagram protocols, leave
2384                                          * headroom for protocol headers
2385                                          * in the first cluster of the chain
2386                                          */
2387                                         if (freelist != NULL && atomic &&
2388                                             top == NULL && headroom > 0) {
2389                                                 freelist->m_data += headroom;
2390                                         }
2391
2392                                         /*
2393                                          * Fall back to regular mbufs without
2394                                          * reserving the socket headroom
2395                                          */
2396                                         if (freelist == NULL) {
2397                                                 if (SOCK_TYPE(so) != SOCK_STREAM || bytes_to_alloc <= MINCLSIZE) {
2398                                                         if (top == NULL) {
2399                                                                 MGETHDR(freelist,
2400                                                                     M_WAIT, MT_DATA);
2401                                                         } else {
2402                                                                 MGET(freelist,
2403                                                                     M_WAIT, MT_DATA);
2404                                                         }
2405                                                 }
2406
2407                                                 if (freelist == NULL) {
2408                                                         error = ENOBUFS;
2409                                                         socket_lock(so, 0);
2410                                                         goto out_locked;
2411                                                 }
2412                                                 /*
2413                                                  * For datagram protocols,
2414                                                  * leave room for protocol
2415                                                  * headers in first mbuf.
2416                                                  */
2417                                                 if (atomic && top == NULL &&
2418                                                     bytes_to_copy < MHLEN) {
2419                                                         MH_ALIGN(freelist,
2420                                                             bytes_to_copy);
2421                                                 }
2422                                         }
2423                                         m = freelist;
2424                                         freelist = m->m_next;
2425                                         m->m_next = NULL;
2426
2427                                         if ((m->m_flags & M_EXT)) {
2428                                                 mlen = m->m_ext.ext_size -
2429                                                     M_LEADINGSPACE(m);
2430                                         } else if ((m->m_flags & M_PKTHDR)) {
2431                                                 mlen =
2432                                                     MHLEN - M_LEADINGSPACE(m);
2433                                         } else {
2434                                                 mlen = MLEN - M_LEADINGSPACE(m);
2435                                         }
2436                                         len = imin(mlen, bytes_to_copy);
2437
2438                                         chainlength += len;
2439
2440                                         space -= len;
2441
2442                                         error = uiomove(mtod(m, caddr_t),
2443                                             len, uio);
2444
2445                                         resid = uio_resid(uio);
2446
2447                                         m->m_len = len;
2448                                         *mp = m;
2449                                         top->m_pkthdr.len += len;
2450                                         if (error) {
2451                                                 break;
2452                                         }
2453                                         mp = &m->m_next;
2454                                         if (resid <= 0) {
2455                                                 if (flags & MSG_EOR) {
2456                                                         top->m_flags |= M_EOR;
2457                                                 }
2458                                                 break;
2459                                         }
2460                                         bytes_to_copy = min(resid, space);
2461                                 } while (space > 0 &&
2462                                     (chainlength < sosendmaxchain || atomic ||
2463                                     resid < MINCLSIZE));
2464
2465                                 socket_lock(so, 0);
2466
2467                                 if (error) {
2468                                         goto out_locked;
2469                                 }
2470                         }
2471
2472                         if (dontroute) {
2473                                 so->so_options |= SO_DONTROUTE;
2474                         }
2475
2476                         /*
2477                          * Compute flags here, for pru_send and NKEs
2478                          *
2479                          * If the user set MSG_EOF, the protocol
2480                          * understands this flag and nothing left to
2481                          * send then use PRU_SEND_EOF instead of PRU_SEND.
2482                          */
2483                         sendflags = (flags & MSG_OOB) ? PRUS_OOB :
2484                             ((flags & MSG_EOF) &&
2485                             (so->so_proto->pr_flags & PR_IMPLOPCL) &&
2486                             (resid <= 0)) ? PRUS_EOF :
2487                             /* If there is more to send set PRUS_MORETOCOME */
2488                             (resid > 0 && space > 0) ? PRUS_MORETOCOME : 0;
2489
2490                         if ((flags & MSG_SKIPCFIL) == 0) {
2491                                 /*
2492                                  * Socket filter processing
2493                                  */
2494                                 error = sflt_data_out(so, addr, &top,
2495                                     &control, (sendflags & MSG_OOB) ?
2496                                     sock_data_filt_flag_oob : 0);
2497                                 if (error) {
2498                                         if (error == EJUSTRETURN) {
2499                                                 error = 0;
2500                                                 clen = 0;
2501                                                 control = NULL;
2502                                                 top = NULL;
2503                                         }
2504                                         goto out_locked;
2505                                 }
2506 #if CONTENT_FILTER
2507                                 /*
2508                                  * Content filter processing
2509                                  */
2510                                 error = cfil_sock_data_out(so, addr, top,
2511                                     control, sendflags);
2512                                 if (error) {
2513                                         if (error == EJUSTRETURN) {
2514                                                 error = 0;
2515                                                 clen = 0;
2516                                                 control = NULL;
2517                                                 top = NULL;
2518                                         }
2519                                         goto out_locked;
2520                                 }
2521 #endif /* CONTENT_FILTER */
2522                         }
2523                         error = (*so->so_proto->pr_usrreqs->pru_send)
2524                             (so, sendflags, top, addr, control, p);
2525
2526                         if (dontroute) {
2527                                 so->so_options &= ~SO_DONTROUTE;
2528                         }
2529
2530                         clen = 0;
2531                         control = NULL;
2532                         top = NULL;
2533                         mp = &top;
2534                         if (error) {
2535                                 goto out_locked;
2536                         }
2537                 } while (resid && space > 0);
2538         } while (resid);
2539
2540 out_locked:
2541         if (sblocked) {
2542                 sbunlock(&so->so_snd, FALSE);   /* will unlock socket */
2543         } else {
2544                 socket_unlock(so, 1);
2545         }
2546         if (top != NULL) {
2547                 m_freem(top);
2548         }
2549         if (control != NULL) {
2550                 m_freem(control);
2551         }
2552         if (freelist != NULL) {
2553                 m_freem_list(freelist);
2554         }
2555
2556         soclearfastopen(so);
2557
2558         if (en_tracing) {
2559                 /* resid passed here is the bytes left in uio */
2560                 KERNEL_ENERGYTRACE(kEnTrActKernSockWrite, DBG_FUNC_END,
2561                     VM_KERNEL_ADDRPERM(so),
2562                     ((error == EWOULDBLOCK) ? kEnTrFlagNoWork : 0),
2563                     (int64_t)(orig_resid - resid));
2564         }
2565         KERNEL_DEBUG(DBG_FNC_SOSEND | DBG_FUNC_END, so, resid,
2566             so->so_snd.sb_cc, space, error);
2567
2568         return error;
2569 }
2570
2571 int
2572 sosend_reinject(struct socket *so, struct sockaddr *addr, struct mbuf *top, struct mbuf *control, uint32_t sendflags)
2573 {
2574         struct mbuf *m0 = NULL, *control_end = NULL;
2575
2576         socket_lock_assert_owned(so);
2577
2578         /*
2579          * top must points to mbuf chain to be sent.
2580          * If control is not NULL, top must be packet header
2581          */
2582         VERIFY(top != NULL &&
2583             (control == NULL || top->m_flags & M_PKTHDR));
2584
2585         /*
2586          * If control is not passed in, see if we can get it
2587          * from top.
2588          */
2589         if (control == NULL && (top->m_flags & M_PKTHDR) == 0) {
2590                 // Locate start of control if present and start of data
2591                 for (m0 = top; m0 != NULL; m0 = m0->m_next) {
2592                         if (m0->m_flags & M_PKTHDR) {
2593                                 top = m0;
2594                                 break;
2595                         } else if (m0->m_type == MT_CONTROL) {
2596                                 if (control == NULL) {
2597                                         // Found start of control
2598                                         control = m0;
2599                                 }
2600                                 if (control != NULL && m0->m_next != NULL && m0->m_next->m_type != MT_CONTROL) {
2601                                         // Found end of control
2602                                         control_end = m0;
2603                                 }
2604                         }
2605                 }
2606                 if (control_end != NULL) {
2607                         control_end->m_next = NULL;
2608                 }
2609         }
2610
2611         int error = (*so->so_proto->pr_usrreqs->pru_send)
2612             (so, sendflags, top, addr, control, current_proc());
2613
2614         return error;
2615 }
2616
2617 /*
2618  * Supported only connected sockets (no address) without ancillary data
2619  * (control mbuf) for atomic protocols
2620  */
2621 int
2622 sosend_list(struct socket *so, struct uio **uioarray, u_int uiocnt, int flags)
2623 {
2624         struct mbuf *m, *freelist = NULL;
2625         user_ssize_t len, resid;
2626         int error, dontroute, mlen;
2627         int atomic = sosendallatonce(so);
2628         int sblocked = 0;
2629         struct proc *p = current_proc();
2630         u_int uiofirst = 0;
2631         u_int uiolast = 0;
2632         struct mbuf *top = NULL;
2633         uint16_t headroom = 0;
2634         boolean_t bigcl;
2635
2636         KERNEL_DEBUG((DBG_FNC_SOSEND_LIST | DBG_FUNC_START), so, uiocnt,
2637             so->so_snd.sb_cc, so->so_snd.sb_lowat, so->so_snd.sb_hiwat);
2638
2639         if (so->so_type != SOCK_DGRAM) {
2640                 error = EINVAL;
2641                 goto out;
2642         }
2643         if (atomic == 0) {
2644                 error = EINVAL;
2645                 goto out;
2646         }
2647         if (so->so_proto->pr_usrreqs->pru_send_list == NULL) {
2648                 error = EPROTONOSUPPORT;
2649                 goto out;
2650         }
2651         if (flags & ~(MSG_DONTWAIT | MSG_NBIO)) {
2652                 error = EINVAL;
2653                 goto out;
2654         }
2655         resid = uio_array_resid(uioarray, uiocnt);
2656
2657         /*
2658          * In theory resid should be unsigned.
2659          * However, space must be signed, as it might be less than 0
2660          * if we over-committed, and we must use a signed comparison
2661          * of space and resid.  On the other hand, a negative resid
2662          * causes us to loop sending 0-length segments to the protocol.
2663          *
2664          * Note: We limit resid to be a positive int value as we use
2665          * imin() to set bytes_to_copy -- radr://14558484
2666          */
2667         if (resid < 0 || resid > INT_MAX) {
2668                 error = EINVAL;
2669                 goto out;
2670         }
2671
2672         socket_lock(so, 1);
2673         so_update_last_owner_locked(so, p);
2674         so_update_policy(so);
2675
2676 #if NECP
2677         so_update_necp_policy(so, NULL, NULL);
2678 #endif /* NECP */
2679
2680         dontroute = (flags & MSG_DONTROUTE) &&
2681             (so->so_options & SO_DONTROUTE) == 0 &&
2682             (so->so_proto->pr_flags & PR_ATOMIC);
2683         OSIncrementAtomicLong(&p->p_stats->p_ru.ru_msgsnd);
2684
2685         error = sosendcheck(so, NULL, resid, 0, atomic, flags, &sblocked);
2686         if (error) {
2687                 goto release;
2688         }
2689
2690         /*
2691          * Use big 4 KB clusters when the outgoing interface does not prefer
2692          * 2 KB clusters
2693          */
2694         bigcl = !(so->so_flags1 & SOF1_IF_2KCL) || sosendbigcl_ignore_capab;
2695
2696         if (soreserveheadroom != 0) {
2697                 headroom = so->so_pktheadroom;
2698         }
2699
2700         do {
2701                 int i;
2702                 int num_needed = 0;
2703                 int chainlength;
2704                 size_t maxpktlen = 0;
2705                 int bytes_to_alloc;
2706
2707                 if (sosendminchain > 0) {
2708                         chainlength = 0;
2709                 } else {
2710                         chainlength = sosendmaxchain;
2711                 }
2712
2713                 socket_unlock(so, 0);
2714
2715                 /*
2716                  * Find a set of uio that fit in a reasonable number
2717                  * of mbuf packets
2718                  */
2719                 for (i = uiofirst; i < uiocnt; i++) {
2720                         struct uio *auio = uioarray[i];
2721
2722                         len = uio_resid(auio);
2723
2724                         /* Do nothing for empty messages */
2725                         if (len == 0) {
2726                                 continue;
2727                         }
2728
2729                         num_needed += 1;
2730                         uiolast += 1;
2731
2732                         if (len > maxpktlen) {
2733                                 maxpktlen = len;
2734                         }
2735
2736                         chainlength += len;
2737                         if (chainlength > sosendmaxchain) {
2738                                 break;
2739                         }
2740                 }
2741                 /*
2742                  * Nothing left to send
2743                  */
2744                 if (num_needed == 0) {
2745                         socket_lock(so, 0);
2746                         break;
2747                 }
2748                 /*
2749                  * Allocate buffer large enough to include headroom space for
2750                  * network and link header
2751                  *
2752                  */
2753                 bytes_to_alloc = maxpktlen + headroom;
2754
2755                 /*
2756                  * Allocate a single contiguous buffer of the smallest available
2757                  * size when possible
2758                  */
2759                 if (bytes_to_alloc > MCLBYTES &&
2760                     bytes_to_alloc <= MBIGCLBYTES && bigcl) {
2761                         freelist = m_getpackets_internal(
2762                                 (unsigned int *)&num_needed,
2763                                 num_needed, M_WAIT, 1,
2764                                 MBIGCLBYTES);
2765                 } else if (bytes_to_alloc > _MHLEN &&
2766                     bytes_to_alloc <= MCLBYTES) {
2767                         freelist = m_getpackets_internal(
2768                                 (unsigned int *)&num_needed,
2769                                 num_needed, M_WAIT, 1,
2770                                 MCLBYTES);
2771                 } else {
2772                         freelist = m_allocpacket_internal(
2773                                 (unsigned int *)&num_needed,
2774                                 bytes_to_alloc, NULL, M_WAIT, 1, 0);
2775                 }
2776
2777                 if (freelist == NULL) {
2778                         socket_lock(so, 0);
2779                         error = ENOMEM;
2780                         goto release;
2781                 }
2782                 /*
2783                  * Copy each uio of the set into its own mbuf packet
2784                  */
2785                 for (i = uiofirst, m = freelist;
2786                     i < uiolast && m != NULL;
2787                     i++) {
2788                         int bytes_to_copy;
2789                         struct mbuf *n;
2790                         struct uio *auio = uioarray[i];
2791
2792                         bytes_to_copy = uio_resid(auio);
2793
2794                         /* Do nothing for empty messages */
2795                         if (bytes_to_copy == 0) {
2796                                 continue;
2797                         }
2798                         /*
2799                          * Leave headroom for protocol headers
2800                          * in the first mbuf of the chain
2801                          */
2802                         m->m_data += headroom;
2803
2804                         for (n = m; n != NULL; n = n->m_next) {
2805                                 if ((m->m_flags & M_EXT)) {
2806                                         mlen = m->m_ext.ext_size -
2807                                             M_LEADINGSPACE(m);
2808                                 } else if ((m->m_flags & M_PKTHDR)) {
2809                                         mlen =
2810                                             MHLEN - M_LEADINGSPACE(m);
2811                                 } else {
2812                                         mlen = MLEN - M_LEADINGSPACE(m);
2813                                 }
2814                                 len = imin(mlen, bytes_to_copy);
2815
2816                                 /*
2817                                  * Note: uiomove() decrements the iovec
2818                                  * length
2819                                  */
2820                                 error = uiomove(mtod(n, caddr_t),
2821                                     len, auio);
2822                                 if (error != 0) {
2823                                         break;
2824                                 }
2825                                 n->m_len = len;
2826                                 m->m_pkthdr.len += len;
2827
2828                                 VERIFY(m->m_pkthdr.len <= maxpktlen);
2829
2830                                 bytes_to_copy -= len;
2831                                 resid -= len;
2832                         }
2833                         if (m->m_pkthdr.len == 0) {
2834                                 printf(
2835                                         "%s:%d so %llx pkt %llx type %u len null\n",
2836                                         __func__, __LINE__,
2837                                         (uint64_t)DEBUG_KERNEL_ADDRPERM(so),
2838                                         (uint64_t)DEBUG_KERNEL_ADDRPERM(m),
2839                                         m->m_type);
2840                         }
2841                         if (error != 0) {
2842                                 break;
2843                         }
2844                         m = m->m_nextpkt;
2845                 }
2846
2847                 socket_lock(so, 0);
2848
2849                 if (error) {
2850                         goto release;
2851                 }
2852                 top = freelist;
2853                 freelist = NULL;
2854
2855                 if (dontroute) {
2856                         so->so_options |= SO_DONTROUTE;
2857                 }
2858
2859                 if ((flags & MSG_SKIPCFIL) == 0) {
2860                         struct mbuf **prevnextp = NULL;
2861
2862                         for (i = uiofirst, m = top;
2863                             i < uiolast && m != NULL;
2864                             i++) {
2865                                 struct mbuf *nextpkt = m->m_nextpkt;
2866
2867                                 /*
2868                                  * Socket filter processing
2869                                  */
2870                                 error = sflt_data_out(so, NULL, &m,
2871                                     NULL, 0);
2872                                 if (error != 0 && error != EJUSTRETURN) {
2873                                         goto release;
2874                                 }
2875
2876 #if CONTENT_FILTER
2877                                 if (error == 0) {
2878                                         /*
2879                                          * Content filter processing
2880                                          */
2881                                         error = cfil_sock_data_out(so, NULL, m,
2882                                             NULL, 0);
2883                                         if (error != 0 && error != EJUSTRETURN) {
2884                                                 goto release;
2885                                         }
2886                                 }
2887 #endif /* CONTENT_FILTER */
2888                                 /*
2889                                  * Remove packet from the list when
2890                                  * swallowed by a filter
2891                                  */
2892                                 if (error == EJUSTRETURN) {
2893                                         error = 0;
2894                                         if (prevnextp != NULL) {
2895                                                 *prevnextp = nextpkt;
2896                                         } else {
2897                                                 top = nextpkt;
2898                                         }
2899                                 }
2900
2901                                 m = nextpkt;
2902                                 if (m != NULL) {
2903                                         prevnextp = &m->m_nextpkt;
2904                                 }
2905                         }
2906                 }
2907                 if (top != NULL) {
2908                         error = (*so->so_proto->pr_usrreqs->pru_send_list)
2909                             (so, 0, top, NULL, NULL, p);
2910                 }
2911
2912                 if (dontroute) {
2913                         so->so_options &= ~SO_DONTROUTE;
2914                 }
2915
2916                 top = NULL;
2917                 uiofirst = uiolast;
2918         } while (resid > 0 && error == 0);
2919 release:
2920         if (sblocked) {
2921                 sbunlock(&so->so_snd, FALSE);   /* will unlock socket */
2922         } else {
2923                 socket_unlock(so, 1);
2924         }
2925 out:
2926         if (top != NULL) {
2927                 m_freem(top);
2928         }
2929         if (freelist != NULL) {
2930                 m_freem_list(freelist);
2931         }
2932
2933         KERNEL_DEBUG(DBG_FNC_SOSEND_LIST | DBG_FUNC_END, so, resid,
2934             so->so_snd.sb_cc, 0, error);
2935
2936         return error;
2937 }
2938
2939 /*
2940  * May return ERESTART when packet is dropped by MAC policy check
2941  */
2942 static int
2943 soreceive_addr(struct proc *p, struct socket *so, struct sockaddr **psa,
2944     int flags, struct mbuf **mp, struct mbuf **nextrecordp, int canwait)
2945 {
2946         int error = 0;
2947         struct mbuf *m = *mp;
2948         struct mbuf *nextrecord = *nextrecordp;
2949
2950         KASSERT(m->m_type == MT_SONAME, ("receive 1a"));
2951 #if CONFIG_MACF_SOCKET_SUBSET
2952         /*
2953          * Call the MAC framework for policy checking if we're in
2954          * the user process context and the socket isn't connected.
2955          */
2956         if (p != kernproc && !(so->so_state & SS_ISCONNECTED)) {
2957                 struct mbuf *m0 = m;
2958                 /*
2959                  * Dequeue this record (temporarily) from the receive
2960                  * list since we're about to drop the socket's lock
2961                  * where a new record may arrive and be appended to
2962                  * the list.  Upon MAC policy failure, the record
2963                  * will be freed.  Otherwise, we'll add it back to
2964                  * the head of the list.  We cannot rely on SB_LOCK
2965                  * because append operation uses the socket's lock.
2966                  */
2967                 do {
2968                         m->m_nextpkt = NULL;
2969                         sbfree(&so->so_rcv, m);
2970                         m = m->m_next;
2971                 } while (m != NULL);
2972                 m = m0;
2973                 so->so_rcv.sb_mb = nextrecord;
2974                 SB_EMPTY_FIXUP(&so->so_rcv);
2975                 SBLASTRECORDCHK(&so->so_rcv, "soreceive 1a");
2976                 SBLASTMBUFCHK(&so->so_rcv, "soreceive 1a");
2977                 socket_unlock(so, 0);
2978
2979                 if (mac_socket_check_received(proc_ucred(p), so,
2980                     mtod(m, struct sockaddr *)) != 0) {
2981                         /*
2982                          * MAC policy failure; free this record and
2983                          * process the next record (or block until
2984                          * one is available).  We have adjusted sb_cc
2985                          * and sb_mbcnt above so there is no need to
2986                          * call sbfree() again.
2987                          */
2988                         m_freem(m);
2989                         /*
2990                          * Clear SB_LOCK but don't unlock the socket.
2991                          * Process the next record or wait for one.
2992                          */
2993                         socket_lock(so, 0);
2994                         sbunlock(&so->so_rcv, TRUE); /* stay locked */
2995                         error = ERESTART;
2996                         goto done;
2997                 }
2998                 socket_lock(so, 0);
2999                 /*
3000                  * If the socket has been defunct'd, drop it.
3001                  */
3002                 if (so->so_flags & SOF_DEFUNCT) {
3003                         m_freem(m);
3004                         error = ENOTCONN;
3005                         goto done;
3006                 }
3007                 /*
3008                  * Re-adjust the socket receive list and re-enqueue
3009                  * the record in front of any packets which may have
3010                  * been appended while we dropped the lock.
3011                  */
3012                 for (m = m0; m->m_next != NULL; m = m->m_next) {
3013                         sballoc(&so->so_rcv, m);
3014                 }
3015                 sballoc(&so->so_rcv, m);
3016                 if (so->so_rcv.sb_mb == NULL) {
3017                         so->so_rcv.sb_lastrecord = m0;
3018                         so->so_rcv.sb_mbtail = m;
3019                 }
3020                 m = m0;
3021                 nextrecord = m->m_nextpkt = so->so_rcv.sb_mb;
3022                 so->so_rcv.sb_mb = m;
3023                 SBLASTRECORDCHK(&so->so_rcv, "soreceive 1b");
3024                 SBLASTMBUFCHK(&so->so_rcv, "soreceive 1b");
3025         }
3026 #endif /* CONFIG_MACF_SOCKET_SUBSET */
3027         if (psa != NULL) {
3028                 *psa = dup_sockaddr(mtod(m, struct sockaddr *), canwait);
3029                 if ((*psa == NULL) && (flags & MSG_NEEDSA)) {
3030                         error = EWOULDBLOCK;
3031                         goto done;
3032                 }
3033         }
3034         if (flags & MSG_PEEK) {
3035                 m = m->m_next;
3036         } else {
3037                 sbfree(&so->so_rcv, m);
3038                 if (m->m_next == NULL && so->so_rcv.sb_cc != 0) {
3039                         panic("%s: about to create invalid socketbuf",
3040                             __func__);
3041                         /* NOTREACHED */
3042                 }
3043                 MFREE(m, so->so_rcv.sb_mb);
3044                 m = so->so_rcv.sb_mb;
3045                 if (m != NULL) {
3046                         m->m_nextpkt = nextrecord;
3047                 } else {
3048                         so->so_rcv.sb_mb = nextrecord;
3049                         SB_EMPTY_FIXUP(&so->so_rcv);
3050                 }
3051         }
3052 done:
3053         *mp = m;
3054         *nextrecordp = nextrecord;
3055
3056         return error;
3057 }
3058
3059 /*
3060  * Process one or more MT_CONTROL mbufs present before any data mbufs
3061  * in the first mbuf chain on the socket buffer.  If MSG_PEEK, we
3062  * just copy the data; if !MSG_PEEK, we call into the protocol to
3063  * perform externalization.
3064  */
3065 static int
3066 soreceive_ctl(struct socket *so, struct mbuf **controlp, int flags,
3067     struct mbuf **mp, struct mbuf **nextrecordp)
3068 {
3069         int error = 0;
3070         struct mbuf *cm = NULL, *cmn;
3071         struct mbuf **cme = &cm;
3072         struct sockbuf *sb_rcv = &so->so_rcv;
3073         struct mbuf **msgpcm = NULL;
3074         struct mbuf *m = *mp;
3075         struct mbuf *nextrecord = *nextrecordp;
3076         struct protosw *pr = so->so_proto;
3077
3078         /*
3079          * Externalizing the control messages would require us to
3080          * drop the socket's lock below.  Once we re-acquire the
3081          * lock, the mbuf chain might change.  In order to preserve
3082          * consistency, we unlink all control messages from the
3083          * first mbuf chain in one shot and link them separately
3084          * onto a different chain.
3085          */
3086         do {
3087                 if (flags & MSG_PEEK) {
3088                         if (controlp != NULL) {
3089                                 if (*controlp == NULL) {
3090                                         msgpcm = controlp;
3091                                 }
3092                                 *controlp = m_copy(m, 0, m->m_len);
3093
3094                                 /*
3095                                  * If we failed to allocate an mbuf,
3096                                  * release any previously allocated
3097                                  * mbufs for control data. Return
3098                                  * an error. Keep the mbufs in the
3099                                  * socket as this is using
3100                                  * MSG_PEEK flag.
3101                                  */
3102                                 if (*controlp == NULL) {
3103                                         m_freem(*msgpcm);
3104                                         error = ENOBUFS;
3105                                         goto done;
3106                                 }
3107                                 controlp = &(*controlp)->m_next;
3108                         }
3109                         m = m->m_next;
3110                 } else {
3111                         m->m_nextpkt = NULL;
3112                         sbfree(sb_rcv, m);
3113                         sb_rcv->sb_mb = m->m_next;
3114                         m->m_next = NULL;
3115                         *cme = m;
3116                         cme = &(*cme)->m_next;
3117                         m = sb_rcv->sb_mb;
3118                 }
3119         } while (m != NULL && m->m_type == MT_CONTROL);
3120
3121         if (!(flags & MSG_PEEK)) {
3122                 if (sb_rcv->sb_mb != NULL) {
3123                         sb_rcv->sb_mb->m_nextpkt = nextrecord;
3124                 } else {
3125                         sb_rcv->sb_mb = nextrecord;
3126                         SB_EMPTY_FIXUP(sb_rcv);
3127                 }
3128                 if (nextrecord == NULL) {
3129                         sb_rcv->sb_lastrecord = m;
3130                 }
3131         }
3132
3133         SBLASTRECORDCHK(&so->so_rcv, "soreceive ctl");
3134         SBLASTMBUFCHK(&so->so_rcv, "soreceive ctl");
3135
3136         while (cm != NULL) {
3137                 int cmsg_type;
3138
3139                 cmn = cm->m_next;
3140                 cm->m_next = NULL;
3141                 cmsg_type = mtod(cm, struct cmsghdr *)->cmsg_type;
3142
3143                 /*
3144                  * Call the protocol to externalize SCM_RIGHTS message
3145                  * and return the modified message to the caller upon
3146                  * success.  Otherwise, all other control messages are
3147                  * returned unmodified to the caller.  Note that we
3148                  * only get into this loop if MSG_PEEK is not set.
3149                  */
3150                 if (pr->pr_domain->dom_externalize != NULL &&
3151                     cmsg_type == SCM_RIGHTS) {
3152                         /*
3153                          * Release socket lock: see 3903171.  This
3154                          * would also allow more records to be appended
3155                          * to the socket buffer.  We still have SB_LOCK
3156                          * set on it, so we can be sure that the head
3157                          * of the mbuf chain won't change.
3158                          */
3159                         socket_unlock(so, 0);
3160                         error = (*pr->pr_domain->dom_externalize)(cm);
3161                         socket_lock(so, 0);
3162                 } else {
3163                         error = 0;
3164                 }
3165
3166                 if (controlp != NULL && error == 0) {
3167                         *controlp = cm;
3168                         controlp = &(*controlp)->m_next;
3169                 } else {
3170                         (void) m_free(cm);
3171                 }
3172                 cm = cmn;
3173         }
3174         /*
3175          * Update the value of nextrecord in case we received new
3176          * records when the socket was unlocked above for
3177          * externalizing SCM_RIGHTS.
3178          */
3179         if (m != NULL) {
3180                 nextrecord = sb_rcv->sb_mb->m_nextpkt;
3181         } else {
3182                 nextrecord = sb_rcv->sb_mb;
3183         }
3184
3185 done:
3186         *mp = m;
3187         *nextrecordp = nextrecord;
3188
3189         return error;
3190 }
3191
3192 /*
3193  * If we have less data than requested, block awaiting more
3194  * (subject to any timeout) if:
3195  *   1. the current count is less than the low water mark, or
3196  *   2. MSG_WAITALL is set, and it is possible to do the entire
3197  *      receive operation at once if we block (resid <= hiwat).
3198  *   3. MSG_DONTWAIT is not set
3199  * If MSG_WAITALL is set but resid is larger than the receive buffer,
3200  * we have to do the receive in sections, and thus risk returning
3201  * a short count if a timeout or signal occurs after we start.
3202  */
3203 static boolean_t
3204 so_should_wait(struct socket *so, struct uio *uio, struct mbuf *m, int flags)
3205 {
3206         struct protosw *pr = so->so_proto;
3207
3208         /* No mbufs in the receive-queue? Wait! */
3209         if (m == NULL) {
3210                 return true;
3211         }
3212
3213         /* Not enough data in the receive socket-buffer - we may have to wait */
3214         if ((flags & MSG_DONTWAIT) == 0 && so->so_rcv.sb_cc < uio_resid(uio) &&
3215             m->m_nextpkt == NULL && (pr->pr_flags & PR_ATOMIC) == 0) {
3216                 /*
3217                  * Application did set the lowater-mark, so we should wait for
3218                  * this data to be present.
3219                  */
3220                 if (so->so_rcv.sb_cc < so->so_rcv.sb_lowat) {
3221                         return true;
3222                 }
3223
3224                 /*
3225                  * Application wants all the data - so let's try to do the
3226                  * receive-operation at once by waiting for everything to
3227                  * be there.
3228                  */
3229                 if ((flags & MSG_WAITALL) && uio_resid(uio) <= so->so_rcv.sb_hiwat) {
3230                         return true;
3231                 }
3232         }
3233
3234         return false;
3235 }
3236
3237 /*
3238  * Implement receive operations on a socket.
3239  * We depend on the way that records are added to the sockbuf
3240  * by sbappend*.  In particular, each record (mbufs linked through m_next)
3241  * must begin with an address if the protocol so specifies,
3242  * followed by an optional mbuf or mbufs containing ancillary data,
3243  * and then zero or more mbufs of data.
3244  * In order to avoid blocking network interrupts for the entire time here,
3245  * we splx() while doing the actual copy to user space.
3246  * Although the sockbuf is locked, new data may still be appended,
3247  * and thus we must maintain consistency of the sockbuf during that time.
3248  *
3249  * The caller may receive the data as a single mbuf chain by supplying
3250  * an mbuf **mp0 for use in returning the chain.  The uio is then used
3251  * only for the count in uio_resid.
3252  *
3253  * Returns:     0                       Success
3254  *              ENOBUFS
3255  *              ENOTCONN
3256  *              EWOULDBLOCK
3257  *      uiomove:EFAULT
3258  *      sblock:EWOULDBLOCK
3259  *      sblock:EINTR
3260  *      sbwait:EBADF
3261  *      sbwait:EINTR
3262  *      sodelayed_copy:EFAULT
3263  *      <pru_rcvoob>:EINVAL[TCP]
3264  *      <pru_rcvoob>:EWOULDBLOCK[TCP]
3265  *      <pru_rcvoob>:???
3266  *      <pr_domain->dom_externalize>:EMSGSIZE[AF_UNIX]
3267  *      <pr_domain->dom_externalize>:ENOBUFS[AF_UNIX]
3268  *      <pr_domain->dom_externalize>:???
3269  *
3270  * Notes:       Additional return values from calls through <pru_rcvoob> and
3271  *              <pr_domain->dom_externalize> depend on protocols other than
3272  *              TCP or AF_UNIX, which are documented above.
3273  */
3274 int
3275 soreceive(struct socket *so, struct sockaddr **psa, struct uio *uio,
3276     struct mbuf **mp0, struct mbuf **controlp, int *flagsp)
3277 {
3278         struct mbuf *m, **mp, *ml = NULL;
3279         struct mbuf *nextrecord, *free_list;
3280         int flags, error, offset;
3281         user_ssize_t len;
3282         struct protosw *pr = so->so_proto;
3283         int moff, type = 0;
3284         user_ssize_t orig_resid = uio_resid(uio);
3285         user_ssize_t delayed_copy_len;
3286         int can_delay;
3287         struct proc *p = current_proc();
3288         boolean_t en_tracing = FALSE;
3289
3290         /*
3291          * Sanity check on the length passed by caller as we are making 'int'
3292          * comparisons
3293          */
3294         if (orig_resid < 0 || orig_resid > INT_MAX) {
3295                 return EINVAL;
3296         }
3297
3298         KERNEL_DEBUG(DBG_FNC_SORECEIVE | DBG_FUNC_START, so,
3299             uio_resid(uio), so->so_rcv.sb_cc, so->so_rcv.sb_lowat,
3300             so->so_rcv.sb_hiwat);
3301
3302         socket_lock(so, 1);
3303         so_update_last_owner_locked(so, p);
3304         so_update_policy(so);
3305
3306 #ifdef MORE_LOCKING_DEBUG
3307         if (so->so_usecount == 1) {
3308                 panic("%s: so=%x no other reference on socket\n", __func__, so);
3309                 /* NOTREACHED */
3310         }
3311 #endif
3312         mp = mp0;
3313         if (psa != NULL) {
3314                 *psa = NULL;
3315         }
3316         if (controlp != NULL) {
3317                 *controlp = NULL;
3318         }
3319         if (flagsp != NULL) {
3320                 flags = *flagsp & ~MSG_EOR;
3321         } else {
3322                 flags = 0;
3323         }
3324
3325         /*
3326          * If a recv attempt is made on a previously-accepted socket
3327          * that has been marked as inactive (disconnected), reject
3328          * the request.
3329          */
3330         if (so->so_flags & SOF_DEFUNCT) {
3331                 struct sockbuf *sb = &so->so_rcv;
3332
3333                 error = ENOTCONN;
3334                 SODEFUNCTLOG("%s[%d, %s]: defunct so 0x%llx [%d,%d] (%d)\n",
3335                     __func__, proc_pid(p), proc_best_name(p),
3336                     (uint64_t)DEBUG_KERNEL_ADDRPERM(so),
3337                     SOCK_DOM(so), SOCK_TYPE(so), error);
3338                 /*
3339                  * This socket should have been disconnected and flushed
3340                  * prior to being returned from sodefunct(); there should
3341                  * be no data on its receive list, so panic otherwise.
3342                  */
3343                 if (so->so_state & SS_DEFUNCT) {
3344                         sb_empty_assert(sb, __func__);
3345                 }
3346                 socket_unlock(so, 1);
3347                 return error;
3348         }
3349
3350         if ((so->so_flags1 & SOF1_PRECONNECT_DATA) &&
3351             pr->pr_usrreqs->pru_preconnect) {
3352                 /*
3353                  * A user may set the CONNECT_RESUME_ON_READ_WRITE-flag but not
3354                  * calling write() right after this. *If* the app calls a read
3355                  * we do not want to block this read indefinetely. Thus,
3356                  * we trigger a connect so that the session gets initiated.
3357                  */
3358                 error = (*pr->pr_usrreqs->pru_preconnect)(so);
3359
3360                 if (error) {
3361                         socket_unlock(so, 1);
3362                         return error;
3363                 }
3364         }
3365
3366         if (ENTR_SHOULDTRACE &&
3367             (SOCK_CHECK_DOM(so, AF_INET) || SOCK_CHECK_DOM(so, AF_INET6))) {
3368                 /*
3369                  * enable energy tracing for inet sockets that go over
3370                  * non-loopback interfaces only.
3371                  */
3372                 struct inpcb *inp = sotoinpcb(so);
3373                 if (inp->inp_last_outifp != NULL &&
3374                     !(inp->inp_last_outifp->if_flags & IFF_LOOPBACK)) {
3375                         en_tracing = TRUE;
3376                         KERNEL_ENERGYTRACE(kEnTrActKernSockRead, DBG_FUNC_START,
3377                             VM_KERNEL_ADDRPERM(so),
3378                             ((so->so_state & SS_NBIO) ?
3379                             kEnTrFlagNonBlocking : 0),
3380                             (int64_t)orig_resid);
3381                 }
3382         }
3383
3384         /*
3385          * When SO_WANTOOBFLAG is set we try to get out-of-band data
3386          * regardless of the flags argument. Here is the case were
3387          * out-of-band data is not inline.
3388          */
3389         if ((flags & MSG_OOB) ||
3390             ((so->so_options & SO_WANTOOBFLAG) != 0 &&
3391             (so->so_options & SO_OOBINLINE) == 0 &&
3392             (so->so_oobmark || (so->so_state & SS_RCVATMARK)))) {
3393                 m = m_get(M_WAIT, MT_DATA);
3394                 if (m == NULL) {
3395                         socket_unlock(so, 1);
3396                         KERNEL_DEBUG(DBG_FNC_SORECEIVE | DBG_FUNC_END,
3397                             ENOBUFS, 0, 0, 0, 0);
3398                         return ENOBUFS;
3399                 }
3400                 error = (*pr->pr_usrreqs->pru_rcvoob)(so, m, flags & MSG_PEEK);
3401                 if (error) {
3402                         goto bad;
3403                 }
3404                 socket_unlock(so, 0);
3405                 do {
3406                         error = uiomove(mtod(m, caddr_t),
3407                             imin(uio_resid(uio), m->m_len), uio);
3408                         m = m_free(m);
3409                 } while (uio_resid(uio) && error == 0 && m != NULL);
3410                 socket_lock(so, 0);
3411 bad:
3412                 if (m != NULL) {
3413                         m_freem(m);
3414                 }
3415
3416                 if ((so->so_options & SO_WANTOOBFLAG) != 0) {
3417                         if (error == EWOULDBLOCK || error == EINVAL) {
3418                                 /*
3419                                  * Let's try to get normal data:
3420                                  * EWOULDBLOCK: out-of-band data not
3421                                  * receive yet. EINVAL: out-of-band data
3422                                  * already read.
3423                                  */
3424                                 error = 0;
3425                                 goto nooob;
3426                         } else if (error == 0 && flagsp != NULL) {
3427                                 *flagsp |= MSG_OOB;
3428                         }
3429                 }
3430                 socket_unlock(so, 1);
3431                 if (en_tracing) {
3432                         KERNEL_ENERGYTRACE(kEnTrActKernSockRead, DBG_FUNC_END,
3433                             VM_KERNEL_ADDRPERM(so), 0,
3434                             (int64_t)(orig_resid - uio_resid(uio)));
3435                 }
3436                 KERNEL_DEBUG(DBG_FNC_SORECEIVE | DBG_FUNC_END, error,
3437                     0, 0, 0, 0);
3438
3439                 return error;
3440         }
3441 nooob:
3442         if (mp != NULL) {
3443                 *mp = NULL;
3444         }
3445
3446         if (so->so_state & SS_ISCONFIRMING && uio_resid(uio)) {
3447                 (*pr->pr_usrreqs->pru_rcvd)(so, 0);
3448         }
3449
3450         free_list = NULL;
3451         delayed_copy_len = 0;
3452 restart:
3453 #ifdef MORE_LOCKING_DEBUG
3454         if (so->so_usecount <= 1) {
3455                 printf("soreceive: sblock so=0x%llx ref=%d on socket\n",
3456                     (uint64_t)DEBUG_KERNEL_ADDRPERM(so), so->so_usecount);
3457         }
3458 #endif
3459         /*
3460          * See if the socket has been closed (SS_NOFDREF|SS_CANTRCVMORE)
3461          * and if so just return to the caller.  This could happen when
3462          * soreceive() is called by a socket upcall function during the
3463          * time the socket is freed.  The socket buffer would have been
3464          * locked across the upcall, therefore we cannot put this thread
3465          * to sleep (else we will deadlock) or return EWOULDBLOCK (else
3466          * we may livelock), because the lock on the socket buffer will
3467          * only be released when the upcall routine returns to its caller.
3468          * Because the socket has been officially closed, there can be
3469          * no further read on it.
3470          *
3471          * A multipath subflow socket would have its SS_NOFDREF set by
3472          * default, so check for SOF_MP_SUBFLOW socket flag; when the
3473          * socket is closed for real, SOF_MP_SUBFLOW would be cleared.
3474          */
3475         if ((so->so_state & (SS_NOFDREF | SS_CANTRCVMORE)) ==
3476             (SS_NOFDREF | SS_CANTRCVMORE) && !(so->so_flags & SOF_MP_SUBFLOW)) {
3477                 socket_unlock(so, 1);
3478                 return 0;
3479         }
3480
3481         error = sblock(&so->so_rcv, SBLOCKWAIT(flags));
3482         if (error) {
3483                 socket_unlock(so, 1);
3484                 KERNEL_DEBUG(DBG_FNC_SORECEIVE | DBG_FUNC_END, error,
3485                     0, 0, 0, 0);
3486                 if (en_tracing) {
3487                         KERNEL_ENERGYTRACE(kEnTrActKernSockRead, DBG_FUNC_END,
3488                             VM_KERNEL_ADDRPERM(so), 0,
3489                             (int64_t)(orig_resid - uio_resid(uio)));
3490                 }
3491                 return error;
3492         }
3493
3494         m = so->so_rcv.sb_mb;
3495         if (so_should_wait(so, uio, m, flags)) {
3496                 /*
3497                  * Panic if we notice inconsistencies in the socket's
3498                  * receive list; both sb_mb and sb_cc should correctly
3499                  * reflect the contents of the list, otherwise we may
3500                  * end up with false positives during select() or poll()
3501                  * which could put the application in a bad state.
3502                  */
3503                 SB_MB_CHECK(&so->so_rcv);
3504
3505                 if (so->so_error) {
3506                         if (m != NULL) {
3507                                 goto dontblock;
3508                         }
3509                         error = so->so_error;
3510                         if ((flags & MSG_PEEK) == 0) {
3511                                 so->so_error = 0;
3512                         }
3513                         goto release;
3514                 }
3515                 if (so->so_state & SS_CANTRCVMORE) {
3516 #if CONTENT_FILTER
3517                         /*
3518                          * Deal with half closed connections
3519                          */
3520                         if ((so->so_state & SS_ISDISCONNECTED) == 0 &&
3521                             cfil_sock_data_pending(&so->so_rcv) != 0) {
3522                                 CFIL_LOG(LOG_INFO,
3523                                     "so %llx ignore SS_CANTRCVMORE",
3524                                     (uint64_t)DEBUG_KERNEL_ADDRPERM(so));
3525                         } else
3526 #endif /* CONTENT_FILTER */
3527                         if (m != NULL) {
3528                                 goto dontblock;
3529                         } else {
3530                                 goto release;
3531                         }
3532                 }
3533                 for (; m != NULL; m = m->m_next) {
3534                         if (m->m_type == MT_OOBDATA || (m->m_flags & M_EOR)) {
3535                                 m = so->so_rcv.sb_mb;
3536                                 goto dontblock;
3537                         }
3538                 }
3539                 if ((so->so_state & (SS_ISCONNECTED | SS_ISCONNECTING)) == 0 &&
3540                     (so->so_proto->pr_flags & PR_CONNREQUIRED)) {
3541                         error = ENOTCONN;
3542                         goto release;
3543                 }
3544                 if (uio_resid(uio) == 0) {
3545                         goto release;
3546                 }
3547
3548                 if ((so->so_state & SS_NBIO) ||
3549                     (flags & (MSG_DONTWAIT | MSG_NBIO))) {
3550                         error = EWOULDBLOCK;
3551                         goto release;
3552                 }
3553                 SBLASTRECORDCHK(&so->so_rcv, "soreceive sbwait 1");
3554                 SBLASTMBUFCHK(&so->so_rcv, "soreceive sbwait 1");
3555                 sbunlock(&so->so_rcv, TRUE);    /* keep socket locked */
3556 #if EVEN_MORE_LOCKING_DEBUG
3557                 if (socket_debug) {
3558                         printf("Waiting for socket data\n");
3559                 }
3560 #endif
3561
3562                 /*
3563                  * Depending on the protocol (e.g. TCP), the following
3564                  * might cause the socket lock to be dropped and later
3565                  * be reacquired, and more data could have arrived and
3566                  * have been appended to the receive socket buffer by
3567                  * the time it returns.  Therefore, we only sleep in
3568                  * sbwait() below if and only if the wait-condition is still
3569                  * true.
3570                  */
3571                 if ((pr->pr_flags & PR_WANTRCVD) && so->so_pcb != NULL) {
3572                         (*pr->pr_usrreqs->pru_rcvd)(so, flags);
3573                 }
3574
3575                 error = 0;
3576                 if (so_should_wait(so, uio, so->so_rcv.sb_mb, flags)) {
3577                         error = sbwait(&so->so_rcv);
3578                 }
3579
3580 #if EVEN_MORE_LOCKING_DEBUG
3581                 if (socket_debug) {
3582                         printf("SORECEIVE - sbwait returned %d\n", error);
3583                 }
3584 #endif
3585                 if (so->so_usecount < 1) {
3586                         panic("%s: after 2nd sblock so=%p ref=%d on socket\n",
3587                             __func__, so, so->so_usecount);
3588                         /* NOTREACHED */
3589                 }
3590                 if (error) {
3591                         socket_unlock(so, 1);
3592                         KERNEL_DEBUG(DBG_FNC_SORECEIVE | DBG_FUNC_END, error,
3593                             0, 0, 0, 0);
3594                         if (en_tracing) {
3595                                 KERNEL_ENERGYTRACE(kEnTrActKernSockRead, DBG_FUNC_END,
3596                                     VM_KERNEL_ADDRPERM(so), 0,
3597                                     (int64_t)(orig_resid - uio_resid(uio)));
3598                         }
3599                         return error;
3600                 }
3601                 goto restart;
3602         }
3603 dontblock:
3604         OSIncrementAtomicLong(&p->p_stats->p_ru.ru_msgrcv);
3605         SBLASTRECORDCHK(&so->so_rcv, "soreceive 1");
3606         SBLASTMBUFCHK(&so->so_rcv, "soreceive 1");
3607         nextrecord = m->m_nextpkt;
3608
3609         if ((pr->pr_flags & PR_ADDR) && m->m_type == MT_SONAME) {
3610                 error = soreceive_addr(p, so, psa, flags, &m, &nextrecord,
3611                     mp0 == NULL);
3612                 if (error == ERESTART) {
3613                         goto restart;
3614                 } else if (error != 0) {
3615                         goto release;
3616                 }
3617                 orig_resid = 0;
3618         }
3619
3620         /*
3621          * Process one or more MT_CONTROL mbufs present before any data mbufs
3622          * in the first mbuf chain on the socket buffer.  If MSG_PEEK, we
3623          * just copy the data; if !MSG_PEEK, we call into the protocol to
3624          * perform externalization.
3625          */
3626         if (m != NULL && m->m_type == MT_CONTROL) {
3627                 error = soreceive_ctl(so, controlp, flags, &m, &nextrecord);
3628                 if (error != 0) {
3629                         goto release;
3630                 }
3631                 orig_resid = 0;
3632         }
3633
3634         if (m != NULL) {
3635                 if (!(flags & MSG_PEEK)) {
3636                         /*
3637                          * We get here because m points to an mbuf following
3638                          * any MT_SONAME or MT_CONTROL mbufs which have been
3639                          * processed above.  In any case, m should be pointing
3640                          * to the head of the mbuf chain, and the nextrecord
3641                          * should be either NULL or equal to m->m_nextpkt.
3642                          * See comments above about SB_LOCK.
3643                          */
3644                         if (m != so->so_rcv.sb_mb ||
3645                             m->m_nextpkt != nextrecord) {
3646                                 panic("%s: post-control !sync so=%p m=%p "
3647                                     "nextrecord=%p\n", __func__, so, m,
3648                                     nextrecord);
3649                                 /* NOTREACHED */
3650                         }
3651                         if (nextrecord == NULL) {
3652                                 so->so_rcv.sb_lastrecord = m;
3653                         }
3654                 }
3655                 type = m->m_type;
3656                 if (type == MT_OOBDATA) {
3657                         flags |= MSG_OOB;
3658                 }
3659         } else {
3660                 if (!(flags & MSG_PEEK)) {
3661                         SB_EMPTY_FIXUP(&so->so_rcv);
3662                 }
3663         }
3664         SBLASTRECORDCHK(&so->so_rcv, "soreceive 2");
3665         SBLASTMBUFCHK(&so->so_rcv, "soreceive 2");
3666
3667         moff = 0;
3668         offset = 0;
3669
3670         if (!(flags & MSG_PEEK) && uio_resid(uio) > sorecvmincopy) {
3671                 can_delay = 1;
3672         } else {
3673                 can_delay = 0;
3674         }
3675
3676         while (m != NULL &&
3677             (uio_resid(uio) - delayed_copy_len) > 0 && error == 0) {
3678                 if (m->m_type == MT_OOBDATA) {
3679                         if (type != MT_OOBDATA) {
3680                                 break;
3681                         }
3682                 } else if (type == MT_OOBDATA) {
3683                         break;
3684                 }
3685                 /*
3686                  * Make sure to allways set MSG_OOB event when getting
3687                  * out of band data inline.
3688                  */
3689                 if ((so->so_options & SO_WANTOOBFLAG) != 0 &&
3690                     (so->so_options & SO_OOBINLINE) != 0 &&
3691                     (so->so_state & SS_RCVATMARK) != 0) {
3692                         flags |= MSG_OOB;
3693                 }
3694                 so->so_state &= ~SS_RCVATMARK;
3695                 len = uio_resid(uio) - delayed_copy_len;
3696                 if (so->so_oobmark && len > so->so_oobmark - offset) {
3697                         len = so->so_oobmark - offset;
3698                 }
3699                 if (len > m->m_len - moff) {
3700                         len = m->m_len - moff;
3701                 }
3702                 /*
3703                  * If mp is set, just pass back the mbufs.
3704                  * Otherwise copy them out via the uio, then free.
3705                  * Sockbuf must be consistent here (points to current mbuf,
3706                  * it points to next record) when we drop priority;
3707                  * we must note any additions to the sockbuf when we
3708                  * block interrupts again.
3709                  */
3710                 if (mp == NULL) {
3711                         SBLASTRECORDCHK(&so->so_rcv, "soreceive uiomove");
3712                         SBLASTMBUFCHK(&so->so_rcv, "soreceive uiomove");
3713                         if (can_delay && len == m->m_len) {
3714                                 /*
3715                                  * only delay the copy if we're consuming the
3716                                  * mbuf and we're NOT in MSG_PEEK mode
3717                                  * and we have enough data to make it worthwile
3718                                  * to drop and retake the lock... can_delay
3719                                  * reflects the state of the 2 latter
3720                                  * constraints moff should always be zero
3721                                  * in these cases
3722                                  */
3723                                 delayed_copy_len += len;
3724                         } else {
3725                                 if (delayed_copy_len) {
3726                                         error = sodelayed_copy(so, uio,
3727                                             &free_list, &delayed_copy_len);
3728
3729                                         if (error) {
3730                                                 goto release;
3731                                         }
3732                                         /*
3733                                          * can only get here if MSG_PEEK is not
3734                                          * set therefore, m should point at the
3735                                          * head of the rcv queue; if it doesn't,
3736                                          * it means something drastically
3737                                          * changed while we were out from behind
3738                                          * the lock in sodelayed_copy. perhaps
3739                                          * a RST on the stream. in any event,
3740                                          * the stream has been interrupted. it's
3741                                          * probably best just to return whatever
3742                                          * data we've moved and let the caller
3743                                          * sort it out...
3744                                          */
3745                                         if (m != so->so_rcv.sb_mb) {
3746                                                 break;
3747                                         }
3748                                 }
3749                                 socket_unlock(so, 0);
3750                                 error = uiomove(mtod(m, caddr_t) + moff,
3751                                     (int)len, uio);
3752                                 socket_lock(so, 0);
3753
3754                                 if (error) {
3755                                         goto release;
3756                                 }
3757                         }
3758                 } else {
3759                         uio_setresid(uio, (uio_resid(uio) - len));
3760                 }
3761                 if (len == m->m_len - moff) {
3762                         if (m->m_flags & M_EOR) {
3763                                 flags |= MSG_EOR;
3764                         }
3765                         if (flags & MSG_PEEK) {
3766                                 m = m->m_next;
3767                                 moff = 0;
3768                         } else {
3769                                 nextrecord = m->m_nextpkt;
3770                                 sbfree(&so->so_rcv, m);
3771                                 m->m_nextpkt = NULL;
3772
3773                                 if (mp != NULL) {
3774                                         *mp = m;
3775                                         mp = &m->m_next;
3776                                         so->so_rcv.sb_mb = m = m->m_next;
3777                                         *mp = NULL;
3778                                 } else {
3779                                         if (free_list == NULL) {
3780                                                 free_list = m;
3781                                         } else {
3782                                                 ml->m_next = m;
3783                                         }
3784                                         ml = m;
3785                                         so->so_rcv.sb_mb = m = m->m_next;
3786                                         ml->m_next = NULL;
3787                                 }
3788                                 if (m != NULL) {
3789                                         m->m_nextpkt = nextrecord;
3790                                         if (nextrecord == NULL) {
3791                                                 so->so_rcv.sb_lastrecord = m;
3792                                         }
3793                                 } else {
3794                                         so->so_rcv.sb_mb = nextrecord;
3795                                         SB_EMPTY_FIXUP(&so->so_rcv);
3796                                 }
3797                                 SBLASTRECORDCHK(&so->so_rcv, "soreceive 3");
3798                                 SBLASTMBUFCHK(&so->so_rcv, "soreceive 3");
3799                         }
3800                 } else {
3801                         if (flags & MSG_PEEK) {
3802                                 moff += len;
3803                         } else {
3804                                 if (mp != NULL) {
3805                                         int copy_flag;
3806
3807                                         if (flags & MSG_DONTWAIT) {
3808                                                 copy_flag = M_DONTWAIT;
3809                                         } else {
3810                                                 copy_flag = M_WAIT;
3811                                         }
3812                                         *mp = m_copym(m, 0, len, copy_flag);
3813                                         /*
3814                                          * Failed to allocate an mbuf?
3815                                          * Adjust uio_resid back, it was
3816                                          * adjusted down by len bytes which
3817                                          * we didn't copy over.
3818                                          */
3819                                         if (*mp == NULL) {
3820                                                 uio_setresid(uio,
3821                                                     (uio_resid(uio) + len));
3822                                                 break;
3823                                         }
3824                                 }
3825                                 m->m_data += len;
3826                                 m->m_len -= len;
3827                                 so->so_rcv.sb_cc -= len;
3828                         }
3829                 }
3830                 if (so->so_oobmark) {
3831                         if ((flags & MSG_PEEK) == 0) {
3832                                 so->so_oobmark -= len;
3833                                 if (so->so_oobmark == 0) {
3834                                         so->so_state |= SS_RCVATMARK;
3835                                         break;
3836                                 }
3837                         } else {
3838                                 offset += len;
3839                                 if (offset == so->so_oobmark) {
3840                                         break;
3841                                 }
3842                         }
3843                 }
3844                 if (flags & MSG_EOR) {
3845                         break;
3846                 }
3847                 /*
3848                  * If the MSG_WAITALL or MSG_WAITSTREAM flag is set
3849                  * (for non-atomic socket), we must not quit until
3850                  * "uio->uio_resid == 0" or an error termination.
3851                  * If a signal/timeout occurs, return with a short
3852                  * count but without error.  Keep sockbuf locked
3853                  * against other readers.
3854                  */
3855                 while (flags & (MSG_WAITALL | MSG_WAITSTREAM) && m == NULL &&
3856                     (uio_resid(uio) - delayed_copy_len) > 0 &&
3857                     !sosendallatonce(so) && !nextrecord) {
3858                         if (so->so_error || ((so->so_state & SS_CANTRCVMORE)
3859 #if CONTENT_FILTER
3860                             && cfil_sock_data_pending(&so->so_rcv) == 0
3861 #endif /* CONTENT_FILTER */
3862                             )) {
3863                                 goto release;
3864                         }
3865
3866                         /*
3867                          * Depending on the protocol (e.g. TCP), the following
3868                          * might cause the socket lock to be dropped and later
3869                          * be reacquired, and more data could have arrived and
3870                          * have been appended to the receive socket buffer by
3871                          * the time it returns.  Therefore, we only sleep in
3872                          * sbwait() below if and only if the socket buffer is
3873                          * empty, in order to avoid a false sleep.
3874                          */
3875                         if ((pr->pr_flags & PR_WANTRCVD) && so->so_pcb != NULL) {
3876                                 (*pr->pr_usrreqs->pru_rcvd)(so, flags);
3877                         }
3878
3879                         SBLASTRECORDCHK(&so->so_rcv, "soreceive sbwait 2");
3880                         SBLASTMBUFCHK(&so->so_rcv, "soreceive sbwait 2");
3881
3882                         if (so->so_rcv.sb_mb == NULL && sbwait(&so->so_rcv)) {
3883                                 error = 0;
3884                                 goto release;
3885                         }
3886                         /*
3887                          * have to wait until after we get back from the sbwait
3888                          * to do the copy because we will drop the lock if we
3889                          * have enough data that has been delayed... by dropping
3890                          * the lock we open up a window allowing the netisr
3891                          * thread to process the incoming packets and to change
3892                          * the state of this socket... we're issuing the sbwait
3893                          * because the socket is empty and we're expecting the
3894                          * netisr thread to wake us up when more packets arrive;
3895                          * if we allow that processing to happen and then sbwait
3896                          * we could stall forever with packets sitting in the
3897                          * socket if no further packets arrive from the remote
3898                          * side.
3899                          *
3900                          * we want to copy before we've collected all the data
3901                          * to satisfy this request to allow the copy to overlap
3902                          * the incoming packet processing on an MP system
3903                          */
3904                         if (delayed_copy_len > sorecvmincopy &&
3905                             (delayed_copy_len > (so->so_rcv.sb_hiwat / 2))) {
3906                                 error = sodelayed_copy(so, uio,
3907                                     &free_list, &delayed_copy_len);
3908
3909                                 if (error) {
3910                                         goto release;
3911                                 }
3912                         }
3913                         m = so->so_rcv.sb_mb;
3914                         if (m != NULL) {
3915                                 nextrecord = m->m_nextpkt;
3916                         }
3917                         SB_MB_CHECK(&so->so_rcv);
3918                 }
3919         }
3920 #ifdef MORE_LOCKING_DEBUG
3921         if (so->so_usecount <= 1) {
3922                 panic("%s: after big while so=%p ref=%d on socket\n",
3923                     __func__, so, so->so_usecount);
3924                 /* NOTREACHED */
3925         }
3926 #endif
3927
3928         if (m != NULL && pr->pr_flags & PR_ATOMIC) {
3929                 if (so->so_options & SO_DONTTRUNC) {
3930                         flags |= MSG_RCVMORE;
3931                 } else {
3932                         flags |= MSG_TRUNC;
3933                         if ((flags & MSG_PEEK) == 0) {
3934                                 (void) sbdroprecord(&so->so_rcv);
3935                         }
3936                 }
3937         }
3938
3939         /*
3940          * pru_rcvd below (for TCP) may cause more data to be received
3941          * if the socket lock is dropped prior to sending the ACK; some
3942          * legacy OpenTransport applications don't handle this well
3943          * (if it receives less data than requested while MSG_HAVEMORE
3944          * is set), and so we set the flag now based on what we know
3945          * prior to calling pru_rcvd.
3946          */
3947         if ((so->so_options & SO_WANTMORE) && so->so_rcv.sb_cc > 0) {
3948                 flags |= MSG_HAVEMORE;
3949         }
3950
3951         if ((flags & MSG_PEEK) == 0) {
3952                 if (m == NULL) {
3953                         so->so_rcv.sb_mb = nextrecord;
3954                         /*
3955                          * First part is an inline SB_EMPTY_FIXUP().  Second
3956                          * part makes sure sb_lastrecord is up-to-date if
3957                          * there is still data in the socket buffer.
3958                          */
3959                         if (so->so_rcv.sb_mb == NULL) {
3960                                 so->so_rcv.sb_mbtail = NULL;
3961                                 so->so_rcv.sb_lastrecord = NULL;
3962                         } else if (nextrecord->m_nextpkt == NULL) {
3963                                 so->so_rcv.sb_lastrecord = nextrecord;
3964                         }
3965                         SB_MB_CHECK(&so->so_rcv);
3966                 }
3967                 SBLASTRECORDCHK(&so->so_rcv, "soreceive 4");
3968                 SBLASTMBUFCHK(&so->so_rcv, "soreceive 4");
3969                 if (pr->pr_flags & PR_WANTRCVD && so->so_pcb) {
3970                         (*pr->pr_usrreqs->pru_rcvd)(so, flags);
3971                 }
3972         }
3973
3974         if (delayed_copy_len) {
3975                 error = sodelayed_copy(so, uio, &free_list, &delayed_copy_len);
3976                 if (error) {
3977                         goto release;
3978                 }
3979         }
3980         if (free_list != NULL) {
3981                 m_freem_list(free_list);
3982                 free_list = NULL;
3983         }
3984
3985         if (orig_resid == uio_resid(uio) && orig_resid &&
3986             (flags & MSG_EOR) == 0 && (so->so_state & SS_CANTRCVMORE) == 0) {
3987                 sbunlock(&so->so_rcv, TRUE);    /* keep socket locked */
3988                 goto restart;
3989         }
3990
3991         if (flagsp != NULL) {
3992                 *flagsp |= flags;
3993         }
3994 release:
3995 #ifdef MORE_LOCKING_DEBUG
3996         if (so->so_usecount <= 1) {
3997                 panic("%s: release so=%p ref=%d on socket\n", __func__,
3998                     so, so->so_usecount);
3999                 /* NOTREACHED */
4000         }
4001 #endif
4002         if (delayed_copy_len) {
4003                 error = sodelayed_copy(so, uio, &free_list, &delayed_copy_len);
4004         }
4005
4006         if (free_list != NULL) {
4007                 m_freem_list(free_list);
4008         }
4009
4010         sbunlock(&so->so_rcv, FALSE);   /* will unlock socket */
4011
4012         if (en_tracing) {
4013                 KERNEL_ENERGYTRACE(kEnTrActKernSockRead, DBG_FUNC_END,
4014                     VM_KERNEL_ADDRPERM(so),
4015                     ((error == EWOULDBLOCK) ? kEnTrFlagNoWork : 0),
4016                     (int64_t)(orig_resid - uio_resid(uio)));
4017         }
4018         KERNEL_DEBUG(DBG_FNC_SORECEIVE | DBG_FUNC_END, so, uio_resid(uio),
4019             so->so_rcv.sb_cc, 0, error);
4020
4021         return error;
4022 }
4023
4024 /*
4025  * Returns:     0                       Success
4026  *      uiomove:EFAULT
4027  */
4028 static int
4029 sodelayed_copy(struct socket *so, struct uio *uio, struct mbuf **free_list,
4030     user_ssize_t *resid)
4031 {
4032         int error = 0;
4033         struct mbuf *m;
4034
4035         m = *free_list;
4036
4037         socket_unlock(so, 0);
4038
4039         while (m != NULL && error == 0) {
4040                 error = uiomove(mtod(m, caddr_t), (int)m->m_len, uio);
4041                 m = m->m_next;
4042         }
4043         m_freem_list(*free_list);
4044
4045         *free_list = NULL;
4046         *resid = 0;
4047
4048         socket_lock(so, 0);
4049
4050         return error;
4051 }
4052
4053 static int
4054 sodelayed_copy_list(struct socket *so, struct recv_msg_elem *msgarray,
4055     u_int uiocnt, struct mbuf **free_list, user_ssize_t *resid)
4056 {
4057 #pragma unused(so)
4058         int error = 0;
4059         struct mbuf *ml, *m;
4060         int i = 0;
4061         struct uio *auio;
4062
4063         for (ml = *free_list, i = 0; ml != NULL && i < uiocnt;
4064             ml = ml->m_nextpkt, i++) {
4065                 auio = msgarray[i].uio;
4066                 for (m = ml; m != NULL; m = m->m_next) {
4067                         error = uiomove(mtod(m, caddr_t), m->m_len, auio);
4068                         if (error != 0) {
4069                                 goto out;
4070                         }
4071                 }
4072         }
4073 out:
4074         m_freem_list(*free_list);
4075
4076         *free_list = NULL;
4077         *resid = 0;
4078
4079         return error;
4080 }
4081
4082 int
4083 soreceive_list(struct socket *so, struct recv_msg_elem *msgarray, u_int uiocnt,
4084     int *flagsp)
4085 {
4086         struct mbuf *m;
4087         struct mbuf *nextrecord;
4088         struct mbuf *ml = NULL, *free_list = NULL, *free_tail = NULL;
4089         int error;
4090         user_ssize_t len, pktlen, delayed_copy_len = 0;
4091         struct protosw *pr = so->so_proto;
4092         user_ssize_t resid;
4093         struct proc *p = current_proc();
4094         struct uio *auio = NULL;
4095         int npkts = 0;
4096         int sblocked = 0;
4097         struct sockaddr **psa = NULL;
4098         struct mbuf **controlp = NULL;
4099         int can_delay;
4100         int flags;
4101         struct mbuf *free_others = NULL;
4102
4103         KERNEL_DEBUG(DBG_FNC_SORECEIVE_LIST | DBG_FUNC_START,
4104             so, uiocnt,
4105             so->so_rcv.sb_cc, so->so_rcv.sb_lowat, so->so_rcv.sb_hiwat);
4106
4107         /*
4108          * Sanity checks:
4109          * - Only supports don't wait flags
4110          * - Only support datagram sockets (could be extended to raw)
4111          * - Must be atomic
4112          * - Protocol must support packet chains
4113          * - The uio array is NULL (should we panic?)
4114          */
4115         if (flagsp != NULL) {
4116                 flags = *flagsp;
4117         } else {
4118                 flags = 0;
4119         }
4120         if (flags & ~(MSG_PEEK | MSG_WAITALL | MSG_DONTWAIT | MSG_NEEDSA |
4121             MSG_NBIO)) {
4122                 printf("%s invalid flags 0x%x\n", __func__, flags);
4123                 error = EINVAL;
4124                 goto out;
4125         }
4126         if (so->so_type != SOCK_DGRAM) {
4127                 error = EINVAL;
4128                 goto out;
4129         }
4130         if (sosendallatonce(so) == 0) {
4131                 error = EINVAL;
4132                 goto out;
4133         }
4134         if (so->so_proto->pr_usrreqs->pru_send_list == NULL) {
4135                 error = EPROTONOSUPPORT;
4136                 goto out;
4137         }
4138         if (msgarray == NULL) {
4139                 printf("%s uioarray is NULL\n", __func__);
4140                 error = EINVAL;
4141                 goto out;
4142         }
4143         if (uiocnt == 0) {
4144                 printf("%s uiocnt is 0\n", __func__);
4145                 error = EINVAL;
4146                 goto out;
4147         }
4148         /*
4149          * Sanity check on the length passed by caller as we are making 'int'
4150          * comparisons
4151          */
4152         resid = recv_msg_array_resid(msgarray, uiocnt);
4153         if (resid < 0 || resid > INT_MAX) {
4154                 error = EINVAL;
4155                 goto out;
4156         }
4157
4158         if (!(flags & MSG_PEEK) && sorecvmincopy > 0) {
4159                 can_delay = 1;
4160         } else {
4161                 can_delay = 0;
4162         }
4163
4164         socket_lock(so, 1);
4165         so_update_last_owner_locked(so, p);
4166         so_update_policy(so);
4167
4168 #if NECP
4169         so_update_necp_policy(so, NULL, NULL);
4170 #endif /* NECP */
4171
4172         /*
4173          * If a recv attempt is made on a previously-accepted socket
4174          * that has been marked as inactive (disconnected), reject
4175          * the request.
4176          */
4177         if (so->so_flags & SOF_DEFUNCT) {
4178                 struct sockbuf *sb = &so->so_rcv;
4179
4180                 error = ENOTCONN;
4181                 SODEFUNCTLOG("%s[%d, %s]: defunct so 0x%llx [%d,%d] (%d)\n",
4182                     __func__, proc_pid(p), proc_best_name(p),
4183                     (uint64_t)DEBUG_KERNEL_ADDRPERM(so),
4184                     SOCK_DOM(so), SOCK_TYPE(so), error);
4185                 /*
4186                  * This socket should have been disconnected and flushed
4187                  * prior to being returned from sodefunct(); there should
4188                  * be no data on its receive list, so panic otherwise.
4189                  */
4190                 if (so->so_state & SS_DEFUNCT) {
4191                         sb_empty_assert(sb, __func__);
4192                 }
4193                 goto release;
4194         }
4195
4196 next:
4197         /*
4198          * The uio may be empty
4199          */
4200         if (npkts >= uiocnt) {
4201                 error = 0;
4202                 goto release;
4203         }
4204 restart:
4205         /*
4206          * See if the socket has been closed (SS_NOFDREF|SS_CANTRCVMORE)
4207          * and if so just return to the caller.  This could happen when
4208          * soreceive() is called by a socket upcall function during the
4209          * time the socket is freed.  The socket buffer would have been
4210          * locked across the upcall, therefore we cannot put this thread
4211          * to sleep (else we will deadlock) or return EWOULDBLOCK (else
4212          * we may livelock), because the lock on the socket buffer will
4213          * only be released when the upcall routine returns to its caller.
4214          * Because the socket has been officially closed, there can be
4215          * no further read on it.
4216          */
4217         if ((so->so_state & (SS_NOFDREF | SS_CANTRCVMORE)) ==
4218             (SS_NOFDREF | SS_CANTRCVMORE)) {
4219                 error = 0;
4220                 goto release;
4221         }
4222
4223         error = sblock(&so->so_rcv, SBLOCKWAIT(flags));
4224         if (error) {
4225                 goto release;
4226         }
4227         sblocked = 1;
4228
4229         m = so->so_rcv.sb_mb;
4230         /*
4231          * Block awaiting more datagram if needed
4232          */
4233         if (m == NULL || (((flags & MSG_DONTWAIT) == 0 &&
4234             (so->so_rcv.sb_cc < so->so_rcv.sb_lowat ||
4235             ((flags & MSG_WAITALL) && npkts < uiocnt))))) {
4236                 /*
4237                  * Panic if we notice inconsistencies in the socket's
4238                  * receive list; both sb_mb and sb_cc should correctly
4239                  * reflect the contents of the list, otherwise we may
4240                  * end up with false positives during select() or poll()
4241                  * which could put the application in a bad state.
4242                  */
4243                 SB_MB_CHECK(&so->so_rcv);
4244
4245                 if (so->so_error) {
4246                         error = so->so_error;
4247                         if ((flags & MSG_PEEK) == 0) {
4248                                 so->so_error = 0;
4249                         }
4250                         goto release;
4251                 }
4252                 if (so->so_state & SS_CANTRCVMORE) {
4253                         goto release;
4254                 }
4255                 if ((so->so_state & (SS_ISCONNECTED | SS_ISCONNECTING)) == 0 &&
4256                     (so->so_proto->pr_flags & PR_CONNREQUIRED)) {
4257                         error = ENOTCONN;
4258                         goto release;
4259                 }
4260                 if ((so->so_state & SS_NBIO) ||
4261                     (flags & (MSG_DONTWAIT | MSG_NBIO))) {
4262                         error = EWOULDBLOCK;
4263                         goto release;
4264                 }
4265                 /*
4266                  * Do not block if we got some data
4267                  */
4268                 if (free_list != NULL) {
4269                         error = 0;
4270                         goto release;
4271                 }
4272
4273                 SBLASTRECORDCHK(&so->so_rcv, "soreceive sbwait 1");
4274                 SBLASTMBUFCHK(&so->so_rcv, "soreceive sbwait 1");
4275
4276                 sbunlock(&so->so_rcv, TRUE);    /* keep socket locked */
4277                 sblocked = 0;
4278
4279                 error = sbwait(&so->so_rcv);
4280                 if (error) {
4281                         goto release;
4282                 }
4283                 goto restart;
4284         }
4285
4286         OSIncrementAtomicLong(&p->p_stats->p_ru.ru_msgrcv);
4287         SBLASTRECORDCHK(&so->so_rcv, "soreceive 1");
4288         SBLASTMBUFCHK(&so->so_rcv, "soreceive 1");
4289
4290         /*
4291          * Consume the current uio index as we have a datagram
4292          */
4293         auio = msgarray[npkts].uio;
4294         resid = uio_resid(auio);
4295         msgarray[npkts].which |= SOCK_MSG_DATA;
4296         psa = (msgarray[npkts].which & SOCK_MSG_SA) ?
4297             &msgarray[npkts].psa : NULL;
4298         controlp = (msgarray[npkts].which & SOCK_MSG_CONTROL) ?
4299             &msgarray[npkts].controlp : NULL;
4300         npkts += 1;
4301         nextrecord = m->m_nextpkt;
4302
4303         if ((pr->pr_flags & PR_ADDR) && m->m_type == MT_SONAME) {
4304                 error = soreceive_addr(p, so, psa, flags, &m, &nextrecord, 1);
4305                 if (error == ERESTART) {
4306                         goto restart;
4307                 } else if (error != 0) {
4308                         goto release;
4309                 }
4310         }
4311
4312         if (m != NULL && m->m_type == MT_CONTROL) {
4313                 error = soreceive_ctl(so, controlp, flags, &m, &nextrecord);
4314                 if (error != 0) {
4315                         goto release;
4316                 }
4317         }
4318
4319         if (m->m_pkthdr.len == 0) {
4320                 printf("%s:%d so %llx pkt %llx type %u pktlen null\n",
4321                     __func__, __LINE__,
4322                     (uint64_t)DEBUG_KERNEL_ADDRPERM(so),
4323                     (uint64_t)DEBUG_KERNEL_ADDRPERM(m),
4324                     m->m_type);
4325         }
4326
4327         /*
4328          * Loop to copy the mbufs of the current record
4329          * Support zero length packets
4330          */
4331         ml = NULL;
4332         pktlen = 0;
4333         while (m != NULL && (len = resid - pktlen) >= 0 && error == 0) {
4334                 if (m->m_len == 0) {
4335                         panic("%p m_len zero", m);
4336                 }
4337                 if (m->m_type == 0) {
4338                         panic("%p m_type zero", m);
4339                 }
4340                 /*
4341                  * Clip to the residual length
4342                  */
4343                 if (len > m->m_len) {
4344                         len = m->m_len;
4345                 }
4346                 pktlen += len;
4347                 /*
4348                  * Copy the mbufs via the uio or delay the copy
4349                  * Sockbuf must be consistent here (points to current mbuf,
4350                  * it points to next record) when we drop priority;
4351                  * we must note any additions to the sockbuf when we
4352                  * block interrupts again.
4353                  */
4354                 if (len > 0 && can_delay == 0) {
4355                         socket_unlock(so, 0);
4356                         error = uiomove(mtod(m, caddr_t), (int)len, auio);
4357                         socket_lock(so, 0);
4358                         if (error) {
4359                                 goto release;
4360                         }
4361                 } else {
4362                         delayed_copy_len += len;
4363                 }
4364
4365                 if (len == m->m_len) {
4366                         /*
4367                          * m was entirely copied
4368                          */
4369                         sbfree(&so->so_rcv, m);
4370                         nextrecord = m->m_nextpkt;
4371                         m->m_nextpkt = NULL;
4372
4373                         /*
4374                          * Set the first packet to the head of the free list
4375                          */
4376                         if (free_list == NULL) {
4377                                 free_list = m;
4378                         }
4379                         /*
4380                          * Link current packet to tail of free list
4381                          */
4382                         if (ml == NULL) {
4383                                 if (free_tail != NULL) {
4384                                         free_tail->m_nextpkt = m;
4385                                 }
4386                                 free_tail = m;
4387                         }
4388                         /*
4389                          * Link current mbuf to last mbuf of current packet
4390                          */
4391                         if (ml != NULL) {
4392                                 ml->m_next = m;
4393                         }
4394                         ml = m;
4395
4396                         /*
4397                          * Move next buf to head of socket buffer
4398                          */
4399                         so->so_rcv.sb_mb = m = ml->m_next;
4400                         ml->m_next = NULL;
4401
4402                         if (m != NULL) {
4403                                 m->m_nextpkt = nextrecord;
4404                                 if (nextrecord == NULL) {
4405                                         so->so_rcv.sb_lastrecord = m;
4406                                 }
4407                         } else {
4408                                 so->so_rcv.sb_mb = nextrecord;
4409                                 SB_EMPTY_FIXUP(&so->so_rcv);
4410                         }
4411                         SBLASTRECORDCHK(&so->so_rcv, "soreceive 3");
4412                         SBLASTMBUFCHK(&so->so_rcv, "soreceive 3");
4413                 } else {
4414                         /*
4415                          * Stop the loop on partial copy
4416                          */
4417                         break;
4418                 }
4419         }
4420 #ifdef MORE_LOCKING_DEBUG
4421         if (so->so_usecount <= 1) {
4422                 panic("%s: after big while so=%llx ref=%d on socket\n",
4423                     __func__,
4424                     (uint64_t)DEBUG_KERNEL_ADDRPERM(so), so->so_usecount);
4425                 /* NOTREACHED */
4426         }
4427 #endif
4428         /*
4429          * Tell the caller we made a partial copy
4430          */
4431         if (m != NULL) {
4432                 if (so->so_options & SO_DONTTRUNC) {
4433                         /*
4434                          * Copyout first the freelist then the partial mbuf
4435                          */
4436                         socket_unlock(so, 0);
4437                         if (delayed_copy_len) {
4438                                 error = sodelayed_copy_list(so, msgarray,
4439                                     uiocnt, &free_list, &delayed_copy_len);
4440                         }
4441
4442                         if (error == 0) {
4443                                 error = uiomove(mtod(m, caddr_t), (int)len,
4444                                     auio);
4445                         }
4446                         socket_lock(so, 0);
4447                         if (error) {
4448                                 goto release;
4449                         }
4450
4451                         m->m_data += len;
4452                         m->m_len -= len;
4453                         so->so_rcv.sb_cc -= len;
4454                         flags |= MSG_RCVMORE;
4455                 } else {
4456                         (void) sbdroprecord(&so->so_rcv);
4457                         nextrecord = so->so_rcv.sb_mb;
4458                         m = NULL;
4459                         flags |= MSG_TRUNC;
4460                 }
4461         }
4462
4463         if (m == NULL) {
4464                 so->so_rcv.sb_mb = nextrecord;
4465                 /*
4466                  * First part is an inline SB_EMPTY_FIXUP().  Second
4467                  * part makes sure sb_lastrecord is up-to-date if
4468                  * there is still data in the socket buffer.
4469                  */
4470                 if (so->so_rcv.sb_mb == NULL) {
4471                         so->so_rcv.sb_mbtail = NULL;
4472                         so->so_rcv.sb_lastrecord = NULL;
4473                 } else if (nextrecord->m_nextpkt == NULL) {
4474                         so->so_rcv.sb_lastrecord = nextrecord;
4475                 }
4476                 SB_MB_CHECK(&so->so_rcv);
4477         }
4478         SBLASTRECORDCHK(&so->so_rcv, "soreceive 4");
4479         SBLASTMBUFCHK(&so->so_rcv, "soreceive 4");
4480
4481         /*
4482          * We can continue to the next packet as long as:
4483          * - We haven't exhausted the uio array
4484          * - There was no error
4485          * - A packet was not truncated
4486          * - We can still receive more data
4487          */
4488         if (npkts < uiocnt && error == 0 &&
4489             (flags & (MSG_RCVMORE | MSG_TRUNC)) == 0 &&
4490             (so->so_state & SS_CANTRCVMORE) == 0) {
4491                 sbunlock(&so->so_rcv, TRUE);    /* keep socket locked */
4492                 sblocked = 0;
4493
4494                 goto next;
4495         }
4496         if (flagsp != NULL) {
4497                 *flagsp |= flags;
4498         }
4499
4500 release:
4501         /*
4502          * pru_rcvd may cause more data to be received if the socket lock
4503          * is dropped so we set MSG_HAVEMORE now based on what we know.
4504          * That way the caller won't be surprised if it receives less data
4505          * than requested.
4506          */
4507         if ((so->so_options & SO_WANTMORE) && so->so_rcv.sb_cc > 0) {
4508                 flags |= MSG_HAVEMORE;
4509         }
4510
4511         if (pr->pr_flags & PR_WANTRCVD && so->so_pcb) {
4512                 (*pr->pr_usrreqs->pru_rcvd)(so, flags);
4513         }
4514
4515         if (sblocked) {
4516                 sbunlock(&so->so_rcv, FALSE);   /* will unlock socket */
4517         } else {
4518                 socket_unlock(so, 1);
4519         }
4520
4521         if (delayed_copy_len) {
4522                 error = sodelayed_copy_list(so, msgarray, uiocnt,
4523                     &free_list, &delayed_copy_len);
4524         }
4525 out:
4526         /*
4527          * Amortize the cost of freeing the mbufs
4528          */
4529         if (free_list != NULL) {
4530                 m_freem_list(free_list);
4531         }
4532         if (free_others != NULL) {
4533                 m_freem_list(free_others);
4534         }
4535
4536         KERNEL_DEBUG(DBG_FNC_SORECEIVE_LIST | DBG_FUNC_END, error,
4537             0, 0, 0, 0);
4538         return error;
4539 }
4540
4541 static int
4542 so_statistics_event_to_nstat_event(int64_t *input_options,
4543     uint64_t *nstat_event)
4544 {
4545         int error = 0;
4546         switch (*input_options) {
4547         case SO_STATISTICS_EVENT_ENTER_CELLFALLBACK:
4548                 *nstat_event = NSTAT_EVENT_SRC_ENTER_CELLFALLBACK;
4549                 break;
4550         case SO_STATISTICS_EVENT_EXIT_CELLFALLBACK:
4551                 *nstat_event = NSTAT_EVENT_SRC_EXIT_CELLFALLBACK;
4552                 break;
4553 #if (DEBUG || DEVELOPMENT)
4554         case SO_STATISTICS_EVENT_RESERVED_1:
4555                 *nstat_event = NSTAT_EVENT_SRC_RESERVED_1;
4556                 break;
4557         case SO_STATISTICS_EVENT_RESERVED_2:
4558                 *nstat_event = NSTAT_EVENT_SRC_RESERVED_2;
4559                 break;
4560 #endif /* (DEBUG || DEVELOPMENT) */
4561         default:
4562                 error = EINVAL;
4563                 break;
4564         }
4565         return error;
4566 }
4567
4568 /*
4569  * Returns:     0                       Success
4570  *              EINVAL
4571  *              ENOTCONN
4572  *      <pru_shutdown>:EINVAL
4573  *      <pru_shutdown>:EADDRNOTAVAIL[TCP]
4574  *      <pru_shutdown>:ENOBUFS[TCP]
4575  *      <pru_shutdown>:EMSGSIZE[TCP]
4576  *      <pru_shutdown>:EHOSTUNREACH[TCP]
4577  *      <pru_shutdown>:ENETUNREACH[TCP]
4578  *      <pru_shutdown>:ENETDOWN[TCP]
4579  *      <pru_shutdown>:ENOMEM[TCP]
4580  *      <pru_shutdown>:EACCES[TCP]
4581  *      <pru_shutdown>:EMSGSIZE[TCP]
4582  *      <pru_shutdown>:ENOBUFS[TCP]
4583  *      <pru_shutdown>:???[TCP]         [ignorable: mostly IPSEC/firewall/DLIL]
4584  *      <pru_shutdown>:???              [other protocol families]
4585  */
4586 int
4587 soshutdown(struct socket *so, int how)
4588 {
4589         int error;
4590
4591         KERNEL_DEBUG(DBG_FNC_SOSHUTDOWN | DBG_FUNC_START, how, 0, 0, 0, 0);
4592
4593         switch (how) {
4594         case SHUT_RD:
4595         case SHUT_WR:
4596         case SHUT_RDWR:
4597                 socket_lock(so, 1);
4598                 if ((so->so_state &
4599                     (SS_ISCONNECTED | SS_ISCONNECTING | SS_ISDISCONNECTING)) == 0) {
4600                         error = ENOTCONN;
4601                 } else {
4602                         error = soshutdownlock(so, how);
4603                 }
4604                 socket_unlock(so, 1);
4605                 break;
4606         default:
4607                 error = EINVAL;
4608                 break;
4609         }
4610
4611         KERNEL_DEBUG(DBG_FNC_SOSHUTDOWN | DBG_FUNC_END, how, error, 0, 0, 0);
4612
4613         return error;
4614 }
4615
4616 int
4617 soshutdownlock_final(struct socket *so, int how)
4618 {
4619         struct protosw *pr = so->so_proto;
4620         int error = 0;
4621
4622         sflt_notify(so, sock_evt_shutdown, &how);
4623
4624         if (how != SHUT_WR) {
4625                 if ((so->so_state & SS_CANTRCVMORE) != 0) {
4626                         /* read already shut down */
4627                         error = ENOTCONN;
4628                         goto done;
4629                 }
4630                 sorflush(so);
4631         }
4632         if (how != SHUT_RD) {
4633                 if ((so->so_state & SS_CANTSENDMORE) != 0) {
4634                         /* write already shut down */
4635                         error = ENOTCONN;
4636                         goto done;
4637                 }
4638                 error = (*pr->pr_usrreqs->pru_shutdown)(so);
4639         }
4640 done:
4641         KERNEL_DEBUG(DBG_FNC_SOSHUTDOWN, how, 1, 0, 0, 0);
4642         return error;
4643 }
4644
4645 int
4646 soshutdownlock(struct socket *so, int how)
4647 {
4648         int error = 0;
4649
4650 #if CONTENT_FILTER
4651         /*
4652          * A content filter may delay the actual shutdown until it
4653          * has processed the pending data
4654          */
4655         if (so->so_flags & SOF_CONTENT_FILTER) {
4656                 error = cfil_sock_shutdown(so, &how);
4657                 if (error == EJUSTRETURN) {
4658                         error = 0;
4659                         goto done;
4660                 } else if (error != 0) {
4661                         goto done;
4662                 }
4663         }
4664 #endif /* CONTENT_FILTER */
4665
4666         error = soshutdownlock_final(so, how);
4667
4668 done:
4669         return error;
4670 }
4671
4672 void
4673 sowflush(struct socket *so)
4674 {
4675         struct sockbuf *sb = &so->so_snd;
4676
4677         /*
4678          * Obtain lock on the socket buffer (SB_LOCK).  This is required
4679          * to prevent the socket buffer from being unexpectedly altered
4680          * while it is used by another thread in socket send/receive.
4681          *
4682          * sblock() must not fail here, hence the assertion.
4683          */
4684         (void) sblock(sb, SBL_WAIT | SBL_NOINTR | SBL_IGNDEFUNCT);
4685         VERIFY(sb->sb_flags & SB_LOCK);
4686
4687         sb->sb_flags            &= ~(SB_SEL | SB_UPCALL);
4688         sb->sb_flags            |= SB_DROP;
4689         sb->sb_upcall           = NULL;
4690         sb->sb_upcallarg        = NULL;
4691
4692         sbunlock(sb, TRUE);     /* keep socket locked */
4693
4694         selthreadclear(&sb->sb_sel);
4695         sbrelease(sb);
4696 }
4697
4698 void
4699 sorflush(struct socket *so)
4700 {
4701         struct sockbuf *sb = &so->so_rcv;
4702         struct protosw *pr = so->so_proto;
4703         struct sockbuf asb;
4704 #ifdef notyet
4705         lck_mtx_t *mutex_held;
4706         /*
4707          * XXX: This code is currently commented out, because we may get here
4708          * as part of sofreelastref(), and at that time, pr_getlock() may no
4709          * longer be able to return us the lock; this will be fixed in future.
4710          */
4711         if (so->so_proto->pr_getlock != NULL) {
4712                 mutex_held = (*so->so_proto->pr_getlock)(so, 0);
4713         } else {
4714                 mutex_held = so->so_proto->pr_domain->dom_mtx;
4715         }
4716
4717         LCK_MTX_ASSERT(mutex_held, LCK_MTX_ASSERT_OWNED);
4718 #endif /* notyet */
4719
4720         sflt_notify(so, sock_evt_flush_read, NULL);
4721
4722         socantrcvmore(so);
4723
4724         /*
4725          * Obtain lock on the socket buffer (SB_LOCK).  This is required
4726          * to prevent the socket buffer from being unexpectedly altered
4727          * while it is used by another thread in socket send/receive.
4728          *
4729          * sblock() must not fail here, hence the assertion.
4730          */
4731         (void) sblock(sb, SBL_WAIT | SBL_NOINTR | SBL_IGNDEFUNCT);
4732         VERIFY(sb->sb_flags & SB_LOCK);
4733
4734         /*
4735          * Copy only the relevant fields from "sb" to "asb" which we
4736          * need for sbrelease() to function.  In particular, skip
4737          * sb_sel as it contains the wait queue linkage, which would
4738          * wreak havoc if we were to issue selthreadclear() on "asb".
4739          * Make sure to not carry over SB_LOCK in "asb", as we need
4740          * to acquire it later as part of sbrelease().
4741          */
4742         bzero(&asb, sizeof(asb));
4743         asb.sb_cc               = sb->sb_cc;
4744         asb.sb_hiwat            = sb->sb_hiwat;
4745         asb.sb_mbcnt            = sb->sb_mbcnt;
4746         asb.sb_mbmax            = sb->sb_mbmax;
4747         asb.sb_ctl              = sb->sb_ctl;
4748         asb.sb_lowat            = sb->sb_lowat;
4749         asb.sb_mb               = sb->sb_mb;
4750         asb.sb_mbtail           = sb->sb_mbtail;
4751         asb.sb_lastrecord       = sb->sb_lastrecord;
4752         asb.sb_so               = sb->sb_so;
4753         asb.sb_flags            = sb->sb_flags;
4754         asb.sb_flags            &= ~(SB_LOCK | SB_SEL | SB_KNOTE | SB_UPCALL);
4755         asb.sb_flags            |= SB_DROP;
4756
4757         /*
4758          * Ideally we'd bzero() these and preserve the ones we need;
4759          * but to do that we'd need to shuffle things around in the
4760          * sockbuf, and we can't do it now because there are KEXTS
4761          * that are directly referring to the socket structure.
4762          *
4763          * Setting SB_DROP acts as a barrier to prevent further appends.
4764          * Clearing SB_SEL is done for selthreadclear() below.
4765          */
4766         sb->sb_cc               = 0;
4767         sb->sb_hiwat            = 0;
4768         sb->sb_mbcnt            = 0;
4769         sb->sb_mbmax            = 0;
4770         sb->sb_ctl              = 0;
4771         sb->sb_lowat            = 0;
4772         sb->sb_mb               = NULL;
4773         sb->sb_mbtail           = NULL;
4774         sb->sb_lastrecord       = NULL;
4775         sb->sb_timeo.tv_sec     = 0;
4776         sb->sb_timeo.tv_usec    = 0;
4777         sb->sb_upcall           = NULL;
4778         sb->sb_upcallarg        = NULL;
4779         sb->sb_flags            &= ~(SB_SEL | SB_UPCALL);
4780         sb->sb_flags            |= SB_DROP;
4781
4782         sbunlock(sb, TRUE);     /* keep socket locked */
4783
4784         /*
4785          * Note that selthreadclear() is called on the original "sb" and
4786          * not the local "asb" because of the way wait queue linkage is
4787          * implemented.  Given that selwakeup() may be triggered, SB_SEL
4788          * should no longer be set (cleared above.)
4789          */
4790         selthreadclear(&sb->sb_sel);
4791
4792         if ((pr->pr_flags & PR_RIGHTS) && pr->pr_domain->dom_dispose) {
4793                 (*pr->pr_domain->dom_dispose)(asb.sb_mb);
4794         }
4795
4796         sbrelease(&asb);
4797 }
4798
4799 /*
4800  * Perhaps this routine, and sooptcopyout(), below, ought to come in
4801  * an additional variant to handle the case where the option value needs
4802  * to be some kind of integer, but not a specific size.
4803  * In addition to their use here, these functions are also called by the
4804  * protocol-level pr_ctloutput() routines.
4805  *
4806  * Returns:     0                       Success
4807  *              EINVAL
4808  *      copyin:EFAULT
4809  */
4810 int
4811 sooptcopyin(struct sockopt *sopt, void *buf, size_t len, size_t minlen)
4812 {
4813         size_t  valsize;
4814
4815         /*
4816          * If the user gives us more than we wanted, we ignore it,
4817          * but if we don't get the minimum length the caller
4818          * wants, we return EINVAL.  On success, sopt->sopt_valsize
4819          * is set to however much we actually retrieved.
4820          */
4821         if ((valsize = sopt->sopt_valsize) < minlen) {
4822                 return EINVAL;
4823         }
4824         if (valsize > len) {
4825                 sopt->sopt_valsize = valsize = len;
4826         }
4827
4828         if (sopt->sopt_p != kernproc) {
4829                 return copyin(sopt->sopt_val, buf, valsize);
4830         }
4831
4832         bcopy(CAST_DOWN(caddr_t, sopt->sopt_val), buf, valsize);
4833         return 0;
4834 }
4835
4836 /*
4837  * sooptcopyin_timeval
4838  *   Copy in a timeval value into tv_p, and take into account whether the
4839  *   the calling process is 64-bit or 32-bit.  Moved the sanity checking
4840  *   code here so that we can verify the 64-bit tv_sec value before we lose
4841  *   the top 32-bits assigning tv64.tv_sec to tv_p->tv_sec.
4842  */
4843 static int
4844 sooptcopyin_timeval(struct sockopt *sopt, struct timeval *tv_p)
4845 {
4846         int                     error;
4847
4848         if (proc_is64bit(sopt->sopt_p)) {
4849                 struct user64_timeval   tv64;
4850
4851                 if (sopt->sopt_valsize < sizeof(tv64)) {
4852                         return EINVAL;
4853                 }
4854
4855                 sopt->sopt_valsize = sizeof(tv64);
4856                 if (sopt->sopt_p != kernproc) {
4857                         error = copyin(sopt->sopt_val, &tv64, sizeof(tv64));
4858                         if (error != 0) {
4859                                 return error;
4860                         }
4861                 } else {
4862                         bcopy(CAST_DOWN(caddr_t, sopt->sopt_val), &tv64,
4863                             sizeof(tv64));
4864                 }
4865                 if (tv64.tv_sec < 0 || tv64.tv_sec > LONG_MAX ||
4866                     tv64.tv_usec < 0 || tv64.tv_usec >= 1000000) {
4867                         return EDOM;
4868                 }
4869
4870                 tv_p->tv_sec = tv64.tv_sec;
4871                 tv_p->tv_usec = tv64.tv_usec;
4872         } else {
4873                 struct user32_timeval   tv32;
4874
4875                 if (sopt->sopt_valsize < sizeof(tv32)) {
4876                         return EINVAL;
4877                 }
4878
4879                 sopt->sopt_valsize = sizeof(tv32);
4880                 if (sopt->sopt_p != kernproc) {
4881                         error = copyin(sopt->sopt_val, &tv32, sizeof(tv32));
4882                         if (error != 0) {
4883                                 return error;
4884                         }
4885                 } else {
4886                         bcopy(CAST_DOWN(caddr_t, sopt->sopt_val), &tv32,
4887                             sizeof(tv32));
4888                 }
4889 #ifndef __LP64__
4890                 /*
4891                  * K64todo "comparison is always false due to
4892                  * limited range of data type"
4893                  */
4894                 if (tv32.tv_sec < 0 || tv32.tv_sec > LONG_MAX ||
4895                     tv32.tv_usec < 0 || tv32.tv_usec >= 1000000) {
4896                         return EDOM;
4897                 }
4898 #endif
4899                 tv_p->tv_sec = tv32.tv_sec;
4900                 tv_p->tv_usec = tv32.tv_usec;
4901         }
4902         return 0;
4903 }
4904
4905 int
4906 soopt_cred_check(struct socket *so, int priv, boolean_t allow_root,
4907     boolean_t ignore_delegate)
4908 {
4909         kauth_cred_t cred =  NULL;
4910         proc_t ep = PROC_NULL;
4911         uid_t uid;
4912         int error = 0;
4913
4914         if (ignore_delegate == false && so->so_flags & SOF_DELEGATED) {
4915                 ep = proc_find(so->e_pid);
4916                 if (ep) {
4917                         cred = kauth_cred_proc_ref(ep);
4918                 }
4919         }
4920
4921         uid = kauth_cred_getuid(cred ? cred : so->so_cred);
4922
4923         /* uid is 0 for root */
4924         if (uid != 0 || !allow_root) {
4925                 error = priv_check_cred(cred ? cred : so->so_cred, priv, 0);
4926         }
4927         if (cred) {
4928                 kauth_cred_unref(&cred);
4929         }
4930         if (ep != PROC_NULL) {
4931                 proc_rele(ep);
4932         }
4933
4934         return error;
4935 }
4936
4937 /*
4938  * Returns:     0                       Success
4939  *              EINVAL
4940  *              ENOPROTOOPT
4941  *              ENOBUFS
4942  *              EDOM
4943  *      sooptcopyin:EINVAL
4944  *      sooptcopyin:EFAULT
4945  *      sooptcopyin_timeval:EINVAL
4946  *      sooptcopyin_timeval:EFAULT
4947  *      sooptcopyin_timeval:EDOM
4948  *      <pr_ctloutput>:EOPNOTSUPP[AF_UNIX]
4949  *      <pr_ctloutput>:???w
4950  *      sflt_attach_private:???         [whatever a filter author chooses]
4951  *      <sf_setoption>:???              [whatever a filter author chooses]
4952  *
4953  * Notes:       Other <pru_listen> returns depend on the protocol family; all
4954  *              <sf_listen> returns depend on what the filter author causes
4955  *              their filter to return.
4956  */
4957 int
4958 sosetoptlock(struct socket *so, struct sockopt *sopt, int dolock)
4959 {
4960         int     error, optval;
4961         int64_t long_optval;
4962         struct  linger l;
4963         struct  timeval tv;
4964
4965         if (sopt->sopt_dir != SOPT_SET) {
4966                 sopt->sopt_dir = SOPT_SET;
4967         }
4968
4969         if (dolock) {
4970                 socket_lock(so, 1);
4971         }
4972
4973         if ((so->so_state & (SS_CANTRCVMORE | SS_CANTSENDMORE)) ==
4974             (SS_CANTRCVMORE | SS_CANTSENDMORE) &&
4975             (so->so_flags & SOF_NPX_SETOPTSHUT) == 0) {
4976                 /* the socket has been shutdown, no more sockopt's */
4977                 error = EINVAL;
4978                 goto out;
4979         }
4980
4981         error = sflt_setsockopt(so, sopt);
4982         if (error != 0) {
4983                 if (error == EJUSTRETURN) {
4984                         error = 0;
4985                 }
4986                 goto out;
4987         }
4988
4989         if (sopt->sopt_level != SOL_SOCKET) {
4990                 if (so->so_proto != NULL &&
4991                     so->so_proto->pr_ctloutput != NULL) {
4992                         error = (*so->so_proto->pr_ctloutput)(so, sopt);
4993                         goto out;
4994                 }
4995                 error = ENOPROTOOPT;
4996         } else {
4997                 /*
4998                  * Allow socket-level (SOL_SOCKET) options to be filtered by
4999                  * the protocol layer, if needed.  A zero value returned from
5000                  * the handler means use default socket-level processing as
5001                  * done by the rest of this routine.  Otherwise, any other
5002                  * return value indicates that the option is unsupported.
5003                  */
5004                 if (so->so_proto != NULL && (error = so->so_proto->pr_usrreqs->
5005                     pru_socheckopt(so, sopt)) != 0) {
5006                         goto out;
5007                 }
5008
5009                 error = 0;
5010                 switch (sopt->sopt_name) {
5011                 case SO_LINGER:
5012                 case SO_LINGER_SEC:
5013                         error = sooptcopyin(sopt, &l, sizeof(l), sizeof(l));
5014                         if (error != 0) {
5015                                 goto out;
5016                         }
5017
5018                         so->so_linger = (sopt->sopt_name == SO_LINGER) ?
5019                             l.l_linger : l.l_linger * hz;
5020                         if (l.l_onoff != 0) {
5021                                 so->so_options |= SO_LINGER;
5022                         } else {
5023                                 so->so_options &= ~SO_LINGER;
5024                         }
5025                         break;
5026
5027                 case SO_DEBUG:
5028                 case SO_KEEPALIVE:
5029                 case SO_DONTROUTE:
5030                 case SO_USELOOPBACK:
5031                 case SO_BROADCAST:
5032                 case SO_REUSEADDR:
5033                 case SO_REUSEPORT:
5034                 case SO_OOBINLINE:
5035                 case SO_TIMESTAMP:
5036                 case SO_TIMESTAMP_MONOTONIC:
5037                 case SO_TIMESTAMP_CONTINUOUS:
5038                 case SO_DONTTRUNC:
5039                 case SO_WANTMORE:
5040                 case SO_WANTOOBFLAG:
5041                 case SO_NOWAKEFROMSLEEP:
5042                 case SO_NOAPNFALLBK:
5043                         error = sooptcopyin(sopt, &optval, sizeof(optval),
5044                             sizeof(optval));
5045                         if (error != 0) {
5046                                 goto out;
5047                         }
5048                         if (optval) {
5049                                 so->so_options |= sopt->sopt_name;
5050                         } else {
5051                                 so->so_options &= ~sopt->sopt_name;
5052                         }
5053                         break;
5054
5055                 case SO_SNDBUF:
5056                 case SO_RCVBUF:
5057                 case SO_SNDLOWAT:
5058                 case SO_RCVLOWAT:
5059                         error = sooptcopyin(sopt, &optval, sizeof(optval),
5060                             sizeof(optval));
5061                         if (error != 0) {
5062                                 goto out;
5063                         }
5064
5065                         /*
5066                          * Values < 1 make no sense for any of these
5067                          * options, so disallow them.
5068                          */
5069                         if (optval < 1) {
5070                                 error = EINVAL;
5071                                 goto out;
5072                         }
5073
5074                         switch (sopt->sopt_name) {
5075                         case SO_SNDBUF:
5076                         case SO_RCVBUF: {
5077                                 struct sockbuf *sb =
5078                                     (sopt->sopt_name == SO_SNDBUF) ?
5079                                     &so->so_snd : &so->so_rcv;
5080                                 if (sbreserve(sb, (u_int32_t)optval) == 0) {
5081                                         error = ENOBUFS;
5082                                         goto out;
5083                                 }
5084                                 sb->sb_flags |= SB_USRSIZE;
5085                                 sb->sb_flags &= ~SB_AUTOSIZE;
5086                                 sb->sb_idealsize = (u_int32_t)optval;
5087                                 break;
5088                         }
5089                         /*
5090                          * Make sure the low-water is never greater than
5091                          * the high-water.
5092                          */
5093                         case SO_SNDLOWAT: {
5094                                 int space = sbspace(&so->so_snd);
5095                                 u_int32_t hiwat = so->so_snd.sb_hiwat;
5096
5097                                 if (so->so_snd.sb_flags & SB_UNIX) {
5098                                         struct unpcb *unp =
5099                                             (struct unpcb *)(so->so_pcb);
5100                                         if (unp != NULL &&
5101                                             unp->unp_conn != NULL) {
5102                                                 hiwat += unp->unp_conn->unp_cc;
5103                                         }
5104                                 }
5105
5106                                 so->so_snd.sb_lowat =
5107                                     (optval > hiwat) ?
5108                                     hiwat : optval;
5109
5110                                 if (space >= so->so_snd.sb_lowat) {
5111                                         sowwakeup(so);
5112                                 }
5113                                 break;
5114                         }
5115                         case SO_RCVLOWAT: {
5116                                 int64_t data_len;
5117                                 so->so_rcv.sb_lowat =
5118                                     (optval > so->so_rcv.sb_hiwat) ?
5119                                     so->so_rcv.sb_hiwat : optval;
5120                                 data_len = so->so_rcv.sb_cc
5121                                     - so->so_rcv.sb_ctl;
5122                                 if (data_len >= so->so_rcv.sb_lowat) {
5123                                         sorwakeup(so);
5124                                 }
5125                                 break;
5126                         }
5127                         }
5128                         break;
5129
5130                 case SO_SNDTIMEO:
5131                 case SO_RCVTIMEO:
5132                         error = sooptcopyin_timeval(sopt, &tv);
5133                         if (error != 0) {
5134                                 goto out;
5135                         }
5136
5137                         switch (sopt->sopt_name) {
5138                         case SO_SNDTIMEO:
5139                                 so->so_snd.sb_timeo = tv;
5140                                 break;
5141                         case SO_RCVTIMEO:
5142                                 so->so_rcv.sb_timeo = tv;
5143                                 break;
5144                         }
5145                         break;
5146
5147                 case SO_NKE: {
5148                         struct so_nke nke;
5149
5150                         error = sooptcopyin(sopt, &nke, sizeof(nke),
5151                             sizeof(nke));
5152                         if (error != 0) {
5153                                 goto out;
5154                         }
5155
5156                         error = sflt_attach_internal(so, nke.nke_handle);
5157                         break;
5158                 }
5159
5160                 case SO_NOSIGPIPE:
5161                         error = sooptcopyin(sopt, &optval, sizeof(optval),
5162                             sizeof(optval));
5163                         if (error != 0) {
5164                                 goto out;
5165                         }
5166                         if (optval != 0) {
5167                                 so->so_flags |= SOF_NOSIGPIPE;
5168                         } else {
5169                                 so->so_flags &= ~SOF_NOSIGPIPE;
5170                         }
5171                         break;
5172
5173                 case SO_NOADDRERR:
5174                         error = sooptcopyin(sopt, &optval, sizeof(optval),
5175                             sizeof(optval));
5176                         if (error != 0) {
5177                                 goto out;
5178                         }
5179                         if (optval != 0) {
5180                                 so->so_flags |= SOF_NOADDRAVAIL;
5181                         } else {
5182                                 so->so_flags &= ~SOF_NOADDRAVAIL;
5183                         }
5184                         break;
5185
5186                 case SO_REUSESHAREUID:
5187                         error = sooptcopyin(sopt, &optval, sizeof(optval),
5188                             sizeof(optval));
5189                         if (error != 0) {
5190                                 goto out;
5191                         }
5192                         if (optval != 0) {
5193                                 so->so_flags |= SOF_REUSESHAREUID;
5194                         } else {
5195                                 so->so_flags &= ~SOF_REUSESHAREUID;
5196                         }
5197                         break;
5198
5199                 case SO_NOTIFYCONFLICT:
5200                         if (kauth_cred_issuser(kauth_cred_get()) == 0) {
5201                                 error = EPERM;
5202                                 goto out;
5203                         }
5204                         error = sooptcopyin(sopt, &optval, sizeof(optval),
5205                             sizeof(optval));
5206                         if (error != 0) {
5207                                 goto out;
5208                         }
5209                         if (optval != 0) {
5210                                 so->so_flags |= SOF_NOTIFYCONFLICT;
5211                         } else {
5212                                 so->so_flags &= ~SOF_NOTIFYCONFLICT;
5213                         }
5214                         break;
5215
5216                 case SO_RESTRICTIONS:
5217                         error = sooptcopyin(sopt, &optval, sizeof(optval),
5218                             sizeof(optval));
5219                         if (error != 0) {
5220                                 goto out;
5221                         }
5222
5223                         error = so_set_restrictions(so, optval);
5224                         break;
5225
5226                 case SO_AWDL_UNRESTRICTED:
5227                         if (SOCK_DOM(so) != PF_INET &&
5228                             SOCK_DOM(so) != PF_INET6) {
5229                                 error = EOPNOTSUPP;
5230                                 goto out;
5231                         }
5232                         error = sooptcopyin(sopt, &optval, sizeof(optval),
5233                             sizeof(optval));
5234                         if (error != 0) {
5235                                 goto out;
5236                         }
5237                         if (optval != 0) {
5238                                 error = soopt_cred_check(so,
5239                                     PRIV_NET_RESTRICTED_AWDL, false, false);
5240                                 if (error == 0) {
5241                                         inp_set_awdl_unrestricted(
5242                                                 sotoinpcb(so));
5243                                 }
5244                         } else {
5245                                 inp_clear_awdl_unrestricted(sotoinpcb(so));
5246                         }
5247                         break;
5248                 case SO_INTCOPROC_ALLOW:
5249                         if (SOCK_DOM(so) != PF_INET6) {
5250                                 error = EOPNOTSUPP;
5251                                 goto out;
5252                         }
5253                         error = sooptcopyin(sopt, &optval, sizeof(optval),
5254                             sizeof(optval));
5255                         if (error != 0) {
5256                                 goto out;
5257                         }
5258                         if (optval != 0 &&
5259                             inp_get_intcoproc_allowed(sotoinpcb(so)) == FALSE) {
5260                                 error = soopt_cred_check(so,
5261                                     PRIV_NET_RESTRICTED_INTCOPROC, false, false);
5262                                 if (error == 0) {
5263                                         inp_set_intcoproc_allowed(
5264                                                 sotoinpcb(so));
5265                                 }
5266                         } else if (optval == 0) {
5267                                 inp_clear_intcoproc_allowed(sotoinpcb(so));
5268                         }
5269                         break;
5270
5271                 case SO_LABEL:
5272                         error = EOPNOTSUPP;
5273                         break;
5274
5275                 case SO_UPCALLCLOSEWAIT:
5276                         error = sooptcopyin(sopt, &optval, sizeof(optval),
5277                             sizeof(optval));
5278                         if (error != 0) {
5279                                 goto out;
5280                         }
5281                         if (optval != 0) {
5282                                 so->so_flags |= SOF_UPCALLCLOSEWAIT;
5283                         } else {
5284                                 so->so_flags &= ~SOF_UPCALLCLOSEWAIT;
5285                         }
5286                         break;
5287
5288                 case SO_RANDOMPORT:
5289                         error = sooptcopyin(sopt, &optval, sizeof(optval),
5290                             sizeof(optval));
5291                         if (error != 0) {
5292                                 goto out;
5293                         }
5294                         if (optval != 0) {
5295                                 so->so_flags |= SOF_BINDRANDOMPORT;
5296                         } else {
5297                                 so->so_flags &= ~SOF_BINDRANDOMPORT;
5298                         }
5299                         break;
5300
5301                 case SO_NP_EXTENSIONS: {
5302                         struct so_np_extensions sonpx;
5303
5304                         error = sooptcopyin(sopt, &sonpx, sizeof(sonpx),
5305                             sizeof(sonpx));
5306                         if (error != 0) {
5307                                 goto out;
5308                         }
5309                         if (sonpx.npx_mask & ~SONPX_MASK_VALID) {
5310                                 error = EINVAL;
5311                                 goto out;
5312                         }
5313                         /*
5314                          * Only one bit defined for now
5315                          */
5316                         if ((sonpx.npx_mask & SONPX_SETOPTSHUT)) {
5317                                 if ((sonpx.npx_flags & SONPX_SETOPTSHUT)) {
5318                                         so->so_flags |= SOF_NPX_SETOPTSHUT;
5319                                 } else {
5320                                         so->so_flags &= ~SOF_NPX_SETOPTSHUT;
5321                                 }
5322                         }
5323                         break;
5324                 }
5325
5326                 case SO_TRAFFIC_CLASS: {
5327                         error = sooptcopyin(sopt, &optval, sizeof(optval),
5328                             sizeof(optval));
5329                         if (error != 0) {
5330                                 goto out;
5331                         }
5332                         if (optval >= SO_TC_NET_SERVICE_OFFSET) {
5333                                 int netsvc = optval - SO_TC_NET_SERVICE_OFFSET;
5334                                 error = so_set_net_service_type(so, netsvc);
5335                                 goto out;
5336                         }
5337                         error = so_set_traffic_class(so, optval);
5338                         if (error != 0) {
5339                                 goto out;
5340                         }
5341                         so->so_flags1 &= ~SOF1_TC_NET_SERV_TYPE;
5342                         so->so_netsvctype = _NET_SERVICE_TYPE_UNSPEC;
5343                         break;
5344                 }
5345
5346                 case SO_RECV_TRAFFIC_CLASS: {
5347                         error = sooptcopyin(sopt, &optval, sizeof(optval),
5348                             sizeof(optval));
5349                         if (error != 0) {
5350                                 goto out;
5351                         }
5352                         if (optval == 0) {
5353                                 so->so_flags &= ~SOF_RECV_TRAFFIC_CLASS;
5354                         } else {
5355                                 so->so_flags |= SOF_RECV_TRAFFIC_CLASS;
5356                         }
5357                         break;
5358                 }
5359
5360 #if (DEVELOPMENT || DEBUG)
5361                 case SO_TRAFFIC_CLASS_DBG: {
5362                         struct so_tcdbg so_tcdbg;
5363
5364                         error = sooptcopyin(sopt, &so_tcdbg,
5365                             sizeof(struct so_tcdbg), sizeof(struct so_tcdbg));
5366                         if (error != 0) {
5367                                 goto out;
5368                         }
5369                         error = so_set_tcdbg(so, &so_tcdbg);
5370                         if (error != 0) {
5371                                 goto out;
5372                         }
5373                         break;
5374                 }
5375 #endif /* (DEVELOPMENT || DEBUG) */
5376
5377                 case SO_PRIVILEGED_TRAFFIC_CLASS:
5378                         error = priv_check_cred(kauth_cred_get(),
5379                             PRIV_NET_PRIVILEGED_TRAFFIC_CLASS, 0);
5380                         if (error != 0) {
5381                                 goto out;
5382                         }
5383                         error = sooptcopyin(sopt, &optval, sizeof(optval),
5384                             sizeof(optval));
5385                         if (error != 0) {
5386                                 goto out;
5387                         }
5388                         if (optval == 0) {
5389                                 so->so_flags &= ~SOF_PRIVILEGED_TRAFFIC_CLASS;
5390                         } else {
5391                                 so->so_flags |= SOF_PRIVILEGED_TRAFFIC_CLASS;
5392                         }
5393                         break;
5394
5395 #if (DEVELOPMENT || DEBUG)
5396                 case SO_DEFUNCTIT:
5397                         error = sosetdefunct(current_proc(), so, 0, FALSE);
5398                         if (error == 0) {
5399                                 error = sodefunct(current_proc(), so, 0);
5400                         }
5401
5402                         break;
5403 #endif /* (DEVELOPMENT || DEBUG) */
5404
5405                 case SO_DEFUNCTOK:
5406                         error = sooptcopyin(sopt, &optval, sizeof(optval),
5407                             sizeof(optval));
5408                         if (error != 0 || (so->so_flags & SOF_DEFUNCT)) {
5409                                 if (error == 0) {
5410                                         error = EBADF;
5411                                 }
5412                                 goto out;
5413                         }
5414                         /*
5415                          * Any process can set SO_DEFUNCTOK (clear
5416                          * SOF_NODEFUNCT), but only root can clear
5417                          * SO_DEFUNCTOK (set SOF_NODEFUNCT).
5418                          */
5419                         if (optval == 0 &&
5420                             kauth_cred_issuser(kauth_cred_get()) == 0) {
5421                                 error = EPERM;
5422                                 goto out;
5423                         }
5424                         if (optval) {
5425                                 so->so_flags &= ~SOF_NODEFUNCT;
5426                         } else {
5427                                 so->so_flags |= SOF_NODEFUNCT;
5428                         }
5429
5430                         if (SOCK_DOM(so) == PF_INET ||
5431                             SOCK_DOM(so) == PF_INET6) {
5432                                 char s[MAX_IPv6_STR_LEN];
5433                                 char d[MAX_IPv6_STR_LEN];
5434                                 struct inpcb *inp = sotoinpcb(so);
5435
5436                                 SODEFUNCTLOG("%s[%d, %s]: so 0x%llx "
5437                                     "[%s %s:%d -> %s:%d] is now marked "
5438                                     "as %seligible for "
5439                                     "defunct\n", __func__, proc_selfpid(),
5440                                     proc_best_name(current_proc()),
5441                                     (uint64_t)DEBUG_KERNEL_ADDRPERM(so),
5442                                     (SOCK_TYPE(so) == SOCK_STREAM) ?
5443                                     "TCP" : "UDP", inet_ntop(SOCK_DOM(so),
5444                                     ((SOCK_DOM(so) == PF_INET) ?
5445                                     (void *)&inp->inp_laddr.s_addr :
5446                                     (void *)&inp->in6p_laddr), s, sizeof(s)),
5447                                     ntohs(inp->in6p_lport),
5448                                     inet_ntop(SOCK_DOM(so),
5449                                     (SOCK_DOM(so) == PF_INET) ?
5450                                     (void *)&inp->inp_faddr.s_addr :
5451                                     (void *)&inp->in6p_faddr, d, sizeof(d)),
5452                                     ntohs(inp->in6p_fport),
5453                                     (so->so_flags & SOF_NODEFUNCT) ?
5454                                     "not " : "");
5455                         } else {
5456                                 SODEFUNCTLOG("%s[%d, %s]: so 0x%llx [%d,%d] "
5457                                     "is now marked as %seligible for "
5458                                     "defunct\n",
5459                                     __func__, proc_selfpid(),
5460                                     proc_best_name(current_proc()),
5461                                     (uint64_t)DEBUG_KERNEL_ADDRPERM(so),
5462                                     SOCK_DOM(so), SOCK_TYPE(so),
5463                                     (so->so_flags & SOF_NODEFUNCT) ?
5464                                     "not " : "");
5465                         }
5466                         break;
5467
5468                 case SO_ISDEFUNCT:
5469                         /* This option is not settable */
5470                         error = EINVAL;
5471                         break;
5472
5473                 case SO_OPPORTUNISTIC:
5474                         error = sooptcopyin(sopt, &optval, sizeof(optval),
5475                             sizeof(optval));
5476                         if (error == 0) {
5477                                 error = so_set_opportunistic(so, optval);
5478                         }
5479                         break;
5480
5481                 case SO_FLUSH:
5482                         /* This option is handled by lower layer(s) */
5483                         error = 0;
5484                         break;
5485
5486                 case SO_RECV_ANYIF:
5487                         error = sooptcopyin(sopt, &optval, sizeof(optval),
5488                             sizeof(optval));
5489                         if (error == 0) {
5490                                 error = so_set_recv_anyif(so, optval);
5491                         }
5492                         break;
5493
5494                 case SO_TRAFFIC_MGT_BACKGROUND: {
5495                         /* This option is handled by lower layer(s) */
5496                         error = 0;
5497                         break;
5498                 }
5499
5500 #if FLOW_DIVERT
5501                 case SO_FLOW_DIVERT_TOKEN:
5502                         error = flow_divert_token_set(so, sopt);
5503                         break;
5504 #endif  /* FLOW_DIVERT */
5505
5506
5507                 case SO_DELEGATED:
5508                         if ((error = sooptcopyin(sopt, &optval, sizeof(optval),
5509                             sizeof(optval))) != 0) {
5510                                 break;
5511                         }
5512
5513                         error = so_set_effective_pid(so, optval, sopt->sopt_p, true);
5514                         break;
5515
5516                 case SO_DELEGATED_UUID: {
5517                         uuid_t euuid;
5518
5519                         if ((error = sooptcopyin(sopt, &euuid, sizeof(euuid),
5520                             sizeof(euuid))) != 0) {
5521                                 break;
5522                         }
5523
5524                         error = so_set_effective_uuid(so, euuid, sopt->sopt_p, true);
5525                         break;
5526                 }
5527
5528 #if NECP
5529                 case SO_NECP_ATTRIBUTES:
5530                         error = necp_set_socket_attributes(so, sopt);
5531                         break;
5532
5533                 case SO_NECP_CLIENTUUID: {
5534                         if (SOCK_DOM(so) == PF_MULTIPATH) {
5535                                 /* Handled by MPTCP itself */
5536                                 break;
5537                         }
5538
5539                         if (SOCK_DOM(so) != PF_INET && SOCK_DOM(so) != PF_INET6) {
5540                                 error = EINVAL;
5541                                 goto out;
5542                         }
5543
5544                         struct inpcb *inp = sotoinpcb(so);
5545                         if (!uuid_is_null(inp->necp_client_uuid)) {
5546                                 // Clear out the old client UUID if present
5547                                 necp_inpcb_remove_cb(inp);
5548                         }
5549
5550                         error = sooptcopyin(sopt, &inp->necp_client_uuid,
5551                             sizeof(uuid_t), sizeof(uuid_t));
5552                         if (error != 0) {
5553                                 goto out;
5554                         }
5555
5556                         if (uuid_is_null(inp->necp_client_uuid)) {
5557                                 error = EINVAL;
5558                                 goto out;
5559                         }
5560
5561                         pid_t current_pid = proc_pid(current_proc());
5562                         error = necp_client_register_socket_flow(current_pid,
5563                             inp->necp_client_uuid, inp);
5564                         if (error != 0) {
5565                                 uuid_clear(inp->necp_client_uuid);
5566                                 goto out;
5567                         }
5568
5569                         if (inp->inp_lport != 0) {
5570                                 // There is a bound local port, so this is not
5571                                 // a fresh socket. Assign to the client.
5572                                 necp_client_assign_from_socket(current_pid, inp->necp_client_uuid, inp);
5573                         }
5574
5575                         break;
5576                 }
5577                 case SO_NECP_LISTENUUID: {
5578                         if (SOCK_DOM(so) != PF_INET && SOCK_DOM(so) != PF_INET6) {
5579                                 error = EINVAL;
5580                                 goto out;
5581                         }
5582
5583                         struct inpcb *inp = sotoinpcb(so);
5584                         if (!uuid_is_null(inp->necp_client_uuid)) {
5585                                 error = EINVAL;
5586                                 goto out;
5587                         }
5588
5589                         error = sooptcopyin(sopt, &inp->necp_client_uuid,
5590                             sizeof(uuid_t), sizeof(uuid_t));
5591                         if (error != 0) {
5592                                 goto out;
5593                         }
5594
5595                         if (uuid_is_null(inp->necp_client_uuid)) {
5596                                 error = EINVAL;
5597                                 goto out;
5598                         }
5599
5600                         error = necp_client_register_socket_listener(proc_pid(current_proc()),
5601                             inp->necp_client_uuid, inp);
5602                         if (error != 0) {
5603                                 uuid_clear(inp->necp_client_uuid);
5604                                 goto out;
5605                         }
5606
5607                         // Mark that the port registration is held by NECP
5608                         inp->inp_flags2 |= INP2_EXTERNAL_PORT;
5609
5610                         break;
5611                 }
5612 #endif /* NECP */
5613
5614                 case SO_EXTENDED_BK_IDLE:
5615                         error = sooptcopyin(sopt, &optval, sizeof(optval),
5616                             sizeof(optval));
5617                         if (error == 0) {
5618                                 error = so_set_extended_bk_idle(so, optval);
5619                         }
5620                         break;
5621
5622                 case SO_MARK_CELLFALLBACK:
5623                         error = sooptcopyin(sopt, &optval, sizeof(optval),
5624                             sizeof(optval));
5625                         if (error != 0) {
5626                                 goto out;
5627                         }
5628                         if (optval < 0) {
5629                                 error = EINVAL;
5630                                 goto out;
5631                         }
5632                         if (optval == 0) {
5633                                 so->so_flags1 &= ~SOF1_CELLFALLBACK;
5634                         } else {
5635                                 so->so_flags1 |= SOF1_CELLFALLBACK;
5636                         }
5637                         break;
5638
5639                 case SO_STATISTICS_EVENT:
5640                         error = sooptcopyin(sopt, &long_optval,
5641                             sizeof(long_optval), sizeof(long_optval));
5642                         if (error != 0) {
5643                                 goto out;
5644                         }
5645                         u_int64_t nstat_event = 0;
5646                         error = so_statistics_event_to_nstat_event(
5647                                 &long_optval, &nstat_event);
5648                         if (error != 0) {
5649                                 goto out;
5650                         }
5651                         nstat_pcb_event(sotoinpcb(so), nstat_event);
5652                         break;
5653
5654                 case SO_NET_SERVICE_TYPE: {
5655                         error = sooptcopyin(sopt, &optval, sizeof(optval),
5656                             sizeof(optval));
5657                         if (error != 0) {
5658                                 goto out;
5659                         }
5660                         error = so_set_net_service_type(so, optval);
5661                         break;
5662                 }
5663
5664                 case SO_QOSMARKING_POLICY_OVERRIDE:
5665                         error = priv_check_cred(kauth_cred_get(),
5666                             PRIV_NET_QOSMARKING_POLICY_OVERRIDE, 0);
5667                         if (error != 0) {
5668                                 goto out;
5669                         }
5670                         error = sooptcopyin(sopt, &optval, sizeof(optval),
5671                             sizeof(optval));
5672                         if (error != 0) {
5673                                 goto out;
5674                         }
5675                         if (optval == 0) {
5676                                 so->so_flags1 &= ~SOF1_QOSMARKING_POLICY_OVERRIDE;
5677                         } else {
5678                                 so->so_flags1 |= SOF1_QOSMARKING_POLICY_OVERRIDE;
5679                         }
5680                         break;
5681
5682                 case SO_MPKL_SEND_INFO: {
5683                         struct so_mpkl_send_info so_mpkl_send_info;
5684
5685                         error = sooptcopyin(sopt, &so_mpkl_send_info,
5686                             sizeof(struct so_mpkl_send_info), sizeof(struct so_mpkl_send_info));
5687                         if (error != 0) {
5688                                 goto out;
5689                         }
5690                         uuid_copy(so->so_mpkl_send_uuid, so_mpkl_send_info.mpkl_uuid);
5691                         so->so_mpkl_send_proto = so_mpkl_send_info.mpkl_proto;
5692
5693                         if (uuid_is_null(so->so_mpkl_send_uuid) && so->so_mpkl_send_proto == 0) {
5694                                 so->so_flags1 &= ~SOF1_MPKL_SEND_INFO;
5695                         } else {
5696                                 so->so_flags1 |= SOF1_MPKL_SEND_INFO;
5697                         }
5698                         break;
5699                 }
5700                 case SO_WANT_KEV_SOCKET_CLOSED: {
5701                         error = sooptcopyin(sopt, &optval, sizeof(optval),
5702                             sizeof(optval));
5703                         if (error != 0) {
5704                                 goto out;
5705                         }
5706                         if (optval == 0) {
5707                                 so->so_flags1 &= ~SOF1_WANT_KEV_SOCK_CLOSED;
5708                         } else {
5709                                 so->so_flags1 |= SOF1_WANT_KEV_SOCK_CLOSED;
5710                         }
5711                         break;
5712                 }
5713                 default:
5714                         error = ENOPROTOOPT;
5715                         break;
5716                 }
5717                 if (error == 0 && so->so_proto != NULL &&
5718                     so->so_proto->pr_ctloutput != NULL) {
5719                         (void) so->so_proto->pr_ctloutput(so, sopt);
5720                 }
5721         }
5722 out:
5723         if (dolock) {
5724                 socket_unlock(so, 1);
5725         }
5726         return error;
5727 }
5728
5729 /* Helper routines for getsockopt */
5730 int
5731 sooptcopyout(struct sockopt *sopt, void *buf, size_t len)
5732 {
5733         int     error;
5734         size_t  valsize;
5735
5736         error = 0;
5737
5738         /*
5739          * Documented get behavior is that we always return a value,
5740          * possibly truncated to fit in the user's buffer.
5741          * Traditional behavior is that we always tell the user
5742          * precisely how much we copied, rather than something useful
5743          * like the total amount we had available for her.
5744          * Note that this interface is not idempotent; the entire answer must
5745          * generated ahead of time.
5746          */
5747         valsize = min(len, sopt->sopt_valsize);
5748         sopt->sopt_valsize = valsize;
5749         if (sopt->sopt_val != USER_ADDR_NULL) {
5750                 if (sopt->sopt_p != kernproc) {
5751                         error = copyout(buf, sopt->sopt_val, valsize);
5752                 } else {
5753                         bcopy(buf, CAST_DOWN(caddr_t, sopt->sopt_val), valsize);
5754                 }
5755         }
5756         return error;
5757 }
5758
5759 static int
5760 sooptcopyout_timeval(struct sockopt *sopt, const struct timeval *tv_p)
5761 {
5762         int                     error;
5763         size_t                  len;
5764         struct user64_timeval   tv64 = {};
5765         struct user32_timeval   tv32 = {};
5766         const void *            val;
5767         size_t                  valsize;
5768
5769         error = 0;
5770         if (proc_is64bit(sopt->sopt_p)) {
5771                 len = sizeof(tv64);
5772                 tv64.tv_sec = tv_p->tv_sec;
5773                 tv64.tv_usec = tv_p->tv_usec;
5774                 val = &tv64;
5775         } else {
5776                 len = sizeof(tv32);
5777                 tv32.tv_sec = tv_p->tv_sec;
5778                 tv32.tv_usec = tv_p->tv_usec;
5779                 val = &tv32;
5780         }
5781         valsize = min(len, sopt->sopt_valsize);
5782         sopt->sopt_valsize = valsize;
5783         if (sopt->sopt_val != USER_ADDR_NULL) {
5784                 if (sopt->sopt_p != kernproc) {
5785                         error = copyout(val, sopt->sopt_val, valsize);
5786                 } else {
5787                         bcopy(val, CAST_DOWN(caddr_t, sopt->sopt_val), valsize);
5788                 }
5789         }
5790         return error;
5791 }
5792
5793 /*
5794  * Return:      0                       Success
5795  *              ENOPROTOOPT
5796  *      <pr_ctloutput>:EOPNOTSUPP[AF_UNIX]
5797  *      <pr_ctloutput>:???
5798  *      <sf_getoption>:???
5799  */
5800 int
5801 sogetoptlock(struct socket *so, struct sockopt *sopt, int dolock)
5802 {
5803         int     error, optval;
5804         struct  linger l;
5805         struct  timeval tv;
5806
5807         if (sopt->sopt_dir != SOPT_GET) {
5808                 sopt->sopt_dir = SOPT_GET;
5809         }
5810
5811         if (dolock) {
5812                 socket_lock(so, 1);
5813         }
5814
5815         error = sflt_getsockopt(so, sopt);
5816         if (error != 0) {
5817                 if (error == EJUSTRETURN) {
5818                         error = 0;
5819                 }
5820                 goto out;
5821         }
5822
5823         if (sopt->sopt_level != SOL_SOCKET) {
5824                 if (so->so_proto != NULL &&
5825                     so->so_proto->pr_ctloutput != NULL) {
5826                         error = (*so->so_proto->pr_ctloutput)(so, sopt);
5827                         goto out;
5828                 }
5829                 error = ENOPROTOOPT;
5830         } else {
5831                 /*
5832                  * Allow socket-level (SOL_SOCKET) options to be filtered by
5833                  * the protocol layer, if needed.  A zero value returned from
5834                  * the handler means use default socket-level processing as
5835                  * done by the rest of this routine.  Otherwise, any other
5836                  * return value indicates that the option is unsupported.
5837                  */
5838                 if (so->so_proto != NULL && (error = so->so_proto->pr_usrreqs->
5839                     pru_socheckopt(so, sopt)) != 0) {
5840                         goto out;
5841                 }
5842
5843                 error = 0;
5844                 switch (sopt->sopt_name) {
5845                 case SO_LINGER:
5846                 case SO_LINGER_SEC:
5847                         l.l_onoff = ((so->so_options & SO_LINGER) ? 1 : 0);
5848                         l.l_linger = (sopt->sopt_name == SO_LINGER) ?
5849                             so->so_linger : so->so_linger / hz;
5850                         error = sooptcopyout(sopt, &l, sizeof(l));
5851                         break;
5852
5853                 case SO_USELOOPBACK:
5854                 case SO_DONTROUTE:
5855                 case SO_DEBUG:
5856                 case SO_KEEPALIVE:
5857                 case SO_REUSEADDR:
5858                 case SO_REUSEPORT:
5859                 case SO_BROADCAST:
5860                 case SO_OOBINLINE:
5861                 case SO_TIMESTAMP:
5862                 case SO_TIMESTAMP_MONOTONIC:
5863                 case SO_TIMESTAMP_CONTINUOUS:
5864                 case SO_DONTTRUNC:
5865                 case SO_WANTMORE:
5866                 case SO_WANTOOBFLAG:
5867                 case SO_NOWAKEFROMSLEEP:
5868                 case SO_NOAPNFALLBK:
5869                         optval = so->so_options & sopt->sopt_name;
5870 integer:
5871                         error = sooptcopyout(sopt, &optval, sizeof(optval));
5872                         break;
5873
5874                 case SO_TYPE:
5875                         optval = so->so_type;
5876                         goto integer;
5877
5878                 case SO_NREAD:
5879                         if (so->so_proto->pr_flags & PR_ATOMIC) {
5880                                 int pkt_total;
5881                                 struct mbuf *m1;
5882
5883                                 pkt_total = 0;
5884                                 m1 = so->so_rcv.sb_mb;
5885                                 while (m1 != NULL) {
5886                                         if (m1->m_type == MT_DATA ||
5887                                             m1->m_type == MT_HEADER ||
5888                                             m1->m_type == MT_OOBDATA) {
5889                                                 pkt_total += m1->m_len;
5890                                         }
5891                                         m1 = m1->m_next;
5892                                 }
5893                                 optval = pkt_total;
5894                         } else {
5895                                 optval = so->so_rcv.sb_cc - so->so_rcv.sb_ctl;
5896                         }
5897                         goto integer;
5898
5899                 case SO_NUMRCVPKT:
5900                         if (so->so_proto->pr_flags & PR_ATOMIC) {
5901                                 int cnt = 0;
5902                                 struct mbuf *m1;
5903
5904                                 m1 = so->so_rcv.sb_mb;
5905                                 while (m1 != NULL) {
5906                                         cnt += 1;
5907                                         m1 = m1->m_nextpkt;
5908                                 }
5909                                 optval = cnt;
5910                                 goto integer;
5911                         } else {
5912                                 error = ENOPROTOOPT;
5913                                 break;
5914                         }
5915
5916                 case SO_NWRITE:
5917                         optval = so->so_snd.sb_cc;
5918                         goto integer;
5919
5920                 case SO_ERROR:
5921                         optval = so->so_error;
5922                         so->so_error = 0;
5923                         goto integer;
5924
5925                 case SO_SNDBUF: {
5926                         u_int32_t hiwat = so->so_snd.sb_hiwat;
5927
5928                         if (so->so_snd.sb_flags & SB_UNIX) {
5929                                 struct unpcb *unp =
5930                                     (struct unpcb *)(so->so_pcb);
5931                                 if (unp != NULL && unp->unp_conn != NULL) {
5932                                         hiwat += unp->unp_conn->unp_cc;
5933                                 }
5934                         }
5935
5936                         optval = hiwat;
5937                         goto integer;
5938                 }
5939                 case SO_RCVBUF:
5940                         optval = so->so_rcv.sb_hiwat;
5941                         goto integer;
5942
5943                 case SO_SNDLOWAT:
5944                         optval = so->so_snd.sb_lowat;
5945                         goto integer;
5946
5947                 case SO_RCVLOWAT:
5948                         optval = so->so_rcv.sb_lowat;
5949                         goto integer;
5950
5951                 case SO_SNDTIMEO:
5952                 case SO_RCVTIMEO:
5953                         tv = (sopt->sopt_name == SO_SNDTIMEO ?
5954                             so->so_snd.sb_timeo : so->so_rcv.sb_timeo);
5955
5956                         error = sooptcopyout_timeval(sopt, &tv);
5957                         break;
5958
5959                 case SO_NOSIGPIPE:
5960                         optval = (so->so_flags & SOF_NOSIGPIPE);
5961                         goto integer;
5962
5963                 case SO_NOADDRERR:
5964                         optval = (so->so_flags & SOF_NOADDRAVAIL);
5965                         goto integer;
5966
5967                 case SO_REUSESHAREUID:
5968                         optval = (so->so_flags & SOF_REUSESHAREUID);
5969                         goto integer;
5970
5971
5972                 case SO_NOTIFYCONFLICT:
5973                         optval = (so->so_flags & SOF_NOTIFYCONFLICT);
5974                         goto integer;
5975
5976                 case SO_RESTRICTIONS:
5977                         optval = so_get_restrictions(so);
5978                         goto integer;
5979
5980                 case SO_AWDL_UNRESTRICTED:
5981                         if (SOCK_DOM(so) == PF_INET ||
5982                             SOCK_DOM(so) == PF_INET6) {
5983                                 optval = inp_get_awdl_unrestricted(
5984                                         sotoinpcb(so));
5985                                 goto integer;
5986                         } else {
5987                                 error = EOPNOTSUPP;
5988                         }
5989                         break;
5990
5991                 case SO_INTCOPROC_ALLOW:
5992                         if (SOCK_DOM(so) == PF_INET6) {
5993                                 optval = inp_get_intcoproc_allowed(
5994                                         sotoinpcb(so));
5995                                 goto integer;
5996                         } else {
5997                                 error = EOPNOTSUPP;
5998                         }
5999                         break;
6000
6001                 case SO_LABEL:
6002                         error = EOPNOTSUPP;
6003                         break;
6004
6005                 case SO_PEERLABEL:
6006                         error = EOPNOTSUPP;
6007                         break;
6008
6009 #ifdef __APPLE_API_PRIVATE
6010                 case SO_UPCALLCLOSEWAIT:
6011                         optval = (so->so_flags & SOF_UPCALLCLOSEWAIT);
6012                         goto integer;
6013 #endif
6014                 case SO_RANDOMPORT:
6015                         optval = (so->so_flags & SOF_BINDRANDOMPORT);
6016                         goto integer;
6017
6018                 case SO_NP_EXTENSIONS: {
6019                         struct so_np_extensions sonpx = {};
6020
6021                         sonpx.npx_flags = (so->so_flags & SOF_NPX_SETOPTSHUT) ?
6022                             SONPX_SETOPTSHUT : 0;
6023                         sonpx.npx_mask = SONPX_MASK_VALID;
6024
6025                         error = sooptcopyout(sopt, &sonpx,
6026                             sizeof(struct so_np_extensions));
6027                         break;
6028                 }
6029
6030                 case SO_TRAFFIC_CLASS:
6031                         optval = so->so_traffic_class;
6032                         goto integer;
6033
6034                 case SO_RECV_TRAFFIC_CLASS:
6035                         optval = (so->so_flags & SOF_RECV_TRAFFIC_CLASS);
6036                         goto integer;
6037
6038 #if (DEVELOPMENT || DEBUG)
6039                 case SO_TRAFFIC_CLASS_DBG:
6040                         error = sogetopt_tcdbg(so, sopt);
6041                         break;
6042 #endif /* (DEVELOPMENT || DEBUG) */
6043
6044                 case SO_PRIVILEGED_TRAFFIC_CLASS:
6045                         optval = (so->so_flags & SOF_PRIVILEGED_TRAFFIC_CLASS);
6046                         goto integer;
6047
6048                 case SO_DEFUNCTOK:
6049                         optval = !(so->so_flags & SOF_NODEFUNCT);
6050                         goto integer;
6051
6052                 case SO_ISDEFUNCT:
6053                         optval = (so->so_flags & SOF_DEFUNCT);
6054                         goto integer;
6055
6056                 case SO_OPPORTUNISTIC:
6057                         optval = so_get_opportunistic(so);
6058                         goto integer;
6059
6060                 case SO_FLUSH:
6061                         /* This option is not gettable */
6062                         error = EINVAL;
6063                         break;
6064
6065                 case SO_RECV_ANYIF:
6066                         optval = so_get_recv_anyif(so);
6067                         goto integer;
6068
6069                 case SO_TRAFFIC_MGT_BACKGROUND:
6070                         /* This option is handled by lower layer(s) */
6071                         if (so->so_proto != NULL &&
6072                             so->so_proto->pr_ctloutput != NULL) {
6073                                 (void) so->so_proto->pr_ctloutput(so, sopt);
6074                         }
6075                         break;
6076
6077 #if FLOW_DIVERT
6078                 case SO_FLOW_DIVERT_TOKEN:
6079                         error = flow_divert_token_get(so, sopt);
6080                         break;
6081 #endif  /* FLOW_DIVERT */
6082
6083 #if NECP
6084                 case SO_NECP_ATTRIBUTES:
6085                         error = necp_get_socket_attributes(so, sopt);
6086                         break;
6087
6088                 case SO_NECP_CLIENTUUID: {
6089                         uuid_t *ncu;
6090
6091                         if (SOCK_DOM(so) == PF_MULTIPATH) {
6092                                 ncu = &mpsotomppcb(so)->necp_client_uuid;
6093                         } else if (SOCK_DOM(so) == PF_INET || SOCK_DOM(so) == PF_INET6) {
6094                                 ncu = &sotoinpcb(so)->necp_client_uuid;
6095                         } else {
6096                                 error = EINVAL;
6097                                 goto out;
6098                         }
6099
6100                         error = sooptcopyout(sopt, ncu, sizeof(uuid_t));
6101                         break;
6102                 }
6103
6104                 case SO_NECP_LISTENUUID: {
6105                         uuid_t *nlu;
6106
6107                         if (SOCK_DOM(so) == PF_INET || SOCK_DOM(so) == PF_INET6) {
6108                                 if (sotoinpcb(so)->inp_flags2 & INP2_EXTERNAL_PORT) {
6109                                         nlu = &sotoinpcb(so)->necp_client_uuid;
6110                                 } else {
6111                                         error = ENOENT;
6112                                         goto out;
6113                                 }
6114                         } else {
6115                                 error = EINVAL;
6116                                 goto out;
6117                         }
6118
6119                         error = sooptcopyout(sopt, nlu, sizeof(uuid_t));
6120                         break;
6121                 }
6122 #endif /* NECP */
6123
6124 #if CONTENT_FILTER
6125                 case SO_CFIL_SOCK_ID: {
6126                         cfil_sock_id_t sock_id;
6127
6128                         sock_id = cfil_sock_id_from_socket(so);
6129
6130                         error = sooptcopyout(sopt, &sock_id,
6131                             sizeof(cfil_sock_id_t));
6132                         break;
6133                 }
6134 #endif  /* CONTENT_FILTER */
6135
6136                 case SO_EXTENDED_BK_IDLE:
6137                         optval = (so->so_flags1 & SOF1_EXTEND_BK_IDLE_WANTED);
6138                         goto integer;
6139                 case SO_MARK_CELLFALLBACK:
6140                         optval = ((so->so_flags1 & SOF1_CELLFALLBACK) > 0)
6141                             ? 1 : 0;
6142                         goto integer;
6143                 case SO_NET_SERVICE_TYPE: {
6144                         if ((so->so_flags1 & SOF1_TC_NET_SERV_TYPE)) {
6145                                 optval = so->so_netsvctype;
6146                         } else {
6147                                 optval = NET_SERVICE_TYPE_BE;
6148                         }
6149                         goto integer;
6150                 }
6151                 case SO_NETSVC_MARKING_LEVEL:
6152                         optval = so_get_netsvc_marking_level(so);
6153                         goto integer;
6154
6155                 case SO_MPKL_SEND_INFO: {
6156                         struct so_mpkl_send_info so_mpkl_send_info;
6157
6158                         uuid_copy(so_mpkl_send_info.mpkl_uuid, so->so_mpkl_send_uuid);
6159                         so_mpkl_send_info.mpkl_proto = so->so_mpkl_send_proto;
6160                         error = sooptcopyout(sopt, &so_mpkl_send_info,
6161                             sizeof(struct so_mpkl_send_info));
6162                         break;
6163                 }
6164                 default:
6165                         error = ENOPROTOOPT;
6166                         break;
6167                 }
6168         }
6169 out:
6170         if (dolock) {
6171                 socket_unlock(so, 1);
6172         }
6173         return error;
6174 }
6175
6176 /*
6177  * The size limits on our soopt_getm is different from that on FreeBSD.
6178  * We limit the size of options to MCLBYTES. This will have to change
6179  * if we need to define options that need more space than MCLBYTES.
6180  */
6181 int
6182 soopt_getm(struct sockopt *sopt, struct mbuf **mp)
6183 {
6184         struct mbuf *m, *m_prev;
6185         int sopt_size = sopt->sopt_valsize;
6186         int how;
6187
6188         if (sopt_size <= 0 || sopt_size > MCLBYTES) {
6189                 return EMSGSIZE;
6190         }
6191
6192         how = sopt->sopt_p != kernproc ? M_WAIT : M_DONTWAIT;
6193         MGET(m, how, MT_DATA);
6194         if (m == NULL) {
6195                 return ENOBUFS;
6196         }
6197         if (sopt_size > MLEN) {
6198                 MCLGET(m, how);
6199                 if ((m->m_flags & M_EXT) == 0) {
6200                         m_free(m);
6201                         return ENOBUFS;
6202                 }
6203                 m->m_len = min(MCLBYTES, sopt_size);
6204         } else {
6205                 m->m_len = min(MLEN, sopt_size);
6206         }
6207         sopt_size -= m->m_len;
6208         *mp = m;
6209         m_prev = m;
6210
6211         while (sopt_size > 0) {
6212                 MGET(m, how, MT_DATA);
6213                 if (m == NULL) {
6214                         m_freem(*mp);
6215                         return ENOBUFS;
6216                 }
6217                 if (sopt_size > MLEN) {
6218                         MCLGET(m, how);
6219                         if ((m->m_flags & M_EXT) == 0) {
6220                                 m_freem(*mp);
6221                                 m_freem(m);
6222                                 return ENOBUFS;
6223                         }
6224                         m->m_len = min(MCLBYTES, sopt_size);
6225                 } else {
6226                         m->m_len = min(MLEN, sopt_size);
6227                 }
6228                 sopt_size -= m->m_len;
6229                 m_prev->m_next = m;
6230                 m_prev = m;
6231         }
6232         return 0;
6233 }
6234
6235 /* copyin sopt data into mbuf chain */
6236 int
6237 soopt_mcopyin(struct sockopt *sopt, struct mbuf *m)
6238 {
6239         struct mbuf *m0 = m;
6240
6241         if (sopt->sopt_val == USER_ADDR_NULL) {
6242                 return 0;
6243         }
6244         while (m != NULL && sopt->sopt_valsize >= m->m_len) {
6245                 if (sopt->sopt_p != kernproc) {
6246                         int error;
6247
6248                         error = copyin(sopt->sopt_val, mtod(m, char *),
6249                             m->m_len);
6250                         if (error != 0) {
6251                                 m_freem(m0);
6252                                 return error;
6253                         }
6254                 } else {
6255                         bcopy(CAST_DOWN(caddr_t, sopt->sopt_val),
6256                             mtod(m, char *), m->m_len);
6257                 }
6258                 sopt->sopt_valsize -= m->m_len;
6259                 sopt->sopt_val += m->m_len;
6260                 m = m->m_next;
6261         }
6262         /* should be allocated enoughly at ip6_sooptmcopyin() */
6263         if (m != NULL) {
6264                 panic("soopt_mcopyin");
6265                 /* NOTREACHED */
6266         }
6267         return 0;
6268 }
6269
6270 /* copyout mbuf chain data into soopt */
6271 int
6272 soopt_mcopyout(struct sockopt *sopt, struct mbuf *m)
6273 {
6274         struct mbuf *m0 = m;
6275         size_t valsize = 0;
6276
6277         if (sopt->sopt_val == USER_ADDR_NULL) {
6278                 return 0;
6279         }
6280         while (m != NULL && sopt->sopt_valsize >= m->m_len) {
6281                 if (sopt->sopt_p != kernproc) {
6282                         int error;
6283
6284                         error = copyout(mtod(m, char *), sopt->sopt_val,
6285                             m->m_len);
6286                         if (error != 0) {
6287                                 m_freem(m0);
6288                                 return error;
6289                         }
6290                 } else {
6291                         bcopy(mtod(m, char *),
6292                             CAST_DOWN(caddr_t, sopt->sopt_val), m->m_len);
6293                 }
6294                 sopt->sopt_valsize -= m->m_len;
6295                 sopt->sopt_val += m->m_len;
6296                 valsize += m->m_len;
6297                 m = m->m_next;
6298         }
6299         if (m != NULL) {
6300                 /* enough soopt buffer should be given from user-land */
6301                 m_freem(m0);
6302                 return EINVAL;
6303         }
6304         sopt->sopt_valsize = valsize;
6305         return 0;
6306 }
6307
6308 void
6309 sohasoutofband(struct socket *so)
6310 {
6311         if (so->so_pgid < 0) {
6312                 gsignal(-so->so_pgid, SIGURG);
6313         } else if (so->so_pgid > 0) {
6314                 proc_signal(so->so_pgid, SIGURG);
6315         }
6316         selwakeup(&so->so_rcv.sb_sel);
6317         if (so->so_rcv.sb_flags & SB_KNOTE) {
6318                 KNOTE(&so->so_rcv.sb_sel.si_note,
6319                     (NOTE_OOB | SO_FILT_HINT_LOCKED));
6320         }
6321 }
6322
6323 int
6324 sopoll(struct socket *so, int events, kauth_cred_t cred, void * wql)
6325 {
6326 #pragma unused(cred)
6327         struct proc *p = current_proc();
6328         int revents = 0;
6329
6330         socket_lock(so, 1);
6331         so_update_last_owner_locked(so, PROC_NULL);
6332         so_update_policy(so);
6333
6334         if (events & (POLLIN | POLLRDNORM)) {
6335                 if (soreadable(so)) {
6336                         revents |= events & (POLLIN | POLLRDNORM);
6337                 }
6338         }
6339
6340         if (events & (POLLOUT | POLLWRNORM)) {
6341                 if (sowriteable(so)) {
6342                         revents |= events & (POLLOUT | POLLWRNORM);
6343                 }
6344         }
6345
6346         if (events & (POLLPRI | POLLRDBAND)) {
6347                 if (so->so_oobmark || (so->so_state & SS_RCVATMARK)) {
6348                         revents |= events & (POLLPRI | POLLRDBAND);
6349                 }
6350         }
6351
6352         if (revents == 0) {
6353                 if (events & (POLLIN | POLLPRI | POLLRDNORM | POLLRDBAND)) {
6354                         /*
6355                          * Darwin sets the flag first,
6356                          * BSD calls selrecord first
6357                          */
6358                         so->so_rcv.sb_flags |= SB_SEL;
6359                         selrecord(p, &so->so_rcv.sb_sel, wql);
6360                 }
6361
6362                 if (events & (POLLOUT | POLLWRNORM)) {
6363                         /*
6364                          * Darwin sets the flag first,
6365                          * BSD calls selrecord first
6366                          */
6367                         so->so_snd.sb_flags |= SB_SEL;
6368                         selrecord(p, &so->so_snd.sb_sel, wql);
6369                 }
6370         }
6371
6372         socket_unlock(so, 1);
6373         return revents;
6374 }
6375
6376 int
6377 soo_kqfilter(struct fileproc *fp, struct knote *kn, struct kevent_qos_s *kev)
6378 {
6379         struct socket *so = (struct socket *)fp->fp_glob->fg_data;
6380         int result;
6381
6382         socket_lock(so, 1);
6383         so_update_last_owner_locked(so, PROC_NULL);
6384         so_update_policy(so);
6385
6386         switch (kn->kn_filter) {
6387         case EVFILT_READ:
6388                 kn->kn_filtid = EVFILTID_SOREAD;
6389                 break;
6390         case EVFILT_WRITE:
6391                 kn->kn_filtid = EVFILTID_SOWRITE;
6392                 break;
6393         case EVFILT_SOCK:
6394                 kn->kn_filtid = EVFILTID_SCK;
6395                 break;
6396         case EVFILT_EXCEPT:
6397                 kn->kn_filtid = EVFILTID_SOEXCEPT;
6398                 break;
6399         default:
6400                 socket_unlock(so, 1);
6401                 knote_set_error(kn, EINVAL);
6402                 return 0;
6403         }
6404
6405         /*
6406          * call the appropriate sub-filter attach
6407          * with the socket still locked
6408          */
6409         result = knote_fops(kn)->f_attach(kn, kev);
6410
6411         socket_unlock(so, 1);
6412
6413         return result;
6414 }
6415
6416 static int
6417 filt_soread_common(struct knote *kn, struct kevent_qos_s *kev, struct socket *so)
6418 {
6419         int retval = 0;
6420         int64_t data = 0;
6421
6422         if (so->so_options & SO_ACCEPTCONN) {
6423                 /*
6424                  * Radar 6615193 handle the listen case dynamically
6425                  * for kqueue read filter. This allows to call listen()
6426                  * after registering the kqueue EVFILT_READ.
6427                  */
6428
6429                 retval = !TAILQ_EMPTY(&so->so_comp);
6430                 data = so->so_qlen;
6431                 goto out;
6432         }
6433
6434         /* socket isn't a listener */
6435         /*
6436          * NOTE_LOWAT specifies new low water mark in data, i.e.
6437          * the bytes of protocol data. We therefore exclude any
6438          * control bytes.
6439          */
6440         data = so->so_rcv.sb_cc - so->so_rcv.sb_ctl;
6441
6442         if (kn->kn_sfflags & NOTE_OOB) {
6443                 if (so->so_oobmark || (so->so_state & SS_RCVATMARK)) {
6444                         kn->kn_fflags |= NOTE_OOB;
6445                         data -= so->so_oobmark;
6446                         retval = 1;
6447                         goto out;
6448                 }
6449         }
6450
6451         if ((so->so_state & SS_CANTRCVMORE)
6452 #if CONTENT_FILTER
6453             && cfil_sock_data_pending(&so->so_rcv) == 0
6454 #endif /* CONTENT_FILTER */
6455             ) {
6456                 kn->kn_flags |= EV_EOF;
6457                 kn->kn_fflags = so->so_error;
6458                 retval = 1;
6459                 goto out;
6460         }
6461
6462         if (so->so_error) {     /* temporary udp error */
6463                 retval = 1;
6464                 goto out;
6465         }
6466
6467         int64_t lowwat = so->so_rcv.sb_lowat;
6468         /*
6469          * Ensure that when NOTE_LOWAT is used, the derived
6470          * low water mark is bounded by socket's rcv buf's
6471          * high and low water mark values.
6472          */
6473         if (kn->kn_sfflags & NOTE_LOWAT) {
6474                 if (kn->kn_sdata > so->so_rcv.sb_hiwat) {
6475                         lowwat = so->so_rcv.sb_hiwat;
6476                 } else if (kn->kn_sdata > lowwat) {
6477                         lowwat = kn->kn_sdata;
6478                 }
6479         }
6480
6481         /*
6482          * While the `data` field is the amount of data to read,
6483          * 0-sized packets need to wake up the kqueue, see 58140856,
6484          * so we need to take control bytes into account too.
6485          */
6486         retval = (so->so_rcv.sb_cc >= lowwat);
6487
6488 out:
6489         if (retval && kev) {
6490                 knote_fill_kevent(kn, kev, data);
6491         }
6492         return retval;
6493 }
6494
6495 static int
6496 filt_sorattach(struct knote *kn, __unused struct kevent_qos_s *kev)
6497 {
6498         struct socket *so = (struct socket *)kn->kn_fp->fp_glob->fg_data;
6499
6500         /* socket locked */
6501
6502         /*
6503          * If the caller explicitly asked for OOB results (e.g. poll())
6504          * from EVFILT_READ, then save that off in the hookid field
6505          * and reserve the kn_flags EV_OOBAND bit for output only.
6506          */
6507         if (kn->kn_filter == EVFILT_READ &&
6508             kn->kn_flags & EV_OOBAND) {
6509                 kn->kn_flags &= ~EV_OOBAND;
6510                 kn->kn_hook32 = EV_OOBAND;
6511         } else {
6512                 kn->kn_hook32 = 0;
6513         }
6514         if (KNOTE_ATTACH(&so->so_rcv.sb_sel.si_note, kn)) {
6515                 so->so_rcv.sb_flags |= SB_KNOTE;
6516         }
6517
6518         /* indicate if event is already fired */
6519         return filt_soread_common(kn, NULL, so);
6520 }
6521
6522 static void
6523 filt_sordetach(struct knote *kn)
6524 {
6525         struct socket *so = (struct socket *)kn->kn_fp->fp_glob->fg_data;
6526
6527         socket_lock(so, 1);
6528         if (so->so_rcv.sb_flags & SB_KNOTE) {
6529                 if (KNOTE_DETACH(&so->so_rcv.sb_sel.si_note, kn)) {
6530                         so->so_rcv.sb_flags &= ~SB_KNOTE;
6531                 }
6532         }
6533         socket_unlock(so, 1);
6534 }
6535
6536 /*ARGSUSED*/
6537 static int
6538 filt_soread(struct knote *kn, long hint)
6539 {
6540         struct socket *so = (struct socket *)kn->kn_fp->fp_glob->fg_data;
6541         int retval;
6542
6543         if ((hint & SO_FILT_HINT_LOCKED) == 0) {
6544                 socket_lock(so, 1);
6545         }
6546
6547         retval = filt_soread_common(kn, NULL, so);
6548
6549         if ((hint & SO_FILT_HINT_LOCKED) == 0) {
6550                 socket_unlock(so, 1);
6551         }
6552
6553         return retval;
6554 }
6555
6556 static int
6557 filt_sortouch(struct knote *kn, struct kevent_qos_s *kev)
6558 {
6559         struct socket *so = (struct socket *)kn->kn_fp->fp_glob->fg_data;
6560         int retval;
6561
6562         socket_lock(so, 1);
6563
6564         /* save off the new input fflags and data */
6565         kn->kn_sfflags = kev->fflags;
6566         kn->kn_sdata = kev->data;
6567
6568         /* determine if changes result in fired events */
6569         retval = filt_soread_common(kn, NULL, so);
6570
6571         socket_unlock(so, 1);
6572
6573         return retval;
6574 }
6575
6576 static int
6577 filt_sorprocess(struct knote *kn, struct kevent_qos_s *kev)
6578 {
6579         struct socket *so = (struct socket *)kn->kn_fp->fp_glob->fg_data;
6580         int retval;
6581
6582         socket_lock(so, 1);
6583         retval = filt_soread_common(kn, kev, so);
6584         socket_unlock(so, 1);
6585
6586         return retval;
6587 }
6588
6589 int
6590 so_wait_for_if_feedback(struct socket *so)
6591 {
6592         if ((SOCK_DOM(so) == PF_INET || SOCK_DOM(so) == PF_INET6) &&
6593             (so->so_state & SS_ISCONNECTED)) {
6594                 struct inpcb *inp = sotoinpcb(so);
6595                 if (INP_WAIT_FOR_IF_FEEDBACK(inp)) {
6596                         return 1;
6597                 }
6598         }
6599         return 0;
6600 }
6601
6602 static int
6603 filt_sowrite_common(struct knote *kn, struct kevent_qos_s *kev, struct socket *so)
6604 {
6605         int ret = 0;
6606         int64_t data = sbspace(&so->so_snd);
6607
6608         if (so->so_state & SS_CANTSENDMORE) {
6609                 kn->kn_flags |= EV_EOF;
6610                 kn->kn_fflags = so->so_error;
6611                 ret = 1;
6612                 goto out;
6613         }
6614
6615         if (so->so_error) {     /* temporary udp error */
6616                 ret = 1;
6617                 goto out;
6618         }
6619
6620         if (!socanwrite(so)) {
6621                 ret = 0;
6622                 goto out;
6623         }
6624
6625         if (so->so_flags1 & SOF1_PRECONNECT_DATA) {
6626                 ret = 1;
6627                 goto out;
6628         }
6629
6630         int64_t lowwat = so->so_snd.sb_lowat;
6631
6632         if (kn->kn_sfflags & NOTE_LOWAT) {
6633                 if (kn->kn_sdata > so->so_snd.sb_hiwat) {
6634                         lowwat = so->so_snd.sb_hiwat;
6635                 } else if (kn->kn_sdata > lowwat) {
6636                         lowwat = kn->kn_sdata;
6637                 }
6638         }
6639
6640         if (data >= lowwat) {
6641                 if ((so->so_flags & SOF_NOTSENT_LOWAT)
6642 #if (DEBUG || DEVELOPMENT)
6643                     && so_notsent_lowat_check == 1
6644 #endif /* DEBUG || DEVELOPMENT */
6645                     ) {
6646                         if ((SOCK_DOM(so) == PF_INET ||
6647                             SOCK_DOM(so) == PF_INET6) &&
6648                             so->so_type == SOCK_STREAM) {
6649                                 ret = tcp_notsent_lowat_check(so);
6650                         }
6651 #if MPTCP
6652                         else if ((SOCK_DOM(so) == PF_MULTIPATH) &&
6653                             (SOCK_PROTO(so) == IPPROTO_TCP)) {
6654                                 ret = mptcp_notsent_lowat_check(so);
6655                         }
6656 #endif
6657                         else {
6658                                 ret = 1;
6659                                 goto out;
6660                         }
6661                 } else {
6662                         ret = 1;
6663                 }
6664         }
6665         if (so_wait_for_if_feedback(so)) {
6666                 ret = 0;
6667         }
6668
6669 out:
6670         if (ret && kev) {
6671                 knote_fill_kevent(kn, kev, data);
6672         }
6673         return ret;
6674 }
6675
6676 static int
6677 filt_sowattach(struct knote *kn, __unused struct kevent_qos_s *kev)
6678 {
6679         struct socket *so = (struct socket *)kn->kn_fp->fp_glob->fg_data;
6680
6681         /* socket locked */
6682         if (KNOTE_ATTACH(&so->so_snd.sb_sel.si_note, kn)) {
6683                 so->so_snd.sb_flags |= SB_KNOTE;
6684         }
6685
6686         /* determine if its already fired */
6687         return filt_sowrite_common(kn, NULL, so);
6688 }
6689
6690 static void
6691 filt_sowdetach(struct knote *kn)
6692 {
6693         struct socket *so = (struct socket *)kn->kn_fp->fp_glob->fg_data;
6694         socket_lock(so, 1);
6695
6696         if (so->so_snd.sb_flags & SB_KNOTE) {
6697                 if (KNOTE_DETACH(&so->so_snd.sb_sel.si_note, kn)) {
6698                         so->so_snd.sb_flags &= ~SB_KNOTE;
6699                 }
6700         }
6701         socket_unlock(so, 1);
6702 }
6703
6704 /*ARGSUSED*/
6705 static int
6706 filt_sowrite(struct knote *kn, long hint)
6707 {
6708         struct socket *so = (struct socket *)kn->kn_fp->fp_glob->fg_data;
6709         int ret;
6710
6711         if ((hint & SO_FILT_HINT_LOCKED) == 0) {
6712                 socket_lock(so, 1);
6713         }
6714
6715         ret = filt_sowrite_common(kn, NULL, so);
6716
6717         if ((hint & SO_FILT_HINT_LOCKED) == 0) {
6718                 socket_unlock(so, 1);
6719         }
6720
6721         return ret;
6722 }
6723
6724 static int
6725 filt_sowtouch(struct knote *kn, struct kevent_qos_s *kev)
6726 {
6727         struct socket *so = (struct socket *)kn->kn_fp->fp_glob->fg_data;
6728         int ret;
6729
6730         socket_lock(so, 1);
6731
6732         /*save off the new input fflags and data */
6733         kn->kn_sfflags = kev->fflags;
6734         kn->kn_sdata = kev->data;
6735
6736         /* determine if these changes result in a triggered event */
6737         ret = filt_sowrite_common(kn, NULL, so);
6738
6739         socket_unlock(so, 1);
6740
6741         return ret;
6742 }
6743
6744 static int
6745 filt_sowprocess(struct knote *kn, struct kevent_qos_s *kev)
6746 {
6747         struct socket *so = (struct socket *)kn->kn_fp->fp_glob->fg_data;
6748         int ret;
6749
6750         socket_lock(so, 1);
6751         ret = filt_sowrite_common(kn, kev, so);
6752         socket_unlock(so, 1);
6753
6754         return ret;
6755 }
6756
6757 static int
6758 filt_sockev_common(struct knote *kn, struct kevent_qos_s *kev,
6759     struct socket *so, long ev_hint)
6760 {
6761         int ret = 0;
6762         int64_t data = 0;
6763         uint32_t level_trigger = 0;
6764
6765         if (ev_hint & SO_FILT_HINT_CONNRESET) {
6766                 kn->kn_fflags |= NOTE_CONNRESET;
6767         }
6768         if (ev_hint & SO_FILT_HINT_TIMEOUT) {
6769                 kn->kn_fflags |= NOTE_TIMEOUT;
6770         }
6771         if (ev_hint & SO_FILT_HINT_NOSRCADDR) {
6772                 kn->kn_fflags |= NOTE_NOSRCADDR;
6773         }
6774         if (ev_hint & SO_FILT_HINT_IFDENIED) {
6775                 kn->kn_fflags |= NOTE_IFDENIED;
6776         }
6777         if (ev_hint & SO_FILT_HINT_KEEPALIVE) {
6778                 kn->kn_fflags |= NOTE_KEEPALIVE;
6779         }
6780         if (ev_hint & SO_FILT_HINT_ADAPTIVE_WTIMO) {
6781                 kn->kn_fflags |= NOTE_ADAPTIVE_WTIMO;
6782         }
6783         if (ev_hint & SO_FILT_HINT_ADAPTIVE_RTIMO) {
6784                 kn->kn_fflags |= NOTE_ADAPTIVE_RTIMO;
6785         }
6786         if ((ev_hint & SO_FILT_HINT_CONNECTED) ||
6787             (so->so_state & SS_ISCONNECTED)) {
6788                 kn->kn_fflags |= NOTE_CONNECTED;
6789                 level_trigger |= NOTE_CONNECTED;
6790         }
6791         if ((ev_hint & SO_FILT_HINT_DISCONNECTED) ||
6792             (so->so_state & SS_ISDISCONNECTED)) {
6793                 kn->kn_fflags |= NOTE_DISCONNECTED;
6794                 level_trigger |= NOTE_DISCONNECTED;
6795         }
6796         if (ev_hint & SO_FILT_HINT_CONNINFO_UPDATED) {
6797                 if (so->so_proto != NULL &&
6798                     (so->so_proto->pr_flags & PR_EVCONNINFO)) {
6799                         kn->kn_fflags |= NOTE_CONNINFO_UPDATED;
6800                 }
6801         }
6802
6803         if ((ev_hint & SO_FILT_HINT_NOTIFY_ACK) ||
6804             tcp_notify_ack_active(so)) {
6805                 kn->kn_fflags |= NOTE_NOTIFY_ACK;
6806         }
6807
6808         if ((so->so_state & SS_CANTRCVMORE)
6809 #if CONTENT_FILTER
6810             && cfil_sock_data_pending(&so->so_rcv) == 0
6811 #endif /* CONTENT_FILTER */
6812             ) {
6813                 kn->kn_fflags |= NOTE_READCLOSED;
6814                 level_trigger |= NOTE_READCLOSED;
6815         }
6816
6817         if (so->so_state & SS_CANTSENDMORE) {
6818                 kn->kn_fflags |= NOTE_WRITECLOSED;
6819                 level_trigger |= NOTE_WRITECLOSED;
6820         }
6821
6822         if ((ev_hint & SO_FILT_HINT_SUSPEND) ||
6823             (so->so_flags & SOF_SUSPENDED)) {
6824                 kn->kn_fflags &= ~(NOTE_SUSPEND | NOTE_RESUME);
6825
6826                 /* If resume event was delivered before, reset it */
6827                 kn->kn_hook32 &= ~NOTE_RESUME;
6828
6829                 kn->kn_fflags |= NOTE_SUSPEND;
6830                 level_trigger |= NOTE_SUSPEND;
6831         }
6832
6833         if ((ev_hint & SO_FILT_HINT_RESUME) ||
6834             (so->so_flags & SOF_SUSPENDED) == 0) {
6835                 kn->kn_fflags &= ~(NOTE_SUSPEND | NOTE_RESUME);
6836
6837                 /* If suspend event was delivered before, reset it */
6838                 kn->kn_hook32 &= ~NOTE_SUSPEND;
6839
6840                 kn->kn_fflags |= NOTE_RESUME;
6841                 level_trigger |= NOTE_RESUME;
6842         }
6843
6844         if (so->so_error != 0) {
6845                 ret = 1;
6846                 data = so->so_error;
6847                 kn->kn_flags |= EV_EOF;
6848         } else {
6849                 u_int32_t data32 = 0;
6850                 get_sockev_state(so, &data32);
6851                 data = data32;
6852         }
6853
6854         /* Reset any events that are not requested on this knote */
6855         kn->kn_fflags &= (kn->kn_sfflags & EVFILT_SOCK_ALL_MASK);
6856         level_trigger &= (kn->kn_sfflags & EVFILT_SOCK_ALL_MASK);
6857
6858         /* Find the level triggerred events that are already delivered */
6859         level_trigger &= kn->kn_hook32;
6860         level_trigger &= EVFILT_SOCK_LEVEL_TRIGGER_MASK;
6861
6862         /* Do not deliver level triggerred events more than once */
6863         if ((kn->kn_fflags & ~level_trigger) != 0) {
6864                 ret = 1;
6865         }
6866
6867         if (ret && kev) {
6868                 /*
6869                  * Store the state of the events being delivered. This
6870                  * state can be used to deliver level triggered events
6871                  * ateast once and still avoid waking up the application
6872                  * multiple times as long as the event is active.
6873                  */
6874                 if (kn->kn_fflags != 0) {
6875                         kn->kn_hook32 |= (kn->kn_fflags &
6876                             EVFILT_SOCK_LEVEL_TRIGGER_MASK);
6877                 }
6878
6879                 /*
6880                  * NOTE_RESUME and NOTE_SUSPEND are an exception, deliver
6881                  * only one of them and remember the last one that was
6882                  * delivered last
6883                  */
6884                 if (kn->kn_fflags & NOTE_SUSPEND) {
6885                         kn->kn_hook32 &= ~NOTE_RESUME;
6886                 }
6887                 if (kn->kn_fflags & NOTE_RESUME) {
6888                         kn->kn_hook32 &= ~NOTE_SUSPEND;
6889                 }
6890
6891                 knote_fill_kevent(kn, kev, data);
6892         }
6893         return ret;
6894 }
6895
6896 static int
6897 filt_sockattach(struct knote *kn, __unused struct kevent_qos_s *kev)
6898 {
6899         struct socket *so = (struct socket *)kn->kn_fp->fp_glob->fg_data;
6900
6901         /* socket locked */
6902         kn->kn_hook32 = 0;
6903         if (KNOTE_ATTACH(&so->so_klist, kn)) {
6904                 so->so_flags |= SOF_KNOTE;
6905         }
6906
6907         /* determine if event already fired */
6908         return filt_sockev_common(kn, NULL, so, 0);
6909 }
6910
6911 static void
6912 filt_sockdetach(struct knote *kn)
6913 {
6914         struct socket *so = (struct socket *)kn->kn_fp->fp_glob->fg_data;
6915         socket_lock(so, 1);
6916
6917         if ((so->so_flags & SOF_KNOTE) != 0) {
6918                 if (KNOTE_DETACH(&so->so_klist, kn)) {
6919                         so->so_flags &= ~SOF_KNOTE;
6920                 }
6921         }
6922         socket_unlock(so, 1);
6923 }
6924
6925 static int
6926 filt_sockev(struct knote *kn, long hint)
6927 {
6928         int ret = 0, locked = 0;
6929         struct socket *so = (struct socket *)kn->kn_fp->fp_glob->fg_data;
6930         long ev_hint = (hint & SO_FILT_HINT_EV);
6931
6932         if ((hint & SO_FILT_HINT_LOCKED) == 0) {
6933                 socket_lock(so, 1);
6934                 locked = 1;
6935         }
6936
6937         ret = filt_sockev_common(kn, NULL, so, ev_hint);
6938
6939         if (locked) {
6940                 socket_unlock(so, 1);
6941         }
6942
6943         return ret;
6944 }
6945
6946
6947
6948 /*
6949  *      filt_socktouch - update event state
6950  */
6951 static int
6952 filt_socktouch(
6953         struct knote *kn,
6954         struct kevent_qos_s *kev)
6955 {
6956         struct socket *so = (struct socket *)kn->kn_fp->fp_glob->fg_data;
6957         uint32_t changed_flags;
6958         int ret;
6959
6960         socket_lock(so, 1);
6961
6962         /* save off the [result] data and fflags */
6963         changed_flags = (kn->kn_sfflags ^ kn->kn_hook32);
6964
6965         /* save off the new input fflags and data */
6966         kn->kn_sfflags = kev->fflags;
6967         kn->kn_sdata = kev->data;
6968
6969         /* restrict the current results to the (smaller?) set of new interest */
6970         /*
6971          * For compatibility with previous implementations, we leave kn_fflags
6972          * as they were before.
6973          */
6974         //kn->kn_fflags &= kev->fflags;
6975
6976         /*
6977          * Since we keep track of events that are already
6978          * delivered, if any of those events are not requested
6979          * anymore the state related to them can be reset
6980          */
6981         kn->kn_hook32 &= ~(changed_flags & EVFILT_SOCK_LEVEL_TRIGGER_MASK);
6982
6983         /* determine if we have events to deliver */
6984         ret = filt_sockev_common(kn, NULL, so, 0);
6985
6986         socket_unlock(so, 1);
6987
6988         return ret;
6989 }
6990
6991 /*
6992  *      filt_sockprocess - query event fired state and return data
6993  */
6994 static int
6995 filt_sockprocess(struct knote *kn, struct kevent_qos_s *kev)
6996 {
6997         struct socket *so = (struct socket *)kn->kn_fp->fp_glob->fg_data;
6998         int ret = 0;
6999
7000         socket_lock(so, 1);
7001
7002         ret = filt_sockev_common(kn, kev, so, 0);
7003
7004         socket_unlock(so, 1);
7005
7006         return ret;
7007 }
7008
7009 void
7010 get_sockev_state(struct socket *so, u_int32_t *statep)
7011 {
7012         u_int32_t state = *(statep);
7013
7014         /*
7015          * If the state variable is already used by a previous event,
7016          * reset it.
7017          */
7018         if (state != 0) {
7019                 return;
7020         }
7021
7022         if (so->so_state & SS_ISCONNECTED) {
7023                 state |= SOCKEV_CONNECTED;
7024         } else {
7025                 state &= ~(SOCKEV_CONNECTED);
7026         }
7027         state |= ((so->so_state & SS_ISDISCONNECTED) ? SOCKEV_DISCONNECTED : 0);
7028         *(statep) = state;
7029 }
7030
7031 #define SO_LOCK_HISTORY_STR_LEN \
7032         (2 * SO_LCKDBG_MAX * (2 + (2 * sizeof (void *)) + 1) + 1)
7033
7034 __private_extern__ const char *
7035 solockhistory_nr(struct socket *so)
7036 {
7037         size_t n = 0;
7038         int i;
7039         static char lock_history_str[SO_LOCK_HISTORY_STR_LEN];
7040
7041         bzero(lock_history_str, sizeof(lock_history_str));
7042         for (i = SO_LCKDBG_MAX - 1; i >= 0; i--) {
7043                 n += scnprintf(lock_history_str + n,
7044                     SO_LOCK_HISTORY_STR_LEN - n, "%p:%p ",
7045                     so->lock_lr[(so->next_lock_lr + i) % SO_LCKDBG_MAX],
7046                     so->unlock_lr[(so->next_unlock_lr + i) % SO_LCKDBG_MAX]);
7047         }
7048         return lock_history_str;
7049 }
7050
7051 lck_mtx_t *
7052 socket_getlock(struct socket *so, int flags)
7053 {
7054         if (so->so_proto->pr_getlock != NULL) {
7055                 return (*so->so_proto->pr_getlock)(so, flags);
7056         } else {
7057                 return so->so_proto->pr_domain->dom_mtx;
7058         }
7059 }
7060
7061 void
7062 socket_lock(struct socket *so, int refcount)
7063 {
7064         void *lr_saved;
7065
7066         lr_saved = __builtin_return_address(0);
7067
7068         if (so->so_proto->pr_lock) {
7069                 (*so->so_proto->pr_lock)(so, refcount, lr_saved);
7070         } else {
7071 #ifdef MORE_LOCKING_DEBUG
7072                 LCK_MTX_ASSERT(so->so_proto->pr_domain->dom_mtx,
7073                     LCK_MTX_ASSERT_NOTOWNED);
7074 #endif
7075                 lck_mtx_lock(so->so_proto->pr_domain->dom_mtx);
7076                 if (refcount) {
7077                         so->so_usecount++;
7078                 }
7079                 so->lock_lr[so->next_lock_lr] = lr_saved;
7080                 so->next_lock_lr = (so->next_lock_lr + 1) % SO_LCKDBG_MAX;
7081         }
7082 }
7083
7084 void
7085 socket_lock_assert_owned(struct socket *so)
7086 {
7087         lck_mtx_t *mutex_held;
7088
7089         if (so->so_proto->pr_getlock != NULL) {
7090                 mutex_held = (*so->so_proto->pr_getlock)(so, 0);
7091         } else {
7092                 mutex_held = so->so_proto->pr_domain->dom_mtx;
7093         }
7094
7095         LCK_MTX_ASSERT(mutex_held, LCK_MTX_ASSERT_OWNED);
7096 }
7097
7098 int
7099 socket_try_lock(struct socket *so)
7100 {
7101         lck_mtx_t *mtx;
7102
7103         if (so->so_proto->pr_getlock != NULL) {
7104                 mtx = (*so->so_proto->pr_getlock)(so, 0);
7105         } else {
7106                 mtx = so->so_proto->pr_domain->dom_mtx;
7107         }
7108
7109         return lck_mtx_try_lock(mtx);
7110 }
7111
7112 void
7113 socket_unlock(struct socket *so, int refcount)
7114 {
7115         void *lr_saved;
7116         lck_mtx_t *mutex_held;
7117
7118         lr_saved = __builtin_return_address(0);
7119
7120         if (so == NULL || so->so_proto == NULL) {
7121                 panic("%s: null so_proto so=%p\n", __func__, so);
7122                 /* NOTREACHED */
7123         }
7124
7125         if (so->so_proto->pr_unlock) {
7126                 (*so->so_proto->pr_unlock)(so, refcount, lr_saved);
7127         } else {
7128                 mutex_held = so->so_proto->pr_domain->dom_mtx;
7129 #ifdef MORE_LOCKING_DEBUG
7130                 LCK_MTX_ASSERT(mutex_held, LCK_MTX_ASSERT_OWNED);
7131 #endif
7132                 so->unlock_lr[so->next_unlock_lr] = lr_saved;
7133                 so->next_unlock_lr = (so->next_unlock_lr + 1) % SO_LCKDBG_MAX;
7134
7135                 if (refcount) {
7136                         if (so->so_usecount <= 0) {
7137                                 panic("%s: bad refcount=%d so=%p (%d, %d, %d) "
7138                                     "lrh=%s", __func__, so->so_usecount, so,
7139                                     SOCK_DOM(so), so->so_type,
7140                                     SOCK_PROTO(so), solockhistory_nr(so));
7141                                 /* NOTREACHED */
7142                         }
7143
7144                         so->so_usecount--;
7145                         if (so->so_usecount == 0) {
7146                                 sofreelastref(so, 1);
7147                         }
7148                 }
7149                 lck_mtx_unlock(mutex_held);
7150         }
7151 }
7152
7153 /* Called with socket locked, will unlock socket */
7154 void
7155 sofree(struct socket *so)
7156 {
7157         lck_mtx_t *mutex_held;
7158
7159         if (so->so_proto->pr_getlock != NULL) {
7160                 mutex_held = (*so->so_proto->pr_getlock)(so, 0);
7161         } else {
7162                 mutex_held = so->so_proto->pr_domain->dom_mtx;
7163         }
7164         LCK_MTX_ASSERT(mutex_held, LCK_MTX_ASSERT_OWNED);
7165
7166         sofreelastref(so, 0);
7167 }
7168
7169 void
7170 soreference(struct socket *so)
7171 {
7172         socket_lock(so, 1);     /* locks & take one reference on socket */
7173         socket_unlock(so, 0);   /* unlock only */
7174 }
7175
7176 void
7177 sodereference(struct socket *so)
7178 {
7179         socket_lock(so, 0);
7180         socket_unlock(so, 1);
7181 }
7182
7183 /*
7184  * Set or clear SOF_MULTIPAGES on the socket to enable or disable the
7185  * possibility of using jumbo clusters.  Caller must ensure to hold
7186  * the socket lock.
7187  */
7188 void
7189 somultipages(struct socket *so, boolean_t set)
7190 {
7191         if (set) {
7192                 so->so_flags |= SOF_MULTIPAGES;
7193         } else {
7194                 so->so_flags &= ~SOF_MULTIPAGES;
7195         }
7196 }
7197
7198 void
7199 soif2kcl(struct socket *so, boolean_t set)
7200 {
7201         if (set) {
7202                 so->so_flags1 |= SOF1_IF_2KCL;
7203         } else {
7204                 so->so_flags1 &= ~SOF1_IF_2KCL;
7205         }
7206 }
7207
7208 int
7209 so_isdstlocal(struct socket *so)
7210 {
7211         struct inpcb *inp = (struct inpcb *)so->so_pcb;
7212
7213         if (SOCK_DOM(so) == PF_INET) {
7214                 return inaddr_local(inp->inp_faddr);
7215         } else if (SOCK_DOM(so) == PF_INET6) {
7216                 return in6addr_local(&inp->in6p_faddr);
7217         }
7218
7219         return 0;
7220 }
7221
7222 int
7223 sosetdefunct(struct proc *p, struct socket *so, int level, boolean_t noforce)
7224 {
7225         struct sockbuf *rcv, *snd;
7226         int err = 0, defunct;
7227
7228         rcv = &so->so_rcv;
7229         snd = &so->so_snd;
7230
7231         defunct = (so->so_flags & SOF_DEFUNCT);
7232         if (defunct) {
7233                 if (!(snd->sb_flags & rcv->sb_flags & SB_DROP)) {
7234                         panic("%s: SB_DROP not set", __func__);
7235                         /* NOTREACHED */
7236                 }
7237                 goto done;
7238         }
7239
7240         if (so->so_flags & SOF_NODEFUNCT) {
7241                 if (noforce) {
7242                         err = EOPNOTSUPP;
7243                         if (p != PROC_NULL) {
7244                                 SODEFUNCTLOG("%s[%d, %s]: (target pid %d "
7245                                     "name %s level %d) so 0x%llx [%d,%d] "
7246                                     "is not eligible for defunct "
7247                                     "(%d)\n", __func__, proc_selfpid(),
7248                                     proc_best_name(current_proc()), proc_pid(p),
7249                                     proc_best_name(p), level,
7250                                     (uint64_t)DEBUG_KERNEL_ADDRPERM(so),
7251                                     SOCK_DOM(so), SOCK_TYPE(so), err);
7252                         }
7253                         return err;
7254                 }
7255                 so->so_flags &= ~SOF_NODEFUNCT;
7256                 if (p != PROC_NULL) {
7257                         SODEFUNCTLOG("%s[%d, %s]: (target pid %d "
7258                             "name %s level %d) so 0x%llx [%d,%d] "
7259                             "defunct by force "
7260                             "(%d)\n", __func__, proc_selfpid(),
7261                             proc_best_name(current_proc()), proc_pid(p),
7262                             proc_best_name(p), level,
7263                             (uint64_t)DEBUG_KERNEL_ADDRPERM(so),
7264                             SOCK_DOM(so), SOCK_TYPE(so), err);
7265                 }
7266         } else if (so->so_flags1 & SOF1_EXTEND_BK_IDLE_WANTED) {
7267                 struct inpcb *inp = (struct inpcb *)so->so_pcb;
7268                 struct ifnet *ifp = inp->inp_last_outifp;
7269
7270                 if (ifp && IFNET_IS_CELLULAR(ifp)) {
7271                         OSIncrementAtomic(&soextbkidlestat.so_xbkidle_nocell);
7272                 } else if (so->so_flags & SOF_DELEGATED) {
7273                         OSIncrementAtomic(&soextbkidlestat.so_xbkidle_nodlgtd);
7274                 } else if (soextbkidlestat.so_xbkidle_time == 0) {
7275                         OSIncrementAtomic(&soextbkidlestat.so_xbkidle_notime);
7276                 } else if (noforce && p != PROC_NULL) {
7277                         OSIncrementAtomic(&soextbkidlestat.so_xbkidle_active);
7278
7279                         so->so_flags1 |= SOF1_EXTEND_BK_IDLE_INPROG;
7280                         so->so_extended_bk_start = net_uptime();
7281                         OSBitOrAtomic(P_LXBKIDLEINPROG, &p->p_ladvflag);
7282
7283                         inpcb_timer_sched(inp->inp_pcbinfo, INPCB_TIMER_LAZY);
7284
7285                         err = EOPNOTSUPP;
7286                         SODEFUNCTLOG("%s[%d, %s]: (target pid %d "
7287                             "name %s level %d) so 0x%llx [%d,%d] "
7288                             "extend bk idle "
7289                             "(%d)\n", __func__, proc_selfpid(),
7290                             proc_best_name(current_proc()), proc_pid(p),
7291                             proc_best_name(p), level,
7292                             (uint64_t)DEBUG_KERNEL_ADDRPERM(so),
7293                             SOCK_DOM(so), SOCK_TYPE(so), err);
7294                         return err;
7295                 } else {
7296                         OSIncrementAtomic(&soextbkidlestat.so_xbkidle_forced);
7297                 }
7298         }
7299
7300         so->so_flags |= SOF_DEFUNCT;
7301
7302         /* Prevent further data from being appended to the socket buffers */
7303         snd->sb_flags |= SB_DROP;
7304         rcv->sb_flags |= SB_DROP;
7305
7306         /* Flush any existing data in the socket buffers */
7307         if (rcv->sb_cc != 0) {
7308                 rcv->sb_flags &= ~SB_SEL;
7309                 selthreadclear(&rcv->sb_sel);
7310                 sbrelease(rcv);
7311         }
7312         if (snd->sb_cc != 0) {
7313                 snd->sb_flags &= ~SB_SEL;
7314                 selthreadclear(&snd->sb_sel);
7315                 sbrelease(snd);
7316         }
7317
7318 done:
7319         if (p != PROC_NULL) {
7320                 SODEFUNCTLOG("%s[%d, %s]: (target pid %d name %s level %d) "
7321                     "so 0x%llx [%d,%d] %s defunct%s\n", __func__,
7322                     proc_selfpid(), proc_best_name(current_proc()),
7323                     proc_pid(p), proc_best_name(p), level,
7324                     (uint64_t)DEBUG_KERNEL_ADDRPERM(so), SOCK_DOM(so),
7325                     SOCK_TYPE(so), defunct ? "is already" : "marked as",
7326                     (so->so_flags1 & SOF1_EXTEND_BK_IDLE_WANTED) ?
7327                     " extbkidle" : "");
7328         }
7329         return err;
7330 }
7331
7332 int
7333 sodefunct(struct proc *p, struct socket *so, int level)
7334 {
7335         struct sockbuf *rcv, *snd;
7336
7337         if (!(so->so_flags & SOF_DEFUNCT)) {
7338                 panic("%s improperly called", __func__);
7339                 /* NOTREACHED */
7340         }
7341         if (so->so_state & SS_DEFUNCT) {
7342                 goto done;
7343         }
7344
7345         rcv = &so->so_rcv;
7346         snd = &so->so_snd;
7347
7348         if (SOCK_DOM(so) == PF_INET || SOCK_DOM(so) == PF_INET6) {
7349                 char s[MAX_IPv6_STR_LEN];
7350                 char d[MAX_IPv6_STR_LEN];
7351                 struct inpcb *inp = sotoinpcb(so);
7352
7353                 if (p != PROC_NULL) {
7354                         SODEFUNCTLOG(
7355                                 "%s[%d, %s]: (target pid %d name %s level %d) "
7356                                 "so 0x%llx [%s %s:%d -> %s:%d] is now defunct "
7357                                 "[rcv_si 0x%x, snd_si 0x%x, rcv_fl 0x%x, "
7358                                 " snd_fl 0x%x]\n", __func__,
7359                                 proc_selfpid(), proc_best_name(current_proc()),
7360                                 proc_pid(p), proc_best_name(p), level,
7361                                 (uint64_t)DEBUG_KERNEL_ADDRPERM(so),
7362                                 (SOCK_TYPE(so) == SOCK_STREAM) ? "TCP" : "UDP",
7363                                 inet_ntop(SOCK_DOM(so), ((SOCK_DOM(so) == PF_INET) ?
7364                                 (void *)&inp->inp_laddr.s_addr :
7365                                 (void *)&inp->in6p_laddr),
7366                                 s, sizeof(s)), ntohs(inp->in6p_lport),
7367                                 inet_ntop(SOCK_DOM(so), (SOCK_DOM(so) == PF_INET) ?
7368                                 (void *)&inp->inp_faddr.s_addr :
7369                                 (void *)&inp->in6p_faddr,
7370                                 d, sizeof(d)), ntohs(inp->in6p_fport),
7371                                 (uint32_t)rcv->sb_sel.si_flags,
7372                                 (uint32_t)snd->sb_sel.si_flags,
7373                                 rcv->sb_flags, snd->sb_flags);
7374                 }
7375         } else if (p != PROC_NULL) {
7376                 SODEFUNCTLOG("%s[%d, %s]: (target pid %d name %s level %d) "
7377                     "so 0x%llx [%d,%d] is now defunct [rcv_si 0x%x, "
7378                     "snd_si 0x%x, rcv_fl 0x%x, snd_fl 0x%x]\n", __func__,
7379                     proc_selfpid(), proc_best_name(current_proc()),
7380                     proc_pid(p), proc_best_name(p), level,
7381                     (uint64_t)DEBUG_KERNEL_ADDRPERM(so),
7382                     SOCK_DOM(so), SOCK_TYPE(so),
7383                     (uint32_t)rcv->sb_sel.si_flags,
7384                     (uint32_t)snd->sb_sel.si_flags, rcv->sb_flags,
7385                     snd->sb_flags);
7386         }
7387
7388         /*
7389          * Unwedge threads blocked on sbwait() and sb_lock().
7390          */
7391         sbwakeup(rcv);
7392         sbwakeup(snd);
7393
7394         so->so_flags1 |= SOF1_DEFUNCTINPROG;
7395         if (rcv->sb_flags & SB_LOCK) {
7396                 sbunlock(rcv, TRUE);    /* keep socket locked */
7397         }
7398         if (snd->sb_flags & SB_LOCK) {
7399                 sbunlock(snd, TRUE);    /* keep socket locked */
7400         }
7401         /*
7402          * Flush the buffers and disconnect.  We explicitly call shutdown
7403          * on both data directions to ensure that SS_CANT{RCV,SEND}MORE
7404          * states are set for the socket.  This would also flush out data
7405          * hanging off the receive list of this socket.
7406          */
7407         (void) soshutdownlock_final(so, SHUT_RD);
7408         (void) soshutdownlock_final(so, SHUT_WR);
7409         (void) sodisconnectlocked(so);
7410
7411         /*
7412          * Explicitly handle connectionless-protocol disconnection
7413          * and release any remaining data in the socket buffers.
7414          */
7415         if (!(so->so_state & SS_ISDISCONNECTED)) {
7416                 (void) soisdisconnected(so);
7417         }
7418
7419         if (so->so_error == 0) {
7420                 so->so_error = EBADF;
7421         }
7422
7423         if (rcv->sb_cc != 0) {
7424                 rcv->sb_flags &= ~SB_SEL;
7425                 selthreadclear(&rcv->sb_sel);
7426                 sbrelease(rcv);
7427         }
7428         if (snd->sb_cc != 0) {
7429                 snd->sb_flags &= ~SB_SEL;
7430                 selthreadclear(&snd->sb_sel);
7431                 sbrelease(snd);
7432         }
7433         so->so_state |= SS_DEFUNCT;
7434         OSIncrementAtomicLong((volatile long *)&sodefunct_calls);
7435
7436 done:
7437         return 0;
7438 }
7439
7440 int
7441 soresume(struct proc *p, struct socket *so, int locked)
7442 {
7443         if (locked == 0) {
7444                 socket_lock(so, 1);
7445         }
7446
7447         if (so->so_flags1 & SOF1_EXTEND_BK_IDLE_INPROG) {
7448                 SODEFUNCTLOG("%s[%d, %s]: (target pid %d name %s) so 0x%llx "
7449                     "[%d,%d] resumed from bk idle\n",
7450                     __func__, proc_selfpid(), proc_best_name(current_proc()),
7451                     proc_pid(p), proc_best_name(p),
7452                     (uint64_t)DEBUG_KERNEL_ADDRPERM(so),
7453                     SOCK_DOM(so), SOCK_TYPE(so));
7454
7455                 so->so_flags1 &= ~SOF1_EXTEND_BK_IDLE_INPROG;
7456                 so->so_extended_bk_start = 0;
7457                 OSBitAndAtomic(~P_LXBKIDLEINPROG, &p->p_ladvflag);
7458
7459                 OSIncrementAtomic(&soextbkidlestat.so_xbkidle_resumed);
7460                 OSDecrementAtomic(&soextbkidlestat.so_xbkidle_active);
7461                 VERIFY(soextbkidlestat.so_xbkidle_active >= 0);
7462         }
7463         if (locked == 0) {
7464                 socket_unlock(so, 1);
7465         }
7466
7467         return 0;
7468 }
7469
7470 /*
7471  * Does not attempt to account for sockets that are delegated from
7472  * the current process
7473  */
7474 int
7475 so_set_extended_bk_idle(struct socket *so, int optval)
7476 {
7477         int error = 0;
7478
7479         if ((SOCK_DOM(so) != PF_INET && SOCK_DOM(so) != PF_INET6) ||
7480             SOCK_PROTO(so) != IPPROTO_TCP) {
7481                 OSDecrementAtomic(&soextbkidlestat.so_xbkidle_notsupp);
7482                 error = EOPNOTSUPP;
7483         } else if (optval == 0) {
7484                 so->so_flags1 &= ~SOF1_EXTEND_BK_IDLE_WANTED;
7485
7486                 soresume(current_proc(), so, 1);
7487         } else {
7488                 struct proc *p = current_proc();
7489                 struct fileproc *fp;
7490                 int count = 0;
7491
7492                 /*
7493                  * Unlock socket to avoid lock ordering issue with
7494                  * the proc fd table lock
7495                  */
7496                 socket_unlock(so, 0);
7497
7498                 proc_fdlock(p);
7499                 fdt_foreach(fp, p) {
7500                         struct socket *so2;
7501
7502                         if (FILEGLOB_DTYPE(fp->fp_glob) != DTYPE_SOCKET) {
7503                                 continue;
7504                         }
7505
7506                         so2 = (struct socket *)fp->fp_glob->fg_data;
7507                         if (so != so2 &&
7508                             so2->so_flags1 & SOF1_EXTEND_BK_IDLE_WANTED) {
7509                                 count++;
7510                         }
7511                         if (count >= soextbkidlestat.so_xbkidle_maxperproc) {
7512                                 break;
7513                         }
7514                 }
7515                 proc_fdunlock(p);
7516
7517                 socket_lock(so, 0);
7518
7519                 if (count >= soextbkidlestat.so_xbkidle_maxperproc) {
7520                         OSIncrementAtomic(&soextbkidlestat.so_xbkidle_toomany);
7521                         error = EBUSY;
7522                 } else if (so->so_flags & SOF_DELEGATED) {
7523                         OSIncrementAtomic(&soextbkidlestat.so_xbkidle_nodlgtd);
7524                         error = EBUSY;
7525                 } else {
7526                         so->so_flags1 |= SOF1_EXTEND_BK_IDLE_WANTED;
7527                         OSIncrementAtomic(&soextbkidlestat.so_xbkidle_wantok);
7528                 }
7529                 SODEFUNCTLOG("%s[%d, %s]: so 0x%llx [%d,%d] "
7530                     "%s marked for extended bk idle\n",
7531                     __func__, proc_selfpid(), proc_best_name(current_proc()),
7532                     (uint64_t)DEBUG_KERNEL_ADDRPERM(so),
7533                     SOCK_DOM(so), SOCK_TYPE(so),
7534                     (so->so_flags1 & SOF1_EXTEND_BK_IDLE_WANTED) ?
7535                     "is" : "not");
7536         }
7537
7538         return error;
7539 }
7540
7541 static void
7542 so_stop_extended_bk_idle(struct socket *so)
7543 {
7544         so->so_flags1 &= ~SOF1_EXTEND_BK_IDLE_INPROG;
7545         so->so_extended_bk_start = 0;
7546
7547         OSDecrementAtomic(&soextbkidlestat.so_xbkidle_active);
7548         VERIFY(soextbkidlestat.so_xbkidle_active >= 0);
7549         /*
7550          * Force defunct
7551          */
7552         sosetdefunct(current_proc(), so,
7553             SHUTDOWN_SOCKET_LEVEL_DISCONNECT_INTERNAL, FALSE);
7554         if (so->so_flags & SOF_DEFUNCT) {
7555                 sodefunct(current_proc(), so,
7556                     SHUTDOWN_SOCKET_LEVEL_DISCONNECT_INTERNAL);
7557         }
7558 }
7559
7560 void
7561 so_drain_extended_bk_idle(struct socket *so)
7562 {
7563         if (so && (so->so_flags1 & SOF1_EXTEND_BK_IDLE_INPROG)) {
7564                 /*
7565                  * Only penalize sockets that have outstanding data
7566                  */
7567                 if (so->so_rcv.sb_cc || so->so_snd.sb_cc) {
7568                         so_stop_extended_bk_idle(so);
7569
7570                         OSIncrementAtomic(&soextbkidlestat.so_xbkidle_drained);
7571                 }
7572         }
7573 }
7574
7575 /*
7576  * Return values tells if socket is still in extended background idle
7577  */
7578 int
7579 so_check_extended_bk_idle_time(struct socket *so)
7580 {
7581         int ret = 1;
7582
7583         if ((so->so_flags1 & SOF1_EXTEND_BK_IDLE_INPROG)) {
7584                 SODEFUNCTLOG("%s[%d, %s]: so 0x%llx [%d,%d]\n",
7585                     __func__, proc_selfpid(), proc_best_name(current_proc()),
7586                     (uint64_t)DEBUG_KERNEL_ADDRPERM(so),
7587                     SOCK_DOM(so), SOCK_TYPE(so));
7588                 if (net_uptime() - so->so_extended_bk_start >
7589                     soextbkidlestat.so_xbkidle_time) {
7590                         so_stop_extended_bk_idle(so);
7591
7592                         OSIncrementAtomic(&soextbkidlestat.so_xbkidle_expired);
7593
7594                         ret = 0;
7595                 } else {
7596                         struct inpcb *inp = (struct inpcb *)so->so_pcb;
7597
7598                         inpcb_timer_sched(inp->inp_pcbinfo, INPCB_TIMER_LAZY);
7599                         OSIncrementAtomic(&soextbkidlestat.so_xbkidle_resched);
7600                 }
7601         }
7602
7603         return ret;
7604 }
7605
7606 void
7607 resume_proc_sockets(proc_t p)
7608 {
7609         if (p->p_ladvflag & P_LXBKIDLEINPROG) {
7610                 struct fileproc *fp;
7611                 struct socket *so;
7612
7613                 proc_fdlock(p);
7614                 fdt_foreach(fp, p) {
7615                         if (FILEGLOB_DTYPE(fp->fp_glob) != DTYPE_SOCKET) {
7616                                 continue;
7617                         }
7618
7619                         so = (struct socket *)fp->fp_glob->fg_data;
7620                         (void) soresume(p, so, 0);
7621                 }
7622                 proc_fdunlock(p);
7623
7624                 OSBitAndAtomic(~P_LXBKIDLEINPROG, &p->p_ladvflag);
7625         }
7626 }
7627
7628 __private_extern__ int
7629 so_set_recv_anyif(struct socket *so, int optval)
7630 {
7631         int ret = 0;
7632
7633         if (SOCK_DOM(so) == PF_INET || SOCK_DOM(so) == PF_INET6) {
7634                 if (optval) {
7635                         sotoinpcb(so)->inp_flags |= INP_RECV_ANYIF;
7636                 } else {
7637                         sotoinpcb(so)->inp_flags &= ~INP_RECV_ANYIF;
7638                 }
7639         }
7640
7641
7642         return ret;
7643 }
7644
7645 __private_extern__ int
7646 so_get_recv_anyif(struct socket *so)
7647 {
7648         int ret = 0;
7649
7650         if (SOCK_DOM(so) == PF_INET || SOCK_DOM(so) == PF_INET6) {
7651                 ret = (sotoinpcb(so)->inp_flags & INP_RECV_ANYIF) ? 1 : 0;
7652         }
7653
7654         return ret;
7655 }
7656
7657 int
7658 so_set_restrictions(struct socket *so, uint32_t vals)
7659 {
7660         int nocell_old, nocell_new;
7661         int noexpensive_old, noexpensive_new;
7662         int noconstrained_old, noconstrained_new;
7663
7664         /*
7665          * Deny-type restrictions are trapdoors; once set they cannot be
7666          * unset for the lifetime of the socket.  This allows them to be
7667          * issued by a framework on behalf of the application without
7668          * having to worry that they can be undone.
7669          *
7670          * Note here that socket-level restrictions overrides any protocol
7671          * level restrictions.  For instance, SO_RESTRICT_DENY_CELLULAR
7672          * socket restriction issued on the socket has a higher precendence
7673          * than INP_NO_IFT_CELLULAR.  The latter is affected by the UUID
7674          * policy PROC_UUID_NO_CELLULAR for unrestricted sockets only,
7675          * i.e. when SO_RESTRICT_DENY_CELLULAR has not been issued.
7676          */
7677         nocell_old = (so->so_restrictions & SO_RESTRICT_DENY_CELLULAR);
7678         noexpensive_old = (so->so_restrictions & SO_RESTRICT_DENY_EXPENSIVE);
7679         noconstrained_old = (so->so_restrictions & SO_RESTRICT_DENY_CONSTRAINED);
7680         so->so_restrictions |= (vals & (SO_RESTRICT_DENY_IN |
7681             SO_RESTRICT_DENY_OUT | SO_RESTRICT_DENY_CELLULAR |
7682             SO_RESTRICT_DENY_EXPENSIVE | SO_RESTRICT_DENY_CONSTRAINED));
7683         nocell_new = (so->so_restrictions & SO_RESTRICT_DENY_CELLULAR);
7684         noexpensive_new = (so->so_restrictions & SO_RESTRICT_DENY_EXPENSIVE);
7685         noconstrained_new = (so->so_restrictions & SO_RESTRICT_DENY_CONSTRAINED);
7686
7687         /* we can only set, not clear restrictions */
7688         if ((nocell_new - nocell_old) == 0 &&
7689             (noexpensive_new - noexpensive_old) == 0 &&
7690             (noconstrained_new - noconstrained_old) == 0) {
7691                 return 0;
7692         }
7693         if (SOCK_DOM(so) == PF_INET || SOCK_DOM(so) == PF_INET6) {
7694                 if (nocell_new - nocell_old != 0) {
7695                         /*
7696                          * if deny cellular is now set, do what's needed
7697                          * for INPCB
7698                          */
7699                         inp_set_nocellular(sotoinpcb(so));
7700                 }
7701                 if (noexpensive_new - noexpensive_old != 0) {
7702                         inp_set_noexpensive(sotoinpcb(so));
7703                 }
7704                 if (noconstrained_new - noconstrained_old != 0) {
7705                         inp_set_noconstrained(sotoinpcb(so));
7706                 }
7707         }
7708
7709         if (SOCK_DOM(so) == PF_MULTIPATH) {
7710                 mptcp_set_restrictions(so);
7711         }
7712
7713         return 0;
7714 }
7715
7716 uint32_t
7717 so_get_restrictions(struct socket *so)
7718 {
7719         return so->so_restrictions & (SO_RESTRICT_DENY_IN |
7720                SO_RESTRICT_DENY_OUT |
7721                SO_RESTRICT_DENY_CELLULAR | SO_RESTRICT_DENY_EXPENSIVE);
7722 }
7723
7724 int
7725 so_set_effective_pid(struct socket *so, int epid, struct proc *p, boolean_t check_cred)
7726 {
7727         struct proc *ep = PROC_NULL;
7728         int error = 0;
7729
7730         /* pid 0 is reserved for kernel */
7731         if (epid == 0) {
7732                 error = EINVAL;
7733                 goto done;
7734         }
7735
7736         /*
7737          * If this is an in-kernel socket, prevent its delegate
7738          * association from changing unless the socket option is
7739          * coming from within the kernel itself.
7740          */
7741         if (so->last_pid == 0 && p != kernproc) {
7742                 error = EACCES;
7743                 goto done;
7744         }
7745
7746         /*
7747          * If this is issued by a process that's recorded as the
7748          * real owner of the socket, or if the pid is the same as
7749          * the process's own pid, then proceed.  Otherwise ensure
7750          * that the issuing process has the necessary privileges.
7751          */
7752         if (check_cred && (epid != so->last_pid || epid != proc_pid(p))) {
7753                 if ((error = priv_check_cred(kauth_cred_get(),
7754                     PRIV_NET_PRIVILEGED_SOCKET_DELEGATE, 0))) {
7755                         error = EACCES;
7756                         goto done;
7757                 }
7758         }
7759
7760         /* Find the process that corresponds to the effective pid */
7761         if ((ep = proc_find(epid)) == PROC_NULL) {
7762                 error = ESRCH;
7763                 goto done;
7764         }
7765
7766         /*
7767          * If a process tries to delegate the socket to itself, then
7768          * there's really nothing to do; treat it as a way for the
7769          * delegate association to be cleared.  Note that we check
7770          * the passed-in proc rather than calling proc_selfpid(),
7771          * as we need to check the process issuing the socket option
7772          * which could be kernproc.  Given that we don't allow 0 for
7773          * effective pid, it means that a delegated in-kernel socket
7774          * stays delegated during its lifetime (which is probably OK.)
7775          */
7776         if (epid == proc_pid(p)) {
7777                 so->so_flags &= ~SOF_DELEGATED;
7778                 so->e_upid = 0;
7779                 so->e_pid = 0;
7780                 uuid_clear(so->e_uuid);
7781         } else {
7782                 so->so_flags |= SOF_DELEGATED;
7783                 so->e_upid = proc_uniqueid(ep);
7784                 so->e_pid = proc_pid(ep);
7785                 proc_getexecutableuuid(ep, so->e_uuid, sizeof(so->e_uuid));
7786
7787 #if defined(XNU_TARGET_OS_OSX)
7788                 if (ep->p_responsible_pid != so->e_pid) {
7789                         proc_t rp = proc_find(ep->p_responsible_pid);
7790                         if (rp != PROC_NULL) {
7791                                 proc_getexecutableuuid(rp, so->so_ruuid, sizeof(so->so_ruuid));
7792                                 so->so_rpid = ep->p_responsible_pid;
7793                                 proc_rele(rp);
7794                         } else {
7795                                 uuid_clear(so->so_ruuid);
7796                                 so->so_rpid = -1;
7797                         }
7798                 }
7799 #endif
7800         }
7801         if (so->so_proto != NULL && so->so_proto->pr_update_last_owner != NULL) {
7802                 (*so->so_proto->pr_update_last_owner)(so, NULL, ep);
7803         }
7804 done:
7805         if (error == 0 && net_io_policy_log) {
7806                 uuid_string_t buf;
7807
7808                 uuid_unparse(so->e_uuid, buf);
7809                 log(LOG_DEBUG, "%s[%s,%d]: so 0x%llx [%d,%d] epid %d (%s) "
7810                     "euuid %s%s\n", __func__, proc_name_address(p),
7811                     proc_pid(p), (uint64_t)DEBUG_KERNEL_ADDRPERM(so),
7812                     SOCK_DOM(so), SOCK_TYPE(so),
7813                     so->e_pid, proc_name_address(ep), buf,
7814                     ((so->so_flags & SOF_DELEGATED) ? " [delegated]" : ""));
7815         } else if (error != 0 && net_io_policy_log) {
7816                 log(LOG_ERR, "%s[%s,%d]: so 0x%llx [%d,%d] epid %d (%s) "
7817                     "ERROR (%d)\n", __func__, proc_name_address(p),
7818                     proc_pid(p), (uint64_t)DEBUG_KERNEL_ADDRPERM(so),
7819                     SOCK_DOM(so), SOCK_TYPE(so),
7820                     epid, (ep == PROC_NULL) ? "PROC_NULL" :
7821                     proc_name_address(ep), error);
7822         }
7823
7824         /* Update this socket's policy upon success */
7825         if (error == 0) {
7826                 so->so_policy_gencnt *= -1;
7827                 so_update_policy(so);
7828 #if NECP
7829                 so_update_necp_policy(so, NULL, NULL);
7830 #endif /* NECP */
7831         }
7832
7833         if (ep != PROC_NULL) {
7834                 proc_rele(ep);
7835         }
7836
7837         return error;
7838 }
7839
7840 int
7841 so_set_effective_uuid(struct socket *so, uuid_t euuid, struct proc *p, boolean_t check_cred)
7842 {
7843         uuid_string_t buf;
7844         uuid_t uuid;
7845         int error = 0;
7846
7847         /* UUID must not be all-zeroes (reserved for kernel) */
7848         if (uuid_is_null(euuid)) {
7849                 error = EINVAL;
7850                 goto done;
7851         }
7852
7853         /*
7854          * If this is an in-kernel socket, prevent its delegate
7855          * association from changing unless the socket option is
7856          * coming from within the kernel itself.
7857          */
7858         if (so->last_pid == 0 && p != kernproc) {
7859                 error = EACCES;
7860                 goto done;
7861         }
7862
7863         /* Get the UUID of the issuing process */
7864         proc_getexecutableuuid(p, uuid, sizeof(uuid));
7865
7866         /*
7867          * If this is issued by a process that's recorded as the
7868          * real owner of the socket, or if the uuid is the same as
7869          * the process's own uuid, then proceed.  Otherwise ensure
7870          * that the issuing process has the necessary privileges.
7871          */
7872         if (check_cred &&
7873             (uuid_compare(euuid, so->last_uuid) != 0 ||
7874             uuid_compare(euuid, uuid) != 0)) {
7875                 if ((error = priv_check_cred(kauth_cred_get(),
7876                     PRIV_NET_PRIVILEGED_SOCKET_DELEGATE, 0))) {
7877                         error = EACCES;
7878                         goto done;
7879                 }
7880         }
7881
7882         /*
7883          * If a process tries to delegate the socket to itself, then
7884          * there's really nothing to do; treat it as a way for the
7885          * delegate association to be cleared.  Note that we check
7886          * the uuid of the passed-in proc rather than that of the
7887          * current process, as we need to check the process issuing
7888          * the socket option which could be kernproc itself.  Given
7889          * that we don't allow 0 for effective uuid, it means that
7890          * a delegated in-kernel socket stays delegated during its
7891          * lifetime (which is okay.)
7892          */
7893         if (uuid_compare(euuid, uuid) == 0) {
7894                 so->so_flags &= ~SOF_DELEGATED;
7895                 so->e_upid = 0;
7896                 so->e_pid = 0;
7897                 uuid_clear(so->e_uuid);
7898         } else {
7899                 so->so_flags |= SOF_DELEGATED;
7900                 /*
7901                  * Unlike so_set_effective_pid(), we only have the UUID
7902                  * here and the process ID is not known.  Inherit the
7903                  * real {pid,upid} of the socket.
7904                  */
7905                 so->e_upid = so->last_upid;
7906                 so->e_pid = so->last_pid;
7907                 uuid_copy(so->e_uuid, euuid);
7908         }
7909         /*
7910          * The following will clear the effective process name as it's the same
7911          * as the real process
7912          */
7913         if (so->so_proto != NULL && so->so_proto->pr_update_last_owner != NULL) {
7914                 (*so->so_proto->pr_update_last_owner)(so, NULL, NULL);
7915         }
7916 done:
7917         if (error == 0 && net_io_policy_log) {
7918                 uuid_unparse(so->e_uuid, buf);
7919                 log(LOG_DEBUG, "%s[%s,%d]: so 0x%llx [%d,%d] epid %d "
7920                     "euuid %s%s\n", __func__, proc_name_address(p), proc_pid(p),
7921                     (uint64_t)DEBUG_KERNEL_ADDRPERM(so), SOCK_DOM(so),
7922                     SOCK_TYPE(so), so->e_pid, buf,
7923                     ((so->so_flags & SOF_DELEGATED) ? " [delegated]" : ""));
7924         } else if (error != 0 && net_io_policy_log) {
7925                 uuid_unparse(euuid, buf);
7926                 log(LOG_DEBUG, "%s[%s,%d]: so 0x%llx [%d,%d] euuid %s "
7927                     "ERROR (%d)\n", __func__, proc_name_address(p), proc_pid(p),
7928                     (uint64_t)DEBUG_KERNEL_ADDRPERM(so), SOCK_DOM(so),
7929                     SOCK_TYPE(so), buf, error);
7930         }
7931
7932         /* Update this socket's policy upon success */
7933         if (error == 0) {
7934                 so->so_policy_gencnt *= -1;
7935                 so_update_policy(so);
7936 #if NECP
7937                 so_update_necp_policy(so, NULL, NULL);
7938 #endif /* NECP */
7939         }
7940
7941         return error;
7942 }
7943
7944 void
7945 netpolicy_post_msg(uint32_t ev_code, struct netpolicy_event_data *ev_data,
7946     uint32_t ev_datalen)
7947 {
7948         struct kev_msg ev_msg;
7949
7950         /*
7951          * A netpolicy event always starts with a netpolicy_event_data
7952          * structure, but the caller can provide for a longer event
7953          * structure to post, depending on the event code.
7954          */
7955         VERIFY(ev_data != NULL && ev_datalen >= sizeof(*ev_data));
7956
7957         bzero(&ev_msg, sizeof(ev_msg));
7958         ev_msg.vendor_code      = KEV_VENDOR_APPLE;
7959         ev_msg.kev_class        = KEV_NETWORK_CLASS;
7960         ev_msg.kev_subclass     = KEV_NETPOLICY_SUBCLASS;
7961         ev_msg.event_code       = ev_code;
7962
7963         ev_msg.dv[0].data_ptr   = ev_data;
7964         ev_msg.dv[0].data_length = ev_datalen;
7965
7966         kev_post_msg(&ev_msg);
7967 }
7968
7969 void
7970 socket_post_kev_msg(uint32_t ev_code,
7971     struct kev_socket_event_data *ev_data,
7972     uint32_t ev_datalen)
7973 {
7974         struct kev_msg ev_msg;
7975
7976         bzero(&ev_msg, sizeof(ev_msg));
7977         ev_msg.vendor_code = KEV_VENDOR_APPLE;
7978         ev_msg.kev_class = KEV_NETWORK_CLASS;
7979         ev_msg.kev_subclass = KEV_SOCKET_SUBCLASS;
7980         ev_msg.event_code = ev_code;
7981
7982         ev_msg.dv[0].data_ptr = ev_data;
7983         ev_msg.dv[0].data_length = ev_datalen;
7984
7985         kev_post_msg(&ev_msg);
7986 }
7987
7988 void
7989 socket_post_kev_msg_closed(struct socket *so)
7990 {
7991         struct kev_socket_closed ev = {};
7992         struct sockaddr *socksa = NULL, *peersa = NULL;
7993         int err;
7994
7995         if ((so->so_flags1 & SOF1_WANT_KEV_SOCK_CLOSED) == 0) {
7996                 return;
7997         }
7998         err = (*so->so_proto->pr_usrreqs->pru_sockaddr)(so, &socksa);
7999         if (err == 0) {
8000                 err = (*so->so_proto->pr_usrreqs->pru_peeraddr)(so,
8001                     &peersa);
8002                 if (err == 0) {
8003                         memcpy(&ev.ev_data.kev_sockname, socksa,
8004                             min(socksa->sa_len,
8005                             sizeof(ev.ev_data.kev_sockname)));
8006                         memcpy(&ev.ev_data.kev_peername, peersa,
8007                             min(peersa->sa_len,
8008                             sizeof(ev.ev_data.kev_peername)));
8009                         socket_post_kev_msg(KEV_SOCKET_CLOSED,
8010                             &ev.ev_data, sizeof(ev));
8011                 }
8012         }
8013         if (socksa != NULL) {
8014                 FREE(socksa, M_SONAME);
8015         }
8016         if (peersa != NULL) {
8017                 FREE(peersa, M_SONAME);
8018         }
8019 }