bsd/kern/uipc_socket.c

   1 /*
   2  * Copyright (c) 1998-2016 Apple Inc. All rights reserved.
   3  *
   4  * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
   5  *
   6  * This file contains Original Code and/or Modifications of Original Code
   7  * as defined in and that are subject to the Apple Public Source License
   8  * Version 2.0 (the 'License'). You may not use this file except in
   9  * compliance with the License. The rights granted to you under the License
  10  * may not be used to create, or enable the creation or redistribution of,
  11  * unlawful or unlicensed copies of an Apple operating system, or to
  12  * circumvent, violate, or enable the circumvention or violation of, any
  13  * terms of an Apple operating system software license agreement.
  14  *
  15  * Please obtain a copy of the License at
  16  * http://www.opensource.apple.com/apsl/ and read it before using this file.
  17  *
  18  * The Original Code and all software distributed under the License are
  19  * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
  20  * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
  21  * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
  22  * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
  23  * Please see the License for the specific language governing rights and
  24  * limitations under the License.
  25  *
  26  * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
  27  */
  28 /* Copyright (c) 1995 NeXT Computer, Inc. All Rights Reserved */
  29 /*
  30  * Copyright (c) 1982, 1986, 1988, 1990, 1993
  31  *      The Regents of the University of California.  All rights reserved.
  32  *
  33  * Redistribution and use in source and binary forms, with or without
  34  * modification, are permitted provided that the following conditions
  35  * are met:
  36  * 1. Redistributions of source code must retain the above copyright
  37  *    notice, this list of conditions and the following disclaimer.
  38  * 2. Redistributions in binary form must reproduce the above copyright
  39  *    notice, this list of conditions and the following disclaimer in the
  40  *    documentation and/or other materials provided with the distribution.
  41  * 3. All advertising materials mentioning features or use of this software
  42  *    must display the following acknowledgement:
  43  *      This product includes software developed by the University of
  44  *      California, Berkeley and its contributors.
  45  * 4. Neither the name of the University nor the names of its contributors
  46  *    may be used to endorse or promote products derived from this software
  47  *    without specific prior written permission.
  48  *
  49  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
  50  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  51  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  52  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  53  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  54  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  55  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  56  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  57  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  58  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  59  * SUCH DAMAGE.
  60  *
  61  *      @(#)uipc_socket.c       8.3 (Berkeley) 4/15/94
  62  */
  63 /*
  64  * NOTICE: This file was modified by SPARTA, Inc. in 2005 to introduce
  65  * support for mandatory and extensible security protections.  This notice
  66  * is included in support of clause 2.2 (b) of the Apple Public License,
  67  * Version 2.0.
  68  */
  69
  70 #include <sys/param.h>
  71 #include <sys/systm.h>
  72 #include <sys/filedesc.h>
  73 #include <sys/proc.h>
  74 #include <sys/proc_internal.h>
  75 #include <sys/kauth.h>
  76 #include <sys/file_internal.h>
  77 #include <sys/fcntl.h>
  78 #include <sys/malloc.h>
  79 #include <sys/mbuf.h>
  80 #include <sys/domain.h>
  81 #include <sys/kernel.h>
  82 #include <sys/event.h>
  83 #include <sys/poll.h>
  84 #include <sys/protosw.h>
  85 #include <sys/socket.h>
  86 #include <sys/socketvar.h>
  87 #include <sys/resourcevar.h>
  88 #include <sys/signalvar.h>
  89 #include <sys/sysctl.h>
  90 #include <sys/syslog.h>
  91 #include <sys/uio.h>
  92 #include <sys/uio_internal.h>
  93 #include <sys/ev.h>
  94 #include <sys/kdebug.h>
  95 #include <sys/un.h>
  96 #include <sys/user.h>
  97 #include <sys/priv.h>
  98 #include <sys/kern_event.h>
  99 #include <net/route.h>
 100 #include <net/init.h>
 101 #include <net/ntstat.h>
 102 #include <net/content_filter.h>
 103 #include <netinet/in.h>
 104 #include <netinet/in_pcb.h>
 105 #include <netinet/in_tclass.h>
 106 #include <netinet/tcp_var.h>
 107 #include <netinet/ip6.h>
 108 #include <netinet6/ip6_var.h>
 109 #include <netinet/flow_divert.h>
 110 #include <kern/zalloc.h>
 111 #include <kern/locks.h>
 112 #include <machine/limits.h>
 113 #include <libkern/OSAtomic.h>
 114 #include <pexpert/pexpert.h>
 115 #include <kern/assert.h>
 116 #include <kern/task.h>
 117 #include <kern/policy_internal.h>
 118
 119 #include <sys/kpi_mbuf.h>
 120 #include <sys/mcache.h>
 121 #include <sys/unpcb.h>
 122
 123 #if CONFIG_MACF
 124 #include <security/mac.h>
 125 #include <security/mac_framework.h>
 126 #endif /* MAC */
 127
 128 #if MULTIPATH
 129 #include <netinet/mp_pcb.h>
 130 #include <netinet/mptcp_var.h>
 131 #endif /* MULTIPATH */
 132
 133 #define ROUNDUP(a, b) (((a) + ((b) - 1)) & (~((b) - 1)))
 134
 135 #if DEBUG || DEVELOPMENT
 136 #define DEBUG_KERNEL_ADDRPERM(_v) (_v)
 137 #else
 138 #define DEBUG_KERNEL_ADDRPERM(_v) VM_KERNEL_ADDRPERM(_v)
 139 #endif
 140
 141 /* TODO: this should be in a header file somewhere */
 142 extern char *proc_name_address(void *p);
 143 extern char *proc_best_name(proc_t);
 144
 145 static u_int32_t        so_cache_hw;    /* High water mark for socache */
 146 static u_int32_t        so_cache_timeouts;      /* number of timeouts */
 147 static u_int32_t        so_cache_max_freed;     /* max freed per timeout */
 148 static u_int32_t        cached_sock_count = 0;
 149 STAILQ_HEAD(, socket)   so_cache_head;
 150 int     max_cached_sock_count = MAX_CACHED_SOCKETS;
 151 static u_int32_t        so_cache_time;
 152 static int              socketinit_done;
 153 static struct zone      *so_cache_zone;
 154
 155 static lck_grp_t        *so_cache_mtx_grp;
 156 static lck_attr_t       *so_cache_mtx_attr;
 157 static lck_grp_attr_t   *so_cache_mtx_grp_attr;
 158 static lck_mtx_t        *so_cache_mtx;
 159
 160 #include <machine/limits.h>
 161
 162 static int      filt_sorattach(struct knote *kn);
 163 static void     filt_sordetach(struct knote *kn);
 164 static int      filt_soread(struct knote *kn, long hint);
 165 static int      filt_sortouch(struct knote *kn, struct kevent_internal_s *kev);
 166 static int      filt_sorprocess(struct knote *kn, struct filt_process_s *data, struct kevent_internal_s *kev);
 167
 168 static int      filt_sowattach(struct knote *kn);
 169 static void     filt_sowdetach(struct knote *kn);
 170 static int      filt_sowrite(struct knote *kn, long hint);
 171 static int      filt_sowtouch(struct knote *kn, struct kevent_internal_s *kev);
 172 static int      filt_sowprocess(struct knote *kn, struct filt_process_s *data, struct kevent_internal_s *kev);
 173
 174 static int      filt_sockattach(struct knote *kn);
 175 static void     filt_sockdetach(struct knote *kn);
 176 static int      filt_sockev(struct knote *kn, long hint);
 177 static int      filt_socktouch(struct knote *kn, struct kevent_internal_s *kev);
 178 static int      filt_sockprocess(struct knote *kn, struct filt_process_s *data, struct kevent_internal_s *kev);
 179
 180 static int sooptcopyin_timeval(struct sockopt *, struct timeval *);
 181 static int sooptcopyout_timeval(struct sockopt *, const struct timeval *);
 182
 183 struct filterops soread_filtops = {
 184         .f_isfd = 1,
 185         .f_attach = filt_sorattach,
 186         .f_detach = filt_sordetach,
 187         .f_event = filt_soread,
 188         .f_touch = filt_sortouch,
 189         .f_process = filt_sorprocess,
 190 };
 191
 192 struct filterops sowrite_filtops = {
 193         .f_isfd = 1,
 194         .f_attach = filt_sowattach,
 195         .f_detach = filt_sowdetach,
 196         .f_event = filt_sowrite,
 197         .f_touch = filt_sowtouch,
 198         .f_process = filt_sowprocess,
 199 };
 200
 201 struct filterops sock_filtops = {
 202         .f_isfd = 1,
 203         .f_attach = filt_sockattach,
 204         .f_detach = filt_sockdetach,
 205         .f_event = filt_sockev,
 206         .f_touch = filt_socktouch,
 207         .f_process = filt_sockprocess,
 208 };
 209
 210 struct filterops soexcept_filtops = {
 211         .f_isfd = 1,
 212         .f_attach = filt_sorattach,
 213         .f_detach = filt_sordetach,
 214         .f_event = filt_soread,
 215         .f_touch = filt_sortouch,
 216         .f_process = filt_sorprocess,
 217 };
 218
 219 SYSCTL_DECL(_kern_ipc);
 220
 221 #define EVEN_MORE_LOCKING_DEBUG 0
 222
 223 int socket_debug = 0;
 224 SYSCTL_INT(_kern_ipc, OID_AUTO, socket_debug,
 225         CTLFLAG_RW | CTLFLAG_LOCKED, &socket_debug, 0, "");
 226
 227 static unsigned long sodefunct_calls = 0;
 228 SYSCTL_LONG(_kern_ipc, OID_AUTO, sodefunct_calls, CTLFLAG_LOCKED,
 229     &sodefunct_calls, "");
 230
 231 static int socket_zone = M_SOCKET;
 232 so_gen_t        so_gencnt;      /* generation count for sockets */
 233
 234 MALLOC_DEFINE(M_SONAME, "soname", "socket name");
 235 MALLOC_DEFINE(M_PCB, "pcb", "protocol control block");
 236
 237 #define DBG_LAYER_IN_BEG        NETDBG_CODE(DBG_NETSOCK, 0)
 238 #define DBG_LAYER_IN_END        NETDBG_CODE(DBG_NETSOCK, 2)
 239 #define DBG_LAYER_OUT_BEG       NETDBG_CODE(DBG_NETSOCK, 1)
 240 #define DBG_LAYER_OUT_END       NETDBG_CODE(DBG_NETSOCK, 3)
 241 #define DBG_FNC_SOSEND          NETDBG_CODE(DBG_NETSOCK, (4 << 8) | 1)
 242 #define DBG_FNC_SOSEND_LIST     NETDBG_CODE(DBG_NETSOCK, (4 << 8) | 3)
 243 #define DBG_FNC_SORECEIVE       NETDBG_CODE(DBG_NETSOCK, (8 << 8))
 244 #define DBG_FNC_SORECEIVE_LIST  NETDBG_CODE(DBG_NETSOCK, (8 << 8) | 3)
 245 #define DBG_FNC_SOSHUTDOWN      NETDBG_CODE(DBG_NETSOCK, (9 << 8))
 246
 247 #define MAX_SOOPTGETM_SIZE      (128 * MCLBYTES)
 248
 249 int somaxconn = SOMAXCONN;
 250 SYSCTL_INT(_kern_ipc, KIPC_SOMAXCONN, somaxconn,
 251         CTLFLAG_RW | CTLFLAG_LOCKED, &somaxconn, 0, "");
 252
 253 /* Should we get a maximum also ??? */
 254 static int sosendmaxchain = 65536;
 255 static int sosendminchain = 16384;
 256 static int sorecvmincopy  = 16384;
 257 SYSCTL_INT(_kern_ipc, OID_AUTO, sosendminchain,
 258         CTLFLAG_RW | CTLFLAG_LOCKED, &sosendminchain, 0, "");
 259 SYSCTL_INT(_kern_ipc, OID_AUTO, sorecvmincopy,
 260         CTLFLAG_RW | CTLFLAG_LOCKED, &sorecvmincopy, 0, "");
 261
 262 /*
 263  * Set to enable jumbo clusters (if available) for large writes when
 264  * the socket is marked with SOF_MULTIPAGES; see below.
 265  */
 266 int sosendjcl = 1;
 267 SYSCTL_INT(_kern_ipc, OID_AUTO, sosendjcl,
 268         CTLFLAG_RW | CTLFLAG_LOCKED, &sosendjcl, 0, "");
 269
 270 /*
 271  * Set this to ignore SOF_MULTIPAGES and use jumbo clusters for large
 272  * writes on the socket for all protocols on any network interfaces,
 273  * depending upon sosendjcl above.  Be extra careful when setting this
 274  * to 1, because sending down packets that cross physical pages down to
 275  * broken drivers (those that falsely assume that the physical pages
 276  * are contiguous) might lead to system panics or silent data corruption.
 277  * When set to 0, the system will respect SOF_MULTIPAGES, which is set
 278  * only for TCP sockets whose outgoing interface is IFNET_MULTIPAGES
 279  * capable.  Set this to 1 only for testing/debugging purposes.
 280  */
 281 int sosendjcl_ignore_capab = 0;
 282 SYSCTL_INT(_kern_ipc, OID_AUTO, sosendjcl_ignore_capab,
 283         CTLFLAG_RW | CTLFLAG_LOCKED, &sosendjcl_ignore_capab, 0, "");
 284
 285 /*
 286  * Set this to ignore SOF1_IF_2KCL and use big clusters for large
 287  * writes on the socket for all protocols on any network interfaces.
 288  * Be extra careful when setting this to 1, because sending down packets with
 289  * clusters larger that 2 KB might lead to system panics or data corruption.
 290  * When set to 0, the system will respect SOF1_IF_2KCL, which is set
 291  * on the outgoing interface
 292  * Set this to 1  for testing/debugging purposes only.
 293  */
 294 int sosendbigcl_ignore_capab = 0;
 295 SYSCTL_INT(_kern_ipc, OID_AUTO, sosendbigcl_ignore_capab,
 296         CTLFLAG_RW | CTLFLAG_LOCKED, &sosendbigcl_ignore_capab, 0, "");
 297
 298 int sodefunctlog = 0;
 299 SYSCTL_INT(_kern_ipc, OID_AUTO, sodefunctlog, CTLFLAG_RW | CTLFLAG_LOCKED,
 300         &sodefunctlog, 0, "");
 301
 302 int sothrottlelog = 0;
 303 SYSCTL_INT(_kern_ipc, OID_AUTO, sothrottlelog, CTLFLAG_RW | CTLFLAG_LOCKED,
 304         &sothrottlelog, 0, "");
 305
 306 int sorestrictrecv = 1;
 307 SYSCTL_INT(_kern_ipc, OID_AUTO, sorestrictrecv, CTLFLAG_RW | CTLFLAG_LOCKED,
 308         &sorestrictrecv, 0, "Enable inbound interface restrictions");
 309
 310 int sorestrictsend = 1;
 311 SYSCTL_INT(_kern_ipc, OID_AUTO, sorestrictsend, CTLFLAG_RW | CTLFLAG_LOCKED,
 312         &sorestrictsend, 0, "Enable outbound interface restrictions");
 313
 314 int soreserveheadroom = 1;
 315 SYSCTL_INT(_kern_ipc, OID_AUTO, soreserveheadroom, CTLFLAG_RW | CTLFLAG_LOCKED,
 316         &soreserveheadroom, 0, "To allocate contiguous datagram buffers");
 317
 318 #if (DEBUG || DEVELOPMENT)
 319 int so_notsent_lowat_check = 1;
 320 SYSCTL_INT(_kern_ipc, OID_AUTO, notsent_lowat, CTLFLAG_RW|CTLFLAG_LOCKED,
 321     &so_notsent_lowat_check, 0, "enable/disable notsnet lowat check");
 322 #endif /* DEBUG || DEVELOPMENT */
 323
 324 extern struct inpcbinfo tcbinfo;
 325
 326 /* TODO: these should be in header file */
 327 extern int get_inpcb_str_size(void);
 328 extern int get_tcp_str_size(void);
 329
 330 static unsigned int sl_zone_size;               /* size of sockaddr_list */
 331 static struct zone *sl_zone;                    /* zone for sockaddr_list */
 332
 333 static unsigned int se_zone_size;               /* size of sockaddr_entry */
 334 static struct zone *se_zone;                    /* zone for sockaddr_entry */
 335
 336 vm_size_t       so_cache_zone_element_size;
 337
 338 static int sodelayed_copy(struct socket *, struct uio *, struct mbuf **,
 339     user_ssize_t *);
 340 static void cached_sock_alloc(struct socket **, int);
 341 static void cached_sock_free(struct socket *);
 342
 343 /*
 344  * Maximum of extended background idle sockets per process
 345  * Set to zero to disable further setting of the option
 346  */
 347
 348 #define SO_IDLE_BK_IDLE_MAX_PER_PROC    1
 349 #define SO_IDLE_BK_IDLE_TIME            600
 350 #define SO_IDLE_BK_IDLE_RCV_HIWAT       131072
 351
 352 struct soextbkidlestat soextbkidlestat;
 353
 354 SYSCTL_UINT(_kern_ipc, OID_AUTO, maxextbkidleperproc,
 355         CTLFLAG_RW | CTLFLAG_LOCKED, &soextbkidlestat.so_xbkidle_maxperproc, 0,
 356         "Maximum of extended background idle sockets per process");
 357
 358 SYSCTL_UINT(_kern_ipc, OID_AUTO, extbkidletime, CTLFLAG_RW | CTLFLAG_LOCKED,
 359         &soextbkidlestat.so_xbkidle_time, 0,
 360         "Time in seconds to keep extended background idle sockets");
 361
 362 SYSCTL_UINT(_kern_ipc, OID_AUTO, extbkidlercvhiwat, CTLFLAG_RW | CTLFLAG_LOCKED,
 363         &soextbkidlestat.so_xbkidle_rcvhiwat, 0,
 364         "High water mark for extended background idle sockets");
 365
 366 SYSCTL_STRUCT(_kern_ipc, OID_AUTO, extbkidlestat, CTLFLAG_RD | CTLFLAG_LOCKED,
 367         &soextbkidlestat, soextbkidlestat, "");
 368
 369 int so_set_extended_bk_idle(struct socket *, int);
 370
 371 /*
 372  * SOTCDB_NO_DSCP is set by default, to prevent the networking stack from
 373  * setting the DSCP code on the packet based on the service class; see
 374  * <rdar://problem/11277343> for details.
 375  */
 376 __private_extern__ u_int32_t sotcdb = 0;
 377 SYSCTL_INT(_kern_ipc, OID_AUTO, sotcdb, CTLFLAG_RW | CTLFLAG_LOCKED,
 378         &sotcdb, 0, "");
 379
 380 void
 381 socketinit(void)
 382 {
 383         _CASSERT(sizeof(so_gencnt) == sizeof(uint64_t));
 384         VERIFY(IS_P2ALIGNED(&so_gencnt, sizeof(uint32_t)));
 385
 386 #ifdef __LP64__
 387         _CASSERT(sizeof(struct sa_endpoints) == sizeof(struct user64_sa_endpoints));
 388         _CASSERT(offsetof(struct sa_endpoints, sae_srcif) == offsetof(struct user64_sa_endpoints, sae_srcif));
 389         _CASSERT(offsetof(struct sa_endpoints, sae_srcaddr) == offsetof(struct user64_sa_endpoints, sae_srcaddr));
 390         _CASSERT(offsetof(struct sa_endpoints, sae_srcaddrlen) == offsetof(struct user64_sa_endpoints, sae_srcaddrlen));
 391         _CASSERT(offsetof(struct sa_endpoints, sae_dstaddr) == offsetof(struct user64_sa_endpoints, sae_dstaddr));
 392         _CASSERT(offsetof(struct sa_endpoints, sae_dstaddrlen) == offsetof(struct user64_sa_endpoints, sae_dstaddrlen));
 393 #else
 394         _CASSERT(sizeof(struct sa_endpoints) == sizeof(struct user32_sa_endpoints));
 395         _CASSERT(offsetof(struct sa_endpoints, sae_srcif) == offsetof(struct user32_sa_endpoints, sae_srcif));
 396         _CASSERT(offsetof(struct sa_endpoints, sae_srcaddr) == offsetof(struct user32_sa_endpoints, sae_srcaddr));
 397         _CASSERT(offsetof(struct sa_endpoints, sae_srcaddrlen) == offsetof(struct user32_sa_endpoints, sae_srcaddrlen));
 398         _CASSERT(offsetof(struct sa_endpoints, sae_dstaddr) == offsetof(struct user32_sa_endpoints, sae_dstaddr));
 399         _CASSERT(offsetof(struct sa_endpoints, sae_dstaddrlen) == offsetof(struct user32_sa_endpoints, sae_dstaddrlen));
 400 #endif
 401
 402         if (socketinit_done) {
 403                 printf("socketinit: already called...\n");
 404                 return;
 405         }
 406         socketinit_done = 1;
 407
 408         PE_parse_boot_argn("socket_debug", &socket_debug,
 409             sizeof (socket_debug));
 410
 411         /*
 412          * allocate lock group attribute and group for socket cache mutex
 413          */
 414         so_cache_mtx_grp_attr = lck_grp_attr_alloc_init();
 415         so_cache_mtx_grp = lck_grp_alloc_init("so_cache",
 416             so_cache_mtx_grp_attr);
 417
 418         /*
 419          * allocate the lock attribute for socket cache mutex
 420          */
 421         so_cache_mtx_attr = lck_attr_alloc_init();
 422
 423         /* cached sockets mutex */
 424         so_cache_mtx = lck_mtx_alloc_init(so_cache_mtx_grp, so_cache_mtx_attr);
 425         if (so_cache_mtx == NULL) {
 426                 panic("%s: unable to allocate so_cache_mtx\n", __func__);
 427                 /* NOTREACHED */
 428         }
 429         STAILQ_INIT(&so_cache_head);
 430
 431         so_cache_zone_element_size = (vm_size_t)(sizeof (struct socket) + 4
 432             + get_inpcb_str_size() + 4 + get_tcp_str_size());
 433
 434         so_cache_zone = zinit(so_cache_zone_element_size,
 435             (120000 * so_cache_zone_element_size), 8192, "socache zone");
 436         zone_change(so_cache_zone, Z_CALLERACCT, FALSE);
 437         zone_change(so_cache_zone, Z_NOENCRYPT, TRUE);
 438
 439         sl_zone_size = sizeof (struct sockaddr_list);
 440         if ((sl_zone = zinit(sl_zone_size, 1024 * sl_zone_size, 1024,
 441             "sockaddr_list")) == NULL) {
 442                 panic("%s: unable to allocate sockaddr_list zone\n", __func__);
 443                 /* NOTREACHED */
 444         }
 445         zone_change(sl_zone, Z_CALLERACCT, FALSE);
 446         zone_change(sl_zone, Z_EXPAND, TRUE);
 447
 448         se_zone_size = sizeof (struct sockaddr_entry);
 449         if ((se_zone = zinit(se_zone_size, 1024 * se_zone_size, 1024,
 450             "sockaddr_entry")) == NULL) {
 451                 panic("%s: unable to allocate sockaddr_entry zone\n", __func__);
 452                 /* NOTREACHED */
 453         }
 454         zone_change(se_zone, Z_CALLERACCT, FALSE);
 455         zone_change(se_zone, Z_EXPAND, TRUE);
 456
 457         bzero(&soextbkidlestat, sizeof(struct soextbkidlestat));
 458         soextbkidlestat.so_xbkidle_maxperproc = SO_IDLE_BK_IDLE_MAX_PER_PROC;
 459         soextbkidlestat.so_xbkidle_time = SO_IDLE_BK_IDLE_TIME;
 460         soextbkidlestat.so_xbkidle_rcvhiwat = SO_IDLE_BK_IDLE_RCV_HIWAT;
 461
 462         in_pcbinit();
 463         sflt_init();
 464         socket_tclass_init();
 465 #if MULTIPATH
 466         mp_pcbinit();
 467 #endif /* MULTIPATH */
 468 }
 469
 470 static void
 471 cached_sock_alloc(struct socket **so, int waitok)
 472 {
 473         caddr_t temp;
 474         uintptr_t offset;
 475
 476         lck_mtx_lock(so_cache_mtx);
 477
 478         if (!STAILQ_EMPTY(&so_cache_head)) {
 479                 VERIFY(cached_sock_count > 0);
 480
 481                 *so = STAILQ_FIRST(&so_cache_head);
 482                 STAILQ_REMOVE_HEAD(&so_cache_head, so_cache_ent);
 483                 STAILQ_NEXT((*so), so_cache_ent) = NULL;
 484
 485                 cached_sock_count--;
 486                 lck_mtx_unlock(so_cache_mtx);
 487
 488                 temp = (*so)->so_saved_pcb;
 489                 bzero((caddr_t)*so, sizeof (struct socket));
 490
 491                 (*so)->so_saved_pcb = temp;
 492         } else {
 493
 494                 lck_mtx_unlock(so_cache_mtx);
 495
 496                 if (waitok)
 497                         *so = (struct socket *)zalloc(so_cache_zone);
 498                 else
 499                         *so = (struct socket *)zalloc_noblock(so_cache_zone);
 500
 501                 if (*so == NULL)
 502                         return;
 503
 504                 bzero((caddr_t)*so, sizeof (struct socket));
 505
 506                 /*
 507                  * Define offsets for extra structures into our
 508                  * single block of memory. Align extra structures
 509                  * on longword boundaries.
 510                  */
 511
 512                 offset = (uintptr_t)*so;
 513                 offset += sizeof (struct socket);
 514
 515                 offset = ALIGN(offset);
 516
 517                 (*so)->so_saved_pcb = (caddr_t)offset;
 518                 offset += get_inpcb_str_size();
 519
 520                 offset = ALIGN(offset);
 521
 522                 ((struct inpcb *)(void *)(*so)->so_saved_pcb)->inp_saved_ppcb =
 523                     (caddr_t)offset;
 524         }
 525
 526         OSBitOrAtomic(SOF1_CACHED_IN_SOCK_LAYER, &(*so)->so_flags1);
 527 }
 528
 529 static void
 530 cached_sock_free(struct socket *so)
 531 {
 532
 533         lck_mtx_lock(so_cache_mtx);
 534
 535         so_cache_time = net_uptime();
 536         if (++cached_sock_count > max_cached_sock_count) {
 537                 --cached_sock_count;
 538                 lck_mtx_unlock(so_cache_mtx);
 539                 zfree(so_cache_zone, so);
 540         } else {
 541                 if (so_cache_hw < cached_sock_count)
 542                         so_cache_hw = cached_sock_count;
 543
 544                 STAILQ_INSERT_TAIL(&so_cache_head, so, so_cache_ent);
 545
 546                 so->cache_timestamp = so_cache_time;
 547                 lck_mtx_unlock(so_cache_mtx);
 548         }
 549 }
 550
 551 void
 552 so_update_last_owner_locked(struct socket *so, proc_t self)
 553 {
 554         if (so->last_pid != 0) {
 555                 /*
 556                  * last_pid and last_upid should remain zero for sockets
 557                  * created using sock_socket. The check above achieves that
 558                  */
 559                 if (self == PROC_NULL)
 560                         self = current_proc();
 561
 562                 if (so->last_upid != proc_uniqueid(self) ||
 563                     so->last_pid != proc_pid(self)) {
 564                         so->last_upid = proc_uniqueid(self);
 565                         so->last_pid = proc_pid(self);
 566                         proc_getexecutableuuid(self, so->last_uuid,
 567                             sizeof (so->last_uuid));
 568                 }
 569                 proc_pidoriginatoruuid(so->so_vuuid, sizeof(so->so_vuuid));
 570         }
 571 }
 572
 573 void
 574 so_update_policy(struct socket *so)
 575 {
 576         if (SOCK_DOM(so) == PF_INET || SOCK_DOM(so) == PF_INET6)
 577                 (void) inp_update_policy(sotoinpcb(so));
 578 }
 579
 580 #if NECP
 581 static void
 582 so_update_necp_policy(struct socket *so, struct sockaddr *override_local_addr,
 583     struct sockaddr *override_remote_addr)
 584 {
 585         if (SOCK_DOM(so) == PF_INET || SOCK_DOM(so) == PF_INET6)
 586                 inp_update_necp_policy(sotoinpcb(so), override_local_addr,
 587                     override_remote_addr, 0);
 588 }
 589 #endif /* NECP */
 590
 591 boolean_t
 592 so_cache_timer(void)
 593 {
 594         struct socket   *p;
 595         int             n_freed = 0;
 596         boolean_t rc = FALSE;
 597
 598         lck_mtx_lock(so_cache_mtx);
 599         so_cache_timeouts++;
 600         so_cache_time = net_uptime();
 601
 602         while (!STAILQ_EMPTY(&so_cache_head)) {
 603                 VERIFY(cached_sock_count > 0);
 604                 p = STAILQ_FIRST(&so_cache_head);
 605                 if ((so_cache_time - p->cache_timestamp) <
 606                         SO_CACHE_TIME_LIMIT)
 607                         break;
 608
 609                 STAILQ_REMOVE_HEAD(&so_cache_head, so_cache_ent);
 610                 --cached_sock_count;
 611
 612                 zfree(so_cache_zone, p);
 613
 614                 if (++n_freed >= SO_CACHE_MAX_FREE_BATCH) {
 615                         so_cache_max_freed++;
 616                         break;
 617                 }
 618         }
 619
 620         /* Schedule again if there is more to cleanup */
 621         if (!STAILQ_EMPTY(&so_cache_head))
 622                 rc = TRUE;
 623
 624         lck_mtx_unlock(so_cache_mtx);
 625         return (rc);
 626 }
 627
 628 /*
 629  * Get a socket structure from our zone, and initialize it.
 630  * We don't implement `waitok' yet (see comments in uipc_domain.c).
 631  * Note that it would probably be better to allocate socket
 632  * and PCB at the same time, but I'm not convinced that all
 633  * the protocols can be easily modified to do this.
 634  */
 635 struct socket *
 636 soalloc(int waitok, int dom, int type)
 637 {
 638         struct socket *so;
 639
 640         if ((dom == PF_INET) && (type == SOCK_STREAM)) {
 641                 cached_sock_alloc(&so, waitok);
 642         } else {
 643                 MALLOC_ZONE(so, struct socket *, sizeof (*so), socket_zone,
 644                     M_WAITOK);
 645                 if (so != NULL)
 646                         bzero(so, sizeof (*so));
 647         }
 648         if (so != NULL) {
 649                 so->so_gencnt = OSIncrementAtomic64((SInt64 *)&so_gencnt);
 650                 so->so_zone = socket_zone;
 651 #if CONFIG_MACF_SOCKET
 652                 /* Convert waitok to  M_WAITOK/M_NOWAIT for MAC Framework. */
 653                 if (mac_socket_label_init(so, !waitok) != 0) {
 654                         sodealloc(so);
 655                         return (NULL);
 656                 }
 657 #endif /* MAC_SOCKET */
 658         }
 659
 660         return (so);
 661 }
 662
 663 int
 664 socreate_internal(int dom, struct socket **aso, int type, int proto,
 665     struct proc *p, uint32_t flags, struct proc *ep)
 666 {
 667         struct protosw *prp;
 668         struct socket *so;
 669         int error = 0;
 670
 671 #if TCPDEBUG
 672         extern int tcpconsdebug;
 673 #endif
 674
 675         VERIFY(aso != NULL);
 676         *aso = NULL;
 677
 678         if (proto != 0)
 679                 prp = pffindproto(dom, proto, type);
 680         else
 681                 prp = pffindtype(dom, type);
 682
 683         if (prp == NULL || prp->pr_usrreqs->pru_attach == NULL) {
 684                 if (pffinddomain(dom) == NULL)
 685                         return (EAFNOSUPPORT);
 686                 if (proto != 0) {
 687                         if (pffindprotonotype(dom, proto) != NULL)
 688                                 return (EPROTOTYPE);
 689                 }
 690                 return (EPROTONOSUPPORT);
 691         }
 692         if (prp->pr_type != type)
 693                 return (EPROTOTYPE);
 694         so = soalloc(1, dom, type);
 695         if (so == NULL)
 696                 return (ENOBUFS);
 697
 698         if (flags & SOCF_ASYNC)
 699                 so->so_state |= SS_NBIO;
 700 #if MULTIPATH
 701         if (flags & SOCF_MP_SUBFLOW) {
 702                 /*
 703                  * A multipath subflow socket is used internally in the kernel,
 704                  * therefore it does not have a file desciptor associated by
 705                  * default.
 706                  */
 707                 so->so_state |= SS_NOFDREF;
 708                 so->so_flags |= SOF_MP_SUBFLOW;
 709         }
 710 #endif /* MULTIPATH */
 711
 712         TAILQ_INIT(&so->so_incomp);
 713         TAILQ_INIT(&so->so_comp);
 714         so->so_type = type;
 715         so->last_upid = proc_uniqueid(p);
 716         so->last_pid = proc_pid(p);
 717         proc_getexecutableuuid(p, so->last_uuid, sizeof (so->last_uuid));
 718         proc_pidoriginatoruuid(so->so_vuuid, sizeof(so->so_vuuid));
 719
 720         if (ep != PROC_NULL && ep != p) {
 721                 so->e_upid = proc_uniqueid(ep);
 722                 so->e_pid = proc_pid(ep);
 723                 proc_getexecutableuuid(ep, so->e_uuid, sizeof (so->e_uuid));
 724                 so->so_flags |= SOF_DELEGATED;
 725         }
 726
 727         so->so_cred = kauth_cred_proc_ref(p);
 728         if (!suser(kauth_cred_get(), NULL))
 729                 so->so_state |= SS_PRIV;
 730
 731         so->so_proto = prp;
 732         so->so_rcv.sb_flags |= SB_RECV;
 733         so->so_rcv.sb_so = so->so_snd.sb_so = so;
 734         so->next_lock_lr = 0;
 735         so->next_unlock_lr = 0;
 736
 737 #if CONFIG_MACF_SOCKET
 738         mac_socket_label_associate(kauth_cred_get(), so);
 739 #endif /* MAC_SOCKET */
 740
 741         /*
 742          * Attachment will create the per pcb lock if necessary and
 743          * increase refcount for creation, make sure it's done before
 744          * socket is inserted in lists.
 745          */
 746         so->so_usecount++;
 747
 748         error = (*prp->pr_usrreqs->pru_attach)(so, proto, p);
 749         if (error != 0) {
 750                 /*
 751                  * Warning:
 752                  * If so_pcb is not zero, the socket will be leaked,
 753                  * so protocol attachment handler must be coded carefuly
 754                  */
 755                 so->so_state |= SS_NOFDREF;
 756                 so->so_usecount--;
 757                 sofreelastref(so, 1);   /* will deallocate the socket */
 758                 return (error);
 759         }
 760
 761         atomic_add_32(&prp->pr_domain->dom_refs, 1);
 762         TAILQ_INIT(&so->so_evlist);
 763
 764         /* Attach socket filters for this protocol */
 765         sflt_initsock(so);
 766 #if TCPDEBUG
 767         if (tcpconsdebug == 2)
 768                 so->so_options |= SO_DEBUG;
 769 #endif
 770         so_set_default_traffic_class(so);
 771
 772         /*
 773          * If this thread or task is marked to create backgrounded sockets,
 774          * mark the socket as background.
 775          */
 776         if (proc_get_effective_thread_policy(current_thread(),
 777             TASK_POLICY_NEW_SOCKETS_BG)) {
 778                 socket_set_traffic_mgt_flags(so, TRAFFIC_MGT_SO_BACKGROUND);
 779                 so->so_background_thread = current_thread();
 780         }
 781
 782         switch (dom) {
 783         /*
 784          * Don't mark Unix domain, system or multipath sockets as
 785          * eligible for defunct by default.
 786          */
 787         case PF_LOCAL:
 788         case PF_SYSTEM:
 789         case PF_MULTIPATH:
 790                 so->so_flags |= SOF_NODEFUNCT;
 791                 break;
 792         default:
 793                 break;
 794         }
 795
 796         /*
 797          * Entitlements can't be checked at socket creation time except if the
 798          * application requested a feature guarded by a privilege (c.f., socket
 799          * delegation).
 800          * The priv(9) and the Sandboxing APIs are designed with the idea that
 801          * a privilege check should only be triggered by a userland request.
 802          * A privilege check at socket creation time is time consuming and
 803          * could trigger many authorisation error messages from the security
 804          * APIs.
 805          */
 806
 807         *aso = so;
 808
 809         return (0);
 810 }
 811
 812 /*
 813  * Returns:     0                       Success
 814  *              EAFNOSUPPORT
 815  *              EPROTOTYPE
 816  *              EPROTONOSUPPORT
 817  *              ENOBUFS
 818  *      <pru_attach>:ENOBUFS[AF_UNIX]
 819  *      <pru_attach>:ENOBUFS[TCP]
 820  *      <pru_attach>:ENOMEM[TCP]
 821  *      <pru_attach>:???                [other protocol families, IPSEC]
 822  */
 823 int
 824 socreate(int dom, struct socket **aso, int type, int proto)
 825 {
 826         return (socreate_internal(dom, aso, type, proto, current_proc(), 0,
 827             PROC_NULL));
 828 }
 829
 830 int
 831 socreate_delegate(int dom, struct socket **aso, int type, int proto, pid_t epid)
 832 {
 833         int error = 0;
 834         struct proc *ep = PROC_NULL;
 835
 836         if ((proc_selfpid() != epid) && ((ep = proc_find(epid)) == PROC_NULL)) {
 837                 error = ESRCH;
 838                 goto done;
 839         }
 840
 841         error = socreate_internal(dom, aso, type, proto, current_proc(), 0, ep);
 842
 843         /*
 844          * It might not be wise to hold the proc reference when calling
 845          * socreate_internal since it calls soalloc with M_WAITOK
 846          */
 847 done:
 848         if (ep != PROC_NULL)
 849                 proc_rele(ep);
 850
 851         return (error);
 852 }
 853
 854 /*
 855  * Returns:     0                       Success
 856  *      <pru_bind>:EINVAL               Invalid argument [COMMON_START]
 857  *      <pru_bind>:EAFNOSUPPORT         Address family not supported
 858  *      <pru_bind>:EADDRNOTAVAIL        Address not available.
 859  *      <pru_bind>:EINVAL               Invalid argument
 860  *      <pru_bind>:EAFNOSUPPORT         Address family not supported [notdef]
 861  *      <pru_bind>:EACCES               Permission denied
 862  *      <pru_bind>:EADDRINUSE           Address in use
 863  *      <pru_bind>:EAGAIN               Resource unavailable, try again
 864  *      <pru_bind>:EPERM                Operation not permitted
 865  *      <pru_bind>:???
 866  *      <sf_bind>:???
 867  *
 868  * Notes:       It's not possible to fully enumerate the return codes above,
 869  *              since socket filter authors and protocol family authors may
 870  *              not choose to limit their error returns to those listed, even
 871  *              though this may result in some software operating incorrectly.
 872  *
 873  *              The error codes which are enumerated above are those known to
 874  *              be returned by the tcp_usr_bind function supplied.
 875  */
 876 int
 877 sobindlock(struct socket *so, struct sockaddr *nam, int dolock)
 878 {
 879         struct proc *p = current_proc();
 880         int error = 0;
 881
 882         if (dolock)
 883                 socket_lock(so, 1);
 884         VERIFY(so->so_usecount > 1);
 885
 886         so_update_last_owner_locked(so, p);
 887         so_update_policy(so);
 888
 889 #if NECP
 890         so_update_necp_policy(so, nam, NULL);
 891 #endif /* NECP */
 892
 893         /*
 894          * If this is a bind request on a socket that has been marked
 895          * as inactive, reject it now before we go any further.
 896          */
 897         if (so->so_flags & SOF_DEFUNCT) {
 898                 error = EINVAL;
 899                 SODEFUNCTLOG("%s[%d, %s]: defunct so 0x%llx [%d,%d] (%d)\n",
 900                     __func__, proc_pid(p), proc_best_name(p),
 901                     (uint64_t)DEBUG_KERNEL_ADDRPERM(so),
 902                     SOCK_DOM(so), SOCK_TYPE(so), error);
 903                 goto out;
 904         }
 905
 906         /* Socket filter */
 907         error = sflt_bind(so, nam);
 908
 909         if (error == 0)
 910                 error = (*so->so_proto->pr_usrreqs->pru_bind)(so, nam, p);
 911 out:
 912         if (dolock)
 913                 socket_unlock(so, 1);
 914
 915         if (error == EJUSTRETURN)
 916                 error = 0;
 917
 918         return (error);
 919 }
 920
 921 void
 922 sodealloc(struct socket *so)
 923 {
 924         kauth_cred_unref(&so->so_cred);
 925
 926         /* Remove any filters */
 927         sflt_termsock(so);
 928
 929 #if CONTENT_FILTER
 930         cfil_sock_detach(so);
 931 #endif /* CONTENT_FILTER */
 932
 933         /* Delete the state allocated for msg queues on a socket */
 934         if (so->so_flags & SOF_ENABLE_MSGS) {
 935                 FREE(so->so_msg_state, M_TEMP);
 936                 so->so_msg_state = NULL;
 937         }
 938         VERIFY(so->so_msg_state == NULL);
 939
 940         so->so_gencnt = OSIncrementAtomic64((SInt64 *)&so_gencnt);
 941
 942 #if CONFIG_MACF_SOCKET
 943         mac_socket_label_destroy(so);
 944 #endif /* MAC_SOCKET */
 945
 946         if (so->so_flags1 & SOF1_CACHED_IN_SOCK_LAYER) {
 947                 cached_sock_free(so);
 948         } else {
 949                 FREE_ZONE(so, sizeof (*so), so->so_zone);
 950         }
 951 }
 952
 953 /*
 954  * Returns:     0                       Success
 955  *              EINVAL
 956  *              EOPNOTSUPP
 957  *      <pru_listen>:EINVAL[AF_UNIX]
 958  *      <pru_listen>:EINVAL[TCP]
 959  *      <pru_listen>:EADDRNOTAVAIL[TCP] Address not available.
 960  *      <pru_listen>:EINVAL[TCP]        Invalid argument
 961  *      <pru_listen>:EAFNOSUPPORT[TCP]  Address family not supported [notdef]
 962  *      <pru_listen>:EACCES[TCP]        Permission denied
 963  *      <pru_listen>:EADDRINUSE[TCP]    Address in use
 964  *      <pru_listen>:EAGAIN[TCP]        Resource unavailable, try again
 965  *      <pru_listen>:EPERM[TCP]         Operation not permitted
 966  *      <sf_listen>:???
 967  *
 968  * Notes:       Other <pru_listen> returns depend on the protocol family; all
 969  *              <sf_listen> returns depend on what the filter author causes
 970  *              their filter to return.
 971  */
 972 int
 973 solisten(struct socket *so, int backlog)
 974 {
 975         struct proc *p = current_proc();
 976         int error = 0;
 977
 978         socket_lock(so, 1);
 979
 980         so_update_last_owner_locked(so, p);
 981         so_update_policy(so);
 982
 983 #if NECP
 984         so_update_necp_policy(so, NULL, NULL);
 985 #endif /* NECP */
 986
 987         if (so->so_proto == NULL) {
 988                 error = EINVAL;
 989                 goto out;
 990         }
 991         if ((so->so_proto->pr_flags & PR_CONNREQUIRED) == 0) {
 992                 error = EOPNOTSUPP;
 993                 goto out;
 994         }
 995
 996         /*
 997          * If the listen request is made on a socket that is not fully
 998          * disconnected, or on a socket that has been marked as inactive,
 999          * reject the request now.
1000          */
1001         if ((so->so_state &
1002             (SS_ISCONNECTED|SS_ISCONNECTING|SS_ISDISCONNECTING)) ||
1003             (so->so_flags & SOF_DEFUNCT)) {
1004                 error = EINVAL;
1005                 if (so->so_flags & SOF_DEFUNCT) {
1006                         SODEFUNCTLOG("%s[%d, %s]: defunct so 0x%llx [%d,%d] "
1007                             "(%d)\n", __func__, proc_pid(p),
1008                             proc_best_name(p),
1009                             (uint64_t)DEBUG_KERNEL_ADDRPERM(so),
1010                             SOCK_DOM(so), SOCK_TYPE(so), error);
1011                 }
1012                 goto out;
1013         }
1014
1015         if ((so->so_restrictions & SO_RESTRICT_DENY_IN) != 0) {
1016                 error = EPERM;
1017                 goto out;
1018         }
1019
1020         error = sflt_listen(so);
1021         if (error == 0)
1022                 error = (*so->so_proto->pr_usrreqs->pru_listen)(so, p);
1023
1024         if (error) {
1025                 if (error == EJUSTRETURN)
1026                         error = 0;
1027                 goto out;
1028         }
1029
1030         if (TAILQ_EMPTY(&so->so_comp))
1031                 so->so_options |= SO_ACCEPTCONN;
1032         /*
1033          * POSIX: The implementation may have an upper limit on the length of
1034          * the listen queue-either global or per accepting socket. If backlog
1035          * exceeds this limit, the length of the listen queue is set to the
1036          * limit.
1037          *
1038          * If listen() is called with a backlog argument value that is less
1039          * than 0, the function behaves as if it had been called with a backlog
1040          * argument value of 0.
1041          *
1042          * A backlog argument of 0 may allow the socket to accept connections,
1043          * in which case the length of the listen queue may be set to an
1044          * implementation-defined minimum value.
1045          */
1046         if (backlog <= 0 || backlog > somaxconn)
1047                 backlog = somaxconn;
1048
1049         so->so_qlimit = backlog;
1050 out:
1051         socket_unlock(so, 1);
1052         return (error);
1053 }
1054
1055 void
1056 sofreelastref(struct socket *so, int dealloc)
1057 {
1058         struct socket *head = so->so_head;
1059
1060         /* Assume socket is locked */
1061
1062         if (!(so->so_flags & SOF_PCBCLEARING) || !(so->so_state & SS_NOFDREF)) {
1063                 selthreadclear(&so->so_snd.sb_sel);
1064                 selthreadclear(&so->so_rcv.sb_sel);
1065                 so->so_rcv.sb_flags &= ~(SB_SEL|SB_UPCALL);
1066                 so->so_snd.sb_flags &= ~(SB_SEL|SB_UPCALL);
1067                 so->so_event = sonullevent;
1068                 return;
1069         }
1070         if (head != NULL) {
1071                 socket_lock(head, 1);
1072                 if (so->so_state & SS_INCOMP) {
1073                         TAILQ_REMOVE(&head->so_incomp, so, so_list);
1074                         head->so_incqlen--;
1075                 } else if (so->so_state & SS_COMP) {
1076                         /*
1077                          * We must not decommission a socket that's
1078                          * on the accept(2) queue.  If we do, then
1079                          * accept(2) may hang after select(2) indicated
1080                          * that the listening socket was ready.
1081                          */
1082                         selthreadclear(&so->so_snd.sb_sel);
1083                         selthreadclear(&so->so_rcv.sb_sel);
1084                         so->so_rcv.sb_flags &= ~(SB_SEL|SB_UPCALL);
1085                         so->so_snd.sb_flags &= ~(SB_SEL|SB_UPCALL);
1086                         so->so_event = sonullevent;
1087                         socket_unlock(head, 1);
1088                         return;
1089                 } else {
1090                         panic("sofree: not queued");
1091                 }
1092                 head->so_qlen--;
1093                 so->so_state &= ~SS_INCOMP;
1094                 so->so_head = NULL;
1095                 socket_unlock(head, 1);
1096         }
1097         sowflush(so);
1098         sorflush(so);
1099
1100 #if FLOW_DIVERT
1101         if (so->so_flags & SOF_FLOW_DIVERT) {
1102                 flow_divert_detach(so);
1103         }
1104 #endif  /* FLOW_DIVERT */
1105
1106         /* 3932268: disable upcall */
1107         so->so_rcv.sb_flags &= ~SB_UPCALL;
1108         so->so_snd.sb_flags &= ~(SB_UPCALL|SB_SNDBYTE_CNT);
1109         so->so_event = sonullevent;
1110
1111         if (dealloc)
1112                 sodealloc(so);
1113 }
1114
1115 void
1116 soclose_wait_locked(struct socket *so)
1117 {
1118         lck_mtx_t *mutex_held;
1119
1120         if (so->so_proto->pr_getlock != NULL)
1121                 mutex_held = (*so->so_proto->pr_getlock)(so, 0);
1122         else
1123                 mutex_held = so->so_proto->pr_domain->dom_mtx;
1124         lck_mtx_assert(mutex_held, LCK_MTX_ASSERT_OWNED);
1125
1126         /*
1127          * Double check here and return if there's no outstanding upcall;
1128          * otherwise proceed further only if SOF_UPCALLCLOSEWAIT is set.
1129          */
1130         if (!so->so_upcallusecount || !(so->so_flags & SOF_UPCALLCLOSEWAIT))
1131                 return;
1132         so->so_rcv.sb_flags &= ~SB_UPCALL;
1133         so->so_snd.sb_flags &= ~SB_UPCALL;
1134         so->so_flags |= SOF_CLOSEWAIT;
1135         (void) msleep((caddr_t)&so->so_upcallusecount, mutex_held, (PZERO - 1),
1136             "soclose_wait_locked", NULL);
1137         lck_mtx_assert(mutex_held, LCK_MTX_ASSERT_OWNED);
1138         so->so_flags &= ~SOF_CLOSEWAIT;
1139 }
1140
1141 /*
1142  * Close a socket on last file table reference removal.
1143  * Initiate disconnect if connected.
1144  * Free socket when disconnect complete.
1145  */
1146 int
1147 soclose_locked(struct socket *so)
1148 {
1149         int error = 0;
1150         lck_mtx_t *mutex_held;
1151         struct timespec ts;
1152
1153         if (so->so_usecount == 0) {
1154                 panic("soclose: so=%p refcount=0\n", so);
1155                 /* NOTREACHED */
1156         }
1157
1158         sflt_notify(so, sock_evt_closing, NULL);
1159
1160         if (so->so_upcallusecount)
1161                 soclose_wait_locked(so);
1162
1163 #if CONTENT_FILTER
1164         /*
1165          * We have to wait until the content filters are done
1166          */
1167         if ((so->so_flags & SOF_CONTENT_FILTER) != 0) {
1168                 cfil_sock_close_wait(so);
1169                 cfil_sock_is_closed(so);
1170                 cfil_sock_detach(so);
1171         }
1172 #endif /* CONTENT_FILTER */
1173
1174         if (so->so_flags1 & SOF1_EXTEND_BK_IDLE_INPROG) {
1175                 soresume(current_proc(), so, 1);
1176                 so->so_flags1 &= ~SOF1_EXTEND_BK_IDLE_WANTED;
1177         }
1178
1179         if ((so->so_options & SO_ACCEPTCONN)) {
1180                 struct socket *sp, *sonext;
1181                 int socklock = 0;
1182
1183                 /*
1184                  * We do not want new connection to be added
1185                  * to the connection queues
1186                  */
1187                 so->so_options &= ~SO_ACCEPTCONN;
1188
1189                 for (sp = TAILQ_FIRST(&so->so_incomp);
1190                     sp != NULL; sp = sonext) {
1191                         sonext = TAILQ_NEXT(sp, so_list);
1192
1193                         /*
1194                          * Radar 5350314
1195                          * skip sockets thrown away by tcpdropdropblreq
1196                          * they will get cleanup by the garbage collection.
1197                          * otherwise, remove the incomp socket from the queue
1198                          * and let soabort trigger the appropriate cleanup.
1199                          */
1200                         if (sp->so_flags & SOF_OVERFLOW)
1201                                 continue;
1202
1203                         if (so->so_proto->pr_getlock != NULL) {
1204                                 /*
1205                                  * Lock ordering for consistency with the
1206                                  * rest of the stack, we lock the socket
1207                                  * first and then grabb the head.
1208                                  */
1209                                 socket_unlock(so, 0);
1210                                 socket_lock(sp, 1);
1211                                 socket_lock(so, 0);
1212                                 socklock = 1;
1213                         }
1214
1215                         TAILQ_REMOVE(&so->so_incomp, sp, so_list);
1216                         so->so_incqlen--;
1217
1218                         if (sp->so_state & SS_INCOMP) {
1219                                 sp->so_state &= ~SS_INCOMP;
1220                                 sp->so_head = NULL;
1221
1222                                 (void) soabort(sp);
1223                         }
1224
1225                         if (socklock)
1226                                 socket_unlock(sp, 1);
1227                 }
1228
1229                 while ((sp = TAILQ_FIRST(&so->so_comp)) != NULL) {
1230                         /* Dequeue from so_comp since sofree() won't do it */
1231                         TAILQ_REMOVE(&so->so_comp, sp, so_list);
1232                         so->so_qlen--;
1233
1234                         if (so->so_proto->pr_getlock != NULL) {
1235                                 socket_unlock(so, 0);
1236                                 socket_lock(sp, 1);
1237                         }
1238
1239                         if (sp->so_state & SS_COMP) {
1240                                 sp->so_state &= ~SS_COMP;
1241                                 sp->so_head = NULL;
1242
1243                                 (void) soabort(sp);
1244                         }
1245
1246                         if (so->so_proto->pr_getlock != NULL) {
1247                                 socket_unlock(sp, 1);
1248                                 socket_lock(so, 0);
1249                         }
1250                 }
1251         }
1252         if (so->so_pcb == NULL) {
1253                 /* 3915887: mark the socket as ready for dealloc */
1254                 so->so_flags |= SOF_PCBCLEARING;
1255                 goto discard;
1256         }
1257         if (so->so_state & SS_ISCONNECTED) {
1258                 if ((so->so_state & SS_ISDISCONNECTING) == 0) {
1259                         error = sodisconnectlocked(so);
1260                         if (error)
1261                                 goto drop;
1262                 }
1263                 if (so->so_options & SO_LINGER) {
1264                         if ((so->so_state & SS_ISDISCONNECTING) &&
1265                             (so->so_state & SS_NBIO))
1266                                 goto drop;
1267                         if (so->so_proto->pr_getlock != NULL)
1268                                 mutex_held = (*so->so_proto->pr_getlock)(so, 0);
1269                         else
1270                                 mutex_held = so->so_proto->pr_domain->dom_mtx;
1271                         while (so->so_state & SS_ISCONNECTED) {
1272                                 ts.tv_sec = (so->so_linger/100);
1273                                 ts.tv_nsec = (so->so_linger % 100) *
1274                                     NSEC_PER_USEC * 1000 * 10;
1275                                 error = msleep((caddr_t)&so->so_timeo,
1276                                     mutex_held, PSOCK | PCATCH, "soclose", &ts);
1277                                 if (error) {
1278                                         /*
1279                                          * It's OK when the time fires,
1280                                          * don't report an error
1281                                          */
1282                                         if (error == EWOULDBLOCK)
1283                                                 error = 0;
1284                                         break;
1285                                 }
1286                         }
1287                 }
1288         }
1289 drop:
1290         if (so->so_usecount == 0) {
1291                 panic("soclose: usecount is zero so=%p\n", so);
1292                 /* NOTREACHED */
1293         }
1294         if (so->so_pcb != NULL && !(so->so_flags & SOF_PCBCLEARING)) {
1295                 int error2 = (*so->so_proto->pr_usrreqs->pru_detach)(so);
1296                 if (error == 0)
1297                         error = error2;
1298         }
1299         if (so->so_usecount <= 0) {
1300                 panic("soclose: usecount is zero so=%p\n", so);
1301                 /* NOTREACHED */
1302         }
1303 discard:
1304         if (so->so_pcb != NULL && !(so->so_flags & SOF_MP_SUBFLOW) &&
1305             (so->so_state & SS_NOFDREF)) {
1306                 panic("soclose: NOFDREF");
1307                 /* NOTREACHED */
1308         }
1309         so->so_state |= SS_NOFDREF;
1310
1311         if (so->so_flags & SOF_MP_SUBFLOW)
1312                 so->so_flags &= ~SOF_MP_SUBFLOW;
1313
1314         if ((so->so_flags & SOF_KNOTE) != 0)
1315                 KNOTE(&so->so_klist, SO_FILT_HINT_LOCKED);
1316
1317         atomic_add_32(&so->so_proto->pr_domain->dom_refs, -1);
1318         evsofree(so);
1319
1320         so->so_usecount--;
1321         sofree(so);
1322         return (error);
1323 }
1324
1325 int
1326 soclose(struct socket *so)
1327 {
1328         int error = 0;
1329         socket_lock(so, 1);
1330
1331         if (so->so_retaincnt == 0) {
1332                 error = soclose_locked(so);
1333         } else {
1334                 /*
1335                  * if the FD is going away, but socket is
1336                  * retained in kernel remove its reference
1337                  */
1338                 so->so_usecount--;
1339                 if (so->so_usecount < 2)
1340                         panic("soclose: retaincnt non null and so=%p "
1341                             "usecount=%d\n", so, so->so_usecount);
1342         }
1343         socket_unlock(so, 1);
1344         return (error);
1345 }
1346
1347 /*
1348  * Must be called at splnet...
1349  */
1350 /* Should already be locked */
1351 int
1352 soabort(struct socket *so)
1353 {
1354         int error;
1355
1356 #ifdef MORE_LOCKING_DEBUG
1357         lck_mtx_t *mutex_held;
1358
1359         if (so->so_proto->pr_getlock != NULL)
1360                 mutex_held = (*so->so_proto->pr_getlock)(so, 0);
1361         else
1362                 mutex_held = so->so_proto->pr_domain->dom_mtx;
1363         lck_mtx_assert(mutex_held, LCK_MTX_ASSERT_OWNED);
1364 #endif
1365
1366         if ((so->so_flags & SOF_ABORTED) == 0) {
1367                 so->so_flags |= SOF_ABORTED;
1368                 error = (*so->so_proto->pr_usrreqs->pru_abort)(so);
1369                 if (error) {
1370                         sofree(so);
1371                         return (error);
1372                 }
1373         }
1374         return (0);
1375 }
1376
1377 int
1378 soacceptlock(struct socket *so, struct sockaddr **nam, int dolock)
1379 {
1380         int error;
1381
1382         if (dolock)
1383                 socket_lock(so, 1);
1384
1385         so_update_last_owner_locked(so, PROC_NULL);
1386         so_update_policy(so);
1387 #if NECP
1388         so_update_necp_policy(so, NULL, NULL);
1389 #endif /* NECP */
1390
1391         if ((so->so_state & SS_NOFDREF) == 0)
1392                 panic("soaccept: !NOFDREF");
1393         so->so_state &= ~SS_NOFDREF;
1394         error = (*so->so_proto->pr_usrreqs->pru_accept)(so, nam);
1395
1396         if (dolock)
1397                 socket_unlock(so, 1);
1398         return (error);
1399 }
1400
1401 int
1402 soaccept(struct socket *so, struct sockaddr **nam)
1403 {
1404         return (soacceptlock(so, nam, 1));
1405 }
1406
1407 int
1408 soacceptfilter(struct socket *so)
1409 {
1410         struct sockaddr *local = NULL, *remote = NULL;
1411         int error = 0;
1412         struct socket *head = so->so_head;
1413
1414         /*
1415          * Hold the lock even if this socket has not been made visible
1416          * to the filter(s).  For sockets with global locks, this protects
1417          * against the head or peer going away
1418          */
1419         socket_lock(so, 1);
1420         if (sogetaddr_locked(so, &remote, 1) != 0 ||
1421             sogetaddr_locked(so, &local, 0) != 0) {
1422                 so->so_state &= ~(SS_NOFDREF | SS_COMP);
1423                 so->so_head = NULL;
1424                 socket_unlock(so, 1);
1425                 soclose(so);
1426                 /* Out of resources; try it again next time */
1427                 error = ECONNABORTED;
1428                 goto done;
1429         }
1430
1431         error = sflt_accept(head, so, local, remote);
1432
1433         /*
1434          * If we get EJUSTRETURN from one of the filters, mark this socket
1435          * as inactive and return it anyway.  This newly accepted socket
1436          * will be disconnected later before we hand it off to the caller.
1437          */
1438         if (error == EJUSTRETURN) {
1439                 error = 0;
1440                 (void) sosetdefunct(current_proc(), so,
1441                     SHUTDOWN_SOCKET_LEVEL_DISCONNECT_INTERNAL, FALSE);
1442         }
1443
1444         if (error != 0) {
1445                 /*
1446                  * This may seem like a duplication to the above error
1447                  * handling part when we return ECONNABORTED, except
1448                  * the following is done while holding the lock since
1449                  * the socket has been exposed to the filter(s) earlier.
1450                  */
1451                 so->so_state &= ~(SS_NOFDREF | SS_COMP);
1452                 so->so_head = NULL;
1453                 socket_unlock(so, 1);
1454                 soclose(so);
1455                 /* Propagate socket filter's error code to the caller */
1456         } else {
1457                 socket_unlock(so, 1);
1458         }
1459 done:
1460         /* Callee checks for NULL pointer */
1461         sock_freeaddr(remote);
1462         sock_freeaddr(local);
1463         return (error);
1464 }
1465
1466 /*
1467  * Returns:     0                       Success
1468  *              EOPNOTSUPP              Operation not supported on socket
1469  *              EISCONN                 Socket is connected
1470  *      <pru_connect>:EADDRNOTAVAIL     Address not available.
1471  *      <pru_connect>:EINVAL            Invalid argument
1472  *      <pru_connect>:EAFNOSUPPORT      Address family not supported [notdef]
1473  *      <pru_connect>:EACCES            Permission denied
1474  *      <pru_connect>:EADDRINUSE        Address in use
1475  *      <pru_connect>:EAGAIN            Resource unavailable, try again
1476  *      <pru_connect>:EPERM             Operation not permitted
1477  *      <sf_connect_out>:???            [anything a filter writer might set]
1478  */
1479 int
1480 soconnectlock(struct socket *so, struct sockaddr *nam, int dolock)
1481 {
1482         int error;
1483         struct proc *p = current_proc();
1484
1485         if (dolock)
1486                 socket_lock(so, 1);
1487
1488         so_update_last_owner_locked(so, p);
1489         so_update_policy(so);
1490
1491 #if NECP
1492         so_update_necp_policy(so, NULL, nam);
1493 #endif /* NECP */
1494
1495         /*
1496          * If this is a listening socket or if this is a previously-accepted
1497          * socket that has been marked as inactive, reject the connect request.
1498          */
1499         if ((so->so_options & SO_ACCEPTCONN) || (so->so_flags & SOF_DEFUNCT)) {
1500                 error = EOPNOTSUPP;
1501                 if (so->so_flags & SOF_DEFUNCT) {
1502                         SODEFUNCTLOG("%s[%d, %s]: defunct so 0x%llx [%d,%d] "
1503                             "(%d)\n", __func__, proc_pid(p),
1504                             proc_best_name(p),
1505                             (uint64_t)DEBUG_KERNEL_ADDRPERM(so),
1506                             SOCK_DOM(so), SOCK_TYPE(so), error);
1507                 }
1508                 if (dolock)
1509                         socket_unlock(so, 1);
1510                 return (error);
1511         }
1512
1513         if ((so->so_restrictions & SO_RESTRICT_DENY_OUT) != 0) {
1514                 if (dolock)
1515                         socket_unlock(so, 1);
1516                 return (EPERM);
1517         }
1518
1519         /*
1520          * If protocol is connection-based, can only connect once.
1521          * Otherwise, if connected, try to disconnect first.
1522          * This allows user to disconnect by connecting to, e.g.,
1523          * a null address.
1524          */
1525         if (so->so_state & (SS_ISCONNECTED|SS_ISCONNECTING) &&
1526             ((so->so_proto->pr_flags & PR_CONNREQUIRED) ||
1527             (error = sodisconnectlocked(so)))) {
1528                 error = EISCONN;
1529         } else {
1530                 /*
1531                  * Run connect filter before calling protocol:
1532                  *  - non-blocking connect returns before completion;
1533                  */
1534                 error = sflt_connectout(so, nam);
1535                 if (error != 0) {
1536                         if (error == EJUSTRETURN)
1537                                 error = 0;
1538                 } else {
1539                         error = (*so->so_proto->pr_usrreqs->pru_connect)
1540                             (so, nam, p);
1541                 }
1542         }
1543         if (dolock)
1544                 socket_unlock(so, 1);
1545         return (error);
1546 }
1547
1548 int
1549 soconnect(struct socket *so, struct sockaddr *nam)
1550 {
1551         return (soconnectlock(so, nam, 1));
1552 }
1553
1554 /*
1555  * Returns:     0                       Success
1556  *      <pru_connect2>:EINVAL[AF_UNIX]
1557  *      <pru_connect2>:EPROTOTYPE[AF_UNIX]
1558  *      <pru_connect2>:???              [other protocol families]
1559  *
1560  * Notes:       <pru_connect2> is not supported by [TCP].
1561  */
1562 int
1563 soconnect2(struct socket *so1, struct socket *so2)
1564 {
1565         int error;
1566
1567         socket_lock(so1, 1);
1568         if (so2->so_proto->pr_lock)
1569                 socket_lock(so2, 1);
1570
1571         error = (*so1->so_proto->pr_usrreqs->pru_connect2)(so1, so2);
1572
1573         socket_unlock(so1, 1);
1574         if (so2->so_proto->pr_lock)
1575                 socket_unlock(so2, 1);
1576         return (error);
1577 }
1578
1579 int
1580 soconnectxlocked(struct socket *so, struct sockaddr_list **src_sl,
1581     struct sockaddr_list **dst_sl, struct proc *p, uint32_t ifscope,
1582     sae_associd_t aid, sae_connid_t *pcid, uint32_t flags, void *arg,
1583     uint32_t arglen, uio_t auio, user_ssize_t *bytes_written)
1584 {
1585         int error;
1586
1587         so_update_last_owner_locked(so, p);
1588         so_update_policy(so);
1589
1590         /*
1591          * If this is a listening socket or if this is a previously-accepted
1592          * socket that has been marked as inactive, reject the connect request.
1593          */
1594         if ((so->so_options & SO_ACCEPTCONN) || (so->so_flags & SOF_DEFUNCT)) {
1595                 error = EOPNOTSUPP;
1596                 if (so->so_flags & SOF_DEFUNCT) {
1597                         SODEFUNCTLOG("%s[%d, %s]: defunct so 0x%llx [%d,%d] "
1598                             "(%d)\n", __func__, proc_pid(p),
1599                             proc_best_name(p),
1600                             (uint64_t)DEBUG_KERNEL_ADDRPERM(so),
1601                             SOCK_DOM(so), SOCK_TYPE(so), error);
1602                 }
1603                 return (error);
1604         }
1605
1606         if ((so->so_restrictions & SO_RESTRICT_DENY_OUT) != 0)
1607                 return (EPERM);
1608
1609         /*
1610          * If protocol is connection-based, can only connect once
1611          * unless PR_MULTICONN is set.  Otherwise, if connected,
1612          * try to disconnect first.  This allows user to disconnect
1613          * by connecting to, e.g., a null address.
1614          */
1615         if ((so->so_state & (SS_ISCONNECTED|SS_ISCONNECTING)) &&
1616             !(so->so_proto->pr_flags & PR_MULTICONN) &&
1617             ((so->so_proto->pr_flags & PR_CONNREQUIRED) ||
1618             (error = sodisconnectlocked(so)) != 0)) {
1619                 error = EISCONN;
1620         } else {
1621                 /*
1622                  * Run connect filter before calling protocol:
1623                  *  - non-blocking connect returns before completion;
1624                  */
1625                 error = sflt_connectxout(so, dst_sl);
1626                 if (error != 0) {
1627                         /* Disable PRECONNECT_DATA, as we don't need to send a SYN anymore. */
1628                         so->so_flags1 &= ~SOF1_PRECONNECT_DATA;
1629                         if (error == EJUSTRETURN)
1630                                 error = 0;
1631                 } else {
1632                         error = (*so->so_proto->pr_usrreqs->pru_connectx)
1633                             (so, src_sl, dst_sl, p, ifscope, aid, pcid,
1634                             flags, arg, arglen, auio, bytes_written);
1635                 }
1636         }
1637
1638         return (error);
1639 }
1640
1641 int
1642 sodisconnectlocked(struct socket *so)
1643 {
1644         int error;
1645
1646         if ((so->so_state & SS_ISCONNECTED) == 0) {
1647                 error = ENOTCONN;
1648                 goto bad;
1649         }
1650         if (so->so_state & SS_ISDISCONNECTING) {
1651                 error = EALREADY;
1652                 goto bad;
1653         }
1654
1655         error = (*so->so_proto->pr_usrreqs->pru_disconnect)(so);
1656         if (error == 0)
1657                 sflt_notify(so, sock_evt_disconnected, NULL);
1658
1659 bad:
1660         return (error);
1661 }
1662
1663 /* Locking version */
1664 int
1665 sodisconnect(struct socket *so)
1666 {
1667         int error;
1668
1669         socket_lock(so, 1);
1670         error = sodisconnectlocked(so);
1671         socket_unlock(so, 1);
1672         return (error);
1673 }
1674
1675 int
1676 sodisconnectxlocked(struct socket *so, sae_associd_t aid, sae_connid_t cid)
1677 {
1678         int error;
1679
1680         /*
1681          * Call the protocol disconnectx handler; let it handle all
1682          * matters related to the connection state of this session.
1683          */
1684         error = (*so->so_proto->pr_usrreqs->pru_disconnectx)(so, aid, cid);
1685         if (error == 0) {
1686                 /*
1687                  * The event applies only for the session, not for
1688                  * the disconnection of individual subflows.
1689                  */
1690                 if (so->so_state & (SS_ISDISCONNECTING|SS_ISDISCONNECTED))
1691                         sflt_notify(so, sock_evt_disconnected, NULL);
1692         }
1693         return (error);
1694 }
1695
1696 int
1697 sodisconnectx(struct socket *so, sae_associd_t aid, sae_connid_t cid)
1698 {
1699         int error;
1700
1701         socket_lock(so, 1);
1702         error = sodisconnectxlocked(so, aid, cid);
1703         socket_unlock(so, 1);
1704         return (error);
1705 }
1706
1707 int
1708 sopeelofflocked(struct socket *so, sae_associd_t aid, struct socket **psop)
1709 {
1710         return ((*so->so_proto->pr_usrreqs->pru_peeloff)(so, aid, psop));
1711 }
1712
1713 #define SBLOCKWAIT(f)   (((f) & MSG_DONTWAIT) ? 0 : SBL_WAIT)
1714
1715 /*
1716  * sosendcheck will lock the socket buffer if it isn't locked and
1717  * verify that there is space for the data being inserted.
1718  *
1719  * Returns:     0                       Success
1720  *              EPIPE
1721  *      sblock:EWOULDBLOCK
1722  *      sblock:EINTR
1723  *      sbwait:EBADF
1724  *      sbwait:EINTR
1725  *      [so_error]:???
1726  */
1727 int
1728 sosendcheck(struct socket *so, struct sockaddr *addr, user_ssize_t resid,
1729     int32_t clen, int32_t atomic, int flags, int *sblocked,
1730     struct mbuf *control)
1731 {
1732         int     error = 0;
1733         int32_t space;
1734         int     assumelock = 0;
1735
1736 restart:
1737         if (*sblocked == 0) {
1738                 if ((so->so_snd.sb_flags & SB_LOCK) != 0 &&
1739                     so->so_send_filt_thread != 0 &&
1740                     so->so_send_filt_thread == current_thread()) {
1741                         /*
1742                          * We're being called recursively from a filter,
1743                          * allow this to continue. Radar 4150520.
1744                          * Don't set sblocked because we don't want
1745                          * to perform an unlock later.
1746                          */
1747                         assumelock = 1;
1748                 } else {
1749                         error = sblock(&so->so_snd, SBLOCKWAIT(flags));
1750                         if (error) {
1751                                 if (so->so_flags & SOF_DEFUNCT)
1752                                         goto defunct;
1753                                 return (error);
1754                         }
1755                         *sblocked = 1;
1756                 }
1757         }
1758
1759         /*
1760          * If a send attempt is made on a socket that has been marked
1761          * as inactive (disconnected), reject the request.
1762          */
1763         if (so->so_flags & SOF_DEFUNCT) {
1764 defunct:
1765                 error = EPIPE;
1766                 SODEFUNCTLOG("%s[%d, %s]: defunct so 0x%llx [%d,%d] (%d)\n",
1767                     __func__, proc_selfpid(), proc_best_name(current_proc()),
1768                     (uint64_t)DEBUG_KERNEL_ADDRPERM(so),
1769                     SOCK_DOM(so), SOCK_TYPE(so), error);
1770                 return (error);
1771         }
1772
1773         if (so->so_state & SS_CANTSENDMORE) {
1774 #if CONTENT_FILTER
1775                 /*
1776                  * Can re-inject data of half closed connections
1777                  */
1778                 if ((so->so_state & SS_ISDISCONNECTED) == 0 &&
1779                         so->so_snd.sb_cfil_thread == current_thread() &&
1780                         cfil_sock_data_pending(&so->so_snd) != 0)
1781                         CFIL_LOG(LOG_INFO,
1782                                 "so %llx ignore SS_CANTSENDMORE",
1783                                 (uint64_t)DEBUG_KERNEL_ADDRPERM(so));
1784                 else
1785 #endif /* CONTENT_FILTER */
1786                         return (EPIPE);
1787         }
1788         if (so->so_error) {
1789                 error = so->so_error;
1790                 so->so_error = 0;
1791                 return (error);
1792         }
1793
1794         if ((so->so_state & SS_ISCONNECTED) == 0) {
1795                 if ((so->so_proto->pr_flags & PR_CONNREQUIRED) != 0) {
1796                         if (((so->so_state & SS_ISCONFIRMING) == 0) &&
1797                             (resid != 0 || clen == 0) &&
1798                             !(so->so_flags1 & SOF1_PRECONNECT_DATA)) {
1799 #if MPTCP
1800                                 /*
1801                                  * MPTCP Fast Join sends data before the
1802                                  * socket is truly connected.
1803                                  */
1804                                 if ((so->so_flags & (SOF_MP_SUBFLOW |
1805                                         SOF_MPTCP_FASTJOIN)) !=
1806                                     (SOF_MP_SUBFLOW | SOF_MPTCP_FASTJOIN))
1807 #endif /* MPTCP */
1808                                 return (ENOTCONN);
1809                         }
1810                 } else if (addr == 0 && !(flags&MSG_HOLD)) {
1811                         return ((so->so_proto->pr_flags & PR_CONNREQUIRED) ?
1812                             ENOTCONN : EDESTADDRREQ);
1813                 }
1814         }
1815
1816         if (so->so_flags & SOF_ENABLE_MSGS)
1817                 space = msgq_sbspace(so, control);
1818         else
1819                 space = sbspace(&so->so_snd);
1820
1821         if (flags & MSG_OOB)
1822                 space += 1024;
1823         if ((atomic && resid > so->so_snd.sb_hiwat) ||
1824             clen > so->so_snd.sb_hiwat)
1825                 return (EMSGSIZE);
1826
1827         if ((space < resid + clen &&
1828             (atomic || (space < (int32_t)so->so_snd.sb_lowat) ||
1829             space < clen)) ||
1830             (so->so_type == SOCK_STREAM && so_wait_for_if_feedback(so))) {
1831                 /*
1832                  * don't block the connectx call when there's more data
1833                  * than can be copied.
1834                  */
1835                 if (so->so_flags1 & SOF1_PRECONNECT_DATA) {
1836                         if (space == 0) {
1837                                 return (EWOULDBLOCK);
1838                         }
1839                         if (space < (int32_t)so->so_snd.sb_lowat) {
1840                                 return (0);
1841                         }
1842                 }
1843                 if ((so->so_state & SS_NBIO) || (flags & MSG_NBIO) ||
1844                     assumelock) {
1845                         return (EWOULDBLOCK);
1846                 }
1847                 sbunlock(&so->so_snd, TRUE);    /* keep socket locked */
1848                 *sblocked = 0;
1849                 error = sbwait(&so->so_snd);
1850                 if (error) {
1851                         if (so->so_flags & SOF_DEFUNCT)
1852                                 goto defunct;
1853                         return (error);
1854                 }
1855                 goto restart;
1856         }
1857         return (0);
1858 }
1859
1860 /*
1861  * Send on a socket.
1862  * If send must go all at once and message is larger than
1863  * send buffering, then hard error.
1864  * Lock against other senders.
1865  * If must go all at once and not enough room now, then
1866  * inform user that this would block and do nothing.
1867  * Otherwise, if nonblocking, send as much as possible.
1868  * The data to be sent is described by "uio" if nonzero,
1869  * otherwise by the mbuf chain "top" (which must be null
1870  * if uio is not).  Data provided in mbuf chain must be small
1871  * enough to send all at once.
1872  *
1873  * Returns nonzero on error, timeout or signal; callers
1874  * must check for short counts if EINTR/ERESTART are returned.
1875  * Data and control buffers are freed on return.
1876  * Experiment:
1877  * MSG_HOLD: go thru most of sosend(), but just enqueue the mbuf
1878  * MSG_SEND: go thru as for MSG_HOLD on current fragment, then
1879  *  point at the mbuf chain being constructed and go from there.
1880  *
1881  * Returns:     0                       Success
1882  *              EOPNOTSUPP
1883  *              EINVAL
1884  *              ENOBUFS
1885  *      uiomove:EFAULT
1886  *      sosendcheck:EPIPE
1887  *      sosendcheck:EWOULDBLOCK
1888  *      sosendcheck:EINTR
1889  *      sosendcheck:EBADF
1890  *      sosendcheck:EINTR
1891  *      sosendcheck:???                 [value from so_error]
1892  *      <pru_send>:ECONNRESET[TCP]
1893  *      <pru_send>:EINVAL[TCP]
1894  *      <pru_send>:ENOBUFS[TCP]
1895  *      <pru_send>:EADDRINUSE[TCP]
1896  *      <pru_send>:EADDRNOTAVAIL[TCP]
1897  *      <pru_send>:EAFNOSUPPORT[TCP]
1898  *      <pru_send>:EACCES[TCP]
1899  *      <pru_send>:EAGAIN[TCP]
1900  *      <pru_send>:EPERM[TCP]
1901  *      <pru_send>:EMSGSIZE[TCP]
1902  *      <pru_send>:EHOSTUNREACH[TCP]
1903  *      <pru_send>:ENETUNREACH[TCP]
1904  *      <pru_send>:ENETDOWN[TCP]
1905  *      <pru_send>:ENOMEM[TCP]
1906  *      <pru_send>:ENOBUFS[TCP]
1907  *      <pru_send>:???[TCP]             [ignorable: mostly IPSEC/firewall/DLIL]
1908  *      <pru_send>:EINVAL[AF_UNIX]
1909  *      <pru_send>:EOPNOTSUPP[AF_UNIX]
1910  *      <pru_send>:EPIPE[AF_UNIX]
1911  *      <pru_send>:ENOTCONN[AF_UNIX]
1912  *      <pru_send>:EISCONN[AF_UNIX]
1913  *      <pru_send>:???[AF_UNIX]         [whatever a filter author chooses]
1914  *      <sf_data_out>:???               [whatever a filter author chooses]
1915  *
1916  * Notes:       Other <pru_send> returns depend on the protocol family; all
1917  *              <sf_data_out> returns depend on what the filter author causes
1918  *              their filter to return.
1919  */
1920 int
1921 sosend(struct socket *so, struct sockaddr *addr, struct uio *uio,
1922     struct mbuf *top, struct mbuf *control, int flags)
1923 {
1924         struct mbuf **mp;
1925         struct mbuf *m, *freelist = NULL;
1926         user_ssize_t space, len, resid, orig_resid;
1927         int clen = 0, error, dontroute, mlen, sendflags;
1928         int atomic = sosendallatonce(so) || top;
1929         int sblocked = 0;
1930         struct proc *p = current_proc();
1931         struct mbuf *control_copy = NULL;
1932         uint16_t headroom = 0;
1933         boolean_t en_tracing = FALSE;
1934
1935         if (uio != NULL)
1936                 resid = uio_resid(uio);
1937         else
1938                 resid = top->m_pkthdr.len;
1939
1940         KERNEL_DEBUG((DBG_FNC_SOSEND | DBG_FUNC_START), so, resid,
1941             so->so_snd.sb_cc, so->so_snd.sb_lowat, so->so_snd.sb_hiwat);
1942
1943         socket_lock(so, 1);
1944
1945         /*
1946          * trace if tracing & network (vs. unix) sockets & and
1947          * non-loopback
1948          */
1949         if (ENTR_SHOULDTRACE &&
1950             (SOCK_CHECK_DOM(so, AF_INET) || SOCK_CHECK_DOM(so, AF_INET6))) {
1951                 struct inpcb *inp = sotoinpcb(so);
1952                 if (inp->inp_last_outifp != NULL &&
1953                     !(inp->inp_last_outifp->if_flags & IFF_LOOPBACK)) {
1954                         en_tracing = TRUE;
1955                         KERNEL_ENERGYTRACE(kEnTrActKernSockWrite, DBG_FUNC_START,
1956                             VM_KERNEL_ADDRPERM(so),
1957                             ((so->so_state & SS_NBIO) ? kEnTrFlagNonBlocking : 0),
1958                             (int64_t)resid);
1959                         orig_resid = resid;
1960                 }
1961         }
1962
1963         /*
1964          * Re-injection should not affect process accounting
1965          */
1966         if ((flags & MSG_SKIPCFIL) == 0) {
1967                 so_update_last_owner_locked(so, p);
1968                 so_update_policy(so);
1969
1970 #if NECP
1971                 so_update_necp_policy(so, NULL, addr);
1972 #endif /* NECP */
1973         }
1974
1975         if (so->so_type != SOCK_STREAM && (flags & MSG_OOB) != 0) {
1976                 error = EOPNOTSUPP;
1977                 socket_unlock(so, 1);
1978                 goto out;
1979         }
1980
1981         /*
1982          * In theory resid should be unsigned.
1983          * However, space must be signed, as it might be less than 0
1984          * if we over-committed, and we must use a signed comparison
1985          * of space and resid.  On the other hand, a negative resid
1986          * causes us to loop sending 0-length segments to the protocol.
1987          *
1988          * Usually, MSG_EOR isn't used on SOCK_STREAM type sockets.
1989          * But it will be used by sockets doing message delivery.
1990          *
1991          * Note: We limit resid to be a positive int value as we use
1992          * imin() to set bytes_to_copy -- radr://14558484
1993          */
1994         if (resid < 0 || resid > INT_MAX || (so->so_type == SOCK_STREAM &&
1995             !(so->so_flags & SOF_ENABLE_MSGS) && (flags & MSG_EOR))) {
1996                 error = EINVAL;
1997                 socket_unlock(so, 1);
1998                 goto out;
1999         }
2000
2001         dontroute = (flags & MSG_DONTROUTE) &&
2002             (so->so_options & SO_DONTROUTE) == 0 &&
2003             (so->so_proto->pr_flags & PR_ATOMIC);
2004         OSIncrementAtomicLong(&p->p_stats->p_ru.ru_msgsnd);
2005
2006         if (control != NULL)
2007                 clen = control->m_len;
2008
2009         if (soreserveheadroom != 0)
2010                 headroom = so->so_pktheadroom;
2011
2012         do {
2013                 error = sosendcheck(so, addr, resid, clen, atomic, flags,
2014                     &sblocked, control);
2015                 if (error)
2016                         goto release;
2017
2018                 mp = &top;
2019                 if (so->so_flags & SOF_ENABLE_MSGS)
2020                         space = msgq_sbspace(so, control);
2021                 else
2022                         space = sbspace(&so->so_snd) - clen;
2023                 space += ((flags & MSG_OOB) ? 1024 : 0);
2024
2025                 do {
2026                         if (uio == NULL) {
2027                                 /*
2028                                  * Data is prepackaged in "top".
2029                                  */
2030                                 resid = 0;
2031                                 if (flags & MSG_EOR)
2032                                         top->m_flags |= M_EOR;
2033                         } else {
2034                                 int chainlength;
2035                                 int bytes_to_copy;
2036                                 boolean_t jumbocl;
2037                                 boolean_t bigcl;
2038                                 int bytes_to_alloc;
2039
2040                                 bytes_to_copy = imin(resid, space);
2041
2042                                 bytes_to_alloc = bytes_to_copy;
2043                                 if (top == NULL)
2044                                         bytes_to_alloc += headroom;
2045
2046                                 if (sosendminchain > 0)
2047                                         chainlength = 0;
2048                                 else
2049                                         chainlength = sosendmaxchain;
2050
2051                                 /*
2052                                  * Use big 4 KB cluster when the outgoing interface
2053                                  * does not prefer 2 KB clusters
2054                                  */
2055                                 bigcl = !(so->so_flags1 & SOF1_IF_2KCL) ||
2056                                     sosendbigcl_ignore_capab;
2057
2058                                 /*
2059                                  * Attempt to use larger than system page-size
2060                                  * clusters for large writes only if there is
2061                                  * a jumbo cluster pool and if the socket is
2062                                  * marked accordingly.
2063                                  */
2064                                 jumbocl = sosendjcl && njcl > 0 &&
2065                                     ((so->so_flags & SOF_MULTIPAGES) ||
2066                                     sosendjcl_ignore_capab) &&
2067                                     bigcl;
2068
2069                                 socket_unlock(so, 0);
2070
2071                                 do {
2072                                         int num_needed;
2073                                         int hdrs_needed = (top == NULL) ? 1 : 0;
2074
2075                                         /*
2076                                          * try to maintain a local cache of mbuf
2077                                          * clusters needed to complete this
2078                                          * write the list is further limited to
2079                                          * the number that are currently needed
2080                                          * to fill the socket this mechanism
2081                                          * allows a large number of mbufs/
2082                                          * clusters to be grabbed under a single
2083                                          * mbuf lock... if we can't get any
2084                                          * clusters, than fall back to trying
2085                                          * for mbufs if we fail early (or
2086                                          * miscalcluate the number needed) make
2087                                          * sure to release any clusters we
2088                                          * haven't yet consumed.
2089                                          */
2090                                         if (freelist == NULL &&
2091                                             bytes_to_alloc > MBIGCLBYTES &&
2092                                             jumbocl) {
2093                                                 num_needed =
2094                                                     bytes_to_alloc / M16KCLBYTES;
2095
2096                                                 if ((bytes_to_alloc -
2097                                                     (num_needed * M16KCLBYTES))
2098                                                     >= MINCLSIZE)
2099                                                         num_needed++;
2100
2101                                                 freelist =
2102                                                     m_getpackets_internal(
2103                                                     (unsigned int *)&num_needed,
2104                                                     hdrs_needed, M_WAIT, 0,
2105                                                     M16KCLBYTES);
2106                                                 /*
2107                                                  * Fall back to 4K cluster size
2108                                                  * if allocation failed
2109                                                  */
2110                                         }
2111
2112                                         if (freelist == NULL &&
2113                                             bytes_to_alloc > MCLBYTES &&
2114                                             bigcl) {
2115                                                 num_needed =
2116                                                     bytes_to_alloc / MBIGCLBYTES;
2117
2118                                                 if ((bytes_to_alloc -
2119                                                     (num_needed * MBIGCLBYTES)) >=
2120                                                     MINCLSIZE)
2121                                                         num_needed++;
2122
2123                                                 freelist =
2124                                                     m_getpackets_internal(
2125                                                     (unsigned int *)&num_needed,
2126                                                     hdrs_needed, M_WAIT, 0,
2127                                                     MBIGCLBYTES);
2128                                                 /*
2129                                                  * Fall back to cluster size
2130                                                  * if allocation failed
2131                                                  */
2132                                         }
2133
2134                                         /*
2135                                          * Allocate a cluster as we want to
2136                                          * avoid to split the data in more
2137                                          * that one segment and using MINCLSIZE
2138                                          * would lead us to allocate two mbufs
2139                                          */
2140                                         if (soreserveheadroom != 0 &&
2141                                             freelist == NULL &&
2142                                             ((top == NULL &&
2143                                             bytes_to_alloc > _MHLEN) ||
2144                                             bytes_to_alloc > _MLEN)) {
2145                                                 num_needed = ROUNDUP(bytes_to_alloc, MCLBYTES) /
2146                                                     MCLBYTES;
2147                                                 freelist =
2148                                                     m_getpackets_internal(
2149                                                     (unsigned int *)&num_needed,
2150                                                     hdrs_needed, M_WAIT, 0,
2151                                                     MCLBYTES);
2152                                                 /*
2153                                                  * Fall back to a single mbuf
2154                                                  * if allocation failed
2155                                                  */
2156                                         } else if (freelist == NULL &&
2157                                             bytes_to_alloc > MINCLSIZE) {
2158                                                 num_needed =
2159                                                     bytes_to_alloc / MCLBYTES;
2160
2161                                                 if ((bytes_to_alloc -
2162                                                     (num_needed * MCLBYTES)) >=
2163                                                     MINCLSIZE)
2164                                                         num_needed++;
2165
2166                                                 freelist =
2167                                                     m_getpackets_internal(
2168                                                     (unsigned int *)&num_needed,
2169                                                     hdrs_needed, M_WAIT, 0,
2170                                                     MCLBYTES);
2171                                                 /*
2172                                                  * Fall back to a single mbuf
2173                                                  * if allocation failed
2174                                                  */
2175                                         }
2176                                         /*
2177                                          * For datagram protocols, leave
2178                                          * headroom for protocol headers
2179                                          * in the first cluster of the chain
2180                                          */
2181                                         if (freelist != NULL && atomic &&
2182                                             top == NULL && headroom > 0) {
2183                                                 freelist->m_data += headroom;
2184                                         }
2185
2186                                         /*
2187                                          * Fall back to regular mbufs without
2188                                          * reserving the socket headroom
2189                                          */
2190                                         if (freelist == NULL) {
2191                                                 if (top == NULL)
2192                                                         MGETHDR(freelist,
2193                                                             M_WAIT, MT_DATA);
2194                                                 else
2195                                                         MGET(freelist,
2196                                                             M_WAIT, MT_DATA);
2197
2198                                                 if (freelist == NULL) {
2199                                                         error = ENOBUFS;
2200                                                         socket_lock(so, 0);
2201                                                         goto release;
2202                                                 }
2203                                                 /*
2204                                                  * For datagram protocols,
2205                                                  * leave room for protocol
2206                                                  * headers in first mbuf.
2207                                                  */
2208                                                 if (atomic && top == NULL &&
2209                                                     bytes_to_copy < MHLEN) {
2210                                                         MH_ALIGN(freelist,
2211                                                             bytes_to_copy);
2212                                                 }
2213                                         }
2214                                         m = freelist;
2215                                         freelist = m->m_next;
2216                                         m->m_next = NULL;
2217
2218                                         if ((m->m_flags & M_EXT))
2219                                                 mlen = m->m_ext.ext_size -
2220                                                     m_leadingspace(m);
2221                                         else if ((m->m_flags & M_PKTHDR))
2222                                                 mlen =
2223                                                     MHLEN - m_leadingspace(m);
2224                                         else
2225                                                 mlen = MLEN - m_leadingspace(m);
2226                                         len = imin(mlen, bytes_to_copy);
2227
2228                                         chainlength += len;
2229
2230                                         space -= len;
2231
2232                                         error = uiomove(mtod(m, caddr_t),
2233                                             len, uio);
2234
2235                                         resid = uio_resid(uio);
2236
2237                                         m->m_len = len;
2238                                         *mp = m;
2239                                         top->m_pkthdr.len += len;
2240                                         if (error)
2241                                                 break;
2242                                         mp = &m->m_next;
2243                                         if (resid <= 0) {
2244                                                 if (flags & MSG_EOR)
2245                                                         top->m_flags |= M_EOR;
2246                                                 break;
2247                                         }
2248                                         bytes_to_copy = min(resid, space);
2249
2250                                 } while (space > 0 &&
2251                                     (chainlength < sosendmaxchain || atomic ||
2252                                     resid < MINCLSIZE));
2253
2254                                 socket_lock(so, 0);
2255
2256                                 if (error)
2257                                         goto release;
2258                         }
2259
2260                         if (flags & (MSG_HOLD|MSG_SEND)) {
2261                                 /* Enqueue for later, go away if HOLD */
2262                                 struct mbuf *mb1;
2263                                 if (so->so_temp && (flags & MSG_FLUSH)) {
2264                                         m_freem(so->so_temp);
2265                                         so->so_temp = NULL;
2266                                 }
2267                                 if (so->so_temp)
2268                                         so->so_tail->m_next = top;
2269                                 else
2270                                         so->so_temp = top;
2271                                 mb1 = top;
2272                                 while (mb1->m_next)
2273                                         mb1 = mb1->m_next;
2274                                 so->so_tail = mb1;
2275                                 if (flags & MSG_HOLD) {
2276                                         top = NULL;
2277                                         goto release;
2278                                 }
2279                                 top = so->so_temp;
2280                         }
2281                         if (dontroute)
2282                                 so->so_options |= SO_DONTROUTE;
2283
2284                         /*
2285                          * Compute flags here, for pru_send and NKEs
2286                          *
2287                          * If the user set MSG_EOF, the protocol
2288                          * understands this flag and nothing left to
2289                          * send then use PRU_SEND_EOF instead of PRU_SEND.
2290                          */
2291                         sendflags = (flags & MSG_OOB) ? PRUS_OOB :
2292                             ((flags & MSG_EOF) &&
2293                             (so->so_proto->pr_flags & PR_IMPLOPCL) &&
2294                             (resid <= 0)) ? PRUS_EOF :
2295                             /* If there is more to send set PRUS_MORETOCOME */
2296                             (resid > 0 && space > 0) ? PRUS_MORETOCOME : 0;
2297
2298                         if ((flags & MSG_SKIPCFIL) == 0) {
2299                                 /*
2300                                  * Socket filter processing
2301                                  */
2302                                 error = sflt_data_out(so, addr, &top,
2303                                     &control, (sendflags & MSG_OOB) ?
2304                                     sock_data_filt_flag_oob : 0);
2305                                 if (error) {
2306                                         if (error == EJUSTRETURN) {
2307                                                 error = 0;
2308                                                 clen = 0;
2309                                                 control = NULL;
2310                                                 top = NULL;
2311                                         }
2312                                         goto release;
2313                                 }
2314 #if CONTENT_FILTER
2315                                 /*
2316                                  * Content filter processing
2317                                  */
2318                                 error = cfil_sock_data_out(so, addr, top,
2319                                     control, (sendflags & MSG_OOB) ?
2320                                     sock_data_filt_flag_oob : 0);
2321                                 if (error) {
2322                                         if (error == EJUSTRETURN) {
2323                                                 error = 0;
2324                                                 clen = 0;
2325                                                 control = NULL;
2326                                                 top = NULL;
2327                                                 }
2328                                         goto release;
2329                                 }
2330 #endif /* CONTENT_FILTER */
2331                         }
2332                         if (so->so_flags & SOF_ENABLE_MSGS) {
2333                                 /*
2334                                  * Make a copy of control mbuf,
2335                                  * so that msg priority can be
2336                                  * passed to subsequent mbufs.
2337                                  */
2338                                 control_copy = m_dup(control, M_NOWAIT);
2339                         }
2340                         error = (*so->so_proto->pr_usrreqs->pru_send)
2341                             (so, sendflags, top, addr, control, p);
2342
2343                         if (flags & MSG_SEND)
2344                                 so->so_temp = NULL;
2345
2346                         if (dontroute)
2347                                 so->so_options &= ~SO_DONTROUTE;
2348
2349                         clen = 0;
2350                         control = control_copy;
2351                         control_copy = NULL;
2352                         top = NULL;
2353                         mp = &top;
2354                         if (error)
2355                                 goto release;
2356                 } while (resid && space > 0);
2357         } while (resid);
2358
2359 release:
2360         if (sblocked)
2361                 sbunlock(&so->so_snd, FALSE);   /* will unlock socket */
2362         else
2363                 socket_unlock(so, 1);
2364 out:
2365         if (top != NULL)
2366                 m_freem(top);
2367         if (control != NULL)
2368                 m_freem(control);
2369         if (freelist != NULL)
2370                 m_freem_list(freelist);
2371         if (control_copy != NULL)
2372                 m_freem(control_copy);
2373
2374         /*
2375          * One write has been done. This was enough. Get back to "normal"
2376          * behavior.
2377          */
2378         if (so->so_flags1 & SOF1_PRECONNECT_DATA)
2379                 so->so_flags1 &= ~SOF1_PRECONNECT_DATA;
2380
2381         if (en_tracing) {
2382                 /* resid passed here is the bytes left in uio */
2383                 KERNEL_ENERGYTRACE(kEnTrActKernSockWrite, DBG_FUNC_END,
2384                     VM_KERNEL_ADDRPERM(so),
2385                     ((error == EWOULDBLOCK) ? kEnTrFlagNoWork : 0),
2386                     (int64_t)(orig_resid - resid));
2387         }
2388         KERNEL_DEBUG(DBG_FNC_SOSEND | DBG_FUNC_END, so, resid,
2389             so->so_snd.sb_cc, space, error);
2390
2391         return (error);
2392 }
2393
2394 /*
2395  * Supported only connected sockets (no address) without ancillary data
2396  * (control mbuf) for atomic protocols
2397  */
2398 int
2399 sosend_list(struct socket *so, struct uio **uioarray, u_int uiocnt, int flags)
2400 {
2401         struct mbuf *m, *freelist = NULL;
2402         user_ssize_t len, resid;
2403         int error, dontroute, mlen;
2404         int atomic = sosendallatonce(so);
2405         int sblocked = 0;
2406         struct proc *p = current_proc();
2407         u_int uiofirst = 0;
2408         u_int uiolast = 0;
2409         struct mbuf *top = NULL;
2410         uint16_t headroom = 0;
2411         boolean_t bigcl;
2412
2413         KERNEL_DEBUG((DBG_FNC_SOSEND_LIST | DBG_FUNC_START), so, uiocnt,
2414             so->so_snd.sb_cc, so->so_snd.sb_lowat, so->so_snd.sb_hiwat);
2415
2416         if (so->so_type != SOCK_DGRAM) {
2417                 error = EINVAL;
2418                 goto out;
2419         }
2420         if (atomic == 0) {
2421                 error = EINVAL;
2422                 goto out;
2423         }
2424         if (so->so_proto->pr_usrreqs->pru_send_list == NULL) {
2425                 error = EPROTONOSUPPORT;
2426                 goto out;
2427         }
2428         if (flags & ~(MSG_DONTWAIT | MSG_NBIO)) {
2429                 error = EINVAL;
2430                 goto out;
2431         }
2432         resid = uio_array_resid(uioarray, uiocnt);
2433
2434         /*
2435          * In theory resid should be unsigned.
2436          * However, space must be signed, as it might be less than 0
2437          * if we over-committed, and we must use a signed comparison
2438          * of space and resid.  On the other hand, a negative resid
2439          * causes us to loop sending 0-length segments to the protocol.
2440          *
2441          * Note: We limit resid to be a positive int value as we use
2442          * imin() to set bytes_to_copy -- radr://14558484
2443          */
2444         if (resid < 0 || resid > INT_MAX) {
2445                 error = EINVAL;
2446                 goto out;
2447         }
2448
2449         socket_lock(so, 1);
2450         so_update_last_owner_locked(so, p);
2451         so_update_policy(so);
2452
2453 #if NECP
2454         so_update_necp_policy(so, NULL, NULL);
2455 #endif /* NECP */
2456
2457         dontroute = (flags & MSG_DONTROUTE) &&
2458             (so->so_options & SO_DONTROUTE) == 0 &&
2459             (so->so_proto->pr_flags & PR_ATOMIC);
2460         OSIncrementAtomicLong(&p->p_stats->p_ru.ru_msgsnd);
2461
2462         error = sosendcheck(so, NULL, resid, 0, atomic, flags,
2463             &sblocked, NULL);
2464         if (error)
2465                 goto release;
2466
2467         /*
2468          * Use big 4 KB clusters when the outgoing interface does not prefer
2469          * 2 KB clusters
2470          */
2471         bigcl = !(so->so_flags1 & SOF1_IF_2KCL) || sosendbigcl_ignore_capab;
2472
2473         if (soreserveheadroom != 0)
2474                 headroom = so->so_pktheadroom;
2475
2476         do {
2477                 int i;
2478                 int num_needed = 0;
2479                 int chainlength;
2480                 size_t maxpktlen = 0;
2481                 int bytes_to_alloc;
2482
2483                 if (sosendminchain > 0)
2484                         chainlength = 0;
2485                 else
2486                         chainlength = sosendmaxchain;
2487
2488                 socket_unlock(so, 0);
2489
2490                 /*
2491                  * Find a set of uio that fit in a reasonable number
2492                  * of mbuf packets
2493                  */
2494                 for (i = uiofirst; i < uiocnt; i++) {
2495                         struct uio *auio = uioarray[i];
2496
2497                         len = uio_resid(auio);
2498
2499                         /* Do nothing for empty messages */
2500                         if (len == 0)
2501                                 continue;
2502
2503                         num_needed += 1;
2504                         uiolast += 1;
2505
2506                         if (len > maxpktlen)
2507                                 maxpktlen = len;
2508
2509                         chainlength += len;
2510                         if (chainlength > sosendmaxchain)
2511                                 break;
2512                 }
2513                 /*
2514                  * Nothing left to send
2515                  */
2516                 if (num_needed == 0) {
2517                         socket_lock(so, 0);
2518                         break;
2519                 }
2520                 /*
2521                  * Allocate buffer large enough to include headroom space for
2522                  * network and link header
2523                  *
2524                  */
2525                 bytes_to_alloc = maxpktlen + headroom;
2526
2527                 /*
2528                  * Allocate a single contiguous buffer of the smallest available
2529                  * size when possible
2530                  */
2531                 if (bytes_to_alloc > MCLBYTES &&
2532                     bytes_to_alloc <= MBIGCLBYTES && bigcl) {
2533                         freelist = m_getpackets_internal(
2534                             (unsigned int *)&num_needed,
2535                             num_needed, M_WAIT, 1,
2536                             MBIGCLBYTES);
2537                 } else if (bytes_to_alloc > _MHLEN &&
2538                     bytes_to_alloc <= MCLBYTES) {
2539                         freelist = m_getpackets_internal(
2540                             (unsigned int *)&num_needed,
2541                             num_needed, M_WAIT, 1,
2542                             MCLBYTES);
2543                 } else {
2544                         freelist = m_allocpacket_internal(
2545                             (unsigned int *)&num_needed,
2546                             bytes_to_alloc, NULL, M_WAIT, 1, 0);
2547                 }
2548
2549                 if (freelist == NULL) {
2550                         socket_lock(so, 0);
2551                         error = ENOMEM;
2552                         goto release;
2553                 }
2554                 /*
2555                  * Copy each uio of the set into its own mbuf packet
2556                  */
2557                 for (i = uiofirst, m = freelist;
2558                     i < uiolast && m != NULL;
2559                     i++) {
2560                         int bytes_to_copy;
2561                         struct mbuf *n;
2562                         struct uio *auio = uioarray[i];
2563
2564                         bytes_to_copy = uio_resid(auio);
2565
2566                         /* Do nothing for empty messages */
2567                         if (bytes_to_copy == 0)
2568                                 continue;
2569                         /*
2570                          * Leave headroom for protocol headers
2571                          * in the first mbuf of the chain
2572                          */
2573                         m->m_data += headroom;
2574
2575                         for (n = m; n != NULL; n = n->m_next) {
2576                                 if ((m->m_flags & M_EXT))
2577                                         mlen = m->m_ext.ext_size -
2578                                             m_leadingspace(m);
2579                                 else if ((m->m_flags & M_PKTHDR))
2580                                         mlen =
2581                                             MHLEN - m_leadingspace(m);
2582                                 else
2583                                         mlen = MLEN - m_leadingspace(m);
2584                                 len = imin(mlen, bytes_to_copy);
2585
2586                                 /*
2587                                  * Note: uiomove() decrements the iovec
2588                                  * length
2589                                  */
2590                                 error = uiomove(mtod(n, caddr_t),
2591                                     len, auio);
2592                                 if (error != 0)
2593                                         break;
2594                                 n->m_len = len;
2595                                 m->m_pkthdr.len += len;
2596
2597                                 VERIFY(m->m_pkthdr.len <= maxpktlen);
2598
2599                                 bytes_to_copy -= len;
2600                                 resid -= len;
2601                         }
2602                         if (m->m_pkthdr.len == 0) {
2603                                 printf(
2604                                     "%s:%d so %llx pkt %llx type %u len null\n",
2605                                     __func__, __LINE__,
2606                                     (uint64_t)DEBUG_KERNEL_ADDRPERM(so),
2607                                     (uint64_t)DEBUG_KERNEL_ADDRPERM(m),
2608                                     m->m_type);
2609                         }
2610                         if (error != 0)
2611                                 break;
2612                         m = m->m_nextpkt;
2613                 }
2614
2615                 socket_lock(so, 0);
2616
2617                 if (error)
2618                         goto release;
2619                 top = freelist;
2620                 freelist = NULL;
2621
2622                 if (dontroute)
2623                         so->so_options |= SO_DONTROUTE;
2624
2625                 if ((flags & MSG_SKIPCFIL) == 0) {
2626                         struct mbuf **prevnextp = NULL;
2627
2628                         for (i = uiofirst, m = top;
2629                             i < uiolast && m != NULL;
2630                             i++) {
2631                                 struct mbuf *nextpkt = m->m_nextpkt;
2632
2633                                 /*
2634                                  * Socket filter processing
2635                                  */
2636                                 error = sflt_data_out(so, NULL, &m,
2637                                     NULL, 0);
2638                                 if (error != 0 && error != EJUSTRETURN)
2639                                         goto release;
2640
2641 #if CONTENT_FILTER
2642                                 if (error == 0) {
2643                                         /*
2644                                          * Content filter processing
2645                                          */
2646                                         error = cfil_sock_data_out(so, NULL, m,
2647                                             NULL, 0);
2648                                         if (error != 0 && error != EJUSTRETURN)
2649                                                 goto release;
2650                                 }
2651 #endif /* CONTENT_FILTER */
2652                                 /*
2653                                  * Remove packet from the list when
2654                                  * swallowed by a filter
2655                                  */
2656                                 if (error == EJUSTRETURN) {
2657                                         error = 0;
2658                                         if (prevnextp != NULL)
2659                                                 *prevnextp = nextpkt;
2660                                         else
2661                                                 top = nextpkt;
2662                                 }
2663
2664                                 m = nextpkt;
2665                                 if (m != NULL)
2666                                         prevnextp = &m->m_nextpkt;
2667                         }
2668                 }
2669                 if (top != NULL)
2670                         error = (*so->so_proto->pr_usrreqs->pru_send_list)
2671                             (so, 0, top, NULL, NULL, p);
2672
2673                 if (dontroute)
2674                         so->so_options &= ~SO_DONTROUTE;
2675
2676                 top = NULL;
2677                 uiofirst = uiolast;
2678         } while (resid > 0 && error == 0);
2679 release:
2680         if (sblocked)
2681                 sbunlock(&so->so_snd, FALSE);   /* will unlock socket */
2682         else
2683                 socket_unlock(so, 1);
2684 out:
2685         if (top != NULL)
2686                 m_freem(top);
2687         if (freelist != NULL)
2688                 m_freem_list(freelist);
2689
2690         KERNEL_DEBUG(DBG_FNC_SOSEND_LIST | DBG_FUNC_END, so, resid,
2691             so->so_snd.sb_cc, 0, error);
2692
2693         return (error);
2694 }
2695
2696 /*
2697  * May return ERESTART when packet is dropped by MAC policy check
2698  */
2699 static int
2700 soreceive_addr(struct proc *p, struct socket *so, struct sockaddr **psa,
2701     int flags, struct mbuf **mp, struct mbuf **nextrecordp, int canwait)
2702 {
2703         int error = 0;
2704         struct mbuf *m = *mp;
2705         struct mbuf *nextrecord = *nextrecordp;
2706
2707         KASSERT(m->m_type == MT_SONAME, ("receive 1a"));
2708 #if CONFIG_MACF_SOCKET_SUBSET
2709         /*
2710          * Call the MAC framework for policy checking if we're in
2711          * the user process context and the socket isn't connected.
2712          */
2713         if (p != kernproc && !(so->so_state & SS_ISCONNECTED)) {
2714                 struct mbuf *m0 = m;
2715                 /*
2716                  * Dequeue this record (temporarily) from the receive
2717                  * list since we're about to drop the socket's lock
2718                  * where a new record may arrive and be appended to
2719                  * the list.  Upon MAC policy failure, the record
2720                  * will be freed.  Otherwise, we'll add it back to
2721                  * the head of the list.  We cannot rely on SB_LOCK
2722                  * because append operation uses the socket's lock.
2723                  */
2724                 do {
2725                         m->m_nextpkt = NULL;
2726                         sbfree(&so->so_rcv, m);
2727                         m = m->m_next;
2728                 } while (m != NULL);
2729                 m = m0;
2730                 so->so_rcv.sb_mb = nextrecord;
2731                 SB_EMPTY_FIXUP(&so->so_rcv);
2732                 SBLASTRECORDCHK(&so->so_rcv, "soreceive 1a");
2733                 SBLASTMBUFCHK(&so->so_rcv, "soreceive 1a");
2734                 socket_unlock(so, 0);
2735
2736                 if (mac_socket_check_received(proc_ucred(p), so,
2737                     mtod(m, struct sockaddr *)) != 0) {
2738                         /*
2739                          * MAC policy failure; free this record and
2740                          * process the next record (or block until
2741                          * one is available).  We have adjusted sb_cc
2742                          * and sb_mbcnt above so there is no need to
2743                          * call sbfree() again.
2744                          */
2745                         m_freem(m);
2746                         /*
2747                          * Clear SB_LOCK but don't unlock the socket.
2748                          * Process the next record or wait for one.
2749                          */
2750                         socket_lock(so, 0);
2751                         sbunlock(&so->so_rcv, TRUE); /* stay locked */
2752                         error = ERESTART;
2753                         goto done;
2754                 }
2755                 socket_lock(so, 0);
2756                 /*
2757                  * If the socket has been defunct'd, drop it.
2758                  */
2759                 if (so->so_flags & SOF_DEFUNCT) {
2760                         m_freem(m);
2761                         error = ENOTCONN;
2762                         goto done;
2763                 }
2764                 /*
2765                  * Re-adjust the socket receive list and re-enqueue
2766                  * the record in front of any packets which may have
2767                  * been appended while we dropped the lock.
2768                  */
2769                 for (m = m0; m->m_next != NULL; m = m->m_next)
2770                         sballoc(&so->so_rcv, m);
2771                 sballoc(&so->so_rcv, m);
2772                 if (so->so_rcv.sb_mb == NULL) {
2773                         so->so_rcv.sb_lastrecord = m0;
2774                         so->so_rcv.sb_mbtail = m;
2775                 }
2776                 m = m0;
2777                 nextrecord = m->m_nextpkt = so->so_rcv.sb_mb;
2778                 so->so_rcv.sb_mb = m;
2779                 SBLASTRECORDCHK(&so->so_rcv, "soreceive 1b");
2780                 SBLASTMBUFCHK(&so->so_rcv, "soreceive 1b");
2781         }
2782 #endif /* CONFIG_MACF_SOCKET_SUBSET */
2783         if (psa != NULL) {
2784                 *psa = dup_sockaddr(mtod(m, struct sockaddr *), canwait);
2785                 if ((*psa == NULL) && (flags & MSG_NEEDSA)) {
2786                         error = EWOULDBLOCK;
2787                         goto done;
2788                 }
2789         }
2790         if (flags & MSG_PEEK) {
2791                 m = m->m_next;
2792         } else {
2793                 sbfree(&so->so_rcv, m);
2794                 if (m->m_next == NULL && so->so_rcv.sb_cc != 0) {
2795                         panic("%s: about to create invalid socketbuf",
2796                             __func__);
2797                         /* NOTREACHED */
2798                 }
2799                 MFREE(m, so->so_rcv.sb_mb);
2800                 m = so->so_rcv.sb_mb;
2801                 if (m != NULL) {
2802                         m->m_nextpkt = nextrecord;
2803                 } else {
2804                         so->so_rcv.sb_mb = nextrecord;
2805                         SB_EMPTY_FIXUP(&so->so_rcv);
2806                 }
2807         }
2808 done:
2809         *mp = m;
2810         *nextrecordp = nextrecord;
2811
2812         return (error);
2813 }
2814
2815 /*
2816  * Process one or more MT_CONTROL mbufs present before any data mbufs
2817  * in the first mbuf chain on the socket buffer.  If MSG_PEEK, we
2818  * just copy the data; if !MSG_PEEK, we call into the protocol to
2819  * perform externalization.
2820  */
2821 static int
2822 soreceive_ctl(struct socket *so, struct mbuf **controlp, int flags,
2823     struct mbuf **mp, struct mbuf **nextrecordp)
2824 {
2825         int error = 0;
2826         struct mbuf *cm = NULL, *cmn;
2827         struct mbuf **cme = &cm;
2828         struct sockbuf *sb_rcv = &so->so_rcv;
2829         struct mbuf **msgpcm = NULL;
2830         struct mbuf *m = *mp;
2831         struct mbuf *nextrecord = *nextrecordp;
2832         struct protosw *pr = so->so_proto;
2833
2834         /*
2835          * Externalizing the control messages would require us to
2836          * drop the socket's lock below.  Once we re-acquire the
2837          * lock, the mbuf chain might change.  In order to preserve
2838          * consistency, we unlink all control messages from the
2839          * first mbuf chain in one shot and link them separately
2840          * onto a different chain.
2841          */
2842         do {
2843                 if (flags & MSG_PEEK) {
2844                         if (controlp != NULL) {
2845                                 if (*controlp == NULL) {
2846                                         msgpcm = controlp;
2847                                 }
2848                                 *controlp = m_copy(m, 0, m->m_len);
2849
2850                                 /*
2851                                  * If we failed to allocate an mbuf,
2852                                  * release any previously allocated
2853                                  * mbufs for control data. Return
2854                                  * an error. Keep the mbufs in the
2855                                  * socket as this is using
2856                                  * MSG_PEEK flag.
2857                                  */
2858                                 if (*controlp == NULL) {
2859                                         m_freem(*msgpcm);
2860                                         error = ENOBUFS;
2861                                         goto done;
2862                                 }
2863                                 controlp = &(*controlp)->m_next;
2864                         }
2865                         m = m->m_next;
2866                 } else {
2867                         m->m_nextpkt = NULL;
2868                         sbfree(sb_rcv, m);
2869                         sb_rcv->sb_mb = m->m_next;
2870                         m->m_next = NULL;
2871                         *cme = m;
2872                         cme = &(*cme)->m_next;
2873                         m = sb_rcv->sb_mb;
2874                 }
2875         } while (m != NULL && m->m_type == MT_CONTROL);
2876
2877         if (!(flags & MSG_PEEK)) {
2878                 if (sb_rcv->sb_mb != NULL) {
2879                         sb_rcv->sb_mb->m_nextpkt = nextrecord;
2880                 } else {
2881                         sb_rcv->sb_mb = nextrecord;
2882                         SB_EMPTY_FIXUP(sb_rcv);
2883                 }
2884                 if (nextrecord == NULL)
2885                         sb_rcv->sb_lastrecord = m;
2886         }
2887
2888         SBLASTRECORDCHK(&so->so_rcv, "soreceive ctl");
2889         SBLASTMBUFCHK(&so->so_rcv, "soreceive ctl");
2890
2891         while (cm != NULL) {
2892                 int cmsg_type;
2893
2894                 cmn = cm->m_next;
2895                 cm->m_next = NULL;
2896                 cmsg_type = mtod(cm, struct cmsghdr *)->cmsg_type;
2897
2898                 /*
2899                  * Call the protocol to externalize SCM_RIGHTS message
2900                  * and return the modified message to the caller upon
2901                  * success.  Otherwise, all other control messages are
2902                  * returned unmodified to the caller.  Note that we
2903                  * only get into this loop if MSG_PEEK is not set.
2904                  */
2905                 if (pr->pr_domain->dom_externalize != NULL &&
2906                     cmsg_type == SCM_RIGHTS) {
2907                         /*
2908                          * Release socket lock: see 3903171.  This
2909                          * would also allow more records to be appended
2910                          * to the socket buffer.  We still have SB_LOCK
2911                          * set on it, so we can be sure that the head
2912                          * of the mbuf chain won't change.
2913                          */
2914                         socket_unlock(so, 0);
2915                         error = (*pr->pr_domain->dom_externalize)(cm);
2916                         socket_lock(so, 0);
2917                 } else {
2918                         error = 0;
2919                 }
2920
2921                 if (controlp != NULL && error == 0) {
2922                         *controlp = cm;
2923                         controlp = &(*controlp)->m_next;
2924                 } else {
2925                         (void) m_free(cm);
2926                 }
2927                 cm = cmn;
2928         }
2929         /*
2930          * Update the value of nextrecord in case we received new
2931          * records when the socket was unlocked above for
2932          * externalizing SCM_RIGHTS.
2933          */
2934         if (m != NULL)
2935                 nextrecord = sb_rcv->sb_mb->m_nextpkt;
2936         else
2937                 nextrecord = sb_rcv->sb_mb;
2938
2939 done:
2940         *mp = m;
2941         *nextrecordp = nextrecord;
2942
2943         return (error);
2944 }
2945
2946 /*
2947  * Implement receive operations on a socket.
2948  * We depend on the way that records are added to the sockbuf
2949  * by sbappend*.  In particular, each record (mbufs linked through m_next)
2950  * must begin with an address if the protocol so specifies,
2951  * followed by an optional mbuf or mbufs containing ancillary data,
2952  * and then zero or more mbufs of data.
2953  * In order to avoid blocking network interrupts for the entire time here,
2954  * we splx() while doing the actual copy to user space.
2955  * Although the sockbuf is locked, new data may still be appended,
2956  * and thus we must maintain consistency of the sockbuf during that time.
2957  *
2958  * The caller may receive the data as a single mbuf chain by supplying
2959  * an mbuf **mp0 for use in returning the chain.  The uio is then used
2960  * only for the count in uio_resid.
2961  *
2962  * Returns:     0                       Success
2963  *              ENOBUFS
2964  *              ENOTCONN
2965  *              EWOULDBLOCK
2966  *      uiomove:EFAULT
2967  *      sblock:EWOULDBLOCK
2968  *      sblock:EINTR
2969  *      sbwait:EBADF
2970  *      sbwait:EINTR
2971  *      sodelayed_copy:EFAULT
2972  *      <pru_rcvoob>:EINVAL[TCP]
2973  *      <pru_rcvoob>:EWOULDBLOCK[TCP]
2974  *      <pru_rcvoob>:???
2975  *      <pr_domain->dom_externalize>:EMSGSIZE[AF_UNIX]
2976  *      <pr_domain->dom_externalize>:ENOBUFS[AF_UNIX]
2977  *      <pr_domain->dom_externalize>:???
2978  *
2979  * Notes:       Additional return values from calls through <pru_rcvoob> and
2980  *              <pr_domain->dom_externalize> depend on protocols other than
2981  *              TCP or AF_UNIX, which are documented above.
2982  */
2983 int
2984 soreceive(struct socket *so, struct sockaddr **psa, struct uio *uio,
2985     struct mbuf **mp0, struct mbuf **controlp, int *flagsp)
2986 {
2987         struct mbuf *m, **mp, *ml = NULL;
2988         struct mbuf *nextrecord, *free_list;
2989         int flags, error, offset;
2990         user_ssize_t len;
2991         struct protosw *pr = so->so_proto;
2992         int moff, type = 0;
2993         user_ssize_t orig_resid = uio_resid(uio);
2994         user_ssize_t delayed_copy_len;
2995         int can_delay;
2996         int need_event;
2997         struct proc *p = current_proc();
2998         boolean_t en_tracing = FALSE;
2999
3000         /*
3001          * Sanity check on the length passed by caller as we are making 'int'
3002          * comparisons
3003          */
3004         if (orig_resid < 0 || orig_resid > INT_MAX)
3005                 return (EINVAL);
3006
3007         KERNEL_DEBUG(DBG_FNC_SORECEIVE | DBG_FUNC_START, so,
3008             uio_resid(uio), so->so_rcv.sb_cc, so->so_rcv.sb_lowat,
3009             so->so_rcv.sb_hiwat);
3010
3011         socket_lock(so, 1);
3012         so_update_last_owner_locked(so, p);
3013         so_update_policy(so);
3014
3015 #ifdef MORE_LOCKING_DEBUG
3016         if (so->so_usecount == 1) {
3017                 panic("%s: so=%x no other reference on socket\n", __func__, so);
3018                 /* NOTREACHED */
3019         }
3020 #endif
3021         mp = mp0;
3022         if (psa != NULL)
3023                 *psa = NULL;
3024         if (controlp != NULL)
3025                 *controlp = NULL;
3026         if (flagsp != NULL)
3027                 flags = *flagsp &~ MSG_EOR;
3028         else
3029                 flags = 0;
3030
3031         /*
3032          * If a recv attempt is made on a previously-accepted socket
3033          * that has been marked as inactive (disconnected), reject
3034          * the request.
3035          */
3036         if (so->so_flags & SOF_DEFUNCT) {
3037                 struct sockbuf *sb = &so->so_rcv;
3038
3039                 error = ENOTCONN;
3040                 SODEFUNCTLOG("%s[%d, %s]: defunct so 0x%llx [%d,%d] (%d)\n",
3041                     __func__, proc_pid(p), proc_best_name(p),
3042                     (uint64_t)DEBUG_KERNEL_ADDRPERM(so),
3043                     SOCK_DOM(so), SOCK_TYPE(so), error);
3044                 /*
3045                  * This socket should have been disconnected and flushed
3046                  * prior to being returned from sodefunct(); there should
3047                  * be no data on its receive list, so panic otherwise.
3048                  */
3049                 if (so->so_state & SS_DEFUNCT)
3050                         sb_empty_assert(sb, __func__);
3051                 socket_unlock(so, 1);
3052                 return (error);
3053         }
3054
3055         if ((so->so_flags1 & SOF1_PRECONNECT_DATA) &&
3056             pr->pr_usrreqs->pru_preconnect) {
3057                 /*
3058                  * A user may set the CONNECT_RESUME_ON_READ_WRITE-flag but not
3059                  * calling write() right after this. *If* the app calls a read
3060                  * we do not want to block this read indefinetely. Thus,
3061                  * we trigger a connect so that the session gets initiated.
3062                  */
3063                 error = (*pr->pr_usrreqs->pru_preconnect)(so);
3064
3065                 if (error) {
3066                         socket_unlock(so, 1);
3067                         return (error);
3068                 }
3069         }
3070
3071         if (ENTR_SHOULDTRACE &&
3072             (SOCK_CHECK_DOM(so, AF_INET) || SOCK_CHECK_DOM(so, AF_INET6))) {
3073                 /*
3074                  * enable energy tracing for inet sockets that go over
3075                  * non-loopback interfaces only.
3076                  */
3077                 struct inpcb *inp = sotoinpcb(so);
3078                 if (inp->inp_last_outifp != NULL &&
3079                     !(inp->inp_last_outifp->if_flags & IFF_LOOPBACK)) {
3080                         en_tracing = TRUE;
3081                         KERNEL_ENERGYTRACE(kEnTrActKernSockRead, DBG_FUNC_START,
3082                             VM_KERNEL_ADDRPERM(so),
3083                             ((so->so_state & SS_NBIO) ?
3084                             kEnTrFlagNonBlocking : 0),
3085                             (int64_t)orig_resid);
3086                 }
3087         }
3088
3089         /*
3090          * When SO_WANTOOBFLAG is set we try to get out-of-band data
3091          * regardless of the flags argument. Here is the case were
3092          * out-of-band data is not inline.
3093          */
3094         if ((flags & MSG_OOB) ||
3095             ((so->so_options & SO_WANTOOBFLAG) != 0 &&
3096             (so->so_options & SO_OOBINLINE) == 0 &&
3097             (so->so_oobmark || (so->so_state & SS_RCVATMARK)))) {
3098                 m = m_get(M_WAIT, MT_DATA);
3099                 if (m == NULL) {
3100                         socket_unlock(so, 1);
3101                         KERNEL_DEBUG(DBG_FNC_SORECEIVE | DBG_FUNC_END,
3102                             ENOBUFS, 0, 0, 0, 0);
3103                         return (ENOBUFS);
3104                 }
3105                 error = (*pr->pr_usrreqs->pru_rcvoob)(so, m, flags & MSG_PEEK);
3106                 if (error)
3107                         goto bad;
3108                 socket_unlock(so, 0);
3109                 do {
3110                         error = uiomove(mtod(m, caddr_t),
3111                             imin(uio_resid(uio), m->m_len), uio);
3112                         m = m_free(m);
3113                 } while (uio_resid(uio) && error == 0 && m != NULL);
3114                 socket_lock(so, 0);
3115 bad:
3116                 if (m != NULL)
3117                         m_freem(m);
3118
3119                 if ((so->so_options & SO_WANTOOBFLAG) != 0) {
3120                         if (error == EWOULDBLOCK || error == EINVAL) {
3121                                 /*
3122                                  * Let's try to get normal data:
3123                                  * EWOULDBLOCK: out-of-band data not
3124                                  * receive yet. EINVAL: out-of-band data
3125                                  * already read.
3126                                  */
3127                                 error = 0;
3128                                 goto nooob;
3129                         } else if (error == 0 && flagsp != NULL) {
3130                                 *flagsp |= MSG_OOB;
3131                         }
3132                 }
3133                 socket_unlock(so, 1);
3134                 if (en_tracing) {
3135                         KERNEL_ENERGYTRACE(kEnTrActKernSockRead, DBG_FUNC_END,
3136                             VM_KERNEL_ADDRPERM(so), 0,
3137                             (int64_t)(orig_resid - uio_resid(uio)));
3138                 }
3139                 KERNEL_DEBUG(DBG_FNC_SORECEIVE | DBG_FUNC_END, error,
3140                     0, 0, 0, 0);
3141
3142                 return (error);
3143         }
3144 nooob:
3145         if (mp != NULL)
3146                 *mp = NULL;
3147
3148         if (so->so_state & SS_ISCONFIRMING && uio_resid(uio)) {
3149                 (*pr->pr_usrreqs->pru_rcvd)(so, 0);
3150         }
3151
3152         free_list = NULL;
3153         delayed_copy_len = 0;
3154 restart:
3155 #ifdef MORE_LOCKING_DEBUG
3156         if (so->so_usecount <= 1)
3157                 printf("soreceive: sblock so=0x%llx ref=%d on socket\n",
3158                     (uint64_t)DEBUG_KERNEL_ADDRPERM(so), so->so_usecount);
3159 #endif
3160         /*
3161          * See if the socket has been closed (SS_NOFDREF|SS_CANTRCVMORE)
3162          * and if so just return to the caller.  This could happen when
3163          * soreceive() is called by a socket upcall function during the
3164          * time the socket is freed.  The socket buffer would have been
3165          * locked across the upcall, therefore we cannot put this thread
3166          * to sleep (else we will deadlock) or return EWOULDBLOCK (else
3167          * we may livelock), because the lock on the socket buffer will
3168          * only be released when the upcall routine returns to its caller.
3169          * Because the socket has been officially closed, there can be
3170          * no further read on it.
3171          *
3172          * A multipath subflow socket would have its SS_NOFDREF set by
3173          * default, so check for SOF_MP_SUBFLOW socket flag; when the
3174          * socket is closed for real, SOF_MP_SUBFLOW would be cleared.
3175          */
3176         if ((so->so_state & (SS_NOFDREF | SS_CANTRCVMORE)) ==
3177             (SS_NOFDREF | SS_CANTRCVMORE) && !(so->so_flags & SOF_MP_SUBFLOW)) {
3178                 socket_unlock(so, 1);
3179                 return (0);
3180         }
3181
3182         error = sblock(&so->so_rcv, SBLOCKWAIT(flags));
3183         if (error) {
3184                 socket_unlock(so, 1);
3185                 KERNEL_DEBUG(DBG_FNC_SORECEIVE | DBG_FUNC_END, error,
3186                     0, 0, 0, 0);
3187                 if (en_tracing) {
3188                         KERNEL_ENERGYTRACE(kEnTrActKernSockRead, DBG_FUNC_END,
3189                             VM_KERNEL_ADDRPERM(so), 0,
3190                             (int64_t)(orig_resid - uio_resid(uio)));
3191                 }
3192                 return (error);
3193         }
3194
3195         m = so->so_rcv.sb_mb;
3196         /*
3197          * If we have less data than requested, block awaiting more
3198          * (subject to any timeout) if:
3199          *   1. the current count is less than the low water mark, or
3200          *   2. MSG_WAITALL is set, and it is possible to do the entire
3201          *      receive operation at once if we block (resid <= hiwat).
3202          *   3. MSG_DONTWAIT is not set
3203          * If MSG_WAITALL is set but resid is larger than the receive buffer,
3204          * we have to do the receive in sections, and thus risk returning
3205          * a short count if a timeout or signal occurs after we start.
3206          */
3207         if (m == NULL || (((flags & MSG_DONTWAIT) == 0 &&
3208             so->so_rcv.sb_cc < uio_resid(uio)) &&
3209             (so->so_rcv.sb_cc < so->so_rcv.sb_lowat ||
3210             ((flags & MSG_WAITALL) && uio_resid(uio) <= so->so_rcv.sb_hiwat)) &&
3211             m->m_nextpkt == NULL && (pr->pr_flags & PR_ATOMIC) == 0)) {
3212                 /*
3213                  * Panic if we notice inconsistencies in the socket's
3214                  * receive list; both sb_mb and sb_cc should correctly
3215                  * reflect the contents of the list, otherwise we may
3216                  * end up with false positives during select() or poll()
3217                  * which could put the application in a bad state.
3218                  */
3219                 SB_MB_CHECK(&so->so_rcv);
3220
3221                 if (so->so_error) {
3222                         if (m != NULL)
3223                                 goto dontblock;
3224                         error = so->so_error;
3225                         if ((flags & MSG_PEEK) == 0)
3226                                 so->so_error = 0;
3227                         goto release;
3228                 }
3229                 if (so->so_state & SS_CANTRCVMORE) {
3230 #if CONTENT_FILTER
3231                         /*
3232                          * Deal with half closed connections
3233                          */
3234                         if ((so->so_state & SS_ISDISCONNECTED) == 0 &&
3235                                 cfil_sock_data_pending(&so->so_rcv) != 0)
3236                                 CFIL_LOG(LOG_INFO,
3237                                         "so %llx ignore SS_CANTRCVMORE",
3238                                         (uint64_t)DEBUG_KERNEL_ADDRPERM(so));
3239                         else
3240 #endif /* CONTENT_FILTER */
3241                         if (m != NULL)
3242                                 goto dontblock;
3243                         else
3244                                 goto release;
3245                 }
3246                 for (; m != NULL; m = m->m_next)
3247                         if (m->m_type == MT_OOBDATA || (m->m_flags & M_EOR)) {
3248                                 m = so->so_rcv.sb_mb;
3249                                 goto dontblock;
3250                         }
3251                 if ((so->so_state & (SS_ISCONNECTED|SS_ISCONNECTING)) == 0 &&
3252                     (so->so_proto->pr_flags & PR_CONNREQUIRED)) {
3253                         error = ENOTCONN;
3254                         goto release;
3255                 }
3256                 if (uio_resid(uio) == 0)
3257                         goto release;
3258
3259                 if ((so->so_state & SS_NBIO) ||
3260                     (flags & (MSG_DONTWAIT|MSG_NBIO))) {
3261                         error = EWOULDBLOCK;
3262                         goto release;
3263                 }
3264                 SBLASTRECORDCHK(&so->so_rcv, "soreceive sbwait 1");
3265                 SBLASTMBUFCHK(&so->so_rcv, "soreceive sbwait 1");
3266                 sbunlock(&so->so_rcv, TRUE);    /* keep socket locked */
3267 #if EVEN_MORE_LOCKING_DEBUG
3268                 if (socket_debug)
3269                         printf("Waiting for socket data\n");
3270 #endif
3271
3272                 error = sbwait(&so->so_rcv);
3273 #if EVEN_MORE_LOCKING_DEBUG
3274                 if (socket_debug)
3275                         printf("SORECEIVE - sbwait returned %d\n", error);
3276 #endif
3277                 if (so->so_usecount < 1) {
3278                         panic("%s: after 2nd sblock so=%p ref=%d on socket\n",
3279                             __func__, so, so->so_usecount);
3280                         /* NOTREACHED */
3281                 }
3282                 if (error) {
3283                         socket_unlock(so, 1);
3284                         KERNEL_DEBUG(DBG_FNC_SORECEIVE | DBG_FUNC_END, error,
3285                             0, 0, 0, 0);
3286                         if (en_tracing) {
3287                                 KERNEL_ENERGYTRACE(kEnTrActKernSockRead, DBG_FUNC_END,
3288                                     VM_KERNEL_ADDRPERM(so), 0,
3289                                     (int64_t)(orig_resid - uio_resid(uio)));
3290                         }
3291                         return (error);
3292                 }
3293                 goto restart;
3294         }
3295 dontblock:
3296         OSIncrementAtomicLong(&p->p_stats->p_ru.ru_msgrcv);
3297         SBLASTRECORDCHK(&so->so_rcv, "soreceive 1");
3298         SBLASTMBUFCHK(&so->so_rcv, "soreceive 1");
3299         nextrecord = m->m_nextpkt;
3300
3301         if ((pr->pr_flags & PR_ADDR) && m->m_type == MT_SONAME) {
3302                 error = soreceive_addr(p, so, psa, flags, &m, &nextrecord,
3303                     mp0 == NULL);
3304                 if (error == ERESTART)
3305                         goto restart;
3306                 else if (error != 0)
3307                         goto release;
3308                 orig_resid = 0;
3309         }
3310
3311         /*
3312          * Process one or more MT_CONTROL mbufs present before any data mbufs
3313          * in the first mbuf chain on the socket buffer.  If MSG_PEEK, we
3314          * just copy the data; if !MSG_PEEK, we call into the protocol to
3315          * perform externalization.
3316          */
3317         if (m != NULL && m->m_type == MT_CONTROL) {
3318                 error = soreceive_ctl(so, controlp, flags, &m, &nextrecord);
3319                 if (error != 0)
3320                         goto release;
3321                 orig_resid = 0;
3322         }
3323
3324         /*
3325          * If the socket is a TCP socket with message delivery
3326          * enabled, then create a control msg to deliver the
3327          * relative TCP sequence number for this data. Waiting
3328          * until this point will protect against failures to
3329          * allocate an mbuf for control msgs.
3330          */
3331         if (so->so_type == SOCK_STREAM && SOCK_PROTO(so) == IPPROTO_TCP &&
3332             (so->so_flags & SOF_ENABLE_MSGS) && controlp != NULL) {
3333                 struct mbuf *seq_cm;
3334
3335                 seq_cm = sbcreatecontrol((caddr_t)&m->m_pkthdr.msg_seq,
3336                     sizeof (uint32_t), SCM_SEQNUM, SOL_SOCKET);
3337                 if (seq_cm == NULL) {
3338                         /* unable to allocate a control mbuf */
3339                         error = ENOBUFS;
3340                         goto release;
3341                 }
3342                 *controlp = seq_cm;
3343                 controlp = &seq_cm->m_next;
3344         }
3345
3346         if (m != NULL) {
3347                 if (!(flags & MSG_PEEK)) {
3348                         /*
3349                          * We get here because m points to an mbuf following
3350                          * any MT_SONAME or MT_CONTROL mbufs which have been
3351                          * processed above.  In any case, m should be pointing
3352                          * to the head of the mbuf chain, and the nextrecord
3353                          * should be either NULL or equal to m->m_nextpkt.
3354                          * See comments above about SB_LOCK.
3355                          */
3356                         if (m != so->so_rcv.sb_mb ||
3357                             m->m_nextpkt != nextrecord) {
3358                                 panic("%s: post-control !sync so=%p m=%p "
3359                                     "nextrecord=%p\n", __func__, so, m,
3360                                     nextrecord);
3361                                 /* NOTREACHED */
3362                         }
3363                         if (nextrecord == NULL)
3364                                 so->so_rcv.sb_lastrecord = m;
3365                 }
3366                 type = m->m_type;
3367                 if (type == MT_OOBDATA)
3368                         flags |= MSG_OOB;
3369         } else {
3370                 if (!(flags & MSG_PEEK)) {
3371                         SB_EMPTY_FIXUP(&so->so_rcv);
3372                 }
3373         }
3374         SBLASTRECORDCHK(&so->so_rcv, "soreceive 2");
3375         SBLASTMBUFCHK(&so->so_rcv, "soreceive 2");
3376
3377         moff = 0;
3378         offset = 0;
3379
3380         if (!(flags & MSG_PEEK) && uio_resid(uio) > sorecvmincopy)
3381                 can_delay = 1;
3382         else
3383                 can_delay = 0;
3384
3385         need_event = 0;
3386
3387         while (m != NULL &&
3388             (uio_resid(uio) - delayed_copy_len) > 0 && error == 0) {
3389                 if (m->m_type == MT_OOBDATA) {
3390                         if (type != MT_OOBDATA)
3391                                 break;
3392                 } else if (type == MT_OOBDATA) {
3393                         break;
3394                 }
3395                 /*
3396                  * Make sure to allways set MSG_OOB event when getting
3397                  * out of band data inline.
3398                  */
3399                 if ((so->so_options & SO_WANTOOBFLAG) != 0 &&
3400                     (so->so_options & SO_OOBINLINE) != 0 &&
3401                     (so->so_state & SS_RCVATMARK) != 0) {
3402                         flags |= MSG_OOB;
3403                 }
3404                 so->so_state &= ~SS_RCVATMARK;
3405                 len = uio_resid(uio) - delayed_copy_len;
3406                 if (so->so_oobmark && len > so->so_oobmark - offset)
3407                         len = so->so_oobmark - offset;
3408                 if (len > m->m_len - moff)
3409                         len = m->m_len - moff;
3410                 /*
3411                  * If mp is set, just pass back the mbufs.
3412                  * Otherwise copy them out via the uio, then free.
3413                  * Sockbuf must be consistent here (points to current mbuf,
3414                  * it points to next record) when we drop priority;
3415                  * we must note any additions to the sockbuf when we
3416                  * block interrupts again.
3417                  */
3418                 if (mp == NULL) {
3419                         SBLASTRECORDCHK(&so->so_rcv, "soreceive uiomove");
3420                         SBLASTMBUFCHK(&so->so_rcv, "soreceive uiomove");
3421                         if (can_delay && len == m->m_len) {
3422                                 /*
3423                                  * only delay the copy if we're consuming the
3424                                  * mbuf and we're NOT in MSG_PEEK mode
3425                                  * and we have enough data to make it worthwile
3426                                  * to drop and retake the lock... can_delay
3427                                  * reflects the state of the 2 latter
3428                                  * constraints moff should always be zero
3429                                  * in these cases
3430                                  */
3431                                 delayed_copy_len += len;
3432                         } else {
3433                                 if (delayed_copy_len) {
3434                                         error = sodelayed_copy(so, uio,
3435                                             &free_list, &delayed_copy_len);
3436
3437                                         if (error) {
3438                                                 goto release;
3439                                         }
3440                                         /*
3441                                          * can only get here if MSG_PEEK is not
3442                                          * set therefore, m should point at the
3443                                          * head of the rcv queue; if it doesn't,
3444                                          * it means something drastically
3445                                          * changed while we were out from behind
3446                                          * the lock in sodelayed_copy. perhaps
3447                                          * a RST on the stream. in any event,
3448                                          * the stream has been interrupted. it's
3449                                          * probably best just to return whatever
3450                                          * data we've moved and let the caller
3451                                          * sort it out...
3452                                          */
3453                                         if (m != so->so_rcv.sb_mb) {
3454                                                 break;
3455                                         }
3456                                 }
3457                                 socket_unlock(so, 0);
3458                                 error = uiomove(mtod(m, caddr_t) + moff,
3459                                     (int)len, uio);
3460                                 socket_lock(so, 0);
3461
3462                                 if (error)
3463                                         goto release;
3464                         }
3465                 } else {
3466                         uio_setresid(uio, (uio_resid(uio) - len));
3467                 }
3468                 if (len == m->m_len - moff) {
3469                         if (m->m_flags & M_EOR)
3470                                 flags |= MSG_EOR;
3471                         if (flags & MSG_PEEK) {
3472                                 m = m->m_next;
3473                                 moff = 0;
3474                         } else {
3475                                 nextrecord = m->m_nextpkt;
3476                                 sbfree(&so->so_rcv, m);
3477                                 m->m_nextpkt = NULL;
3478
3479                                 /*
3480                                  * If this packet is an unordered packet
3481                                  * (indicated by M_UNORDERED_DATA flag), remove
3482                                  * the additional bytes added to the
3483                                  * receive socket buffer size.
3484                                  */
3485                                 if ((so->so_flags & SOF_ENABLE_MSGS) &&
3486                                     m->m_len &&
3487                                     (m->m_flags & M_UNORDERED_DATA) &&
3488                                     sbreserve(&so->so_rcv,
3489                                     so->so_rcv.sb_hiwat - m->m_len)) {
3490                                         if (so->so_msg_state->msg_uno_bytes >
3491                                             m->m_len) {
3492                                                 so->so_msg_state->
3493                                                     msg_uno_bytes -= m->m_len;
3494                                         } else {
3495                                                 so->so_msg_state->
3496                                                     msg_uno_bytes = 0;
3497                                         }
3498                                         m->m_flags &= ~M_UNORDERED_DATA;
3499                                 }
3500
3501                                 if (mp != NULL) {
3502                                         *mp = m;
3503                                         mp = &m->m_next;
3504                                         so->so_rcv.sb_mb = m = m->m_next;
3505                                         *mp = NULL;
3506                                 } else {
3507                                         if (free_list == NULL)
3508                                                 free_list = m;
3509                                         else
3510                                                 ml->m_next = m;
3511                                         ml = m;
3512                                         so->so_rcv.sb_mb = m = m->m_next;
3513                                         ml->m_next = NULL;
3514                                 }
3515                                 if (m != NULL) {
3516                                         m->m_nextpkt = nextrecord;
3517                                         if (nextrecord == NULL)
3518                                                 so->so_rcv.sb_lastrecord = m;
3519                                 } else {
3520                                         so->so_rcv.sb_mb = nextrecord;
3521                                         SB_EMPTY_FIXUP(&so->so_rcv);
3522                                 }
3523                                 SBLASTRECORDCHK(&so->so_rcv, "soreceive 3");
3524                                 SBLASTMBUFCHK(&so->so_rcv, "soreceive 3");
3525                         }
3526                 } else {
3527                         if (flags & MSG_PEEK) {
3528                                 moff += len;
3529                         } else {
3530                                 if (mp != NULL) {
3531                                         int copy_flag;
3532
3533                                         if (flags & MSG_DONTWAIT)
3534                                                 copy_flag = M_DONTWAIT;
3535                                         else
3536                                                 copy_flag = M_WAIT;
3537                                         *mp = m_copym(m, 0, len, copy_flag);
3538                                         /*
3539                                          * Failed to allocate an mbuf?
3540                                          * Adjust uio_resid back, it was
3541                                          * adjusted down by len bytes which
3542                                          * we didn't copy over.
3543                                          */
3544                                         if (*mp == NULL) {
3545                                                 uio_setresid(uio,
3546                                                     (uio_resid(uio) + len));
3547                                                 break;
3548                                         }
3549                                 }
3550                                 m->m_data += len;
3551                                 m->m_len -= len;
3552                                 so->so_rcv.sb_cc -= len;
3553                         }
3554                 }
3555                 if (so->so_oobmark) {
3556                         if ((flags & MSG_PEEK) == 0) {
3557                                 so->so_oobmark -= len;
3558                                 if (so->so_oobmark == 0) {
3559                                         so->so_state |= SS_RCVATMARK;
3560                                         /*
3561                                          * delay posting the actual event until
3562                                          * after any delayed copy processing
3563                                          * has finished
3564                                          */
3565                                         need_event = 1;
3566                                         break;
3567                                 }
3568                         } else {
3569                                 offset += len;
3570                                 if (offset == so->so_oobmark)
3571                                         break;
3572                         }
3573                 }
3574                 if (flags & MSG_EOR)
3575                         break;
3576                 /*
3577                  * If the MSG_WAITALL or MSG_WAITSTREAM flag is set
3578                  * (for non-atomic socket), we must not quit until
3579                  * "uio->uio_resid == 0" or an error termination.
3580                  * If a signal/timeout occurs, return with a short
3581                  * count but without error.  Keep sockbuf locked
3582                  * against other readers.
3583                  */
3584                 while (flags & (MSG_WAITALL|MSG_WAITSTREAM) && m == NULL &&
3585                     (uio_resid(uio) - delayed_copy_len) > 0 &&
3586                     !sosendallatonce(so) && !nextrecord) {
3587                         if (so->so_error || ((so->so_state & SS_CANTRCVMORE)
3588 #if CONTENT_FILTER
3589                             && cfil_sock_data_pending(&so->so_rcv) == 0
3590 #endif /* CONTENT_FILTER */
3591                             ))
3592                                 goto release;
3593
3594                         /*
3595                          * Depending on the protocol (e.g. TCP), the following
3596                          * might cause the socket lock to be dropped and later
3597                          * be reacquired, and more data could have arrived and
3598                          * have been appended to the receive socket buffer by
3599                          * the time it returns.  Therefore, we only sleep in
3600                          * sbwait() below if and only if the socket buffer is
3601                          * empty, in order to avoid a false sleep.
3602                          */
3603                         if (pr->pr_flags & PR_WANTRCVD && so->so_pcb &&
3604                             (((struct inpcb *)so->so_pcb)->inp_state !=
3605                             INPCB_STATE_DEAD))
3606                                 (*pr->pr_usrreqs->pru_rcvd)(so, flags);
3607
3608                         SBLASTRECORDCHK(&so->so_rcv, "soreceive sbwait 2");
3609                         SBLASTMBUFCHK(&so->so_rcv, "soreceive sbwait 2");
3610
3611                         if (so->so_rcv.sb_mb == NULL && sbwait(&so->so_rcv)) {
3612                                 error = 0;
3613                                 goto release;
3614                         }
3615                         /*
3616                          * have to wait until after we get back from the sbwait
3617                          * to do the copy because we will drop the lock if we
3618                          * have enough data that has been delayed... by dropping
3619                          * the lock we open up a window allowing the netisr
3620                          * thread to process the incoming packets and to change
3621                          * the state of this socket... we're issuing the sbwait
3622                          * because the socket is empty and we're expecting the
3623                          * netisr thread to wake us up when more packets arrive;
3624                          * if we allow that processing to happen and then sbwait
3625                          * we could stall forever with packets sitting in the
3626                          * socket if no further packets arrive from the remote
3627                          * side.
3628                          *
3629                          * we want to copy before we've collected all the data
3630                          * to satisfy this request to allow the copy to overlap
3631                          * the incoming packet processing on an MP system
3632                          */
3633                         if (delayed_copy_len > sorecvmincopy &&
3634                             (delayed_copy_len > (so->so_rcv.sb_hiwat / 2))) {
3635                                 error = sodelayed_copy(so, uio,
3636                                     &free_list, &delayed_copy_len);
3637
3638                                 if (error)
3639                                         goto release;
3640                         }
3641                         m = so->so_rcv.sb_mb;
3642                         if (m != NULL) {
3643                                 nextrecord = m->m_nextpkt;
3644                         }
3645                         SB_MB_CHECK(&so->so_rcv);
3646                 }
3647         }
3648 #ifdef MORE_LOCKING_DEBUG
3649         if (so->so_usecount <= 1) {
3650                 panic("%s: after big while so=%p ref=%d on socket\n",
3651                     __func__, so, so->so_usecount);
3652                 /* NOTREACHED */
3653         }
3654 #endif
3655
3656         if (m != NULL && pr->pr_flags & PR_ATOMIC) {
3657                 if (so->so_options & SO_DONTTRUNC) {
3658                         flags |= MSG_RCVMORE;
3659                 } else {
3660                         flags |= MSG_TRUNC;
3661                         if ((flags & MSG_PEEK) == 0)
3662                                 (void) sbdroprecord(&so->so_rcv);
3663                 }
3664         }
3665
3666         /*
3667          * pru_rcvd below (for TCP) may cause more data to be received
3668          * if the socket lock is dropped prior to sending the ACK; some
3669          * legacy OpenTransport applications don't handle this well
3670          * (if it receives less data than requested while MSG_HAVEMORE
3671          * is set), and so we set the flag now based on what we know
3672          * prior to calling pru_rcvd.
3673          */
3674         if ((so->so_options & SO_WANTMORE) && so->so_rcv.sb_cc > 0)
3675                 flags |= MSG_HAVEMORE;
3676
3677         if ((flags & MSG_PEEK) == 0) {
3678                 if (m == NULL) {
3679                         so->so_rcv.sb_mb = nextrecord;
3680                         /*
3681                          * First part is an inline SB_EMPTY_FIXUP().  Second
3682                          * part makes sure sb_lastrecord is up-to-date if
3683                          * there is still data in the socket buffer.
3684                          */
3685                         if (so->so_rcv.sb_mb == NULL) {
3686                                 so->so_rcv.sb_mbtail = NULL;
3687                                 so->so_rcv.sb_lastrecord = NULL;
3688                         } else if (nextrecord->m_nextpkt == NULL) {
3689                                 so->so_rcv.sb_lastrecord = nextrecord;
3690                         }
3691                         SB_MB_CHECK(&so->so_rcv);
3692                 }
3693                 SBLASTRECORDCHK(&so->so_rcv, "soreceive 4");
3694                 SBLASTMBUFCHK(&so->so_rcv, "soreceive 4");
3695                 if (pr->pr_flags & PR_WANTRCVD && so->so_pcb)
3696                         (*pr->pr_usrreqs->pru_rcvd)(so, flags);
3697         }
3698
3699         if (delayed_copy_len) {
3700                 error = sodelayed_copy(so, uio, &free_list, &delayed_copy_len);
3701                 if (error)
3702                         goto release;
3703         }
3704         if (free_list != NULL) {
3705                 m_freem_list(free_list);
3706                 free_list = NULL;
3707         }
3708         if (need_event)
3709                 postevent(so, 0, EV_OOB);
3710
3711         if (orig_resid == uio_resid(uio) && orig_resid &&
3712             (flags & MSG_EOR) == 0 && (so->so_state & SS_CANTRCVMORE) == 0) {
3713                 sbunlock(&so->so_rcv, TRUE);    /* keep socket locked */
3714                 goto restart;
3715         }
3716
3717         if (flagsp != NULL)
3718                 *flagsp |= flags;
3719 release:
3720 #ifdef MORE_LOCKING_DEBUG
3721         if (so->so_usecount <= 1) {
3722                 panic("%s: release so=%p ref=%d on socket\n", __func__,
3723                     so, so->so_usecount);
3724                 /* NOTREACHED */
3725         }
3726 #endif
3727         if (delayed_copy_len)
3728                 error = sodelayed_copy(so, uio, &free_list, &delayed_copy_len);
3729
3730         if (free_list != NULL)
3731                 m_freem_list(free_list);
3732
3733         sbunlock(&so->so_rcv, FALSE);   /* will unlock socket */
3734
3735         if (en_tracing) {
3736                 KERNEL_ENERGYTRACE(kEnTrActKernSockRead, DBG_FUNC_END,
3737                     VM_KERNEL_ADDRPERM(so),
3738                     ((error == EWOULDBLOCK) ? kEnTrFlagNoWork : 0),
3739                     (int64_t)(orig_resid - uio_resid(uio)));
3740         }
3741         KERNEL_DEBUG(DBG_FNC_SORECEIVE | DBG_FUNC_END, so, uio_resid(uio),
3742             so->so_rcv.sb_cc, 0, error);
3743
3744         return (error);
3745 }
3746
3747 /*
3748  * Returns:     0                       Success
3749  *      uiomove:EFAULT
3750  */
3751 static int
3752 sodelayed_copy(struct socket *so, struct uio *uio, struct mbuf **free_list,
3753     user_ssize_t *resid)
3754 {
3755         int error = 0;
3756         struct mbuf *m;
3757
3758         m = *free_list;
3759
3760         socket_unlock(so, 0);
3761
3762         while (m != NULL && error == 0) {
3763                 error = uiomove(mtod(m, caddr_t), (int)m->m_len, uio);
3764                 m = m->m_next;
3765         }
3766         m_freem_list(*free_list);
3767
3768         *free_list = NULL;
3769         *resid = 0;
3770
3771         socket_lock(so, 0);
3772
3773         return (error);
3774 }
3775
3776 static int
3777 sodelayed_copy_list(struct socket *so, struct recv_msg_elem *msgarray,
3778     u_int uiocnt, struct mbuf **free_list, user_ssize_t *resid)
3779 {
3780 #pragma unused(so)
3781         int error = 0;
3782         struct mbuf *ml, *m;
3783         int i = 0;
3784         struct uio *auio;
3785
3786         for (ml = *free_list, i = 0; ml != NULL && i < uiocnt;
3787             ml = ml->m_nextpkt, i++) {
3788                 auio = msgarray[i].uio;
3789                 for (m = ml; m != NULL; m = m->m_next) {
3790                         error = uiomove(mtod(m, caddr_t), m->m_len, auio);
3791                         if (error != 0)
3792                                 goto out;
3793                 }
3794         }
3795 out:
3796         m_freem_list(*free_list);
3797
3798         *free_list = NULL;
3799         *resid = 0;
3800
3801         return (error);
3802 }
3803
3804 int
3805 soreceive_list(struct socket *so, struct recv_msg_elem *msgarray, u_int uiocnt,
3806     int *flagsp)
3807 {
3808         struct mbuf *m;
3809         struct mbuf *nextrecord;
3810         struct mbuf *ml = NULL, *free_list = NULL, *free_tail = NULL;
3811         int error;
3812         user_ssize_t len, pktlen, delayed_copy_len = 0;
3813         struct protosw *pr = so->so_proto;
3814         user_ssize_t resid;
3815         struct proc *p = current_proc();
3816         struct uio *auio = NULL;
3817         int npkts = 0;
3818         int sblocked = 0;
3819         struct sockaddr **psa = NULL;
3820         struct mbuf **controlp = NULL;
3821         int can_delay;
3822         int flags;
3823         struct mbuf *free_others = NULL;
3824
3825         KERNEL_DEBUG(DBG_FNC_SORECEIVE_LIST | DBG_FUNC_START,
3826             so, uiocnt,
3827             so->so_rcv.sb_cc, so->so_rcv.sb_lowat, so->so_rcv.sb_hiwat);
3828
3829         /*
3830          * Sanity checks:
3831          * - Only supports don't wait flags
3832          * - Only support datagram sockets (could be extended to raw)
3833          * - Must be atomic
3834          * - Protocol must support packet chains
3835          * - The uio array is NULL (should we panic?)
3836          */
3837         if (flagsp != NULL)
3838                 flags = *flagsp;
3839         else
3840                 flags = 0;
3841         if (flags & ~(MSG_PEEK | MSG_WAITALL | MSG_DONTWAIT | MSG_NEEDSA |
3842             MSG_NBIO)) {
3843                 printf("%s invalid flags 0x%x\n", __func__, flags);
3844                 error = EINVAL;
3845                 goto out;
3846         }
3847         if (so->so_type != SOCK_DGRAM) {
3848                 error = EINVAL;
3849                 goto out;
3850         }
3851         if (sosendallatonce(so) == 0) {
3852                 error = EINVAL;
3853                 goto out;
3854         }
3855         if (so->so_proto->pr_usrreqs->pru_send_list == NULL) {
3856                 error = EPROTONOSUPPORT;
3857                 goto out;
3858         }
3859         if (msgarray == NULL) {
3860                 printf("%s uioarray is NULL\n", __func__);
3861                 error = EINVAL;
3862                 goto out;
3863         }
3864         if (uiocnt == 0) {
3865                 printf("%s uiocnt is 0\n", __func__);
3866                 error = EINVAL;
3867                 goto out;
3868         }
3869         /*
3870          * Sanity check on the length passed by caller as we are making 'int'
3871          * comparisons
3872          */
3873         resid = recv_msg_array_resid(msgarray, uiocnt);
3874         if (resid < 0 || resid > INT_MAX) {
3875                 error = EINVAL;
3876                 goto out;
3877         }
3878
3879         if (!(flags & MSG_PEEK) && sorecvmincopy > 0)
3880                 can_delay = 1;
3881         else
3882                 can_delay = 0;
3883
3884         socket_lock(so, 1);
3885         so_update_last_owner_locked(so, p);
3886         so_update_policy(so);
3887
3888 #if NECP
3889         so_update_necp_policy(so, NULL, NULL);
3890 #endif /* NECP */
3891
3892         /*
3893          * If a recv attempt is made on a previously-accepted socket
3894          * that has been marked as inactive (disconnected), reject
3895          * the request.
3896          */
3897         if (so->so_flags & SOF_DEFUNCT) {
3898                 struct sockbuf *sb = &so->so_rcv;
3899
3900                 error = ENOTCONN;
3901                 SODEFUNCTLOG("%s[%d, %s]: defunct so 0x%llx [%d,%d] (%d)\n",
3902                     __func__, proc_pid(p), proc_best_name(p),
3903                     (uint64_t)DEBUG_KERNEL_ADDRPERM(so),
3904                     SOCK_DOM(so), SOCK_TYPE(so), error);
3905                 /*
3906                  * This socket should have been disconnected and flushed
3907                  * prior to being returned from sodefunct(); there should
3908                  * be no data on its receive list, so panic otherwise.
3909                  */
3910                 if (so->so_state & SS_DEFUNCT)
3911                         sb_empty_assert(sb, __func__);
3912                 goto release;
3913         }
3914
3915 next:
3916         /*
3917          * The uio may be empty
3918          */
3919         if (npkts >= uiocnt) {
3920                 error = 0;
3921                 goto release;
3922         }
3923 restart:
3924         /*
3925          * See if the socket has been closed (SS_NOFDREF|SS_CANTRCVMORE)
3926          * and if so just return to the caller.  This could happen when
3927          * soreceive() is called by a socket upcall function during the
3928          * time the socket is freed.  The socket buffer would have been
3929          * locked across the upcall, therefore we cannot put this thread
3930          * to sleep (else we will deadlock) or return EWOULDBLOCK (else
3931          * we may livelock), because the lock on the socket buffer will
3932          * only be released when the upcall routine returns to its caller.
3933          * Because the socket has been officially closed, there can be
3934          * no further read on it.
3935          */
3936         if ((so->so_state & (SS_NOFDREF | SS_CANTRCVMORE)) ==
3937             (SS_NOFDREF | SS_CANTRCVMORE)) {
3938                 error = 0;
3939                 goto release;
3940         }
3941
3942         error = sblock(&so->so_rcv, SBLOCKWAIT(flags));
3943         if (error) {
3944                 goto release;
3945         }
3946         sblocked = 1;
3947
3948         m = so->so_rcv.sb_mb;
3949         /*
3950          * Block awaiting more datagram if needed
3951          */
3952         if (m == NULL || (((flags & MSG_DONTWAIT) == 0 &&
3953             (so->so_rcv.sb_cc < so->so_rcv.sb_lowat ||
3954             ((flags & MSG_WAITALL) && npkts < uiocnt))))) {
3955                 /*
3956                  * Panic if we notice inconsistencies in the socket's
3957                  * receive list; both sb_mb and sb_cc should correctly
3958                  * reflect the contents of the list, otherwise we may
3959                  * end up with false positives during select() or poll()
3960                  * which could put the application in a bad state.
3961                  */
3962                 SB_MB_CHECK(&so->so_rcv);
3963
3964                 if (so->so_error) {
3965                         error = so->so_error;
3966                         if ((flags & MSG_PEEK) == 0)
3967                                 so->so_error = 0;
3968                         goto release;
3969                 }
3970                 if (so->so_state & SS_CANTRCVMORE) {
3971                         goto release;
3972                 }
3973                 if ((so->so_state & (SS_ISCONNECTED|SS_ISCONNECTING)) == 0 &&
3974                     (so->so_proto->pr_flags & PR_CONNREQUIRED)) {
3975                         error = ENOTCONN;
3976                         goto release;
3977                 }
3978                 if ((so->so_state & SS_NBIO) ||
3979                     (flags & (MSG_DONTWAIT|MSG_NBIO))) {
3980                         error = EWOULDBLOCK;
3981                         goto release;
3982                 }
3983                 /*
3984                  * Do not block if we got some data
3985                  */
3986                 if (free_list != NULL) {
3987                         error = 0;
3988                         goto release;
3989                 }
3990
3991                 SBLASTRECORDCHK(&so->so_rcv, "soreceive sbwait 1");
3992                 SBLASTMBUFCHK(&so->so_rcv, "soreceive sbwait 1");
3993
3994                 sbunlock(&so->so_rcv, TRUE);    /* keep socket locked */
3995                 sblocked = 0;
3996
3997                 error = sbwait(&so->so_rcv);
3998                 if (error) {
3999                         goto release;
4000                 }
4001                 goto restart;
4002         }
4003
4004         OSIncrementAtomicLong(&p->p_stats->p_ru.ru_msgrcv);
4005         SBLASTRECORDCHK(&so->so_rcv, "soreceive 1");
4006         SBLASTMBUFCHK(&so->so_rcv, "soreceive 1");
4007
4008         /*
4009          * Consume the current uio index as we have a datagram
4010          */
4011         auio = msgarray[npkts].uio;
4012         resid = uio_resid(auio);
4013         msgarray[npkts].which |= SOCK_MSG_DATA;
4014         psa = (msgarray[npkts].which & SOCK_MSG_SA) ?
4015             &msgarray[npkts].psa : NULL;
4016         controlp = (msgarray[npkts].which & SOCK_MSG_CONTROL) ?
4017             &msgarray[npkts].controlp : NULL;
4018         npkts += 1;
4019         nextrecord = m->m_nextpkt;
4020
4021         if ((pr->pr_flags & PR_ADDR) && m->m_type == MT_SONAME) {
4022                 error = soreceive_addr(p, so, psa, flags, &m, &nextrecord, 1);
4023                 if (error == ERESTART)
4024                         goto restart;
4025                 else if (error != 0)
4026                         goto release;
4027         }
4028
4029         if (m != NULL && m->m_type == MT_CONTROL) {
4030                 error = soreceive_ctl(so, controlp, flags, &m, &nextrecord);
4031                 if (error != 0)
4032                         goto release;
4033         }
4034
4035         if (m->m_pkthdr.len == 0) {
4036                 printf("%s:%d so %llx pkt %llx type %u pktlen null\n",
4037                     __func__, __LINE__,
4038                     (uint64_t)DEBUG_KERNEL_ADDRPERM(so),
4039                     (uint64_t)DEBUG_KERNEL_ADDRPERM(m),
4040                     m->m_type);
4041         }
4042
4043         /*
4044          * Loop to copy the mbufs of the current record
4045          * Support zero length packets
4046          */
4047         ml = NULL;
4048         pktlen = 0;
4049         while (m != NULL && (len = resid - pktlen) >= 0 && error == 0) {
4050                 if (m->m_len == 0)
4051                         panic("%p m_len zero", m);
4052                 if (m->m_type == 0)
4053                         panic("%p m_type zero", m);
4054                 /*
4055                  * Clip to the residual length
4056                  */
4057                 if (len > m->m_len)
4058                         len = m->m_len;
4059                 pktlen += len;
4060                 /*
4061                  * Copy the mbufs via the uio or delay the copy
4062                  * Sockbuf must be consistent here (points to current mbuf,
4063                  * it points to next record) when we drop priority;
4064                  * we must note any additions to the sockbuf when we
4065                  * block interrupts again.
4066                  */
4067                 if (len > 0 && can_delay == 0) {
4068                         socket_unlock(so, 0);
4069                         error = uiomove(mtod(m, caddr_t), (int)len, auio);
4070                         socket_lock(so, 0);
4071                         if (error)
4072                                 goto release;
4073                 } else {
4074                         delayed_copy_len += len;
4075                 }
4076
4077                 if (len == m->m_len) {
4078                         /*
4079                          * m was entirely copied
4080                          */
4081                         sbfree(&so->so_rcv, m);
4082                         nextrecord = m->m_nextpkt;
4083                         m->m_nextpkt = NULL;
4084
4085                         /*
4086                          * Set the first packet to the head of the free list
4087                          */
4088                         if (free_list == NULL)
4089                                 free_list = m;
4090                         /*
4091                          * Link current packet to tail of free list
4092                          */
4093                         if (ml == NULL) {
4094                                 if (free_tail != NULL)
4095                                         free_tail->m_nextpkt = m;
4096                                 free_tail = m;
4097                         }
4098                         /*
4099                          * Link current mbuf to last mbuf of current packet
4100                          */
4101                         if (ml != NULL)
4102                                 ml->m_next = m;
4103                         ml = m;
4104
4105                         /*
4106                          * Move next buf to head of socket buffer
4107                          */
4108                         so->so_rcv.sb_mb = m = ml->m_next;
4109                         ml->m_next = NULL;
4110
4111                         if (m != NULL) {
4112                                 m->m_nextpkt = nextrecord;
4113                                 if (nextrecord == NULL)
4114                                         so->so_rcv.sb_lastrecord = m;
4115                         } else {
4116                                 so->so_rcv.sb_mb = nextrecord;
4117                                 SB_EMPTY_FIXUP(&so->so_rcv);
4118                         }
4119                         SBLASTRECORDCHK(&so->so_rcv, "soreceive 3");
4120                         SBLASTMBUFCHK(&so->so_rcv, "soreceive 3");
4121                 } else {
4122                         /*
4123                          * Stop the loop on partial copy
4124                          */
4125                         break;
4126                 }
4127         }
4128 #ifdef MORE_LOCKING_DEBUG
4129         if (so->so_usecount <= 1) {
4130                 panic("%s: after big while so=%llx ref=%d on socket\n",
4131                     __func__,
4132                     (uint64_t)DEBUG_KERNEL_ADDRPERM(so), so->so_usecount);
4133                 /* NOTREACHED */
4134         }
4135 #endif
4136         /*
4137          * Tell the caller we made a partial copy
4138          */
4139         if (m != NULL) {
4140                 if (so->so_options & SO_DONTTRUNC) {
4141                         /*
4142                          * Copyout first the freelist then the partial mbuf
4143                          */
4144                         socket_unlock(so, 0);
4145                         if (delayed_copy_len)
4146                                 error = sodelayed_copy_list(so, msgarray,
4147                                     uiocnt, &free_list, &delayed_copy_len);
4148
4149                         if (error == 0) {
4150                                 error = uiomove(mtod(m, caddr_t), (int)len,
4151                                     auio);
4152                         }
4153                         socket_lock(so, 0);
4154                         if (error)
4155                                 goto release;
4156
4157                         m->m_data += len;
4158                         m->m_len -= len;
4159                         so->so_rcv.sb_cc -= len;
4160                         flags |= MSG_RCVMORE;
4161                 } else {
4162                         (void) sbdroprecord(&so->so_rcv);
4163                         nextrecord = so->so_rcv.sb_mb;
4164                         m = NULL;
4165                         flags |= MSG_TRUNC;
4166                 }
4167         }
4168
4169         if (m == NULL) {
4170                 so->so_rcv.sb_mb = nextrecord;
4171                 /*
4172                  * First part is an inline SB_EMPTY_FIXUP().  Second
4173                  * part makes sure sb_lastrecord is up-to-date if
4174                  * there is still data in the socket buffer.
4175                  */
4176                 if (so->so_rcv.sb_mb == NULL) {
4177                         so->so_rcv.sb_mbtail = NULL;
4178                         so->so_rcv.sb_lastrecord = NULL;
4179                 } else if (nextrecord->m_nextpkt == NULL) {
4180                         so->so_rcv.sb_lastrecord = nextrecord;
4181                 }
4182                 SB_MB_CHECK(&so->so_rcv);
4183         }
4184         SBLASTRECORDCHK(&so->so_rcv, "soreceive 4");
4185         SBLASTMBUFCHK(&so->so_rcv, "soreceive 4");
4186
4187         /*
4188          * We can continue to the next packet as long as:
4189          * - We haven't exhausted the uio array
4190          * - There was no error
4191          * - A packet was not truncated
4192          * - We can still receive more data
4193          */
4194         if (npkts < uiocnt && error == 0 &&
4195             (flags & (MSG_RCVMORE | MSG_TRUNC)) == 0 &&
4196             (so->so_state & SS_CANTRCVMORE) == 0) {
4197                 sbunlock(&so->so_rcv, TRUE);    /* keep socket locked */
4198                 sblocked = 0;
4199
4200                 goto next;
4201         }
4202         if (flagsp != NULL)
4203                 *flagsp |= flags;
4204
4205 release:
4206         /*
4207          * pru_rcvd may cause more data to be received if the socket lock
4208          * is dropped so we set MSG_HAVEMORE now based on what we know.
4209          * That way the caller won't be surprised if it receives less data
4210          * than requested.
4211          */
4212         if ((so->so_options & SO_WANTMORE) && so->so_rcv.sb_cc > 0)
4213                 flags |= MSG_HAVEMORE;
4214
4215         if (pr->pr_flags & PR_WANTRCVD && so->so_pcb)
4216                 (*pr->pr_usrreqs->pru_rcvd)(so, flags);
4217
4218         if (sblocked)
4219                 sbunlock(&so->so_rcv, FALSE);   /* will unlock socket */
4220         else
4221                 socket_unlock(so, 1);
4222
4223         if (delayed_copy_len)
4224                 error = sodelayed_copy_list(so, msgarray, uiocnt,
4225                     &free_list, &delayed_copy_len);
4226 out:
4227         /*
4228          * Amortize the cost of freeing the mbufs
4229          */
4230         if (free_list != NULL)
4231                 m_freem_list(free_list);
4232         if (free_others != NULL)
4233                 m_freem_list(free_others);
4234
4235         KERNEL_DEBUG(DBG_FNC_SORECEIVE_LIST | DBG_FUNC_END, error,
4236             0, 0, 0, 0);
4237         return (error);
4238 }
4239
4240 /*
4241  * Returns:     0                       Success
4242  *              EINVAL
4243  *              ENOTCONN
4244  *      <pru_shutdown>:EINVAL
4245  *      <pru_shutdown>:EADDRNOTAVAIL[TCP]
4246  *      <pru_shutdown>:ENOBUFS[TCP]
4247  *      <pru_shutdown>:EMSGSIZE[TCP]
4248  *      <pru_shutdown>:EHOSTUNREACH[TCP]
4249  *      <pru_shutdown>:ENETUNREACH[TCP]
4250  *      <pru_shutdown>:ENETDOWN[TCP]
4251  *      <pru_shutdown>:ENOMEM[TCP]
4252  *      <pru_shutdown>:EACCES[TCP]
4253  *      <pru_shutdown>:EMSGSIZE[TCP]
4254  *      <pru_shutdown>:ENOBUFS[TCP]
4255  *      <pru_shutdown>:???[TCP]         [ignorable: mostly IPSEC/firewall/DLIL]
4256  *      <pru_shutdown>:???              [other protocol families]
4257  */
4258 int
4259 soshutdown(struct socket *so, int how)
4260 {
4261         int error;
4262
4263         KERNEL_DEBUG(DBG_FNC_SOSHUTDOWN | DBG_FUNC_START, how, 0, 0, 0, 0);
4264
4265         switch (how) {
4266         case SHUT_RD:
4267         case SHUT_WR:
4268         case SHUT_RDWR:
4269                 socket_lock(so, 1);
4270                 if ((so->so_state &
4271                     (SS_ISCONNECTED|SS_ISCONNECTING|SS_ISDISCONNECTING)) == 0) {
4272                         error = ENOTCONN;
4273                 } else {
4274                         error = soshutdownlock(so, how);
4275                 }
4276                 socket_unlock(so, 1);
4277                 break;
4278         default:
4279                 error = EINVAL;
4280                 break;
4281         }
4282
4283         KERNEL_DEBUG(DBG_FNC_SOSHUTDOWN | DBG_FUNC_END, how, error, 0, 0, 0);
4284
4285         return (error);
4286 }
4287
4288 int
4289 soshutdownlock_final(struct socket *so, int how)
4290 {
4291         struct protosw *pr = so->so_proto;
4292         int error = 0;
4293
4294         sflt_notify(so, sock_evt_shutdown, &how);
4295
4296         if (how != SHUT_WR) {
4297                 if ((so->so_state & SS_CANTRCVMORE) != 0) {
4298                         /* read already shut down */
4299                         error = ENOTCONN;
4300                         goto done;
4301                 }
4302                 sorflush(so);
4303                 postevent(so, 0, EV_RCLOSED);
4304         }
4305         if (how != SHUT_RD) {
4306                 if ((so->so_state & SS_CANTSENDMORE) != 0) {
4307                         /* write already shut down */
4308                         error = ENOTCONN;
4309                         goto done;
4310                 }
4311                 error = (*pr->pr_usrreqs->pru_shutdown)(so);
4312                 postevent(so, 0, EV_WCLOSED);
4313         }
4314 done:
4315         KERNEL_DEBUG(DBG_FNC_SOSHUTDOWN, how, 1, 0, 0, 0);
4316         return (error);
4317 }
4318
4319 int
4320 soshutdownlock(struct socket *so, int how)
4321 {
4322         int error = 0;
4323
4324 #if CONTENT_FILTER
4325         /*
4326          * A content filter may delay the actual shutdown until it
4327          * has processed the pending data
4328          */
4329         if (so->so_flags & SOF_CONTENT_FILTER) {
4330                 error = cfil_sock_shutdown(so, &how);
4331                 if (error == EJUSTRETURN) {
4332                         error = 0;
4333                         goto done;
4334                 } else if (error != 0) {
4335                         goto done;
4336                 }
4337         }
4338 #endif /* CONTENT_FILTER */
4339
4340         error = soshutdownlock_final(so, how);
4341
4342 done:
4343         return (error);
4344 }
4345
4346 void
4347 sowflush(struct socket *so)
4348 {
4349         struct sockbuf *sb = &so->so_snd;
4350
4351         /*
4352          * Obtain lock on the socket buffer (SB_LOCK).  This is required
4353          * to prevent the socket buffer from being unexpectedly altered
4354          * while it is used by another thread in socket send/receive.
4355          *
4356          * sblock() must not fail here, hence the assertion.
4357          */
4358         (void) sblock(sb, SBL_WAIT | SBL_NOINTR | SBL_IGNDEFUNCT);
4359         VERIFY(sb->sb_flags & SB_LOCK);
4360
4361         sb->sb_flags            &= ~(SB_SEL|SB_UPCALL);
4362         sb->sb_flags            |= SB_DROP;
4363         sb->sb_upcall           = NULL;
4364         sb->sb_upcallarg        = NULL;
4365
4366         sbunlock(sb, TRUE);     /* keep socket locked */
4367
4368         selthreadclear(&sb->sb_sel);
4369         sbrelease(sb);
4370 }
4371
4372 void
4373 sorflush(struct socket *so)
4374 {
4375         struct sockbuf *sb = &so->so_rcv;
4376         struct protosw *pr = so->so_proto;
4377         struct sockbuf asb;
4378 #ifdef notyet
4379         lck_mtx_t *mutex_held;
4380         /*
4381          * XXX: This code is currently commented out, because we may get here
4382          * as part of sofreelastref(), and at that time, pr_getlock() may no
4383          * longer be able to return us the lock; this will be fixed in future.
4384          */
4385         if (so->so_proto->pr_getlock != NULL)
4386                 mutex_held = (*so->so_proto->pr_getlock)(so, 0);
4387         else
4388                 mutex_held = so->so_proto->pr_domain->dom_mtx;
4389
4390         lck_mtx_assert(mutex_held, LCK_MTX_ASSERT_OWNED);
4391 #endif /* notyet */
4392
4393         sflt_notify(so, sock_evt_flush_read, NULL);
4394
4395         socantrcvmore(so);
4396
4397         /*
4398          * Obtain lock on the socket buffer (SB_LOCK).  This is required
4399          * to prevent the socket buffer from being unexpectedly altered
4400          * while it is used by another thread in socket send/receive.
4401          *
4402          * sblock() must not fail here, hence the assertion.
4403          */
4404         (void) sblock(sb, SBL_WAIT | SBL_NOINTR | SBL_IGNDEFUNCT);
4405         VERIFY(sb->sb_flags & SB_LOCK);
4406
4407         /*
4408          * Copy only the relevant fields from "sb" to "asb" which we
4409          * need for sbrelease() to function.  In particular, skip
4410          * sb_sel as it contains the wait queue linkage, which would
4411          * wreak havoc if we were to issue selthreadclear() on "asb".
4412          * Make sure to not carry over SB_LOCK in "asb", as we need
4413          * to acquire it later as part of sbrelease().
4414          */
4415         bzero(&asb, sizeof (asb));
4416         asb.sb_cc               = sb->sb_cc;
4417         asb.sb_hiwat            = sb->sb_hiwat;
4418         asb.sb_mbcnt            = sb->sb_mbcnt;
4419         asb.sb_mbmax            = sb->sb_mbmax;
4420         asb.sb_ctl              = sb->sb_ctl;
4421         asb.sb_lowat            = sb->sb_lowat;
4422         asb.sb_mb               = sb->sb_mb;
4423         asb.sb_mbtail           = sb->sb_mbtail;
4424         asb.sb_lastrecord       = sb->sb_lastrecord;
4425         asb.sb_so               = sb->sb_so;
4426         asb.sb_flags            = sb->sb_flags;
4427         asb.sb_flags            &= ~(SB_LOCK|SB_SEL|SB_KNOTE|SB_UPCALL);
4428         asb.sb_flags            |= SB_DROP;
4429
4430         /*
4431          * Ideally we'd bzero() these and preserve the ones we need;
4432          * but to do that we'd need to shuffle things around in the
4433          * sockbuf, and we can't do it now because there are KEXTS
4434          * that are directly referring to the socket structure.
4435          *
4436          * Setting SB_DROP acts as a barrier to prevent further appends.
4437          * Clearing SB_SEL is done for selthreadclear() below.
4438          */
4439         sb->sb_cc               = 0;
4440         sb->sb_hiwat            = 0;
4441         sb->sb_mbcnt            = 0;
4442         sb->sb_mbmax            = 0;
4443         sb->sb_ctl              = 0;
4444         sb->sb_lowat            = 0;
4445         sb->sb_mb               = NULL;
4446         sb->sb_mbtail           = NULL;
4447         sb->sb_lastrecord       = NULL;
4448         sb->sb_timeo.tv_sec     = 0;
4449         sb->sb_timeo.tv_usec    = 0;
4450         sb->sb_upcall           = NULL;
4451         sb->sb_upcallarg        = NULL;
4452         sb->sb_flags            &= ~(SB_SEL|SB_UPCALL);
4453         sb->sb_flags            |= SB_DROP;
4454
4455         sbunlock(sb, TRUE);     /* keep socket locked */
4456
4457         /*
4458          * Note that selthreadclear() is called on the original "sb" and
4459          * not the local "asb" because of the way wait queue linkage is
4460          * implemented.  Given that selwakeup() may be triggered, SB_SEL
4461          * should no longer be set (cleared above.)
4462          */
4463         selthreadclear(&sb->sb_sel);
4464
4465         if ((pr->pr_flags & PR_RIGHTS) && pr->pr_domain->dom_dispose)
4466                 (*pr->pr_domain->dom_dispose)(asb.sb_mb);
4467
4468         sbrelease(&asb);
4469 }
4470
4471 /*
4472  * Perhaps this routine, and sooptcopyout(), below, ought to come in
4473  * an additional variant to handle the case where the option value needs
4474  * to be some kind of integer, but not a specific size.
4475  * In addition to their use here, these functions are also called by the
4476  * protocol-level pr_ctloutput() routines.
4477  *
4478  * Returns:     0                       Success
4479  *              EINVAL
4480  *      copyin:EFAULT
4481  */
4482 int
4483 sooptcopyin(struct sockopt *sopt, void *buf, size_t len, size_t minlen)
4484 {
4485         size_t  valsize;
4486
4487         /*
4488          * If the user gives us more than we wanted, we ignore it,
4489          * but if we don't get the minimum length the caller
4490          * wants, we return EINVAL.  On success, sopt->sopt_valsize
4491          * is set to however much we actually retrieved.
4492          */
4493         if ((valsize = sopt->sopt_valsize) < minlen)
4494                 return (EINVAL);
4495         if (valsize > len)
4496                 sopt->sopt_valsize = valsize = len;
4497
4498         if (sopt->sopt_p != kernproc)
4499                 return (copyin(sopt->sopt_val, buf, valsize));
4500
4501         bcopy(CAST_DOWN(caddr_t, sopt->sopt_val), buf, valsize);
4502         return (0);
4503 }
4504
4505 /*
4506  * sooptcopyin_timeval
4507  *   Copy in a timeval value into tv_p, and take into account whether the
4508  *   the calling process is 64-bit or 32-bit.  Moved the sanity checking
4509  *   code here so that we can verify the 64-bit tv_sec value before we lose
4510  *   the top 32-bits assigning tv64.tv_sec to tv_p->tv_sec.
4511  */
4512 static int
4513 sooptcopyin_timeval(struct sockopt *sopt, struct timeval *tv_p)
4514 {
4515         int                     error;
4516
4517         if (proc_is64bit(sopt->sopt_p)) {
4518                 struct user64_timeval   tv64;
4519
4520                 if (sopt->sopt_valsize < sizeof (tv64))
4521                         return (EINVAL);
4522
4523                 sopt->sopt_valsize = sizeof (tv64);
4524                 if (sopt->sopt_p != kernproc) {
4525                         error = copyin(sopt->sopt_val, &tv64, sizeof (tv64));
4526                         if (error != 0)
4527                                 return (error);
4528                 } else {
4529                         bcopy(CAST_DOWN(caddr_t, sopt->sopt_val), &tv64,
4530                             sizeof (tv64));
4531                 }
4532                 if (tv64.tv_sec < 0 || tv64.tv_sec > LONG_MAX ||
4533                     tv64.tv_usec < 0 || tv64.tv_usec >= 1000000)
4534                         return (EDOM);
4535
4536                 tv_p->tv_sec = tv64.tv_sec;
4537                 tv_p->tv_usec = tv64.tv_usec;
4538         } else {
4539                 struct user32_timeval   tv32;
4540
4541                 if (sopt->sopt_valsize < sizeof (tv32))
4542                         return (EINVAL);
4543
4544                 sopt->sopt_valsize = sizeof (tv32);
4545                 if (sopt->sopt_p != kernproc) {
4546                         error = copyin(sopt->sopt_val, &tv32, sizeof (tv32));
4547                         if (error != 0) {
4548                                 return (error);
4549                         }
4550                 } else {
4551                         bcopy(CAST_DOWN(caddr_t, sopt->sopt_val), &tv32,
4552                             sizeof (tv32));
4553                 }
4554 #ifndef __LP64__
4555                 /*
4556                  * K64todo "comparison is always false due to
4557                  * limited range of data type"
4558                  */
4559                 if (tv32.tv_sec < 0 || tv32.tv_sec > LONG_MAX ||
4560                     tv32.tv_usec < 0 || tv32.tv_usec >= 1000000)
4561                         return (EDOM);
4562 #endif
4563                 tv_p->tv_sec = tv32.tv_sec;
4564                 tv_p->tv_usec = tv32.tv_usec;
4565         }
4566         return (0);
4567 }
4568
4569 static int
4570 soopt_cred_check(struct socket *so, int priv)
4571 {
4572         kauth_cred_t cred =  NULL;
4573         proc_t ep = PROC_NULL;
4574         int error;
4575
4576         if (so->so_flags & SOF_DELEGATED) {
4577                 ep = proc_find(so->e_pid);
4578                 if (ep)
4579                         cred = kauth_cred_proc_ref(ep);
4580         }
4581         error = priv_check_cred(cred ? cred : so->so_cred, priv, 0);
4582         if (cred)
4583                 kauth_cred_unref(&cred);
4584         if (ep != PROC_NULL)
4585                 proc_rele(ep);
4586
4587         return (error);
4588 }
4589
4590 /*
4591  * Returns:     0                       Success
4592  *              EINVAL
4593  *              ENOPROTOOPT
4594  *              ENOBUFS
4595  *              EDOM
4596  *      sooptcopyin:EINVAL
4597  *      sooptcopyin:EFAULT
4598  *      sooptcopyin_timeval:EINVAL
4599  *      sooptcopyin_timeval:EFAULT
4600  *      sooptcopyin_timeval:EDOM
4601  *      <pr_ctloutput>:EOPNOTSUPP[AF_UNIX]
4602  *      <pr_ctloutput>:???w
4603  *      sflt_attach_private:???         [whatever a filter author chooses]
4604  *      <sf_setoption>:???              [whatever a filter author chooses]
4605  *
4606  * Notes:       Other <pru_listen> returns depend on the protocol family; all
4607  *              <sf_listen> returns depend on what the filter author causes
4608  *              their filter to return.
4609  */
4610 int
4611 sosetoptlock(struct socket *so, struct sockopt *sopt, int dolock)
4612 {
4613         int     error, optval;
4614         struct  linger l;
4615         struct  timeval tv;
4616 #if CONFIG_MACF_SOCKET
4617         struct mac extmac;
4618 #endif /* MAC_SOCKET */
4619
4620         if (sopt->sopt_dir != SOPT_SET)
4621                 sopt->sopt_dir = SOPT_SET;
4622
4623         if (dolock)
4624                 socket_lock(so, 1);
4625
4626         if ((so->so_state & (SS_CANTRCVMORE | SS_CANTSENDMORE)) ==
4627             (SS_CANTRCVMORE | SS_CANTSENDMORE) &&
4628             (so->so_flags & SOF_NPX_SETOPTSHUT) == 0) {
4629                 /* the socket has been shutdown, no more sockopt's */
4630                 error = EINVAL;
4631                 goto out;
4632         }
4633
4634         error = sflt_setsockopt(so, sopt);
4635         if (error != 0) {
4636                 if (error == EJUSTRETURN)
4637                         error = 0;
4638                 goto out;
4639         }
4640
4641         if (sopt->sopt_level != SOL_SOCKET) {
4642                 if (so->so_proto != NULL &&
4643                     so->so_proto->pr_ctloutput != NULL) {
4644                         error = (*so->so_proto->pr_ctloutput)(so, sopt);
4645                         goto out;
4646                 }
4647                 error = ENOPROTOOPT;
4648         } else {
4649                 /*
4650                  * Allow socket-level (SOL_SOCKET) options to be filtered by
4651                  * the protocol layer, if needed.  A zero value returned from
4652                  * the handler means use default socket-level processing as
4653                  * done by the rest of this routine.  Otherwise, any other
4654                  * return value indicates that the option is unsupported.
4655                  */
4656                 if (so->so_proto != NULL && (error = so->so_proto->pr_usrreqs->
4657                     pru_socheckopt(so, sopt)) != 0)
4658                         goto out;
4659
4660                 error = 0;
4661                 switch (sopt->sopt_name) {
4662                 case SO_LINGER:
4663                 case SO_LINGER_SEC:
4664                         error = sooptcopyin(sopt, &l, sizeof (l), sizeof (l));
4665                         if (error != 0)
4666                                 goto out;
4667
4668                         so->so_linger = (sopt->sopt_name == SO_LINGER) ?
4669                             l.l_linger : l.l_linger * hz;
4670                         if (l.l_onoff != 0)
4671                                 so->so_options |= SO_LINGER;
4672                         else
4673                                 so->so_options &= ~SO_LINGER;
4674                         break;
4675
4676                 case SO_DEBUG:
4677                 case SO_KEEPALIVE:
4678                 case SO_DONTROUTE:
4679                 case SO_USELOOPBACK:
4680                 case SO_BROADCAST:
4681                 case SO_REUSEADDR:
4682                 case SO_REUSEPORT:
4683                 case SO_OOBINLINE:
4684                 case SO_TIMESTAMP:
4685                 case SO_TIMESTAMP_MONOTONIC:
4686                 case SO_DONTTRUNC:
4687                 case SO_WANTMORE:
4688                 case SO_WANTOOBFLAG:
4689                 case SO_NOWAKEFROMSLEEP:
4690                 case SO_NOAPNFALLBK:
4691                         error = sooptcopyin(sopt, &optval, sizeof (optval),
4692                             sizeof (optval));
4693                         if (error != 0)
4694                                 goto out;
4695                         if (optval)
4696                                 so->so_options |= sopt->sopt_name;
4697                         else
4698                                 so->so_options &= ~sopt->sopt_name;
4699                         break;
4700
4701                 case SO_SNDBUF:
4702                 case SO_RCVBUF:
4703                 case SO_SNDLOWAT:
4704                 case SO_RCVLOWAT:
4705                         error = sooptcopyin(sopt, &optval, sizeof (optval),
4706                             sizeof (optval));
4707                         if (error != 0)
4708                                 goto out;
4709
4710                         /*
4711                          * Values < 1 make no sense for any of these
4712                          * options, so disallow them.
4713                          */
4714                         if (optval < 1) {
4715                                 error = EINVAL;
4716                                 goto out;
4717                         }
4718
4719                         switch (sopt->sopt_name) {
4720                         case SO_SNDBUF:
4721                         case SO_RCVBUF: {
4722                                 struct sockbuf *sb =
4723                                     (sopt->sopt_name == SO_SNDBUF) ?
4724                                     &so->so_snd : &so->so_rcv;
4725                                 if (sbreserve(sb, (u_int32_t)optval) == 0) {
4726                                         error = ENOBUFS;
4727                                         goto out;
4728                                 }
4729                                 sb->sb_flags |= SB_USRSIZE;
4730                                 sb->sb_flags &= ~SB_AUTOSIZE;
4731                                 sb->sb_idealsize = (u_int32_t)optval;
4732                                 break;
4733                         }
4734                         /*
4735                          * Make sure the low-water is never greater than
4736                          * the high-water.
4737                          */
4738                         case SO_SNDLOWAT: {
4739                                 int space = sbspace(&so->so_snd);
4740                                 u_int32_t hiwat = so->so_snd.sb_hiwat;
4741
4742                                 if (so->so_snd.sb_flags & SB_UNIX) {
4743                                         struct unpcb *unp =
4744                                             (struct unpcb *)(so->so_pcb);
4745                                         if (unp != NULL &&
4746                                             unp->unp_conn != NULL) {
4747                                                 hiwat += unp->unp_conn->unp_cc;
4748                                         }
4749                                 }
4750
4751                                 so->so_snd.sb_lowat =
4752                                     (optval > hiwat) ?
4753                                     hiwat : optval;
4754
4755                                 if (space >= so->so_snd.sb_lowat) {
4756                                         sowwakeup(so);
4757                                 }
4758                                 break;
4759                         }
4760                         case SO_RCVLOWAT: {
4761                                 int64_t data_len;
4762                                 so->so_rcv.sb_lowat =
4763                                     (optval > so->so_rcv.sb_hiwat) ?
4764                                     so->so_rcv.sb_hiwat : optval;
4765                                 data_len = so->so_rcv.sb_cc
4766                                     - so->so_rcv.sb_ctl;
4767                                 if (data_len >= so->so_rcv.sb_lowat)
4768                                     sorwakeup(so);
4769                                 break;
4770                         }
4771                         }
4772                         break;
4773
4774                 case SO_SNDTIMEO:
4775                 case SO_RCVTIMEO:
4776                         error = sooptcopyin_timeval(sopt, &tv);
4777                         if (error != 0)
4778                                 goto out;
4779
4780                         switch (sopt->sopt_name) {
4781                         case SO_SNDTIMEO:
4782                                 so->so_snd.sb_timeo = tv;
4783                                 break;
4784                         case SO_RCVTIMEO:
4785                                 so->so_rcv.sb_timeo = tv;
4786                                 break;
4787                         }
4788                         break;
4789
4790                 case SO_NKE: {
4791                         struct so_nke nke;
4792
4793                         error = sooptcopyin(sopt, &nke, sizeof (nke),
4794                             sizeof (nke));
4795                         if (error != 0)
4796                                 goto out;
4797
4798                         error = sflt_attach_internal(so, nke.nke_handle);
4799                         break;
4800                 }
4801
4802                 case SO_NOSIGPIPE:
4803                         error = sooptcopyin(sopt, &optval, sizeof (optval),
4804                             sizeof (optval));
4805                         if (error != 0)
4806                                 goto out;
4807                         if (optval != 0)
4808                                 so->so_flags |= SOF_NOSIGPIPE;
4809                         else
4810                                 so->so_flags &= ~SOF_NOSIGPIPE;
4811                         break;
4812
4813                 case SO_NOADDRERR:
4814                         error = sooptcopyin(sopt, &optval, sizeof (optval),
4815                             sizeof (optval));
4816                         if (error != 0)
4817                                 goto out;
4818                         if (optval != 0)
4819                                 so->so_flags |= SOF_NOADDRAVAIL;
4820                         else
4821                                 so->so_flags &= ~SOF_NOADDRAVAIL;
4822                         break;
4823
4824                 case SO_REUSESHAREUID:
4825                         error = sooptcopyin(sopt, &optval, sizeof (optval),
4826                             sizeof (optval));
4827                         if (error != 0)
4828                                 goto out;
4829                         if (optval != 0)
4830                                 so->so_flags |= SOF_REUSESHAREUID;
4831                         else
4832                                 so->so_flags &= ~SOF_REUSESHAREUID;
4833                         break;
4834
4835                 case SO_NOTIFYCONFLICT:
4836                         if (kauth_cred_issuser(kauth_cred_get()) == 0) {
4837                                 error = EPERM;
4838                                 goto out;
4839                         }
4840                         error = sooptcopyin(sopt, &optval, sizeof (optval),
4841                             sizeof (optval));
4842                         if (error != 0)
4843                                 goto out;
4844                         if (optval != 0)
4845                                 so->so_flags |= SOF_NOTIFYCONFLICT;
4846                         else
4847                                 so->so_flags &= ~SOF_NOTIFYCONFLICT;
4848                         break;
4849
4850                 case SO_RESTRICTIONS:
4851                         error = sooptcopyin(sopt, &optval, sizeof (optval),
4852                             sizeof (optval));
4853                         if (error != 0)
4854                                 goto out;
4855
4856                         error = so_set_restrictions(so, optval);
4857                         break;
4858
4859                 case SO_AWDL_UNRESTRICTED:
4860                         if (SOCK_DOM(so) != PF_INET &&
4861                             SOCK_DOM(so) != PF_INET6) {
4862                                 error = EOPNOTSUPP;
4863                                 goto out;
4864                         }
4865                         error = sooptcopyin(sopt, &optval, sizeof(optval),
4866                             sizeof(optval));
4867                         if (error != 0)
4868                                 goto out;
4869                         if (optval != 0) {
4870                                 error = soopt_cred_check(so,
4871                                     PRIV_NET_RESTRICTED_AWDL);
4872                                 if (error == 0)
4873                                         inp_set_awdl_unrestricted(
4874                                             sotoinpcb(so));
4875                         } else
4876                                 inp_clear_awdl_unrestricted(sotoinpcb(so));
4877                         break;
4878                 case SO_INTCOPROC_ALLOW:
4879                         if (SOCK_DOM(so) != PF_INET6) {
4880                                 error = EOPNOTSUPP;
4881                                 goto out;
4882                         }
4883                         error = sooptcopyin(sopt, &optval, sizeof(optval),
4884                             sizeof(optval));
4885                         if (error != 0)
4886                                 goto out;
4887                         if (optval != 0) {
4888                                 error = soopt_cred_check(so,
4889                                     PRIV_NET_RESTRICTED_INTCOPROC);
4890                                 if (error == 0)
4891                                         inp_set_intcoproc_allowed(
4892                                             sotoinpcb(so));
4893                         } else
4894                                 inp_clear_intcoproc_allowed(sotoinpcb(so));
4895                         break;
4896
4897                 case SO_LABEL:
4898 #if CONFIG_MACF_SOCKET
4899                         if ((error = sooptcopyin(sopt, &extmac, sizeof (extmac),
4900                             sizeof (extmac))) != 0)
4901                                 goto out;
4902
4903                         error = mac_setsockopt_label(proc_ucred(sopt->sopt_p),
4904                             so, &extmac);
4905 #else
4906                         error = EOPNOTSUPP;
4907 #endif /* MAC_SOCKET */
4908                         break;
4909
4910                 case SO_UPCALLCLOSEWAIT:
4911                         error = sooptcopyin(sopt, &optval, sizeof (optval),
4912                             sizeof (optval));
4913                         if (error != 0)
4914                                 goto out;
4915                         if (optval != 0)
4916                                 so->so_flags |= SOF_UPCALLCLOSEWAIT;
4917                         else
4918                                 so->so_flags &= ~SOF_UPCALLCLOSEWAIT;
4919                         break;
4920
4921                 case SO_RANDOMPORT:
4922                         error = sooptcopyin(sopt, &optval, sizeof (optval),
4923                             sizeof (optval));
4924                         if (error != 0)
4925                                 goto out;
4926                         if (optval != 0)
4927                                 so->so_flags |= SOF_BINDRANDOMPORT;
4928                         else
4929                                 so->so_flags &= ~SOF_BINDRANDOMPORT;
4930                         break;
4931
4932                 case SO_NP_EXTENSIONS: {
4933                         struct so_np_extensions sonpx;
4934
4935                         error = sooptcopyin(sopt, &sonpx, sizeof (sonpx),
4936                             sizeof (sonpx));
4937                         if (error != 0)
4938                                 goto out;
4939                         if (sonpx.npx_mask & ~SONPX_MASK_VALID) {
4940                                 error = EINVAL;
4941                                 goto out;
4942                         }
4943                         /*
4944                          * Only one bit defined for now
4945                          */
4946                         if ((sonpx.npx_mask & SONPX_SETOPTSHUT)) {
4947                                 if ((sonpx.npx_flags & SONPX_SETOPTSHUT))
4948                                         so->so_flags |= SOF_NPX_SETOPTSHUT;
4949                                 else
4950                                         so->so_flags &= ~SOF_NPX_SETOPTSHUT;
4951                         }
4952                         break;
4953                 }
4954
4955                 case SO_TRAFFIC_CLASS: {
4956                         error = sooptcopyin(sopt, &optval, sizeof (optval),
4957                             sizeof (optval));
4958                         if (error != 0)
4959                                 goto out;
4960                         if (optval >= SO_TC_NET_SERVICE_OFFSET) {
4961                                 int netsvc = optval - SO_TC_NET_SERVICE_OFFSET;
4962                                 error = so_set_net_service_type(so, netsvc);
4963                                 goto out;
4964                         }
4965                         error = so_set_traffic_class(so, optval);
4966                         if (error != 0)
4967                                 goto out;
4968                         so->so_flags1 &= ~SOF1_TC_NET_SERV_TYPE;
4969                         so->so_netsvctype = _NET_SERVICE_TYPE_UNSPEC;
4970                         break;
4971                 }
4972
4973                 case SO_RECV_TRAFFIC_CLASS: {
4974                         error = sooptcopyin(sopt, &optval, sizeof (optval),
4975                             sizeof (optval));
4976                         if (error != 0)
4977                                 goto out;
4978                         if (optval == 0)
4979                                 so->so_flags &= ~SOF_RECV_TRAFFIC_CLASS;
4980                         else
4981                                 so->so_flags |= SOF_RECV_TRAFFIC_CLASS;
4982                         break;
4983                 }
4984
4985 #if (DEVELOPMENT || DEBUG)
4986                 case SO_TRAFFIC_CLASS_DBG: {
4987                         struct so_tcdbg so_tcdbg;
4988
4989                         error = sooptcopyin(sopt, &so_tcdbg,
4990                             sizeof (struct so_tcdbg), sizeof (struct so_tcdbg));
4991                         if (error != 0)
4992                                 goto out;
4993                         error = so_set_tcdbg(so, &so_tcdbg);
4994                         if (error != 0)
4995                                 goto out;
4996                         break;
4997                 }
4998 #endif /* (DEVELOPMENT || DEBUG) */
4999
5000                 case SO_PRIVILEGED_TRAFFIC_CLASS:
5001                         error = priv_check_cred(kauth_cred_get(),
5002                             PRIV_NET_PRIVILEGED_TRAFFIC_CLASS, 0);
5003                         if (error != 0)
5004                                 goto out;
5005                         error = sooptcopyin(sopt, &optval, sizeof (optval),
5006                             sizeof (optval));
5007                         if (error != 0)
5008                                 goto out;
5009                         if (optval == 0)
5010                                 so->so_flags &= ~SOF_PRIVILEGED_TRAFFIC_CLASS;
5011                         else
5012                                 so->so_flags |= SOF_PRIVILEGED_TRAFFIC_CLASS;
5013                         break;
5014
5015                 case SO_DEFUNCTOK:
5016                         error = sooptcopyin(sopt, &optval, sizeof (optval),
5017                             sizeof (optval));
5018                         if (error != 0 || (so->so_flags & SOF_DEFUNCT)) {
5019                                 if (error == 0)
5020                                         error = EBADF;
5021                                 goto out;
5022                         }
5023                         /*
5024                          * Any process can set SO_DEFUNCTOK (clear
5025                          * SOF_NODEFUNCT), but only root can clear
5026                          * SO_DEFUNCTOK (set SOF_NODEFUNCT).
5027                          */
5028                         if (optval == 0 &&
5029                             kauth_cred_issuser(kauth_cred_get()) == 0) {
5030                                 error = EPERM;
5031                                 goto out;
5032                         }
5033                         if (optval)
5034                                 so->so_flags &= ~SOF_NODEFUNCT;
5035                         else
5036                                 so->so_flags |= SOF_NODEFUNCT;
5037
5038                         if (SOCK_DOM(so) == PF_INET ||
5039                             SOCK_DOM(so) == PF_INET6) {
5040                                 char s[MAX_IPv6_STR_LEN];
5041                                 char d[MAX_IPv6_STR_LEN];
5042                                 struct inpcb *inp = sotoinpcb(so);
5043
5044                                 SODEFUNCTLOG("%s[%d, %s]: so 0x%llx "
5045                                     "[%s %s:%d -> %s:%d] is now marked "
5046                                     "as %seligible for "
5047                                     "defunct\n", __func__, proc_selfpid(),
5048                                     proc_best_name(current_proc()),
5049                                     (uint64_t)DEBUG_KERNEL_ADDRPERM(so),
5050                                     (SOCK_TYPE(so) == SOCK_STREAM) ?
5051                                     "TCP" : "UDP", inet_ntop(SOCK_DOM(so),
5052                                     ((SOCK_DOM(so) == PF_INET) ?
5053                                     (void *)&inp->inp_laddr.s_addr :
5054                                     (void *)&inp->in6p_laddr), s, sizeof (s)),
5055                                     ntohs(inp->in6p_lport),
5056                                     inet_ntop(SOCK_DOM(so),
5057                                     (SOCK_DOM(so) == PF_INET) ?
5058                                     (void *)&inp->inp_faddr.s_addr :
5059                                     (void *)&inp->in6p_faddr, d, sizeof (d)),
5060                                     ntohs(inp->in6p_fport),
5061                                     (so->so_flags & SOF_NODEFUNCT) ?
5062                                     "not " : "");
5063                         } else {
5064                                 SODEFUNCTLOG("%s[%d, %s]: so 0x%llx [%d,%d] "
5065                                     "is now marked as %seligible for "
5066                                     "defunct\n",
5067                                     __func__, proc_selfpid(),
5068                                     proc_best_name(current_proc()),
5069                                     (uint64_t)DEBUG_KERNEL_ADDRPERM(so),
5070                                     SOCK_DOM(so), SOCK_TYPE(so),
5071                                     (so->so_flags & SOF_NODEFUNCT) ?
5072                                     "not " : "");
5073                         }
5074                         break;
5075
5076                 case SO_ISDEFUNCT:
5077                         /* This option is not settable */
5078                         error = EINVAL;
5079                         break;
5080
5081                 case SO_OPPORTUNISTIC:
5082                         error = sooptcopyin(sopt, &optval, sizeof (optval),
5083                             sizeof (optval));
5084                         if (error == 0)
5085                                 error = so_set_opportunistic(so, optval);
5086                         break;
5087
5088                 case SO_FLUSH:
5089                         /* This option is handled by lower layer(s) */
5090                         error = 0;
5091                         break;
5092
5093                 case SO_RECV_ANYIF:
5094                         error = sooptcopyin(sopt, &optval, sizeof (optval),
5095                             sizeof (optval));
5096                         if (error == 0)
5097                                 error = so_set_recv_anyif(so, optval);
5098                         break;
5099
5100                 case SO_TRAFFIC_MGT_BACKGROUND: {
5101                         /* This option is handled by lower layer(s) */
5102                         error = 0;
5103                         break;
5104                 }
5105
5106 #if FLOW_DIVERT
5107                 case SO_FLOW_DIVERT_TOKEN:
5108                         error = flow_divert_token_set(so, sopt);
5109                         break;
5110 #endif  /* FLOW_DIVERT */
5111
5112
5113                 case SO_DELEGATED:
5114                         if ((error = sooptcopyin(sopt, &optval, sizeof (optval),
5115                             sizeof (optval))) != 0)
5116                                 break;
5117
5118                         error = so_set_effective_pid(so, optval, sopt->sopt_p);
5119                         break;
5120
5121                 case SO_DELEGATED_UUID: {
5122                         uuid_t euuid;
5123
5124                         if ((error = sooptcopyin(sopt, &euuid, sizeof (euuid),
5125                             sizeof (euuid))) != 0)
5126                                 break;
5127
5128                         error = so_set_effective_uuid(so, euuid, sopt->sopt_p);
5129                         break;
5130                 }
5131
5132 #if NECP
5133                 case SO_NECP_ATTRIBUTES:
5134                         error = necp_set_socket_attributes(so, sopt);
5135                         break;
5136 #endif /* NECP */
5137
5138 #if MPTCP
5139                 case SO_MPTCP_FASTJOIN:
5140                         if (!((so->so_flags & SOF_MP_SUBFLOW) ||
5141                             ((SOCK_CHECK_DOM(so, PF_MULTIPATH)) &&
5142                             (SOCK_CHECK_PROTO(so, IPPROTO_TCP))))) {
5143                                 error = ENOPROTOOPT;
5144                                 break;
5145                         }
5146
5147                         error = sooptcopyin(sopt, &optval, sizeof (optval),
5148                             sizeof (optval));
5149                         if (error != 0)
5150                                 goto out;
5151                         if (optval == 0)
5152                                 so->so_flags &= ~SOF_MPTCP_FASTJOIN;
5153                         else
5154                                 so->so_flags |= SOF_MPTCP_FASTJOIN;
5155                         break;
5156 #endif /* MPTCP */
5157
5158                 case SO_EXTENDED_BK_IDLE:
5159                         error = sooptcopyin(sopt, &optval, sizeof (optval),
5160                             sizeof (optval));
5161                         if (error == 0)
5162                                 error = so_set_extended_bk_idle(so, optval);
5163                         break;
5164
5165                 case SO_MARK_CELLFALLBACK:
5166                         error = sooptcopyin(sopt, &optval, sizeof(optval),
5167                             sizeof(optval));
5168                         if (error != 0)
5169                                 goto out;
5170                         if (optval < 0) {
5171                                 error = EINVAL;
5172                                 goto out;
5173                         }
5174                         if (optval == 0)
5175                                 so->so_flags1 &= ~SOF1_CELLFALLBACK;
5176                         else
5177                                 so->so_flags1 |= SOF1_CELLFALLBACK;
5178                         break;
5179
5180                 case SO_NET_SERVICE_TYPE: {
5181                         error = sooptcopyin(sopt, &optval, sizeof(optval),
5182                             sizeof(optval));
5183                         if (error != 0)
5184                                 goto out;
5185                         error = so_set_net_service_type(so, optval);
5186                         break;
5187                 }
5188
5189                 case SO_QOSMARKING_POLICY_OVERRIDE:
5190                         error = priv_check_cred(kauth_cred_get(),
5191                             PRIV_NET_QOSMARKING_POLICY_OVERRIDE, 0);
5192                         if (error != 0)
5193                                 goto out;
5194                         error = sooptcopyin(sopt, &optval, sizeof(optval),
5195                             sizeof(optval));
5196                         if (error != 0)
5197                                 goto out;
5198                         if (optval == 0)
5199                                 so->so_flags1 &= ~SOF1_QOSMARKING_POLICY_OVERRIDE;
5200                         else
5201                                 so->so_flags1 |= SOF1_QOSMARKING_POLICY_OVERRIDE;
5202                         break;
5203
5204                 default:
5205                         error = ENOPROTOOPT;
5206                         break;
5207                 }
5208                 if (error == 0 && so->so_proto != NULL &&
5209                     so->so_proto->pr_ctloutput != NULL) {
5210                         (void) so->so_proto->pr_ctloutput(so, sopt);
5211                 }
5212         }
5213 out:
5214         if (dolock)
5215                 socket_unlock(so, 1);
5216         return (error);
5217 }
5218
5219 /* Helper routines for getsockopt */
5220 int
5221 sooptcopyout(struct sockopt *sopt, void *buf, size_t len)
5222 {
5223         int     error;
5224         size_t  valsize;
5225
5226         error = 0;
5227
5228         /*
5229          * Documented get behavior is that we always return a value,
5230          * possibly truncated to fit in the user's buffer.
5231          * Traditional behavior is that we always tell the user
5232          * precisely how much we copied, rather than something useful
5233          * like the total amount we had available for her.
5234          * Note that this interface is not idempotent; the entire answer must
5235          * generated ahead of time.
5236          */
5237         valsize = min(len, sopt->sopt_valsize);
5238         sopt->sopt_valsize = valsize;
5239         if (sopt->sopt_val != USER_ADDR_NULL) {
5240                 if (sopt->sopt_p != kernproc)
5241                         error = copyout(buf, sopt->sopt_val, valsize);
5242                 else
5243                         bcopy(buf, CAST_DOWN(caddr_t, sopt->sopt_val), valsize);
5244         }
5245         return (error);
5246 }
5247
5248 static int
5249 sooptcopyout_timeval(struct sockopt *sopt, const struct timeval *tv_p)
5250 {
5251         int                     error;
5252         size_t                  len;
5253         struct user64_timeval   tv64;
5254         struct user32_timeval   tv32;
5255         const void *            val;
5256         size_t                  valsize;
5257
5258         error = 0;
5259         if (proc_is64bit(sopt->sopt_p)) {
5260                 len = sizeof (tv64);
5261                 tv64.tv_sec = tv_p->tv_sec;
5262                 tv64.tv_usec = tv_p->tv_usec;
5263                 val = &tv64;
5264         } else {
5265                 len = sizeof (tv32);
5266                 tv32.tv_sec = tv_p->tv_sec;
5267                 tv32.tv_usec = tv_p->tv_usec;
5268                 val = &tv32;
5269         }
5270         valsize = min(len, sopt->sopt_valsize);
5271         sopt->sopt_valsize = valsize;
5272         if (sopt->sopt_val != USER_ADDR_NULL) {
5273                 if (sopt->sopt_p != kernproc)
5274                         error = copyout(val, sopt->sopt_val, valsize);
5275                 else
5276                         bcopy(val, CAST_DOWN(caddr_t, sopt->sopt_val), valsize);
5277         }
5278         return (error);
5279 }
5280
5281 /*
5282  * Return:      0                       Success
5283  *              ENOPROTOOPT
5284  *      <pr_ctloutput>:EOPNOTSUPP[AF_UNIX]
5285  *      <pr_ctloutput>:???
5286  *      <sf_getoption>:???
5287  */
5288 int
5289 sogetoptlock(struct socket *so, struct sockopt *sopt, int dolock)
5290 {
5291         int     error, optval;
5292         struct  linger l;
5293         struct  timeval tv;
5294 #if CONFIG_MACF_SOCKET
5295         struct mac extmac;
5296 #endif /* MAC_SOCKET */
5297
5298         if (sopt->sopt_dir != SOPT_GET)
5299                 sopt->sopt_dir = SOPT_GET;
5300
5301         if (dolock)
5302                 socket_lock(so, 1);
5303
5304         error = sflt_getsockopt(so, sopt);
5305         if (error != 0) {
5306                 if (error == EJUSTRETURN)
5307                         error = 0;
5308                 goto out;
5309         }
5310
5311         if (sopt->sopt_level != SOL_SOCKET) {
5312                 if (so->so_proto != NULL &&
5313                     so->so_proto->pr_ctloutput != NULL) {
5314                         error = (*so->so_proto->pr_ctloutput)(so, sopt);
5315                         goto out;
5316                 }
5317                 error = ENOPROTOOPT;
5318         } else {
5319                 /*
5320                  * Allow socket-level (SOL_SOCKET) options to be filtered by
5321                  * the protocol layer, if needed.  A zero value returned from
5322                  * the handler means use default socket-level processing as
5323                  * done by the rest of this routine.  Otherwise, any other
5324                  * return value indicates that the option is unsupported.
5325                  */
5326                 if (so->so_proto != NULL && (error = so->so_proto->pr_usrreqs->
5327                     pru_socheckopt(so, sopt)) != 0)
5328                         goto out;
5329
5330                 error = 0;
5331                 switch (sopt->sopt_name) {
5332                 case SO_LINGER:
5333                 case SO_LINGER_SEC:
5334                         l.l_onoff = ((so->so_options & SO_LINGER) ? 1 : 0);
5335                         l.l_linger = (sopt->sopt_name == SO_LINGER) ?
5336                             so->so_linger : so->so_linger / hz;
5337                         error = sooptcopyout(sopt, &l, sizeof (l));
5338                         break;
5339
5340                 case SO_USELOOPBACK:
5341                 case SO_DONTROUTE:
5342                 case SO_DEBUG:
5343                 case SO_KEEPALIVE:
5344                 case SO_REUSEADDR:
5345                 case SO_REUSEPORT:
5346                 case SO_BROADCAST:
5347                 case SO_OOBINLINE:
5348                 case SO_TIMESTAMP:
5349                 case SO_TIMESTAMP_MONOTONIC:
5350                 case SO_DONTTRUNC:
5351                 case SO_WANTMORE:
5352                 case SO_WANTOOBFLAG:
5353                 case SO_NOWAKEFROMSLEEP:
5354                 case SO_NOAPNFALLBK:
5355                         optval = so->so_options & sopt->sopt_name;
5356 integer:
5357                         error = sooptcopyout(sopt, &optval, sizeof (optval));
5358                         break;
5359
5360                 case SO_TYPE:
5361                         optval = so->so_type;
5362                         goto integer;
5363
5364                 case SO_NREAD:
5365                         if (so->so_proto->pr_flags & PR_ATOMIC) {
5366                                 int pkt_total;
5367                                 struct mbuf *m1;
5368
5369                                 pkt_total = 0;
5370                                 m1 = so->so_rcv.sb_mb;
5371                                 while (m1 != NULL) {
5372                                         if (m1->m_type == MT_DATA ||
5373                                             m1->m_type == MT_HEADER ||
5374                                             m1->m_type == MT_OOBDATA)
5375                                                 pkt_total += m1->m_len;
5376                                         m1 = m1->m_next;
5377                                 }
5378                                 optval = pkt_total;
5379                         } else {
5380                                 optval = so->so_rcv.sb_cc - so->so_rcv.sb_ctl;
5381                         }
5382                         goto integer;
5383
5384                 case SO_NUMRCVPKT:
5385                         if (so->so_proto->pr_flags & PR_ATOMIC) {
5386                                 int cnt = 0;
5387                                 struct mbuf *m1;
5388
5389                                 m1 = so->so_rcv.sb_mb;
5390                                 while (m1 != NULL) {
5391                                         if (m1->m_type == MT_DATA ||
5392                                             m1->m_type == MT_HEADER ||
5393                                             m1->m_type == MT_OOBDATA)
5394                                                 cnt += 1;
5395                                         m1 = m1->m_nextpkt;
5396                                 }
5397                                 optval = cnt;
5398                                 goto integer;
5399                         } else {
5400                                 error = EINVAL;
5401                                 break;
5402                         }
5403
5404                 case SO_NWRITE:
5405                         optval = so->so_snd.sb_cc;
5406                         goto integer;
5407
5408                 case SO_ERROR:
5409                         optval = so->so_error;
5410                         so->so_error = 0;
5411                         goto integer;
5412
5413                 case SO_SNDBUF: {
5414                         u_int32_t hiwat = so->so_snd.sb_hiwat;
5415
5416                         if (so->so_snd.sb_flags & SB_UNIX) {
5417                                 struct unpcb *unp =
5418                                     (struct unpcb *)(so->so_pcb);
5419                                 if (unp != NULL && unp->unp_conn != NULL) {
5420                                         hiwat += unp->unp_conn->unp_cc;
5421                                 }
5422                         }
5423
5424                         optval = hiwat;
5425                         goto integer;
5426                 }
5427                 case SO_RCVBUF:
5428                         optval = so->so_rcv.sb_hiwat;
5429                         goto integer;
5430
5431                 case SO_SNDLOWAT:
5432                         optval = so->so_snd.sb_lowat;
5433                         goto integer;
5434
5435                 case SO_RCVLOWAT:
5436                         optval = so->so_rcv.sb_lowat;
5437                         goto integer;
5438
5439                 case SO_SNDTIMEO:
5440                 case SO_RCVTIMEO:
5441                         tv = (sopt->sopt_name == SO_SNDTIMEO ?
5442                             so->so_snd.sb_timeo : so->so_rcv.sb_timeo);
5443
5444                         error = sooptcopyout_timeval(sopt, &tv);
5445                         break;
5446
5447                 case SO_NOSIGPIPE:
5448                         optval = (so->so_flags & SOF_NOSIGPIPE);
5449                         goto integer;
5450
5451                 case SO_NOADDRERR:
5452                         optval = (so->so_flags & SOF_NOADDRAVAIL);
5453                         goto integer;
5454
5455                 case SO_REUSESHAREUID:
5456                         optval = (so->so_flags & SOF_REUSESHAREUID);
5457                         goto integer;
5458
5459
5460                 case SO_NOTIFYCONFLICT:
5461                         optval = (so->so_flags & SOF_NOTIFYCONFLICT);
5462                         goto integer;
5463
5464                 case SO_RESTRICTIONS:
5465                         optval = so_get_restrictions(so);
5466                         goto integer;
5467
5468                 case SO_AWDL_UNRESTRICTED:
5469                         if (SOCK_DOM(so) == PF_INET ||
5470                             SOCK_DOM(so) == PF_INET6) {
5471                                 optval = inp_get_awdl_unrestricted(
5472                                     sotoinpcb(so));
5473                                 goto integer;
5474                         } else
5475                                 error = EOPNOTSUPP;
5476                         break;
5477
5478                 case SO_INTCOPROC_ALLOW:
5479                         if (SOCK_DOM(so) == PF_INET6) {
5480                                 optval = inp_get_intcoproc_allowed(
5481                                     sotoinpcb(so));
5482                                 goto integer;
5483                         } else
5484                                 error = EOPNOTSUPP;
5485                         break;
5486
5487                 case SO_LABEL:
5488 #if CONFIG_MACF_SOCKET
5489                         if ((error = sooptcopyin(sopt, &extmac, sizeof (extmac),
5490                             sizeof (extmac))) != 0 ||
5491                             (error = mac_socket_label_get(proc_ucred(
5492                             sopt->sopt_p), so, &extmac)) != 0)
5493                                 break;
5494
5495                         error = sooptcopyout(sopt, &extmac, sizeof (extmac));
5496 #else
5497                         error = EOPNOTSUPP;
5498 #endif /* MAC_SOCKET */
5499                         break;
5500
5501                 case SO_PEERLABEL:
5502 #if CONFIG_MACF_SOCKET
5503                         if ((error = sooptcopyin(sopt, &extmac, sizeof (extmac),
5504                             sizeof (extmac))) != 0 ||
5505                             (error = mac_socketpeer_label_get(proc_ucred(
5506                             sopt->sopt_p), so, &extmac)) != 0)
5507                                 break;
5508
5509                         error = sooptcopyout(sopt, &extmac, sizeof (extmac));
5510 #else
5511                         error = EOPNOTSUPP;
5512 #endif /* MAC_SOCKET */
5513                         break;
5514
5515 #ifdef __APPLE_API_PRIVATE
5516                 case SO_UPCALLCLOSEWAIT:
5517                         optval = (so->so_flags & SOF_UPCALLCLOSEWAIT);
5518                         goto integer;
5519 #endif
5520                 case SO_RANDOMPORT:
5521                         optval = (so->so_flags & SOF_BINDRANDOMPORT);
5522                         goto integer;
5523
5524                 case SO_NP_EXTENSIONS: {
5525                         struct so_np_extensions sonpx;
5526
5527                         sonpx.npx_flags = (so->so_flags & SOF_NPX_SETOPTSHUT) ?
5528                             SONPX_SETOPTSHUT : 0;
5529                         sonpx.npx_mask = SONPX_MASK_VALID;
5530
5531                         error = sooptcopyout(sopt, &sonpx,
5532                             sizeof (struct so_np_extensions));
5533                         break;
5534                 }
5535
5536                 case SO_TRAFFIC_CLASS:
5537                         optval = so->so_traffic_class;
5538                         goto integer;
5539
5540                 case SO_RECV_TRAFFIC_CLASS:
5541                         optval = (so->so_flags & SOF_RECV_TRAFFIC_CLASS);
5542                         goto integer;
5543
5544                 case SO_TRAFFIC_CLASS_STATS:
5545                         error = sooptcopyout(sopt, &so->so_tc_stats,
5546                             sizeof (so->so_tc_stats));
5547                         break;
5548
5549 #if (DEVELOPMENT || DEBUG)
5550                 case SO_TRAFFIC_CLASS_DBG:
5551                         error = sogetopt_tcdbg(so, sopt);
5552                         break;
5553 #endif /* (DEVELOPMENT || DEBUG) */
5554
5555                 case SO_PRIVILEGED_TRAFFIC_CLASS:
5556                         optval = (so->so_flags & SOF_PRIVILEGED_TRAFFIC_CLASS);
5557                         goto integer;
5558
5559                 case SO_DEFUNCTOK:
5560                         optval = !(so->so_flags & SOF_NODEFUNCT);
5561                         goto integer;
5562
5563                 case SO_ISDEFUNCT:
5564                         optval = (so->so_flags & SOF_DEFUNCT);
5565                         goto integer;
5566
5567                 case SO_OPPORTUNISTIC:
5568                         optval = so_get_opportunistic(so);
5569                         goto integer;
5570
5571                 case SO_FLUSH:
5572                         /* This option is not gettable */
5573                         error = EINVAL;
5574                         break;
5575
5576                 case SO_RECV_ANYIF:
5577                         optval = so_get_recv_anyif(so);
5578                         goto integer;
5579
5580                 case SO_TRAFFIC_MGT_BACKGROUND:
5581                         /* This option is handled by lower layer(s) */
5582                         if (so->so_proto != NULL &&
5583                             so->so_proto->pr_ctloutput != NULL) {
5584                                 (void) so->so_proto->pr_ctloutput(so, sopt);
5585                         }
5586                         break;
5587
5588 #if FLOW_DIVERT
5589                 case SO_FLOW_DIVERT_TOKEN:
5590                         error = flow_divert_token_get(so, sopt);
5591                         break;
5592 #endif  /* FLOW_DIVERT */
5593
5594 #if NECP
5595                 case SO_NECP_ATTRIBUTES:
5596                         error = necp_get_socket_attributes(so, sopt);
5597                         break;
5598 #endif /* NECP */
5599
5600 #if CONTENT_FILTER
5601                 case SO_CFIL_SOCK_ID: {
5602                         cfil_sock_id_t sock_id;
5603
5604                         sock_id = cfil_sock_id_from_socket(so);
5605
5606                         error = sooptcopyout(sopt, &sock_id,
5607                                 sizeof(cfil_sock_id_t));
5608                         break;
5609                 }
5610 #endif  /* CONTENT_FILTER */
5611
5612 #if MPTCP
5613                 case SO_MPTCP_FASTJOIN:
5614                         if (!((so->so_flags & SOF_MP_SUBFLOW) ||
5615                             ((SOCK_CHECK_DOM(so, PF_MULTIPATH)) &&
5616                             (SOCK_CHECK_PROTO(so, IPPROTO_TCP))))) {
5617                                 error = ENOPROTOOPT;
5618                                 break;
5619                         }
5620                         optval = (so->so_flags & SOF_MPTCP_FASTJOIN);
5621                         /* Fixed along with rdar://19391339 */
5622                         goto integer;
5623 #endif /* MPTCP */
5624
5625                 case SO_EXTENDED_BK_IDLE:
5626                         optval = (so->so_flags1 & SOF1_EXTEND_BK_IDLE_WANTED);
5627                         goto integer;
5628                 case SO_MARK_CELLFALLBACK:
5629                         optval = ((so->so_flags1 & SOF1_CELLFALLBACK) > 0)
5630                             ? 1 : 0;
5631                         goto integer;
5632                 case SO_NET_SERVICE_TYPE: {
5633                         if ((so->so_flags1 & SOF1_TC_NET_SERV_TYPE))
5634                                 optval = so->so_netsvctype;
5635                         else
5636                                 optval = NET_SERVICE_TYPE_BE;
5637                         goto integer;
5638                 }
5639                 case SO_NETSVC_MARKING_LEVEL:
5640                         optval = so_get_netsvc_marking_level(so);
5641                         goto integer;
5642
5643                 default:
5644                         error = ENOPROTOOPT;
5645                         break;
5646                 }
5647         }
5648 out:
5649         if (dolock)
5650                 socket_unlock(so, 1);
5651         return (error);
5652 }
5653
5654 /*
5655  * The size limits on our soopt_getm is different from that on FreeBSD.
5656  * We limit the size of options to MCLBYTES. This will have to change
5657  * if we need to define options that need more space than MCLBYTES.
5658  */
5659 int
5660 soopt_getm(struct sockopt *sopt, struct mbuf **mp)
5661 {
5662         struct mbuf *m, *m_prev;
5663         int sopt_size = sopt->sopt_valsize;
5664         int how;
5665
5666         if (sopt_size <= 0 || sopt_size > MCLBYTES)
5667                 return (EMSGSIZE);
5668
5669         how = sopt->sopt_p != kernproc ? M_WAIT : M_DONTWAIT;
5670         MGET(m, how, MT_DATA);
5671         if (m == NULL)
5672                 return (ENOBUFS);
5673         if (sopt_size > MLEN) {
5674                 MCLGET(m, how);
5675                 if ((m->m_flags & M_EXT) == 0) {
5676                         m_free(m);
5677                         return (ENOBUFS);
5678                 }
5679                 m->m_len = min(MCLBYTES, sopt_size);
5680         } else {
5681                 m->m_len = min(MLEN, sopt_size);
5682         }
5683         sopt_size -= m->m_len;
5684         *mp = m;
5685         m_prev = m;
5686
5687         while (sopt_size > 0) {
5688                 MGET(m, how, MT_DATA);
5689                 if (m == NULL) {
5690                         m_freem(*mp);
5691                         return (ENOBUFS);
5692                 }
5693                 if (sopt_size > MLEN) {
5694                         MCLGET(m, how);
5695                         if ((m->m_flags & M_EXT) == 0) {
5696                                 m_freem(*mp);
5697                                 m_freem(m);
5698                                 return (ENOBUFS);
5699                         }
5700                         m->m_len = min(MCLBYTES, sopt_size);
5701                 } else {
5702                         m->m_len = min(MLEN, sopt_size);
5703                 }
5704                 sopt_size -= m->m_len;
5705                 m_prev->m_next = m;
5706                 m_prev = m;
5707         }
5708         return (0);
5709 }
5710
5711 /* copyin sopt data into mbuf chain */
5712 int
5713 soopt_mcopyin(struct sockopt *sopt, struct mbuf *m)
5714 {
5715         struct mbuf *m0 = m;
5716
5717         if (sopt->sopt_val == USER_ADDR_NULL)
5718                 return (0);
5719         while (m != NULL && sopt->sopt_valsize >= m->m_len) {
5720                 if (sopt->sopt_p != kernproc) {
5721                         int error;
5722
5723                         error = copyin(sopt->sopt_val, mtod(m, char *),
5724                             m->m_len);
5725                         if (error != 0) {
5726                                 m_freem(m0);
5727                                 return (error);
5728                         }
5729                 } else {
5730                         bcopy(CAST_DOWN(caddr_t, sopt->sopt_val),
5731                             mtod(m, char *), m->m_len);
5732                 }
5733                 sopt->sopt_valsize -= m->m_len;
5734                 sopt->sopt_val += m->m_len;
5735                 m = m->m_next;
5736         }
5737         /* should be allocated enoughly at ip6_sooptmcopyin() */
5738         if (m != NULL) {
5739                 panic("soopt_mcopyin");
5740                 /* NOTREACHED */
5741         }
5742         return (0);
5743 }
5744
5745 /* copyout mbuf chain data into soopt */
5746 int
5747 soopt_mcopyout(struct sockopt *sopt, struct mbuf *m)
5748 {
5749         struct mbuf *m0 = m;
5750         size_t valsize = 0;
5751
5752         if (sopt->sopt_val == USER_ADDR_NULL)
5753                 return (0);
5754         while (m != NULL && sopt->sopt_valsize >= m->m_len) {
5755                 if (sopt->sopt_p != kernproc) {
5756                         int error;
5757
5758                         error = copyout(mtod(m, char *), sopt->sopt_val,
5759                             m->m_len);
5760                         if (error != 0) {
5761                                 m_freem(m0);
5762                                 return (error);
5763                         }
5764                 } else {
5765                         bcopy(mtod(m, char *),
5766                             CAST_DOWN(caddr_t, sopt->sopt_val), m->m_len);
5767                 }
5768                 sopt->sopt_valsize -= m->m_len;
5769                 sopt->sopt_val += m->m_len;
5770                 valsize += m->m_len;
5771                 m = m->m_next;
5772         }
5773         if (m != NULL) {
5774                 /* enough soopt buffer should be given from user-land */
5775                 m_freem(m0);
5776                 return (EINVAL);
5777         }
5778         sopt->sopt_valsize = valsize;
5779         return (0);
5780 }
5781
5782 void
5783 sohasoutofband(struct socket *so)
5784 {
5785         if (so->so_pgid < 0)
5786                 gsignal(-so->so_pgid, SIGURG);
5787         else if (so->so_pgid > 0)
5788                 proc_signal(so->so_pgid, SIGURG);
5789         selwakeup(&so->so_rcv.sb_sel);
5790         if (so->so_rcv.sb_flags & SB_KNOTE) {
5791                 KNOTE(&so->so_rcv.sb_sel.si_note,
5792                     (NOTE_OOB | SO_FILT_HINT_LOCKED));
5793         }
5794 }
5795
5796 int
5797 sopoll(struct socket *so, int events, kauth_cred_t cred, void * wql)
5798 {
5799 #pragma unused(cred)
5800         struct proc *p = current_proc();
5801         int revents = 0;
5802
5803         socket_lock(so, 1);
5804         so_update_last_owner_locked(so, PROC_NULL);
5805         so_update_policy(so);
5806
5807         if (events & (POLLIN | POLLRDNORM))
5808                 if (soreadable(so))
5809                         revents |= events & (POLLIN | POLLRDNORM);
5810
5811         if (events & (POLLOUT | POLLWRNORM))
5812                 if (sowriteable(so))
5813                         revents |= events & (POLLOUT | POLLWRNORM);
5814
5815         if (events & (POLLPRI | POLLRDBAND))
5816                 if (so->so_oobmark || (so->so_state & SS_RCVATMARK))
5817                         revents |= events & (POLLPRI | POLLRDBAND);
5818
5819         if (revents == 0) {
5820                 if (events & (POLLIN | POLLPRI | POLLRDNORM | POLLRDBAND)) {
5821                         /*
5822                          * Darwin sets the flag first,
5823                          * BSD calls selrecord first
5824                          */
5825                         so->so_rcv.sb_flags |= SB_SEL;
5826                         selrecord(p, &so->so_rcv.sb_sel, wql);
5827                 }
5828
5829                 if (events & (POLLOUT | POLLWRNORM)) {
5830                         /*
5831                          * Darwin sets the flag first,
5832                          * BSD calls selrecord first
5833                          */
5834                         so->so_snd.sb_flags |= SB_SEL;
5835                         selrecord(p, &so->so_snd.sb_sel, wql);
5836                 }
5837         }
5838
5839         socket_unlock(so, 1);
5840         return (revents);
5841 }
5842
5843 int
5844 soo_kqfilter(struct fileproc *fp, struct knote *kn, vfs_context_t ctx)
5845 {
5846 #pragma unused(fp)
5847 #if !CONFIG_MACF_SOCKET
5848 #pragma unused(ctx)
5849 #endif /* MAC_SOCKET */
5850         struct socket *so = (struct socket *)kn->kn_fp->f_fglob->fg_data;
5851         int result;
5852
5853         socket_lock(so, 1);
5854         so_update_last_owner_locked(so, PROC_NULL);
5855         so_update_policy(so);
5856
5857 #if CONFIG_MACF_SOCKET
5858         if (mac_socket_check_kqfilter(proc_ucred(vfs_context_proc(ctx)),
5859             kn, so) != 0) {
5860                 socket_unlock(so, 1);
5861                 kn->kn_flags = EV_ERROR;
5862                 kn->kn_data = EPERM;
5863                 return 0;
5864         }
5865 #endif /* MAC_SOCKET */
5866
5867         switch (kn->kn_filter) {
5868         case EVFILT_READ:
5869                 kn->kn_filtid = EVFILTID_SOREAD;
5870                 break;
5871         case EVFILT_WRITE:
5872                 kn->kn_filtid = EVFILTID_SOWRITE;
5873                 break;
5874         case EVFILT_SOCK:
5875                 kn->kn_filtid = EVFILTID_SCK;
5876                 break;
5877         case EVFILT_EXCEPT:
5878                 kn->kn_filtid = EVFILTID_SOEXCEPT;
5879                 break;
5880         default:
5881                 socket_unlock(so, 1);
5882                 kn->kn_flags = EV_ERROR;
5883                 kn->kn_data = EINVAL;
5884                 return 0;
5885         }
5886
5887         /*
5888          * call the appropriate sub-filter attach
5889          * with the socket still locked
5890          */
5891         result = knote_fops(kn)->f_attach(kn);
5892
5893         socket_unlock(so, 1);
5894
5895         return result;
5896 }
5897
5898 static int
5899 filt_soread_common(struct knote *kn, struct socket *so)
5900 {
5901         if (so->so_options & SO_ACCEPTCONN) {
5902                 int is_not_empty;
5903
5904                 /*
5905                  * Radar 6615193 handle the listen case dynamically
5906                  * for kqueue read filter. This allows to call listen()
5907                  * after registering the kqueue EVFILT_READ.
5908                  */
5909
5910                 kn->kn_data = so->so_qlen;
5911                 is_not_empty = ! TAILQ_EMPTY(&so->so_comp);
5912
5913                 return (is_not_empty);
5914         }
5915
5916         /* socket isn't a listener */
5917         /*
5918          * NOTE_LOWAT specifies new low water mark in data, i.e.
5919          * the bytes of protocol data. We therefore exclude any
5920          * control bytes.
5921          */
5922         kn->kn_data = so->so_rcv.sb_cc - so->so_rcv.sb_ctl;
5923
5924         if (kn->kn_sfflags & NOTE_OOB) {
5925                 if (so->so_oobmark || (so->so_state & SS_RCVATMARK)) {
5926                         kn->kn_fflags |= NOTE_OOB;
5927                         kn->kn_data -= so->so_oobmark;
5928                         return (1);
5929                 }
5930         }
5931
5932         if ((so->so_state & SS_CANTRCVMORE)
5933 #if CONTENT_FILTER
5934             && cfil_sock_data_pending(&so->so_rcv) == 0
5935 #endif /* CONTENT_FILTER */
5936            ) {
5937                 kn->kn_flags |= EV_EOF;
5938                 kn->kn_fflags = so->so_error;
5939                 return (1);
5940         }
5941
5942         if (so->so_error) {     /* temporary udp error */
5943                 return (1);
5944         }
5945
5946         int64_t lowwat = so->so_rcv.sb_lowat;
5947         /*
5948          * Ensure that when NOTE_LOWAT is used, the derived
5949          * low water mark is bounded by socket's rcv buf's
5950          * high and low water mark values.
5951          */
5952         if (kn->kn_sfflags & NOTE_LOWAT) {
5953                 if (kn->kn_sdata > so->so_rcv.sb_hiwat)
5954                         lowwat = so->so_rcv.sb_hiwat;
5955                 else if (kn->kn_sdata > lowwat)
5956                         lowwat = kn->kn_sdata;
5957         }
5958
5959         /*
5960          * The order below is important. Since NOTE_LOWAT
5961          * overrides sb_lowat, check for NOTE_LOWAT case
5962          * first.
5963          */
5964         if (kn->kn_sfflags & NOTE_LOWAT)
5965                 return (kn->kn_data >= lowwat);
5966
5967         return (so->so_rcv.sb_cc >= lowwat);
5968 }
5969
5970 static int
5971 filt_sorattach(struct knote *kn)
5972 {
5973         struct socket *so = (struct socket *)kn->kn_fp->f_fglob->fg_data;
5974
5975         /* socket locked */
5976
5977         /*
5978          * If the caller explicitly asked for OOB results (e.g. poll())
5979          * from EVFILT_READ, then save that off in the hookid field
5980          * and reserve the kn_flags EV_OOBAND bit for output only.
5981          */
5982         if (kn->kn_filter == EVFILT_READ &&
5983             kn->kn_flags & EV_OOBAND) {
5984                 kn->kn_flags &= ~EV_OOBAND;
5985                 kn->kn_hookid = EV_OOBAND;
5986         } else {
5987                 kn->kn_hookid = 0;
5988         }
5989         if (KNOTE_ATTACH(&so->so_rcv.sb_sel.si_note, kn))
5990                 so->so_rcv.sb_flags |= SB_KNOTE;
5991
5992         /* indicate if event is already fired */
5993         return filt_soread_common(kn, so);
5994 }
5995
5996 static void
5997 filt_sordetach(struct knote *kn)
5998 {
5999         struct socket *so = (struct socket *)kn->kn_fp->f_fglob->fg_data;
6000
6001         socket_lock(so, 1);
6002         if (so->so_rcv.sb_flags & SB_KNOTE)
6003                 if (KNOTE_DETACH(&so->so_rcv.sb_sel.si_note, kn))
6004                         so->so_rcv.sb_flags &= ~SB_KNOTE;
6005         socket_unlock(so, 1);
6006 }
6007
6008 /*ARGSUSED*/
6009 static int
6010 filt_soread(struct knote *kn, long hint)
6011 {
6012         struct socket *so = (struct socket *)kn->kn_fp->f_fglob->fg_data;
6013         int retval;
6014
6015         if ((hint & SO_FILT_HINT_LOCKED) == 0)
6016                 socket_lock(so, 1);
6017
6018         retval = filt_soread_common(kn, so);
6019
6020         if ((hint & SO_FILT_HINT_LOCKED) == 0)
6021                 socket_unlock(so, 1);
6022
6023         return retval;
6024 }
6025
6026 static int
6027 filt_sortouch(struct knote *kn, struct kevent_internal_s *kev)
6028 {
6029         struct socket *so = (struct socket *)kn->kn_fp->f_fglob->fg_data;
6030         int retval;
6031
6032         socket_lock(so, 1);
6033
6034         /* save off the new input fflags and data */
6035         kn->kn_sfflags = kev->fflags;
6036         kn->kn_sdata = kev->data;
6037         if ((kn->kn_status & KN_UDATA_SPECIFIC) == 0)
6038                 kn->kn_udata = kev->udata;
6039
6040         /* determine if changes result in fired events */
6041         retval = filt_soread_common(kn, so);
6042
6043         socket_unlock(so, 1);
6044
6045         return retval;
6046 }
6047
6048 static int
6049 filt_sorprocess(struct knote *kn, struct filt_process_s *data, struct kevent_internal_s *kev)
6050 {
6051 #pragma unused(data)
6052         struct socket *so = (struct socket *)kn->kn_fp->f_fglob->fg_data;
6053         int retval;
6054
6055         socket_lock(so, 1);
6056         retval = filt_soread_common(kn, so);
6057         if (retval) {
6058                 *kev = kn->kn_kevent;
6059                 if (kn->kn_flags & EV_CLEAR) {
6060                         kn->kn_fflags = 0;
6061                         kn->kn_data = 0;
6062                 }
6063         }
6064         socket_unlock(so, 1);
6065
6066         return retval;
6067 }
6068
6069 int
6070 so_wait_for_if_feedback(struct socket *so)
6071 {
6072         if ((SOCK_DOM(so) == PF_INET || SOCK_DOM(so) == PF_INET6) &&
6073             (so->so_state & SS_ISCONNECTED)) {
6074                 struct inpcb *inp = sotoinpcb(so);
6075                 if (INP_WAIT_FOR_IF_FEEDBACK(inp))
6076                         return (1);
6077         }
6078         return (0);
6079 }
6080
6081 static int
6082 filt_sowrite_common(struct knote *kn, struct socket *so)
6083 {
6084         int ret = 0;
6085
6086         kn->kn_data = sbspace(&so->so_snd);
6087         if (so->so_state & SS_CANTSENDMORE) {
6088                 kn->kn_flags |= EV_EOF;
6089                 kn->kn_fflags = so->so_error;
6090                 return 1;
6091         }
6092         if (so->so_error) {     /* temporary udp error */
6093                 return 1;
6094         }
6095         if (!socanwrite(so)) {
6096                 return 0;
6097         }
6098         if (so->so_flags1 & SOF1_PRECONNECT_DATA) {
6099                 return 1;
6100         }
6101         int64_t lowwat = so->so_snd.sb_lowat;
6102         if (kn->kn_sfflags & NOTE_LOWAT) {
6103                 if (kn->kn_sdata > so->so_snd.sb_hiwat)
6104                         lowwat = so->so_snd.sb_hiwat;
6105                 else if (kn->kn_sdata > lowwat)
6106                         lowwat = kn->kn_sdata;
6107         }
6108         if (kn->kn_data >= lowwat) {
6109                 if ((so->so_flags & SOF_NOTSENT_LOWAT)
6110 #if (DEBUG || DEVELOPMENT)
6111                     && so_notsent_lowat_check == 1
6112 #endif /* DEBUG || DEVELOPMENT */
6113                     ) {
6114                         if ((SOCK_DOM(so) == PF_INET ||
6115                             SOCK_DOM(so) == PF_INET6) &&
6116                             so->so_type == SOCK_STREAM) {
6117                                 ret = tcp_notsent_lowat_check(so);
6118                         }
6119 #if MPTCP
6120                         else if ((SOCK_DOM(so) == PF_MULTIPATH) &&
6121                             (SOCK_PROTO(so) == IPPROTO_TCP)) {
6122                                 ret = mptcp_notsent_lowat_check(so);
6123                         }
6124 #endif
6125                         else {
6126                                 return 1;
6127                         }
6128                 } else {
6129                         ret = 1;
6130                 }
6131         }
6132         if (so_wait_for_if_feedback(so))
6133                 ret = 0;
6134         return (ret);
6135 }
6136
6137 static int
6138 filt_sowattach(struct knote *kn)
6139 {
6140         struct socket *so = (struct socket *)kn->kn_fp->f_fglob->fg_data;
6141
6142         /* socket locked */
6143         if (KNOTE_ATTACH(&so->so_snd.sb_sel.si_note, kn))
6144                 so->so_snd.sb_flags |= SB_KNOTE;
6145
6146         /* determine if its already fired */
6147         return filt_sowrite_common(kn, so);
6148 }
6149
6150 static void
6151 filt_sowdetach(struct knote *kn)
6152 {
6153         struct socket *so = (struct socket *)kn->kn_fp->f_fglob->fg_data;
6154         socket_lock(so, 1);
6155
6156         if (so->so_snd.sb_flags & SB_KNOTE)
6157                 if (KNOTE_DETACH(&so->so_snd.sb_sel.si_note, kn))
6158                         so->so_snd.sb_flags &= ~SB_KNOTE;
6159         socket_unlock(so, 1);
6160 }
6161
6162 /*ARGSUSED*/
6163 static int
6164 filt_sowrite(struct knote *kn, long hint)
6165 {
6166         struct socket *so = (struct socket *)kn->kn_fp->f_fglob->fg_data;
6167         int ret;
6168
6169         if ((hint & SO_FILT_HINT_LOCKED) == 0)
6170                 socket_lock(so, 1);
6171
6172         ret = filt_sowrite_common(kn, so);
6173
6174         if ((hint & SO_FILT_HINT_LOCKED) == 0)
6175                 socket_unlock(so, 1);
6176
6177         return ret;
6178 }
6179
6180 static int
6181 filt_sowtouch(struct knote *kn, struct kevent_internal_s *kev)
6182 {
6183         struct socket *so = (struct socket *)kn->kn_fp->f_fglob->fg_data;
6184         int ret;
6185
6186         socket_lock(so, 1);
6187
6188         /*save off the new input fflags and data */
6189         kn->kn_sfflags = kev->fflags;
6190         kn->kn_sdata = kev->data;
6191         if ((kn->kn_status & KN_UDATA_SPECIFIC) == 0)
6192                 kn->kn_udata = kev->udata;
6193
6194         /* determine if these changes result in a triggered event */
6195         ret = filt_sowrite_common(kn, so);
6196
6197         socket_unlock(so, 1);
6198
6199         return ret;
6200 }
6201
6202 static int
6203 filt_sowprocess(struct knote *kn, struct filt_process_s *data, struct kevent_internal_s *kev)
6204 {
6205 #pragma unused(data)
6206         struct socket *so = (struct socket *)kn->kn_fp->f_fglob->fg_data;
6207         int ret;
6208
6209         socket_lock(so, 1);
6210         ret = filt_sowrite_common(kn, so);
6211         if (ret) {
6212                 *kev = kn->kn_kevent;
6213                 if (kn->kn_flags & EV_CLEAR) {
6214                         kn->kn_fflags = 0;
6215                         kn->kn_data = 0;
6216                 }
6217         }
6218         socket_unlock(so, 1);
6219         return ret;
6220 }
6221
6222 static int
6223 filt_sockev_common(struct knote *kn, struct socket *so, long ev_hint)
6224 {
6225         int ret = 0;
6226         uint32_t level_trigger = 0;
6227
6228         if (ev_hint & SO_FILT_HINT_CONNRESET) {
6229                 kn->kn_fflags |= NOTE_CONNRESET;
6230         }
6231         if (ev_hint & SO_FILT_HINT_TIMEOUT) {
6232                 kn->kn_fflags |= NOTE_TIMEOUT;
6233         }
6234         if (ev_hint & SO_FILT_HINT_NOSRCADDR) {
6235                 kn->kn_fflags |= NOTE_NOSRCADDR;
6236         }
6237         if (ev_hint & SO_FILT_HINT_IFDENIED) {
6238                 kn->kn_fflags |= NOTE_IFDENIED;
6239         }
6240         if (ev_hint & SO_FILT_HINT_KEEPALIVE) {
6241                 kn->kn_fflags |= NOTE_KEEPALIVE;
6242         }
6243         if (ev_hint & SO_FILT_HINT_ADAPTIVE_WTIMO) {
6244                 kn->kn_fflags |= NOTE_ADAPTIVE_WTIMO;
6245         }
6246         if (ev_hint & SO_FILT_HINT_ADAPTIVE_RTIMO) {
6247                 kn->kn_fflags |= NOTE_ADAPTIVE_RTIMO;
6248         }
6249         if ((ev_hint & SO_FILT_HINT_CONNECTED) ||
6250             (so->so_state & SS_ISCONNECTED)) {
6251                 kn->kn_fflags |= NOTE_CONNECTED;
6252                 level_trigger |= NOTE_CONNECTED;
6253         }
6254         if ((ev_hint & SO_FILT_HINT_DISCONNECTED) ||
6255             (so->so_state & SS_ISDISCONNECTED)) {
6256                 kn->kn_fflags |= NOTE_DISCONNECTED;
6257                 level_trigger |= NOTE_DISCONNECTED;
6258         }
6259         if (ev_hint & SO_FILT_HINT_CONNINFO_UPDATED) {
6260                 if (so->so_proto != NULL &&
6261                     (so->so_proto->pr_flags & PR_EVCONNINFO))
6262                         kn->kn_fflags |= NOTE_CONNINFO_UPDATED;
6263         }
6264
6265         if ((ev_hint & SO_FILT_HINT_NOTIFY_ACK) ||
6266             tcp_notify_ack_active(so)) {
6267                 kn->kn_fflags |= NOTE_NOTIFY_ACK;
6268         }
6269
6270         if ((so->so_state & SS_CANTRCVMORE)
6271 #if CONTENT_FILTER
6272             && cfil_sock_data_pending(&so->so_rcv) == 0
6273 #endif /* CONTENT_FILTER */
6274             ) {
6275                 kn->kn_fflags |= NOTE_READCLOSED;
6276                 level_trigger |= NOTE_READCLOSED;
6277         }
6278
6279         if (so->so_state & SS_CANTSENDMORE) {
6280                 kn->kn_fflags |= NOTE_WRITECLOSED;
6281                 level_trigger |= NOTE_WRITECLOSED;
6282         }
6283
6284         if ((ev_hint & SO_FILT_HINT_SUSPEND) ||
6285             (so->so_flags & SOF_SUSPENDED)) {
6286                 kn->kn_fflags &= ~(NOTE_SUSPEND | NOTE_RESUME);
6287
6288                 /* If resume event was delivered before, reset it */
6289                 kn->kn_hookid &= ~NOTE_RESUME;
6290
6291                 kn->kn_fflags |= NOTE_SUSPEND;
6292                 level_trigger |= NOTE_SUSPEND;
6293         }
6294
6295         if ((ev_hint & SO_FILT_HINT_RESUME) ||
6296             (so->so_flags & SOF_SUSPENDED) == 0) {
6297                 kn->kn_fflags &= ~(NOTE_SUSPEND | NOTE_RESUME);
6298
6299                 /* If suspend event was delivered before, reset it */
6300                 kn->kn_hookid &= ~NOTE_SUSPEND;
6301
6302                 kn->kn_fflags |= NOTE_RESUME;
6303                 level_trigger |= NOTE_RESUME;
6304         }
6305
6306         if (so->so_error != 0) {
6307                 ret = 1;
6308                 kn->kn_data = so->so_error;
6309                 kn->kn_flags |= EV_EOF;
6310         } else {
6311                 get_sockev_state(so, (u_int32_t *)&(kn->kn_data));
6312         }
6313
6314         /* Reset any events that are not requested on this knote */
6315         kn->kn_fflags &= (kn->kn_sfflags & EVFILT_SOCK_ALL_MASK);
6316         level_trigger &= (kn->kn_sfflags & EVFILT_SOCK_ALL_MASK);
6317
6318         /* Find the level triggerred events that are already delivered */
6319         level_trigger &= kn->kn_hookid;
6320         level_trigger &= EVFILT_SOCK_LEVEL_TRIGGER_MASK;
6321
6322         /* Do not deliver level triggerred events more than once */
6323         if ((kn->kn_fflags & ~level_trigger) != 0)
6324                 ret = 1;
6325
6326         return (ret);
6327 }
6328
6329 static int
6330 filt_sockattach(struct knote *kn)
6331 {
6332         struct socket *so = (struct socket *)kn->kn_fp->f_fglob->fg_data;
6333
6334         /* socket locked */
6335         kn->kn_hookid = 0;
6336         if (KNOTE_ATTACH(&so->so_klist, kn))
6337                 so->so_flags |= SOF_KNOTE;
6338
6339         /* determine if event already fired */
6340         return filt_sockev_common(kn, so, 0);
6341 }
6342
6343 static void
6344 filt_sockdetach(struct knote *kn)
6345 {
6346         struct socket *so = (struct socket *)kn->kn_fp->f_fglob->fg_data;
6347         socket_lock(so, 1);
6348
6349         if ((so->so_flags & SOF_KNOTE) != 0)
6350                 if (KNOTE_DETACH(&so->so_klist, kn))
6351                         so->so_flags &= ~SOF_KNOTE;
6352         socket_unlock(so, 1);
6353 }
6354
6355 static int
6356 filt_sockev(struct knote *kn, long hint)
6357 {
6358         int ret = 0, locked = 0;
6359         struct socket *so = (struct socket *)kn->kn_fp->f_fglob->fg_data;
6360         long ev_hint = (hint & SO_FILT_HINT_EV);
6361
6362         if ((hint & SO_FILT_HINT_LOCKED) == 0) {
6363                 socket_lock(so, 1);
6364                 locked = 1;
6365         }
6366
6367         ret = filt_sockev_common(kn, so, ev_hint);
6368
6369         if (locked)
6370                 socket_unlock(so, 1);
6371
6372         return ret;
6373 }
6374
6375
6376
6377 /*
6378  *      filt_socktouch - update event state
6379  */
6380 static int
6381 filt_socktouch(
6382         struct knote *kn,
6383         struct kevent_internal_s *kev)
6384 {
6385         struct socket *so = (struct socket *)kn->kn_fp->f_fglob->fg_data;
6386         uint32_t changed_flags;
6387         int ret;
6388
6389         socket_lock(so, 1);
6390
6391         /* save off the [result] data and fflags */
6392         changed_flags = (kn->kn_sfflags ^ kn->kn_hookid);
6393
6394         /* save off the new input fflags and data */
6395         kn->kn_sfflags = kev->fflags;
6396         kn->kn_sdata = kev->data;
6397         if ((kn->kn_status & KN_UDATA_SPECIFIC) == 0)
6398                 kn->kn_udata = kev->udata;
6399
6400         /* restrict the current results to the (smaller?) set of new interest */
6401         /*
6402          * For compatibility with previous implementations, we leave kn_fflags
6403          * as they were before.
6404          */
6405         //kn->kn_fflags &= kev->fflags;
6406
6407         /*
6408          * Since we keep track of events that are already
6409          * delivered, if any of those events are not requested
6410          * anymore the state related to them can be reset
6411          */
6412         kn->kn_hookid &=
6413             ~(changed_flags & EVFILT_SOCK_LEVEL_TRIGGER_MASK);
6414
6415         /* determine if we have events to deliver */
6416         ret = filt_sockev_common(kn, so, 0);
6417
6418         socket_unlock(so, 1);
6419
6420         return ret;
6421 }
6422
6423 /*
6424  *      filt_sockprocess - query event fired state and return data
6425  */
6426 static int
6427 filt_sockprocess(
6428         struct knote *kn,
6429         struct filt_process_s *data,
6430         struct kevent_internal_s *kev)
6431 {
6432 #pragma unused(data)
6433
6434         struct socket *so = (struct socket *)kn->kn_fp->f_fglob->fg_data;
6435         int ret = 0;
6436
6437         socket_lock(so, 1);
6438
6439         ret = filt_sockev_common(kn, so, 0);
6440         if (ret) {
6441                 *kev = kn->kn_kevent;
6442
6443                 /*
6444                  * Store the state of the events being delivered. This
6445                  * state can be used to deliver level triggered events
6446                  * ateast once and still avoid waking up the application
6447                  * multiple times as long as the event is active.
6448                  */
6449                 if (kn->kn_fflags != 0)
6450                         kn->kn_hookid |= (kn->kn_fflags &
6451                                           EVFILT_SOCK_LEVEL_TRIGGER_MASK);
6452
6453                 /*
6454                  * NOTE_RESUME and NOTE_SUSPEND are an exception, deliver
6455                  * only one of them and remember the last one that was
6456                  * delivered last
6457                  */
6458                 if (kn->kn_fflags & NOTE_SUSPEND)
6459                         kn->kn_hookid &= ~NOTE_RESUME;
6460                 if (kn->kn_fflags & NOTE_RESUME)
6461                         kn->kn_hookid &= ~NOTE_SUSPEND;
6462
6463                 if (kn->kn_flags & EV_CLEAR) {
6464                         kn->kn_data = 0;
6465                         kn->kn_fflags = 0;
6466                 }
6467         }
6468
6469         socket_unlock(so, 1);
6470
6471         return ret;
6472 }
6473
6474 void
6475 get_sockev_state(struct socket *so, u_int32_t *statep)
6476 {
6477         u_int32_t state = *(statep);
6478
6479         /*
6480          * If the state variable is already used by a previous event,
6481          * reset it.
6482          */
6483         if (state != 0)
6484                 return;
6485
6486         if (so->so_state & SS_ISCONNECTED)
6487                 state |= SOCKEV_CONNECTED;
6488         else
6489                 state &= ~(SOCKEV_CONNECTED);
6490         state |= ((so->so_state & SS_ISDISCONNECTED) ? SOCKEV_DISCONNECTED : 0);
6491         *(statep) = state;
6492 }
6493
6494 #define SO_LOCK_HISTORY_STR_LEN \
6495         (2 * SO_LCKDBG_MAX * (2 + (2 * sizeof (void *)) + 1) + 1)
6496
6497 __private_extern__ const char *
6498 solockhistory_nr(struct socket *so)
6499 {
6500         size_t n = 0;
6501         int i;
6502         static char lock_history_str[SO_LOCK_HISTORY_STR_LEN];
6503
6504         bzero(lock_history_str, sizeof (lock_history_str));
6505         for (i = SO_LCKDBG_MAX - 1; i >= 0; i--) {
6506                 n += snprintf(lock_history_str + n,
6507                     SO_LOCK_HISTORY_STR_LEN - n, "%p:%p ",
6508                     so->lock_lr[(so->next_lock_lr + i) % SO_LCKDBG_MAX],
6509                     so->unlock_lr[(so->next_unlock_lr + i) % SO_LCKDBG_MAX]);
6510         }
6511         return (lock_history_str);
6512 }
6513
6514 int
6515 socket_lock(struct socket *so, int refcount)
6516 {
6517         int error = 0;
6518         void *lr_saved;
6519
6520         lr_saved = __builtin_return_address(0);
6521
6522         if (so->so_proto->pr_lock) {
6523                 error = (*so->so_proto->pr_lock)(so, refcount, lr_saved);
6524         } else {
6525 #ifdef MORE_LOCKING_DEBUG
6526                 lck_mtx_assert(so->so_proto->pr_domain->dom_mtx,
6527                     LCK_MTX_ASSERT_NOTOWNED);
6528 #endif
6529                 lck_mtx_lock(so->so_proto->pr_domain->dom_mtx);
6530                 if (refcount)
6531                         so->so_usecount++;
6532                 so->lock_lr[so->next_lock_lr] = lr_saved;
6533                 so->next_lock_lr = (so->next_lock_lr+1) % SO_LCKDBG_MAX;
6534         }
6535
6536         return (error);
6537 }
6538
6539 int
6540 socket_unlock(struct socket *so, int refcount)
6541 {
6542         int error = 0;
6543         void *lr_saved;
6544         lck_mtx_t *mutex_held;
6545
6546         lr_saved = __builtin_return_address(0);
6547
6548         if (so->so_proto == NULL) {
6549                 panic("%s: null so_proto so=%p\n", __func__, so);
6550                 /* NOTREACHED */
6551         }
6552
6553         if (so && so->so_proto->pr_unlock) {
6554                 error = (*so->so_proto->pr_unlock)(so, refcount, lr_saved);
6555         } else {
6556                 mutex_held = so->so_proto->pr_domain->dom_mtx;
6557 #ifdef MORE_LOCKING_DEBUG
6558                 lck_mtx_assert(mutex_held, LCK_MTX_ASSERT_OWNED);
6559 #endif
6560                 so->unlock_lr[so->next_unlock_lr] = lr_saved;
6561                 so->next_unlock_lr = (so->next_unlock_lr+1) % SO_LCKDBG_MAX;
6562
6563                 if (refcount) {
6564                         if (so->so_usecount <= 0) {
6565                                 panic("%s: bad refcount=%d so=%p (%d, %d, %d) "
6566                                     "lrh=%s", __func__, so->so_usecount, so,
6567                                     SOCK_DOM(so), so->so_type,
6568                                     SOCK_PROTO(so), solockhistory_nr(so));
6569                                 /* NOTREACHED */
6570                         }
6571
6572                         so->so_usecount--;
6573                         if (so->so_usecount == 0)
6574                                 sofreelastref(so, 1);
6575                 }
6576                 lck_mtx_unlock(mutex_held);
6577         }
6578
6579         return (error);
6580 }
6581
6582 /* Called with socket locked, will unlock socket */
6583 void
6584 sofree(struct socket *so)
6585 {
6586         lck_mtx_t *mutex_held;
6587
6588         if (so->so_proto->pr_getlock != NULL)
6589                 mutex_held = (*so->so_proto->pr_getlock)(so, 0);
6590         else
6591                 mutex_held = so->so_proto->pr_domain->dom_mtx;
6592         lck_mtx_assert(mutex_held, LCK_MTX_ASSERT_OWNED);
6593
6594         sofreelastref(so, 0);
6595 }
6596
6597 void
6598 soreference(struct socket *so)
6599 {
6600         socket_lock(so, 1);     /* locks & take one reference on socket */
6601         socket_unlock(so, 0);   /* unlock only */
6602 }
6603
6604 void
6605 sodereference(struct socket *so)
6606 {
6607         socket_lock(so, 0);
6608         socket_unlock(so, 1);
6609 }
6610
6611 /*
6612  * Set or clear SOF_MULTIPAGES on the socket to enable or disable the
6613  * possibility of using jumbo clusters.  Caller must ensure to hold
6614  * the socket lock.
6615  */
6616 void
6617 somultipages(struct socket *so, boolean_t set)
6618 {
6619         if (set)
6620                 so->so_flags |= SOF_MULTIPAGES;
6621         else
6622                 so->so_flags &= ~SOF_MULTIPAGES;
6623 }
6624
6625 void
6626 soif2kcl(struct socket *so, boolean_t set)
6627 {
6628         if (set)
6629                 so->so_flags1 |= SOF1_IF_2KCL;
6630         else
6631                 so->so_flags1 &= ~SOF1_IF_2KCL;
6632 }
6633
6634 int
6635 so_isdstlocal(struct socket *so) {
6636
6637         struct inpcb *inp = (struct inpcb *)so->so_pcb;
6638
6639         if (SOCK_DOM(so) == PF_INET)
6640                 return (inaddr_local(inp->inp_faddr));
6641         else if (SOCK_DOM(so) == PF_INET6)
6642                 return (in6addr_local(&inp->in6p_faddr));
6643
6644         return (0);
6645 }
6646
6647 int
6648 sosetdefunct(struct proc *p, struct socket *so, int level, boolean_t noforce)
6649 {
6650         struct sockbuf *rcv, *snd;
6651         int err = 0, defunct;
6652
6653         rcv = &so->so_rcv;
6654         snd = &so->so_snd;
6655
6656         defunct = (so->so_flags & SOF_DEFUNCT);
6657         if (defunct) {
6658                 if (!(snd->sb_flags & rcv->sb_flags & SB_DROP)) {
6659                         panic("%s: SB_DROP not set", __func__);
6660                         /* NOTREACHED */
6661                 }
6662                 goto done;
6663         }
6664
6665         if (so->so_flags & SOF_NODEFUNCT) {
6666                 if (noforce) {
6667                         err = EOPNOTSUPP;
6668                         SODEFUNCTLOG("%s[%d, %s]: (target pid %d "
6669                             "name %s level %d) so 0x%llx [%d,%d] "
6670                             "is not eligible for defunct "
6671                             "(%d)\n", __func__, proc_selfpid(),
6672                             proc_best_name(current_proc()), proc_pid(p),
6673                             proc_best_name(p), level,
6674                             (uint64_t)DEBUG_KERNEL_ADDRPERM(so),
6675                             SOCK_DOM(so), SOCK_TYPE(so), err);
6676                         return (err);
6677                 }
6678                 so->so_flags &= ~SOF_NODEFUNCT;
6679                 SODEFUNCTLOG("%s[%d, %s]: (target pid %d name %s level %d) "
6680                     "so 0x%llx [%d,%d] defunct by force\n", __func__,
6681                     proc_selfpid(), proc_best_name(current_proc()),
6682                     proc_pid(p), proc_best_name(p), level,
6683                     (uint64_t)DEBUG_KERNEL_ADDRPERM(so),
6684                     SOCK_DOM(so), SOCK_TYPE(so));
6685         } else if (so->so_flags1 & SOF1_EXTEND_BK_IDLE_WANTED) {
6686                 struct inpcb *inp = (struct inpcb *)so->so_pcb;
6687                 struct ifnet *ifp = inp->inp_last_outifp;
6688
6689                 if (ifp && IFNET_IS_CELLULAR(ifp)) {
6690                         OSIncrementAtomic(&soextbkidlestat.so_xbkidle_nocell);
6691                 } else if (so->so_flags & SOF_DELEGATED) {
6692                         OSIncrementAtomic(&soextbkidlestat.so_xbkidle_nodlgtd);
6693                 } else if (soextbkidlestat.so_xbkidle_time == 0) {
6694                         OSIncrementAtomic(&soextbkidlestat.so_xbkidle_notime);
6695                 } else if (noforce) {
6696                         OSIncrementAtomic(&soextbkidlestat.so_xbkidle_active);
6697
6698                         so->so_flags1 |= SOF1_EXTEND_BK_IDLE_INPROG;
6699                         so->so_extended_bk_start = net_uptime();
6700                         OSBitOrAtomic(P_LXBKIDLEINPROG, &p->p_ladvflag);
6701
6702                         inpcb_timer_sched(inp->inp_pcbinfo, INPCB_TIMER_LAZY);
6703
6704                         err = EOPNOTSUPP;
6705                         SODEFUNCTLOG("%s[%d, %s]: (target pid %d name %s "
6706                             "level %d) extend bk idle so 0x%llx rcv hw %d "
6707                             "cc %d\n",
6708                             __func__, proc_selfpid(),
6709                             proc_best_name(current_proc()), proc_pid(p),
6710                             proc_best_name(p), level,
6711                             (uint64_t)DEBUG_KERNEL_ADDRPERM(so),
6712                             so->so_rcv.sb_hiwat, so->so_rcv.sb_cc);
6713                         return (err);
6714                 } else {
6715                         OSIncrementAtomic(&soextbkidlestat.so_xbkidle_forced);
6716                 }
6717         }
6718
6719         so->so_flags |= SOF_DEFUNCT;
6720
6721         /* Prevent further data from being appended to the socket buffers */
6722         snd->sb_flags |= SB_DROP;
6723         rcv->sb_flags |= SB_DROP;
6724
6725         /* Flush any existing data in the socket buffers */
6726         if (rcv->sb_cc != 0) {
6727                 rcv->sb_flags &= ~SB_SEL;
6728                 selthreadclear(&rcv->sb_sel);
6729                 sbrelease(rcv);
6730         }
6731         if (snd->sb_cc != 0) {
6732                 snd->sb_flags &= ~SB_SEL;
6733                 selthreadclear(&snd->sb_sel);
6734                 sbrelease(snd);
6735         }
6736
6737 done:
6738         SODEFUNCTLOG("%s[%d, %s]: (target pid %d name %s level %d) "
6739             "so 0x%llx [%d,%d] %s defunct%s\n", __func__, proc_selfpid(),
6740             proc_best_name(current_proc()), proc_pid(p), proc_best_name(p),
6741             level, (uint64_t)DEBUG_KERNEL_ADDRPERM(so), SOCK_DOM(so),
6742             SOCK_TYPE(so), defunct ? "is already" : "marked as",
6743             (so->so_flags1 & SOF1_EXTEND_BK_IDLE_WANTED) ? " extbkidle" : "");
6744
6745         return (err);
6746 }
6747
6748 int
6749 sodefunct(struct proc *p, struct socket *so, int level)
6750 {
6751         struct sockbuf *rcv, *snd;
6752
6753         if (!(so->so_flags & SOF_DEFUNCT)) {
6754                 panic("%s improperly called", __func__);
6755                 /* NOTREACHED */
6756         }
6757         if (so->so_state & SS_DEFUNCT)
6758                 goto done;
6759
6760         rcv = &so->so_rcv;
6761         snd = &so->so_snd;
6762
6763         if (SOCK_DOM(so) == PF_INET || SOCK_DOM(so) == PF_INET6) {
6764                 char s[MAX_IPv6_STR_LEN];
6765                 char d[MAX_IPv6_STR_LEN];
6766                 struct inpcb *inp = sotoinpcb(so);
6767
6768                 SODEFUNCTLOG("%s[%d, %s]: (target pid %d name %s level %d) "
6769                     "so 0x%llx [%s %s:%d -> %s:%d] is now defunct "
6770                     "[rcv_si 0x%x, snd_si 0x%x, rcv_fl 0x%x, snd_fl 0x%x]\n",
6771                     __func__, proc_selfpid(), proc_best_name(current_proc()),
6772                     proc_pid(p), proc_best_name(p), level,
6773                     (uint64_t)DEBUG_KERNEL_ADDRPERM(so),
6774                     (SOCK_TYPE(so) == SOCK_STREAM) ? "TCP" : "UDP",
6775                     inet_ntop(SOCK_DOM(so), ((SOCK_DOM(so) == PF_INET) ?
6776                     (void *)&inp->inp_laddr.s_addr : (void *)&inp->in6p_laddr),
6777                     s, sizeof (s)), ntohs(inp->in6p_lport),
6778                     inet_ntop(SOCK_DOM(so), (SOCK_DOM(so) == PF_INET) ?
6779                     (void *)&inp->inp_faddr.s_addr : (void *)&inp->in6p_faddr,
6780                     d, sizeof (d)), ntohs(inp->in6p_fport),
6781                     (uint32_t)rcv->sb_sel.si_flags,
6782                     (uint32_t)snd->sb_sel.si_flags,
6783                     rcv->sb_flags, snd->sb_flags);
6784         } else {
6785                 SODEFUNCTLOG("%s[%d, %s]: (target pid %d name %s level %d) "
6786                     "so 0x%llx [%d,%d] is now defunct [rcv_si 0x%x, "
6787                     "snd_si 0x%x, rcv_fl 0x%x, snd_fl 0x%x]\n", __func__,
6788                     proc_selfpid(), proc_best_name(current_proc()),
6789                     proc_pid(p), proc_best_name(p), level,
6790                     (uint64_t)DEBUG_KERNEL_ADDRPERM(so),
6791                     SOCK_DOM(so), SOCK_TYPE(so),
6792                     (uint32_t)rcv->sb_sel.si_flags,
6793                     (uint32_t)snd->sb_sel.si_flags, rcv->sb_flags,
6794                     snd->sb_flags);
6795         }
6796
6797         /*
6798          * Unwedge threads blocked on sbwait() and sb_lock().
6799          */
6800         sbwakeup(rcv);
6801         sbwakeup(snd);
6802
6803         so->so_flags1 |= SOF1_DEFUNCTINPROG;
6804         if (rcv->sb_flags & SB_LOCK)
6805                 sbunlock(rcv, TRUE);    /* keep socket locked */
6806         if (snd->sb_flags & SB_LOCK)
6807                 sbunlock(snd, TRUE);    /* keep socket locked */
6808
6809         /*
6810          * Flush the buffers and disconnect.  We explicitly call shutdown
6811          * on both data directions to ensure that SS_CANT{RCV,SEND}MORE
6812          * states are set for the socket.  This would also flush out data
6813          * hanging off the receive list of this socket.
6814          */
6815         (void) soshutdownlock_final(so, SHUT_RD);
6816         (void) soshutdownlock_final(so, SHUT_WR);
6817         (void) sodisconnectlocked(so);
6818
6819         /*
6820          * Explicitly handle connectionless-protocol disconnection
6821          * and release any remaining data in the socket buffers.
6822          */
6823         if (!(so->so_flags & SS_ISDISCONNECTED))
6824                 (void) soisdisconnected(so);
6825
6826         if (so->so_error == 0)
6827                 so->so_error = EBADF;
6828
6829         if (rcv->sb_cc != 0) {
6830                 rcv->sb_flags &= ~SB_SEL;
6831                 selthreadclear(&rcv->sb_sel);
6832                 sbrelease(rcv);
6833         }
6834         if (snd->sb_cc != 0) {
6835                 snd->sb_flags &= ~SB_SEL;
6836                 selthreadclear(&snd->sb_sel);
6837                 sbrelease(snd);
6838         }
6839         so->so_state |= SS_DEFUNCT;
6840         OSIncrementAtomicLong((volatile long *)&sodefunct_calls);
6841
6842 done:
6843         return (0);
6844 }
6845
6846 int
6847 soresume(struct proc *p, struct socket *so, int locked)
6848 {
6849         if (locked == 0)
6850                 socket_lock(so, 1);
6851
6852         if (so->so_flags1 & SOF1_EXTEND_BK_IDLE_INPROG) {
6853                 SODEFUNCTLOG("%s[%d, %s]: (target pid %d name %s) so 0x%llx "
6854                     "[%d,%d] resumed from bk idle\n",
6855                     __func__, proc_selfpid(), proc_best_name(current_proc()),
6856                     proc_pid(p), proc_best_name(p),
6857                     (uint64_t)DEBUG_KERNEL_ADDRPERM(so),
6858                     SOCK_DOM(so), SOCK_TYPE(so));
6859
6860                 so->so_flags1 &= ~SOF1_EXTEND_BK_IDLE_INPROG;
6861                 so->so_extended_bk_start = 0;
6862                 OSBitAndAtomic(~P_LXBKIDLEINPROG, &p->p_ladvflag);
6863
6864                 OSIncrementAtomic(&soextbkidlestat.so_xbkidle_resumed);
6865                 OSDecrementAtomic(&soextbkidlestat.so_xbkidle_active);
6866                 VERIFY(soextbkidlestat.so_xbkidle_active >= 0);
6867         }
6868         if (locked == 0)
6869                 socket_unlock(so, 1);
6870
6871         return (0);
6872 }
6873
6874 /*
6875  * Does not attempt to account for sockets that are delegated from
6876  * the current process
6877  */
6878 int
6879 so_set_extended_bk_idle(struct socket *so, int optval)
6880 {
6881         int error = 0;
6882
6883         if ((SOCK_DOM(so) != PF_INET && SOCK_DOM(so) != PF_INET6) ||
6884             SOCK_PROTO(so) != IPPROTO_TCP) {
6885                 OSDecrementAtomic(&soextbkidlestat.so_xbkidle_notsupp);
6886                 error = EOPNOTSUPP;
6887         } else if (optval == 0) {
6888                 so->so_flags1 &= ~SOF1_EXTEND_BK_IDLE_WANTED;
6889
6890                 soresume(current_proc(), so, 1);
6891         } else {
6892                 struct proc *p = current_proc();
6893                 int i;
6894                 struct filedesc *fdp;
6895                 int count = 0;
6896
6897                 proc_fdlock(p);
6898
6899                 fdp = p->p_fd;
6900                 for (i = 0; i < fdp->fd_nfiles; i++) {
6901                         struct fileproc *fp = fdp->fd_ofiles[i];
6902                         struct socket *so2;
6903
6904                         if (fp == NULL ||
6905                             (fdp->fd_ofileflags[i] & UF_RESERVED) != 0 ||
6906                             FILEGLOB_DTYPE(fp->f_fglob) != DTYPE_SOCKET)
6907                                 continue;
6908
6909                         so2 = (struct socket *)fp->f_fglob->fg_data;
6910                         if (so != so2 &&
6911                             so2->so_flags1 & SOF1_EXTEND_BK_IDLE_WANTED)
6912                                 count++;
6913                         if (count >= soextbkidlestat.so_xbkidle_maxperproc)
6914                                 break;
6915                 }
6916                 if (count >= soextbkidlestat.so_xbkidle_maxperproc) {
6917                         OSIncrementAtomic(&soextbkidlestat.so_xbkidle_toomany);
6918                         error = EBUSY;
6919                 } else if (so->so_flags & SOF_DELEGATED) {
6920                         OSIncrementAtomic(&soextbkidlestat.so_xbkidle_nodlgtd);
6921                         error = EBUSY;
6922                 } else {
6923                         so->so_flags1 |= SOF1_EXTEND_BK_IDLE_WANTED;
6924                         OSIncrementAtomic(&soextbkidlestat.so_xbkidle_wantok);
6925                 }
6926                 SODEFUNCTLOG("%s[%d, %s]: so 0x%llx [%d,%d] "
6927                     "%s marked for extended bk idle\n",
6928                     __func__, proc_selfpid(), proc_best_name(current_proc()),
6929                     (uint64_t)DEBUG_KERNEL_ADDRPERM(so),
6930                     SOCK_DOM(so), SOCK_TYPE(so),
6931                     (so->so_flags1 & SOF1_EXTEND_BK_IDLE_WANTED) ?
6932                     "is" : "not");
6933
6934                 proc_fdunlock(p);
6935         }
6936
6937         return (error);
6938 }
6939
6940 static void
6941 so_stop_extended_bk_idle(struct socket *so)
6942 {
6943         so->so_flags1 &= ~SOF1_EXTEND_BK_IDLE_INPROG;
6944         so->so_extended_bk_start = 0;
6945
6946         OSDecrementAtomic(&soextbkidlestat.so_xbkidle_active);
6947         VERIFY(soextbkidlestat.so_xbkidle_active >= 0);
6948         /*
6949          * Force defunct
6950          */
6951         sosetdefunct(current_proc(), so,
6952             SHUTDOWN_SOCKET_LEVEL_DISCONNECT_INTERNAL, FALSE);
6953         if (so->so_flags & SOF_DEFUNCT) {
6954                 sodefunct(current_proc(), so,
6955                     SHUTDOWN_SOCKET_LEVEL_DISCONNECT_INTERNAL);
6956         }
6957 }
6958
6959 void
6960 so_drain_extended_bk_idle(struct socket *so)
6961 {
6962         if (so && (so->so_flags1 & SOF1_EXTEND_BK_IDLE_INPROG)) {
6963                 /*
6964                  * Only penalize sockets that have outstanding data
6965                  */
6966                 if (so->so_rcv.sb_cc || so->so_snd.sb_cc) {
6967                         so_stop_extended_bk_idle(so);
6968
6969                         OSIncrementAtomic(&soextbkidlestat.so_xbkidle_drained);
6970                 }
6971         }
6972 }
6973
6974 /*
6975  * Return values tells if socket is still in extended background idle
6976  */
6977 int
6978 so_check_extended_bk_idle_time(struct socket *so)
6979 {
6980         int ret = 1;
6981
6982         if ((so->so_flags1 & SOF1_EXTEND_BK_IDLE_INPROG)) {
6983                 SODEFUNCTLOG("%s[%d, %s]: so 0x%llx [%d,%d]\n",
6984                     __func__, proc_selfpid(), proc_best_name(current_proc()),
6985                     (uint64_t)DEBUG_KERNEL_ADDRPERM(so),
6986                     SOCK_DOM(so), SOCK_TYPE(so));
6987                 if (net_uptime() - so->so_extended_bk_start >
6988                     soextbkidlestat.so_xbkidle_time) {
6989                         so_stop_extended_bk_idle(so);
6990
6991                         OSIncrementAtomic(&soextbkidlestat.so_xbkidle_expired);
6992
6993                         ret = 0;
6994                 } else {
6995                         struct inpcb *inp = (struct inpcb *)so->so_pcb;
6996
6997                         inpcb_timer_sched(inp->inp_pcbinfo, INPCB_TIMER_LAZY);
6998                         OSIncrementAtomic(&soextbkidlestat.so_xbkidle_resched);
6999                 }
7000         }
7001
7002         return (ret);
7003 }
7004
7005 void
7006 resume_proc_sockets(proc_t p)
7007 {
7008         if (p->p_ladvflag & P_LXBKIDLEINPROG) {
7009                 struct filedesc *fdp;
7010                 int i;
7011
7012                 proc_fdlock(p);
7013                 fdp = p->p_fd;
7014                 for (i = 0; i < fdp->fd_nfiles; i++) {
7015                         struct fileproc *fp;
7016                         struct socket *so;
7017
7018                         fp = fdp->fd_ofiles[i];
7019                         if (fp == NULL ||
7020                             (fdp->fd_ofileflags[i] & UF_RESERVED) != 0 ||
7021                             FILEGLOB_DTYPE(fp->f_fglob) != DTYPE_SOCKET)
7022                                 continue;
7023
7024                         so = (struct socket *)fp->f_fglob->fg_data;
7025                         (void) soresume(p, so, 0);
7026                 }
7027                 proc_fdunlock(p);
7028
7029                 OSBitAndAtomic(~P_LXBKIDLEINPROG, &p->p_ladvflag);
7030         }
7031 }
7032
7033 __private_extern__ int
7034 so_set_recv_anyif(struct socket *so, int optval)
7035 {
7036         int ret = 0;
7037
7038 #if INET6
7039         if (SOCK_DOM(so) == PF_INET || SOCK_DOM(so) == PF_INET6) {
7040 #else
7041         if (SOCK_DOM(so) == PF_INET) {
7042 #endif /* !INET6 */
7043                 if (optval)
7044                         sotoinpcb(so)->inp_flags |= INP_RECV_ANYIF;
7045                 else
7046                         sotoinpcb(so)->inp_flags &= ~INP_RECV_ANYIF;
7047         }
7048
7049         return (ret);
7050 }
7051
7052 __private_extern__ int
7053 so_get_recv_anyif(struct socket *so)
7054 {
7055         int ret = 0;
7056
7057 #if INET6
7058         if (SOCK_DOM(so) == PF_INET || SOCK_DOM(so) == PF_INET6) {
7059 #else
7060         if (SOCK_DOM(so) == PF_INET) {
7061 #endif /* !INET6 */
7062                 ret = (sotoinpcb(so)->inp_flags & INP_RECV_ANYIF) ? 1 : 0;
7063         }
7064
7065         return (ret);
7066 }
7067
7068 int
7069 so_set_restrictions(struct socket *so, uint32_t vals)
7070 {
7071         int nocell_old, nocell_new;
7072         int noexpensive_old, noexpensive_new;
7073
7074         /*
7075          * Deny-type restrictions are trapdoors; once set they cannot be
7076          * unset for the lifetime of the socket.  This allows them to be
7077          * issued by a framework on behalf of the application without
7078          * having to worry that they can be undone.
7079          *
7080          * Note here that socket-level restrictions overrides any protocol
7081          * level restrictions.  For instance, SO_RESTRICT_DENY_CELLULAR
7082          * socket restriction issued on the socket has a higher precendence
7083          * than INP_NO_IFT_CELLULAR.  The latter is affected by the UUID
7084          * policy PROC_UUID_NO_CELLULAR for unrestricted sockets only,
7085          * i.e. when SO_RESTRICT_DENY_CELLULAR has not been issued.
7086          */
7087         nocell_old = (so->so_restrictions & SO_RESTRICT_DENY_CELLULAR);
7088         noexpensive_old = (so->so_restrictions & SO_RESTRICT_DENY_EXPENSIVE);
7089         so->so_restrictions |= (vals & (SO_RESTRICT_DENY_IN |
7090             SO_RESTRICT_DENY_OUT | SO_RESTRICT_DENY_CELLULAR |
7091             SO_RESTRICT_DENY_EXPENSIVE));
7092         nocell_new = (so->so_restrictions & SO_RESTRICT_DENY_CELLULAR);
7093         noexpensive_new = (so->so_restrictions & SO_RESTRICT_DENY_EXPENSIVE);
7094
7095         /* we can only set, not clear restrictions */
7096         if ((nocell_new - nocell_old) == 0 &&
7097             (noexpensive_new - noexpensive_old) == 0)
7098                 return (0);
7099 #if INET6
7100         if (SOCK_DOM(so) == PF_INET || SOCK_DOM(so) == PF_INET6) {
7101 #else
7102         if (SOCK_DOM(so) == PF_INET) {
7103 #endif /* !INET6 */
7104                 if (nocell_new - nocell_old != 0) {
7105                         /*
7106                          * if deny cellular is now set, do what's needed
7107                          * for INPCB
7108                          */
7109                         inp_set_nocellular(sotoinpcb(so));
7110                 }
7111                 if (noexpensive_new - noexpensive_old != 0) {
7112                         inp_set_noexpensive(sotoinpcb(so));
7113                 }
7114         }
7115
7116         return (0);
7117 }
7118
7119 uint32_t
7120 so_get_restrictions(struct socket *so)
7121 {
7122         return (so->so_restrictions & (SO_RESTRICT_DENY_IN |
7123             SO_RESTRICT_DENY_OUT |
7124             SO_RESTRICT_DENY_CELLULAR | SO_RESTRICT_DENY_EXPENSIVE));
7125 }
7126
7127 struct sockaddr_entry *
7128 sockaddrentry_alloc(int how)
7129 {
7130         struct sockaddr_entry *se;
7131
7132         se = (how == M_WAITOK) ? zalloc(se_zone) : zalloc_noblock(se_zone);
7133         if (se != NULL)
7134                 bzero(se, se_zone_size);
7135
7136         return (se);
7137 }
7138
7139 void
7140 sockaddrentry_free(struct sockaddr_entry *se)
7141 {
7142         if (se->se_addr != NULL) {
7143                 FREE(se->se_addr, M_SONAME);
7144                 se->se_addr = NULL;
7145         }
7146         zfree(se_zone, se);
7147 }
7148
7149 struct sockaddr_entry *
7150 sockaddrentry_dup(const struct sockaddr_entry *src_se, int how)
7151 {
7152         struct sockaddr_entry *dst_se;
7153
7154         dst_se = sockaddrentry_alloc(how);
7155         if (dst_se != NULL) {
7156                 int len = src_se->se_addr->sa_len;
7157
7158                 MALLOC(dst_se->se_addr, struct sockaddr *,
7159                     len, M_SONAME, how | M_ZERO);
7160                 if (dst_se->se_addr != NULL) {
7161                         bcopy(src_se->se_addr, dst_se->se_addr, len);
7162                 } else {
7163                         sockaddrentry_free(dst_se);
7164                         dst_se = NULL;
7165                 }
7166         }
7167
7168         return (dst_se);
7169 }
7170
7171 struct sockaddr_list *
7172 sockaddrlist_alloc(int how)
7173 {
7174         struct sockaddr_list *sl;
7175
7176         sl = (how == M_WAITOK) ? zalloc(sl_zone) : zalloc_noblock(sl_zone);
7177         if (sl != NULL) {
7178                 bzero(sl, sl_zone_size);
7179                 TAILQ_INIT(&sl->sl_head);
7180         }
7181         return (sl);
7182 }
7183
7184 void
7185 sockaddrlist_free(struct sockaddr_list *sl)
7186 {
7187         struct sockaddr_entry *se, *tse;
7188
7189         TAILQ_FOREACH_SAFE(se, &sl->sl_head, se_link, tse) {
7190                 sockaddrlist_remove(sl, se);
7191                 sockaddrentry_free(se);
7192         }
7193         VERIFY(sl->sl_cnt == 0 && TAILQ_EMPTY(&sl->sl_head));
7194         zfree(sl_zone, sl);
7195 }
7196
7197 void
7198 sockaddrlist_insert(struct sockaddr_list *sl, struct sockaddr_entry *se)
7199 {
7200         VERIFY(!(se->se_flags & SEF_ATTACHED));
7201         se->se_flags |= SEF_ATTACHED;
7202         TAILQ_INSERT_TAIL(&sl->sl_head, se, se_link);
7203         sl->sl_cnt++;
7204         VERIFY(sl->sl_cnt != 0);
7205 }
7206
7207 void
7208 sockaddrlist_remove(struct sockaddr_list *sl, struct sockaddr_entry *se)
7209 {
7210         VERIFY(se->se_flags & SEF_ATTACHED);
7211         se->se_flags &= ~SEF_ATTACHED;
7212         VERIFY(sl->sl_cnt != 0);
7213         sl->sl_cnt--;
7214         TAILQ_REMOVE(&sl->sl_head, se, se_link);
7215 }
7216
7217 struct sockaddr_list *
7218 sockaddrlist_dup(const struct sockaddr_list *src_sl, int how)
7219 {
7220         struct sockaddr_entry *src_se, *tse;
7221         struct sockaddr_list *dst_sl;
7222
7223         dst_sl = sockaddrlist_alloc(how);
7224         if (dst_sl == NULL)
7225                 return (NULL);
7226
7227         TAILQ_FOREACH_SAFE(src_se, &src_sl->sl_head, se_link, tse) {
7228                 struct sockaddr_entry *dst_se;
7229
7230                 if (src_se->se_addr == NULL)
7231                         continue;
7232
7233                 dst_se = sockaddrentry_dup(src_se, how);
7234                 if (dst_se == NULL) {
7235                         sockaddrlist_free(dst_sl);
7236                         return (NULL);
7237                 }
7238
7239                 sockaddrlist_insert(dst_sl, dst_se);
7240         }
7241         VERIFY(src_sl->sl_cnt == dst_sl->sl_cnt);
7242
7243         return (dst_sl);
7244 }
7245
7246 int
7247 so_set_effective_pid(struct socket *so, int epid, struct proc *p)
7248 {
7249         struct proc *ep = PROC_NULL;
7250         int error = 0;
7251
7252         /* pid 0 is reserved for kernel */
7253         if (epid == 0) {
7254                 error = EINVAL;
7255                 goto done;
7256         }
7257
7258         /*
7259          * If this is an in-kernel socket, prevent its delegate
7260          * association from changing unless the socket option is
7261          * coming from within the kernel itself.
7262          */
7263         if (so->last_pid == 0 && p != kernproc) {
7264                 error = EACCES;
7265                 goto done;
7266         }
7267
7268         /*
7269          * If this is issued by a process that's recorded as the
7270          * real owner of the socket, or if the pid is the same as
7271          * the process's own pid, then proceed.  Otherwise ensure
7272          * that the issuing process has the necessary privileges.
7273          */
7274         if (epid != so->last_pid || epid != proc_pid(p)) {
7275                 if ((error = priv_check_cred(kauth_cred_get(),
7276                     PRIV_NET_PRIVILEGED_SOCKET_DELEGATE, 0))) {
7277                         error = EACCES;
7278                         goto done;
7279                 }
7280         }
7281
7282         /* Find the process that corresponds to the effective pid */
7283         if ((ep = proc_find(epid)) == PROC_NULL) {
7284                 error = ESRCH;
7285                 goto done;
7286         }
7287
7288         /*
7289          * If a process tries to delegate the socket to itself, then
7290          * there's really nothing to do; treat it as a way for the
7291          * delegate association to be cleared.  Note that we check
7292          * the passed-in proc rather than calling proc_selfpid(),
7293          * as we need to check the process issuing the socket option
7294          * which could be kernproc.  Given that we don't allow 0 for
7295          * effective pid, it means that a delegated in-kernel socket
7296          * stays delegated during its lifetime (which is probably OK.)
7297          */
7298         if (epid == proc_pid(p)) {
7299                 so->so_flags &= ~SOF_DELEGATED;
7300                 so->e_upid = 0;
7301                 so->e_pid = 0;
7302                 uuid_clear(so->e_uuid);
7303         } else {
7304                 so->so_flags |= SOF_DELEGATED;
7305                 so->e_upid = proc_uniqueid(ep);
7306                 so->e_pid = proc_pid(ep);
7307                 proc_getexecutableuuid(ep, so->e_uuid, sizeof (so->e_uuid));
7308         }
7309 done:
7310         if (error == 0 && net_io_policy_log) {
7311                 uuid_string_t buf;
7312
7313                 uuid_unparse(so->e_uuid, buf);
7314                 log(LOG_DEBUG, "%s[%s,%d]: so 0x%llx [%d,%d] epid %d (%s) "
7315                     "euuid %s%s\n", __func__, proc_name_address(p),
7316                     proc_pid(p), (uint64_t)DEBUG_KERNEL_ADDRPERM(so),
7317                     SOCK_DOM(so), SOCK_TYPE(so),
7318                     so->e_pid, proc_name_address(ep), buf,
7319                     ((so->so_flags & SOF_DELEGATED) ? " [delegated]" : ""));
7320         } else if (error != 0 && net_io_policy_log) {
7321                 log(LOG_ERR, "%s[%s,%d]: so 0x%llx [%d,%d] epid %d (%s) "
7322                     "ERROR (%d)\n", __func__, proc_name_address(p),
7323                     proc_pid(p), (uint64_t)DEBUG_KERNEL_ADDRPERM(so),
7324                     SOCK_DOM(so), SOCK_TYPE(so),
7325                     epid, (ep == PROC_NULL) ? "PROC_NULL" :
7326                     proc_name_address(ep), error);
7327         }
7328
7329         /* Update this socket's policy upon success */
7330         if (error == 0) {
7331                 so->so_policy_gencnt *= -1;
7332                 so_update_policy(so);
7333 #if NECP
7334                 so_update_necp_policy(so, NULL, NULL);
7335 #endif /* NECP */
7336         }
7337
7338         if (ep != PROC_NULL)
7339                 proc_rele(ep);
7340
7341         return (error);
7342 }
7343
7344 int
7345 so_set_effective_uuid(struct socket *so, uuid_t euuid, struct proc *p)
7346 {
7347         uuid_string_t buf;
7348         uuid_t uuid;
7349         int error = 0;
7350
7351         /* UUID must not be all-zeroes (reserved for kernel) */
7352         if (uuid_is_null(euuid)) {
7353                 error = EINVAL;
7354                 goto done;
7355         }
7356
7357         /*
7358          * If this is an in-kernel socket, prevent its delegate
7359          * association from changing unless the socket option is
7360          * coming from within the kernel itself.
7361          */
7362         if (so->last_pid == 0 && p != kernproc) {
7363                 error = EACCES;
7364                 goto done;
7365         }
7366
7367         /* Get the UUID of the issuing process */
7368         proc_getexecutableuuid(p, uuid, sizeof (uuid));
7369
7370         /*
7371          * If this is issued by a process that's recorded as the
7372          * real owner of the socket, or if the uuid is the same as
7373          * the process's own uuid, then proceed.  Otherwise ensure
7374          * that the issuing process has the necessary privileges.
7375          */
7376         if (uuid_compare(euuid, so->last_uuid) != 0 ||
7377             uuid_compare(euuid, uuid) != 0) {
7378                 if ((error = priv_check_cred(kauth_cred_get(),
7379                     PRIV_NET_PRIVILEGED_SOCKET_DELEGATE, 0))) {
7380                         error = EACCES;
7381                         goto done;
7382                 }
7383         }
7384
7385         /*
7386          * If a process tries to delegate the socket to itself, then
7387          * there's really nothing to do; treat it as a way for the
7388          * delegate association to be cleared.  Note that we check
7389          * the uuid of the passed-in proc rather than that of the
7390          * current process, as we need to check the process issuing
7391          * the socket option which could be kernproc itself.  Given
7392          * that we don't allow 0 for effective uuid, it means that
7393          * a delegated in-kernel socket stays delegated during its
7394          * lifetime (which is okay.)
7395          */
7396         if (uuid_compare(euuid, uuid) == 0) {
7397                 so->so_flags &= ~SOF_DELEGATED;
7398                 so->e_upid = 0;
7399                 so->e_pid = 0;
7400                 uuid_clear(so->e_uuid);
7401         } else {
7402                 so->so_flags |= SOF_DELEGATED;
7403                 /*
7404                  * Unlike so_set_effective_pid(), we only have the UUID
7405                  * here and the process ID is not known.  Inherit the
7406                  * real {pid,upid} of the socket.
7407                  */
7408                 so->e_upid = so->last_upid;
7409                 so->e_pid = so->last_pid;
7410                 uuid_copy(so->e_uuid, euuid);
7411         }
7412
7413 done:
7414         if (error == 0 && net_io_policy_log) {
7415                 uuid_unparse(so->e_uuid, buf);
7416                 log(LOG_DEBUG, "%s[%s,%d]: so 0x%llx [%d,%d] epid %d "
7417                     "euuid %s%s\n", __func__, proc_name_address(p), proc_pid(p),
7418                     (uint64_t)DEBUG_KERNEL_ADDRPERM(so), SOCK_DOM(so),
7419                     SOCK_TYPE(so), so->e_pid, buf,
7420                     ((so->so_flags & SOF_DELEGATED) ? " [delegated]" : ""));
7421         } else if (error != 0 && net_io_policy_log) {
7422                 uuid_unparse(euuid, buf);
7423                 log(LOG_DEBUG, "%s[%s,%d]: so 0x%llx [%d,%d] euuid %s "
7424                     "ERROR (%d)\n", __func__, proc_name_address(p), proc_pid(p),
7425                     (uint64_t)DEBUG_KERNEL_ADDRPERM(so), SOCK_DOM(so),
7426                     SOCK_TYPE(so), buf, error);
7427         }
7428
7429         /* Update this socket's policy upon success */
7430         if (error == 0) {
7431                 so->so_policy_gencnt *= -1;
7432                 so_update_policy(so);
7433 #if NECP
7434                 so_update_necp_policy(so, NULL, NULL);
7435 #endif /* NECP */
7436         }
7437
7438         return (error);
7439 }
7440
7441 void
7442 netpolicy_post_msg(uint32_t ev_code, struct netpolicy_event_data *ev_data,
7443     uint32_t ev_datalen)
7444 {
7445         struct kev_msg ev_msg;
7446
7447         /*
7448          * A netpolicy event always starts with a netpolicy_event_data
7449          * structure, but the caller can provide for a longer event
7450          * structure to post, depending on the event code.
7451          */
7452         VERIFY(ev_data != NULL && ev_datalen >= sizeof (*ev_data));
7453
7454         bzero(&ev_msg, sizeof (ev_msg));
7455         ev_msg.vendor_code      = KEV_VENDOR_APPLE;
7456         ev_msg.kev_class        = KEV_NETWORK_CLASS;
7457         ev_msg.kev_subclass     = KEV_NETPOLICY_SUBCLASS;
7458         ev_msg.event_code       = ev_code;
7459
7460         ev_msg.dv[0].data_ptr   = ev_data;
7461         ev_msg.dv[0].data_length = ev_datalen;
7462
7463         kev_post_msg(&ev_msg);
7464 }
7465
7466 void
7467 socket_post_kev_msg(uint32_t ev_code,
7468     struct kev_socket_event_data *ev_data,
7469     uint32_t ev_datalen)
7470 {
7471         struct kev_msg ev_msg;
7472
7473         bzero(&ev_msg, sizeof(ev_msg));
7474         ev_msg.vendor_code = KEV_VENDOR_APPLE;
7475         ev_msg.kev_class = KEV_NETWORK_CLASS;
7476         ev_msg.kev_subclass = KEV_SOCKET_SUBCLASS;
7477         ev_msg.event_code = ev_code;
7478
7479         ev_msg.dv[0].data_ptr = ev_data;
7480         ev_msg.dv[0]. data_length = ev_datalen;
7481
7482         kev_post_msg(&ev_msg);
7483 }
7484
7485 void
7486 socket_post_kev_msg_closed(struct socket *so)
7487 {
7488         struct kev_socket_closed ev;
7489         struct sockaddr *socksa = NULL, *peersa = NULL;
7490         int err;
7491         bzero(&ev, sizeof(ev));
7492         err = (*so->so_proto->pr_usrreqs->pru_sockaddr)(so, &socksa);
7493         if (err == 0) {
7494                 err = (*so->so_proto->pr_usrreqs->pru_peeraddr)(so,
7495                     &peersa);
7496                 if (err == 0) {
7497                         memcpy(&ev.ev_data.kev_sockname, socksa,
7498                             min(socksa->sa_len,
7499                             sizeof (ev.ev_data.kev_sockname)));
7500                         memcpy(&ev.ev_data.kev_peername, peersa,
7501                             min(peersa->sa_len,
7502                             sizeof (ev.ev_data.kev_peername)));
7503                         socket_post_kev_msg(KEV_SOCKET_CLOSED,
7504                             &ev.ev_data, sizeof (ev));
7505                 }
7506         }
7507         if (socksa != NULL)
7508                 FREE(socksa, M_SONAME);
7509         if (peersa != NULL)
7510                 FREE(peersa, M_SONAME);
7511 }