bsd/kern/uipc_socket.c

   1 /*
   2  * Copyright (c) 1998-2016 Apple Inc. All rights reserved.
   3  *
   4  * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
   5  *
   6  * This file contains Original Code and/or Modifications of Original Code
   7  * as defined in and that are subject to the Apple Public Source License
   8  * Version 2.0 (the 'License'). You may not use this file except in
   9  * compliance with the License. The rights granted to you under the License
  10  * may not be used to create, or enable the creation or redistribution of,
  11  * unlawful or unlicensed copies of an Apple operating system, or to
  12  * circumvent, violate, or enable the circumvention or violation of, any
  13  * terms of an Apple operating system software license agreement.
  14  *
  15  * Please obtain a copy of the License at
  16  * http://www.opensource.apple.com/apsl/ and read it before using this file.
  17  *
  18  * The Original Code and all software distributed under the License are
  19  * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
  20  * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
  21  * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
  22  * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
  23  * Please see the License for the specific language governing rights and
  24  * limitations under the License.
  25  *
  26  * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
  27  */
  28 /* Copyright (c) 1995 NeXT Computer, Inc. All Rights Reserved */
  29 /*
  30  * Copyright (c) 1982, 1986, 1988, 1990, 1993
  31  *      The Regents of the University of California.  All rights reserved.
  32  *
  33  * Redistribution and use in source and binary forms, with or without
  34  * modification, are permitted provided that the following conditions
  35  * are met:
  36  * 1. Redistributions of source code must retain the above copyright
  37  *    notice, this list of conditions and the following disclaimer.
  38  * 2. Redistributions in binary form must reproduce the above copyright
  39  *    notice, this list of conditions and the following disclaimer in the
  40  *    documentation and/or other materials provided with the distribution.
  41  * 3. All advertising materials mentioning features or use of this software
  42  *    must display the following acknowledgement:
  43  *      This product includes software developed by the University of
  44  *      California, Berkeley and its contributors.
  45  * 4. Neither the name of the University nor the names of its contributors
  46  *    may be used to endorse or promote products derived from this software
  47  *    without specific prior written permission.
  48  *
  49  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
  50  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  51  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  52  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  53  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  54  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  55  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  56  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  57  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  58  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  59  * SUCH DAMAGE.
  60  *
  61  *      @(#)uipc_socket.c       8.3 (Berkeley) 4/15/94
  62  */
  63 /*
  64  * NOTICE: This file was modified by SPARTA, Inc. in 2005 to introduce
  65  * support for mandatory and extensible security protections.  This notice
  66  * is included in support of clause 2.2 (b) of the Apple Public License,
  67  * Version 2.0.
  68  */
  69
  70 #include <sys/param.h>
  71 #include <sys/systm.h>
  72 #include <sys/filedesc.h>
  73 #include <sys/proc.h>
  74 #include <sys/proc_internal.h>
  75 #include <sys/kauth.h>
  76 #include <sys/file_internal.h>
  77 #include <sys/fcntl.h>
  78 #include <sys/malloc.h>
  79 #include <sys/mbuf.h>
  80 #include <sys/domain.h>
  81 #include <sys/kernel.h>
  82 #include <sys/event.h>
  83 #include <sys/poll.h>
  84 #include <sys/protosw.h>
  85 #include <sys/socket.h>
  86 #include <sys/socketvar.h>
  87 #include <sys/resourcevar.h>
  88 #include <sys/signalvar.h>
  89 #include <sys/sysctl.h>
  90 #include <sys/syslog.h>
  91 #include <sys/uio.h>
  92 #include <sys/uio_internal.h>
  93 #include <sys/ev.h>
  94 #include <sys/kdebug.h>
  95 #include <sys/un.h>
  96 #include <sys/user.h>
  97 #include <sys/priv.h>
  98 #include <sys/kern_event.h>
  99 #include <net/route.h>
 100 #include <net/init.h>
 101 #include <net/ntstat.h>
 102 #include <net/content_filter.h>
 103 #include <netinet/in.h>
 104 #include <netinet/in_pcb.h>
 105 #include <netinet/in_tclass.h>
 106 #include <netinet/tcp_var.h>
 107 #include <netinet/ip6.h>
 108 #include <netinet6/ip6_var.h>
 109 #include <netinet/flow_divert.h>
 110 #include <kern/zalloc.h>
 111 #include <kern/locks.h>
 112 #include <machine/limits.h>
 113 #include <libkern/OSAtomic.h>
 114 #include <pexpert/pexpert.h>
 115 #include <kern/assert.h>
 116 #include <kern/task.h>
 117 #include <kern/policy_internal.h>
 118
 119 #include <sys/kpi_mbuf.h>
 120 #include <sys/mcache.h>
 121 #include <sys/unpcb.h>
 122
 123 #if CONFIG_MACF
 124 #include <security/mac.h>
 125 #include <security/mac_framework.h>
 126 #endif /* MAC */
 127
 128 #if MULTIPATH
 129 #include <netinet/mp_pcb.h>
 130 #include <netinet/mptcp_var.h>
 131 #endif /* MULTIPATH */
 132
 133 #define ROUNDUP(a, b) (((a) + ((b) - 1)) & (~((b) - 1)))
 134
 135 #if DEBUG || DEVELOPMENT
 136 #define DEBUG_KERNEL_ADDRPERM(_v) (_v)
 137 #else
 138 #define DEBUG_KERNEL_ADDRPERM(_v) VM_KERNEL_ADDRPERM(_v)
 139 #endif
 140
 141 /* TODO: this should be in a header file somewhere */
 142 extern char *proc_name_address(void *p);
 143 extern char *proc_best_name(proc_t);
 144
 145 static u_int32_t        so_cache_hw;    /* High water mark for socache */
 146 static u_int32_t        so_cache_timeouts;      /* number of timeouts */
 147 static u_int32_t        so_cache_max_freed;     /* max freed per timeout */
 148 static u_int32_t        cached_sock_count = 0;
 149 STAILQ_HEAD(, socket)   so_cache_head;
 150 int     max_cached_sock_count = MAX_CACHED_SOCKETS;
 151 static u_int32_t        so_cache_time;
 152 static int              socketinit_done;
 153 static struct zone      *so_cache_zone;
 154
 155 static lck_grp_t        *so_cache_mtx_grp;
 156 static lck_attr_t       *so_cache_mtx_attr;
 157 static lck_grp_attr_t   *so_cache_mtx_grp_attr;
 158 static lck_mtx_t        *so_cache_mtx;
 159
 160 #include <machine/limits.h>
 161
 162 static int      filt_sorattach(struct knote *kn);
 163 static void     filt_sordetach(struct knote *kn);
 164 static int      filt_soread(struct knote *kn, long hint);
 165 static int      filt_sortouch(struct knote *kn, struct kevent_internal_s *kev);
 166 static int      filt_sorprocess(struct knote *kn, struct filt_process_s *data, struct kevent_internal_s *kev);
 167
 168 static int      filt_sowattach(struct knote *kn);
 169 static void     filt_sowdetach(struct knote *kn);
 170 static int      filt_sowrite(struct knote *kn, long hint);
 171 static int      filt_sowtouch(struct knote *kn, struct kevent_internal_s *kev);
 172 static int      filt_sowprocess(struct knote *kn, struct filt_process_s *data, struct kevent_internal_s *kev);
 173
 174 static int      filt_sockattach(struct knote *kn);
 175 static void     filt_sockdetach(struct knote *kn);
 176 static int      filt_sockev(struct knote *kn, long hint);
 177 static int      filt_socktouch(struct knote *kn, struct kevent_internal_s *kev);
 178 static int      filt_sockprocess(struct knote *kn, struct filt_process_s *data, struct kevent_internal_s *kev);
 179
 180 static int sooptcopyin_timeval(struct sockopt *, struct timeval *);
 181 static int sooptcopyout_timeval(struct sockopt *, const struct timeval *);
 182
 183 struct filterops soread_filtops = {
 184         .f_isfd = 1,
 185         .f_attach = filt_sorattach,
 186         .f_detach = filt_sordetach,
 187         .f_event = filt_soread,
 188         .f_touch = filt_sortouch,
 189         .f_process = filt_sorprocess,
 190 };
 191
 192 struct filterops sowrite_filtops = {
 193         .f_isfd = 1,
 194         .f_attach = filt_sowattach,
 195         .f_detach = filt_sowdetach,
 196         .f_event = filt_sowrite,
 197         .f_touch = filt_sowtouch,
 198         .f_process = filt_sowprocess,
 199 };
 200
 201 struct filterops sock_filtops = {
 202         .f_isfd = 1,
 203         .f_attach = filt_sockattach,
 204         .f_detach = filt_sockdetach,
 205         .f_event = filt_sockev,
 206         .f_touch = filt_socktouch,
 207         .f_process = filt_sockprocess,
 208 };
 209
 210 struct filterops soexcept_filtops = {
 211         .f_isfd = 1,
 212         .f_attach = filt_sorattach,
 213         .f_detach = filt_sordetach,
 214         .f_event = filt_soread,
 215         .f_touch = filt_sortouch,
 216         .f_process = filt_sorprocess,
 217 };
 218
 219 SYSCTL_DECL(_kern_ipc);
 220
 221 #define EVEN_MORE_LOCKING_DEBUG 0
 222
 223 int socket_debug = 0;
 224 SYSCTL_INT(_kern_ipc, OID_AUTO, socket_debug,
 225         CTLFLAG_RW | CTLFLAG_LOCKED, &socket_debug, 0, "");
 226
 227 static unsigned long sodefunct_calls = 0;
 228 SYSCTL_LONG(_kern_ipc, OID_AUTO, sodefunct_calls, CTLFLAG_LOCKED,
 229     &sodefunct_calls, "");
 230
 231 static int socket_zone = M_SOCKET;
 232 so_gen_t        so_gencnt;      /* generation count for sockets */
 233
 234 MALLOC_DEFINE(M_SONAME, "soname", "socket name");
 235 MALLOC_DEFINE(M_PCB, "pcb", "protocol control block");
 236
 237 #define DBG_LAYER_IN_BEG        NETDBG_CODE(DBG_NETSOCK, 0)
 238 #define DBG_LAYER_IN_END        NETDBG_CODE(DBG_NETSOCK, 2)
 239 #define DBG_LAYER_OUT_BEG       NETDBG_CODE(DBG_NETSOCK, 1)
 240 #define DBG_LAYER_OUT_END       NETDBG_CODE(DBG_NETSOCK, 3)
 241 #define DBG_FNC_SOSEND          NETDBG_CODE(DBG_NETSOCK, (4 << 8) | 1)
 242 #define DBG_FNC_SOSEND_LIST     NETDBG_CODE(DBG_NETSOCK, (4 << 8) | 3)
 243 #define DBG_FNC_SORECEIVE       NETDBG_CODE(DBG_NETSOCK, (8 << 8))
 244 #define DBG_FNC_SORECEIVE_LIST  NETDBG_CODE(DBG_NETSOCK, (8 << 8) | 3)
 245 #define DBG_FNC_SOSHUTDOWN      NETDBG_CODE(DBG_NETSOCK, (9 << 8))
 246
 247 #define MAX_SOOPTGETM_SIZE      (128 * MCLBYTES)
 248
 249 int somaxconn = SOMAXCONN;
 250 SYSCTL_INT(_kern_ipc, KIPC_SOMAXCONN, somaxconn,
 251         CTLFLAG_RW | CTLFLAG_LOCKED, &somaxconn, 0, "");
 252
 253 /* Should we get a maximum also ??? */
 254 static int sosendmaxchain = 65536;
 255 static int sosendminchain = 16384;
 256 static int sorecvmincopy  = 16384;
 257 SYSCTL_INT(_kern_ipc, OID_AUTO, sosendminchain,
 258         CTLFLAG_RW | CTLFLAG_LOCKED, &sosendminchain, 0, "");
 259 SYSCTL_INT(_kern_ipc, OID_AUTO, sorecvmincopy,
 260         CTLFLAG_RW | CTLFLAG_LOCKED, &sorecvmincopy, 0, "");
 261
 262 /*
 263  * Set to enable jumbo clusters (if available) for large writes when
 264  * the socket is marked with SOF_MULTIPAGES; see below.
 265  */
 266 int sosendjcl = 1;
 267 SYSCTL_INT(_kern_ipc, OID_AUTO, sosendjcl,
 268         CTLFLAG_RW | CTLFLAG_LOCKED, &sosendjcl, 0, "");
 269
 270 /*
 271  * Set this to ignore SOF_MULTIPAGES and use jumbo clusters for large
 272  * writes on the socket for all protocols on any network interfaces,
 273  * depending upon sosendjcl above.  Be extra careful when setting this
 274  * to 1, because sending down packets that cross physical pages down to
 275  * broken drivers (those that falsely assume that the physical pages
 276  * are contiguous) might lead to system panics or silent data corruption.
 277  * When set to 0, the system will respect SOF_MULTIPAGES, which is set
 278  * only for TCP sockets whose outgoing interface is IFNET_MULTIPAGES
 279  * capable.  Set this to 1 only for testing/debugging purposes.
 280  */
 281 int sosendjcl_ignore_capab = 0;
 282 SYSCTL_INT(_kern_ipc, OID_AUTO, sosendjcl_ignore_capab,
 283         CTLFLAG_RW | CTLFLAG_LOCKED, &sosendjcl_ignore_capab, 0, "");
 284
 285 /*
 286  * Set this to ignore SOF1_IF_2KCL and use big clusters for large
 287  * writes on the socket for all protocols on any network interfaces.
 288  * Be extra careful when setting this to 1, because sending down packets with
 289  * clusters larger that 2 KB might lead to system panics or data corruption.
 290  * When set to 0, the system will respect SOF1_IF_2KCL, which is set
 291  * on the outgoing interface
 292  * Set this to 1  for testing/debugging purposes only.
 293  */
 294 int sosendbigcl_ignore_capab = 0;
 295 SYSCTL_INT(_kern_ipc, OID_AUTO, sosendbigcl_ignore_capab,
 296         CTLFLAG_RW | CTLFLAG_LOCKED, &sosendbigcl_ignore_capab, 0, "");
 297
 298 int sodefunctlog = 0;
 299 SYSCTL_INT(_kern_ipc, OID_AUTO, sodefunctlog, CTLFLAG_RW | CTLFLAG_LOCKED,
 300         &sodefunctlog, 0, "");
 301
 302 int sothrottlelog = 0;
 303 SYSCTL_INT(_kern_ipc, OID_AUTO, sothrottlelog, CTLFLAG_RW | CTLFLAG_LOCKED,
 304         &sothrottlelog, 0, "");
 305
 306 int sorestrictrecv = 1;
 307 SYSCTL_INT(_kern_ipc, OID_AUTO, sorestrictrecv, CTLFLAG_RW | CTLFLAG_LOCKED,
 308         &sorestrictrecv, 0, "Enable inbound interface restrictions");
 309
 310 int sorestrictsend = 1;
 311 SYSCTL_INT(_kern_ipc, OID_AUTO, sorestrictsend, CTLFLAG_RW | CTLFLAG_LOCKED,
 312         &sorestrictsend, 0, "Enable outbound interface restrictions");
 313
 314 int soreserveheadroom = 1;
 315 SYSCTL_INT(_kern_ipc, OID_AUTO, soreserveheadroom, CTLFLAG_RW | CTLFLAG_LOCKED,
 316         &soreserveheadroom, 0, "To allocate contiguous datagram buffers");
 317
 318 #if (DEBUG || DEVELOPMENT)
 319 int so_notsent_lowat_check = 1;
 320 SYSCTL_INT(_kern_ipc, OID_AUTO, notsent_lowat, CTLFLAG_RW|CTLFLAG_LOCKED,
 321     &so_notsent_lowat_check, 0, "enable/disable notsnet lowat check");
 322 #endif /* DEBUG || DEVELOPMENT */
 323
 324 int so_accept_list_waits = 0;
 325 #if (DEBUG || DEVELOPMENT)
 326 SYSCTL_INT(_kern_ipc, OID_AUTO, accept_list_waits, CTLFLAG_RW|CTLFLAG_LOCKED,
 327     &so_accept_list_waits, 0, "number of waits for listener incomp list");
 328 #endif /* DEBUG || DEVELOPMENT */
 329
 330 extern struct inpcbinfo tcbinfo;
 331
 332 /* TODO: these should be in header file */
 333 extern int get_inpcb_str_size(void);
 334 extern int get_tcp_str_size(void);
 335
 336 vm_size_t       so_cache_zone_element_size;
 337
 338 static int sodelayed_copy(struct socket *, struct uio *, struct mbuf **,
 339     user_ssize_t *);
 340 static void cached_sock_alloc(struct socket **, int);
 341 static void cached_sock_free(struct socket *);
 342
 343 /*
 344  * Maximum of extended background idle sockets per process
 345  * Set to zero to disable further setting of the option
 346  */
 347
 348 #define SO_IDLE_BK_IDLE_MAX_PER_PROC    1
 349 #define SO_IDLE_BK_IDLE_TIME            600
 350 #define SO_IDLE_BK_IDLE_RCV_HIWAT       131072
 351
 352 struct soextbkidlestat soextbkidlestat;
 353
 354 SYSCTL_UINT(_kern_ipc, OID_AUTO, maxextbkidleperproc,
 355         CTLFLAG_RW | CTLFLAG_LOCKED, &soextbkidlestat.so_xbkidle_maxperproc, 0,
 356         "Maximum of extended background idle sockets per process");
 357
 358 SYSCTL_UINT(_kern_ipc, OID_AUTO, extbkidletime, CTLFLAG_RW | CTLFLAG_LOCKED,
 359         &soextbkidlestat.so_xbkidle_time, 0,
 360         "Time in seconds to keep extended background idle sockets");
 361
 362 SYSCTL_UINT(_kern_ipc, OID_AUTO, extbkidlercvhiwat, CTLFLAG_RW | CTLFLAG_LOCKED,
 363         &soextbkidlestat.so_xbkidle_rcvhiwat, 0,
 364         "High water mark for extended background idle sockets");
 365
 366 SYSCTL_STRUCT(_kern_ipc, OID_AUTO, extbkidlestat, CTLFLAG_RD | CTLFLAG_LOCKED,
 367         &soextbkidlestat, soextbkidlestat, "");
 368
 369 int so_set_extended_bk_idle(struct socket *, int);
 370
 371 /*
 372  * SOTCDB_NO_DSCP is set by default, to prevent the networking stack from
 373  * setting the DSCP code on the packet based on the service class; see
 374  * <rdar://problem/11277343> for details.
 375  */
 376 __private_extern__ u_int32_t sotcdb = 0;
 377 SYSCTL_INT(_kern_ipc, OID_AUTO, sotcdb, CTLFLAG_RW | CTLFLAG_LOCKED,
 378         &sotcdb, 0, "");
 379
 380 void
 381 socketinit(void)
 382 {
 383         _CASSERT(sizeof(so_gencnt) == sizeof(uint64_t));
 384         VERIFY(IS_P2ALIGNED(&so_gencnt, sizeof(uint32_t)));
 385
 386 #ifdef __LP64__
 387         _CASSERT(sizeof(struct sa_endpoints) == sizeof(struct user64_sa_endpoints));
 388         _CASSERT(offsetof(struct sa_endpoints, sae_srcif) == offsetof(struct user64_sa_endpoints, sae_srcif));
 389         _CASSERT(offsetof(struct sa_endpoints, sae_srcaddr) == offsetof(struct user64_sa_endpoints, sae_srcaddr));
 390         _CASSERT(offsetof(struct sa_endpoints, sae_srcaddrlen) == offsetof(struct user64_sa_endpoints, sae_srcaddrlen));
 391         _CASSERT(offsetof(struct sa_endpoints, sae_dstaddr) == offsetof(struct user64_sa_endpoints, sae_dstaddr));
 392         _CASSERT(offsetof(struct sa_endpoints, sae_dstaddrlen) == offsetof(struct user64_sa_endpoints, sae_dstaddrlen));
 393 #else
 394         _CASSERT(sizeof(struct sa_endpoints) == sizeof(struct user32_sa_endpoints));
 395         _CASSERT(offsetof(struct sa_endpoints, sae_srcif) == offsetof(struct user32_sa_endpoints, sae_srcif));
 396         _CASSERT(offsetof(struct sa_endpoints, sae_srcaddr) == offsetof(struct user32_sa_endpoints, sae_srcaddr));
 397         _CASSERT(offsetof(struct sa_endpoints, sae_srcaddrlen) == offsetof(struct user32_sa_endpoints, sae_srcaddrlen));
 398         _CASSERT(offsetof(struct sa_endpoints, sae_dstaddr) == offsetof(struct user32_sa_endpoints, sae_dstaddr));
 399         _CASSERT(offsetof(struct sa_endpoints, sae_dstaddrlen) == offsetof(struct user32_sa_endpoints, sae_dstaddrlen));
 400 #endif
 401
 402         if (socketinit_done) {
 403                 printf("socketinit: already called...\n");
 404                 return;
 405         }
 406         socketinit_done = 1;
 407
 408         PE_parse_boot_argn("socket_debug", &socket_debug,
 409             sizeof (socket_debug));
 410
 411         /*
 412          * allocate lock group attribute and group for socket cache mutex
 413          */
 414         so_cache_mtx_grp_attr = lck_grp_attr_alloc_init();
 415         so_cache_mtx_grp = lck_grp_alloc_init("so_cache",
 416             so_cache_mtx_grp_attr);
 417
 418         /*
 419          * allocate the lock attribute for socket cache mutex
 420          */
 421         so_cache_mtx_attr = lck_attr_alloc_init();
 422
 423         /* cached sockets mutex */
 424         so_cache_mtx = lck_mtx_alloc_init(so_cache_mtx_grp, so_cache_mtx_attr);
 425         if (so_cache_mtx == NULL) {
 426                 panic("%s: unable to allocate so_cache_mtx\n", __func__);
 427                 /* NOTREACHED */
 428         }
 429         STAILQ_INIT(&so_cache_head);
 430
 431         so_cache_zone_element_size = (vm_size_t)(sizeof (struct socket) + 4
 432             + get_inpcb_str_size() + 4 + get_tcp_str_size());
 433
 434         so_cache_zone = zinit(so_cache_zone_element_size,
 435             (120000 * so_cache_zone_element_size), 8192, "socache zone");
 436         zone_change(so_cache_zone, Z_CALLERACCT, FALSE);
 437         zone_change(so_cache_zone, Z_NOENCRYPT, TRUE);
 438
 439         bzero(&soextbkidlestat, sizeof(struct soextbkidlestat));
 440         soextbkidlestat.so_xbkidle_maxperproc = SO_IDLE_BK_IDLE_MAX_PER_PROC;
 441         soextbkidlestat.so_xbkidle_time = SO_IDLE_BK_IDLE_TIME;
 442         soextbkidlestat.so_xbkidle_rcvhiwat = SO_IDLE_BK_IDLE_RCV_HIWAT;
 443
 444         in_pcbinit();
 445         sflt_init();
 446         socket_tclass_init();
 447 #if MULTIPATH
 448         mp_pcbinit();
 449 #endif /* MULTIPATH */
 450 }
 451
 452 static void
 453 cached_sock_alloc(struct socket **so, int waitok)
 454 {
 455         caddr_t temp;
 456         uintptr_t offset;
 457
 458         lck_mtx_lock(so_cache_mtx);
 459
 460         if (!STAILQ_EMPTY(&so_cache_head)) {
 461                 VERIFY(cached_sock_count > 0);
 462
 463                 *so = STAILQ_FIRST(&so_cache_head);
 464                 STAILQ_REMOVE_HEAD(&so_cache_head, so_cache_ent);
 465                 STAILQ_NEXT((*so), so_cache_ent) = NULL;
 466
 467                 cached_sock_count--;
 468                 lck_mtx_unlock(so_cache_mtx);
 469
 470                 temp = (*so)->so_saved_pcb;
 471                 bzero((caddr_t)*so, sizeof (struct socket));
 472
 473                 (*so)->so_saved_pcb = temp;
 474         } else {
 475
 476                 lck_mtx_unlock(so_cache_mtx);
 477
 478                 if (waitok)
 479                         *so = (struct socket *)zalloc(so_cache_zone);
 480                 else
 481                         *so = (struct socket *)zalloc_noblock(so_cache_zone);
 482
 483                 if (*so == NULL)
 484                         return;
 485
 486                 bzero((caddr_t)*so, sizeof (struct socket));
 487
 488                 /*
 489                  * Define offsets for extra structures into our
 490                  * single block of memory. Align extra structures
 491                  * on longword boundaries.
 492                  */
 493
 494                 offset = (uintptr_t)*so;
 495                 offset += sizeof (struct socket);
 496
 497                 offset = ALIGN(offset);
 498
 499                 (*so)->so_saved_pcb = (caddr_t)offset;
 500                 offset += get_inpcb_str_size();
 501
 502                 offset = ALIGN(offset);
 503
 504                 ((struct inpcb *)(void *)(*so)->so_saved_pcb)->inp_saved_ppcb =
 505                     (caddr_t)offset;
 506         }
 507
 508         OSBitOrAtomic(SOF1_CACHED_IN_SOCK_LAYER, &(*so)->so_flags1);
 509 }
 510
 511 static void
 512 cached_sock_free(struct socket *so)
 513 {
 514
 515         lck_mtx_lock(so_cache_mtx);
 516
 517         so_cache_time = net_uptime();
 518         if (++cached_sock_count > max_cached_sock_count) {
 519                 --cached_sock_count;
 520                 lck_mtx_unlock(so_cache_mtx);
 521                 zfree(so_cache_zone, so);
 522         } else {
 523                 if (so_cache_hw < cached_sock_count)
 524                         so_cache_hw = cached_sock_count;
 525
 526                 STAILQ_INSERT_TAIL(&so_cache_head, so, so_cache_ent);
 527
 528                 so->cache_timestamp = so_cache_time;
 529                 lck_mtx_unlock(so_cache_mtx);
 530         }
 531 }
 532
 533 void
 534 so_update_last_owner_locked(struct socket *so, proc_t self)
 535 {
 536         if (so->last_pid != 0) {
 537                 /*
 538                  * last_pid and last_upid should remain zero for sockets
 539                  * created using sock_socket. The check above achieves that
 540                  */
 541                 if (self == PROC_NULL)
 542                         self = current_proc();
 543
 544                 if (so->last_upid != proc_uniqueid(self) ||
 545                     so->last_pid != proc_pid(self)) {
 546                         so->last_upid = proc_uniqueid(self);
 547                         so->last_pid = proc_pid(self);
 548                         proc_getexecutableuuid(self, so->last_uuid,
 549                             sizeof (so->last_uuid));
 550                 }
 551                 proc_pidoriginatoruuid(so->so_vuuid, sizeof(so->so_vuuid));
 552         }
 553 }
 554
 555 void
 556 so_update_policy(struct socket *so)
 557 {
 558         if (SOCK_DOM(so) == PF_INET || SOCK_DOM(so) == PF_INET6)
 559                 (void) inp_update_policy(sotoinpcb(so));
 560 }
 561
 562 #if NECP
 563 static void
 564 so_update_necp_policy(struct socket *so, struct sockaddr *override_local_addr,
 565     struct sockaddr *override_remote_addr)
 566 {
 567         if (SOCK_DOM(so) == PF_INET || SOCK_DOM(so) == PF_INET6)
 568                 inp_update_necp_policy(sotoinpcb(so), override_local_addr,
 569                     override_remote_addr, 0);
 570 }
 571 #endif /* NECP */
 572
 573 boolean_t
 574 so_cache_timer(void)
 575 {
 576         struct socket   *p;
 577         int             n_freed = 0;
 578         boolean_t rc = FALSE;
 579
 580         lck_mtx_lock(so_cache_mtx);
 581         so_cache_timeouts++;
 582         so_cache_time = net_uptime();
 583
 584         while (!STAILQ_EMPTY(&so_cache_head)) {
 585                 VERIFY(cached_sock_count > 0);
 586                 p = STAILQ_FIRST(&so_cache_head);
 587                 if ((so_cache_time - p->cache_timestamp) <
 588                         SO_CACHE_TIME_LIMIT)
 589                         break;
 590
 591                 STAILQ_REMOVE_HEAD(&so_cache_head, so_cache_ent);
 592                 --cached_sock_count;
 593
 594                 zfree(so_cache_zone, p);
 595
 596                 if (++n_freed >= SO_CACHE_MAX_FREE_BATCH) {
 597                         so_cache_max_freed++;
 598                         break;
 599                 }
 600         }
 601
 602         /* Schedule again if there is more to cleanup */
 603         if (!STAILQ_EMPTY(&so_cache_head))
 604                 rc = TRUE;
 605
 606         lck_mtx_unlock(so_cache_mtx);
 607         return (rc);
 608 }
 609
 610 /*
 611  * Get a socket structure from our zone, and initialize it.
 612  * We don't implement `waitok' yet (see comments in uipc_domain.c).
 613  * Note that it would probably be better to allocate socket
 614  * and PCB at the same time, but I'm not convinced that all
 615  * the protocols can be easily modified to do this.
 616  */
 617 struct socket *
 618 soalloc(int waitok, int dom, int type)
 619 {
 620         struct socket *so;
 621
 622         if ((dom == PF_INET) && (type == SOCK_STREAM)) {
 623                 cached_sock_alloc(&so, waitok);
 624         } else {
 625                 MALLOC_ZONE(so, struct socket *, sizeof (*so), socket_zone,
 626                     M_WAITOK);
 627                 if (so != NULL)
 628                         bzero(so, sizeof (*so));
 629         }
 630         if (so != NULL) {
 631                 so->so_gencnt = OSIncrementAtomic64((SInt64 *)&so_gencnt);
 632                 so->so_zone = socket_zone;
 633 #if CONFIG_MACF_SOCKET
 634                 /* Convert waitok to  M_WAITOK/M_NOWAIT for MAC Framework. */
 635                 if (mac_socket_label_init(so, !waitok) != 0) {
 636                         sodealloc(so);
 637                         return (NULL);
 638                 }
 639 #endif /* MAC_SOCKET */
 640         }
 641
 642         return (so);
 643 }
 644
 645 int
 646 socreate_internal(int dom, struct socket **aso, int type, int proto,
 647     struct proc *p, uint32_t flags, struct proc *ep)
 648 {
 649         struct protosw *prp;
 650         struct socket *so;
 651         int error = 0;
 652
 653 #if TCPDEBUG
 654         extern int tcpconsdebug;
 655 #endif
 656
 657         VERIFY(aso != NULL);
 658         *aso = NULL;
 659
 660         if (proto != 0)
 661                 prp = pffindproto(dom, proto, type);
 662         else
 663                 prp = pffindtype(dom, type);
 664
 665         if (prp == NULL || prp->pr_usrreqs->pru_attach == NULL) {
 666                 if (pffinddomain(dom) == NULL)
 667                         return (EAFNOSUPPORT);
 668                 if (proto != 0) {
 669                         if (pffindprotonotype(dom, proto) != NULL)
 670                                 return (EPROTOTYPE);
 671                 }
 672                 return (EPROTONOSUPPORT);
 673         }
 674         if (prp->pr_type != type)
 675                 return (EPROTOTYPE);
 676         so = soalloc(1, dom, type);
 677         if (so == NULL)
 678                 return (ENOBUFS);
 679
 680         if (flags & SOCF_ASYNC)
 681                 so->so_state |= SS_NBIO;
 682 #if MULTIPATH
 683         if (flags & SOCF_MP_SUBFLOW) {
 684                 /*
 685                  * A multipath subflow socket is used internally in the kernel,
 686                  * therefore it does not have a file desciptor associated by
 687                  * default.
 688                  */
 689                 so->so_state |= SS_NOFDREF;
 690                 so->so_flags |= SOF_MP_SUBFLOW;
 691         }
 692 #endif /* MULTIPATH */
 693
 694         TAILQ_INIT(&so->so_incomp);
 695         TAILQ_INIT(&so->so_comp);
 696         so->so_type = type;
 697         so->last_upid = proc_uniqueid(p);
 698         so->last_pid = proc_pid(p);
 699         proc_getexecutableuuid(p, so->last_uuid, sizeof (so->last_uuid));
 700         proc_pidoriginatoruuid(so->so_vuuid, sizeof(so->so_vuuid));
 701
 702         if (ep != PROC_NULL && ep != p) {
 703                 so->e_upid = proc_uniqueid(ep);
 704                 so->e_pid = proc_pid(ep);
 705                 proc_getexecutableuuid(ep, so->e_uuid, sizeof (so->e_uuid));
 706                 so->so_flags |= SOF_DELEGATED;
 707         }
 708
 709         so->so_cred = kauth_cred_proc_ref(p);
 710         if (!suser(kauth_cred_get(), NULL))
 711                 so->so_state |= SS_PRIV;
 712
 713         so->so_proto = prp;
 714         so->so_rcv.sb_flags |= SB_RECV;
 715         so->so_rcv.sb_so = so->so_snd.sb_so = so;
 716         so->next_lock_lr = 0;
 717         so->next_unlock_lr = 0;
 718
 719 #if CONFIG_MACF_SOCKET
 720         mac_socket_label_associate(kauth_cred_get(), so);
 721 #endif /* MAC_SOCKET */
 722
 723         /*
 724          * Attachment will create the per pcb lock if necessary and
 725          * increase refcount for creation, make sure it's done before
 726          * socket is inserted in lists.
 727          */
 728         so->so_usecount++;
 729
 730         error = (*prp->pr_usrreqs->pru_attach)(so, proto, p);
 731         if (error != 0) {
 732                 /*
 733                  * Warning:
 734                  * If so_pcb is not zero, the socket will be leaked,
 735                  * so protocol attachment handler must be coded carefuly
 736                  */
 737                 so->so_state |= SS_NOFDREF;
 738                 VERIFY(so->so_usecount > 0);
 739                 so->so_usecount--;
 740                 sofreelastref(so, 1);   /* will deallocate the socket */
 741                 return (error);
 742         }
 743
 744         atomic_add_32(&prp->pr_domain->dom_refs, 1);
 745         TAILQ_INIT(&so->so_evlist);
 746
 747         /* Attach socket filters for this protocol */
 748         sflt_initsock(so);
 749 #if TCPDEBUG
 750         if (tcpconsdebug == 2)
 751                 so->so_options |= SO_DEBUG;
 752 #endif
 753         so_set_default_traffic_class(so);
 754
 755         /*
 756          * If this thread or task is marked to create backgrounded sockets,
 757          * mark the socket as background.
 758          */
 759         if (proc_get_effective_thread_policy(current_thread(),
 760             TASK_POLICY_NEW_SOCKETS_BG)) {
 761                 socket_set_traffic_mgt_flags(so, TRAFFIC_MGT_SO_BACKGROUND);
 762                 so->so_background_thread = current_thread();
 763         }
 764
 765         switch (dom) {
 766         /*
 767          * Don't mark Unix domain, system or multipath sockets as
 768          * eligible for defunct by default.
 769          */
 770         case PF_LOCAL:
 771         case PF_SYSTEM:
 772         case PF_MULTIPATH:
 773                 so->so_flags |= SOF_NODEFUNCT;
 774                 break;
 775         default:
 776                 break;
 777         }
 778
 779         /*
 780          * Entitlements can't be checked at socket creation time except if the
 781          * application requested a feature guarded by a privilege (c.f., socket
 782          * delegation).
 783          * The priv(9) and the Sandboxing APIs are designed with the idea that
 784          * a privilege check should only be triggered by a userland request.
 785          * A privilege check at socket creation time is time consuming and
 786          * could trigger many authorisation error messages from the security
 787          * APIs.
 788          */
 789
 790         *aso = so;
 791
 792         return (0);
 793 }
 794
 795 /*
 796  * Returns:     0                       Success
 797  *              EAFNOSUPPORT
 798  *              EPROTOTYPE
 799  *              EPROTONOSUPPORT
 800  *              ENOBUFS
 801  *      <pru_attach>:ENOBUFS[AF_UNIX]
 802  *      <pru_attach>:ENOBUFS[TCP]
 803  *      <pru_attach>:ENOMEM[TCP]
 804  *      <pru_attach>:???                [other protocol families, IPSEC]
 805  */
 806 int
 807 socreate(int dom, struct socket **aso, int type, int proto)
 808 {
 809         return (socreate_internal(dom, aso, type, proto, current_proc(), 0,
 810             PROC_NULL));
 811 }
 812
 813 int
 814 socreate_delegate(int dom, struct socket **aso, int type, int proto, pid_t epid)
 815 {
 816         int error = 0;
 817         struct proc *ep = PROC_NULL;
 818
 819         if ((proc_selfpid() != epid) && ((ep = proc_find(epid)) == PROC_NULL)) {
 820                 error = ESRCH;
 821                 goto done;
 822         }
 823
 824         error = socreate_internal(dom, aso, type, proto, current_proc(), 0, ep);
 825
 826         /*
 827          * It might not be wise to hold the proc reference when calling
 828          * socreate_internal since it calls soalloc with M_WAITOK
 829          */
 830 done:
 831         if (ep != PROC_NULL)
 832                 proc_rele(ep);
 833
 834         return (error);
 835 }
 836
 837 /*
 838  * Returns:     0                       Success
 839  *      <pru_bind>:EINVAL               Invalid argument [COMMON_START]
 840  *      <pru_bind>:EAFNOSUPPORT         Address family not supported
 841  *      <pru_bind>:EADDRNOTAVAIL        Address not available.
 842  *      <pru_bind>:EINVAL               Invalid argument
 843  *      <pru_bind>:EAFNOSUPPORT         Address family not supported [notdef]
 844  *      <pru_bind>:EACCES               Permission denied
 845  *      <pru_bind>:EADDRINUSE           Address in use
 846  *      <pru_bind>:EAGAIN               Resource unavailable, try again
 847  *      <pru_bind>:EPERM                Operation not permitted
 848  *      <pru_bind>:???
 849  *      <sf_bind>:???
 850  *
 851  * Notes:       It's not possible to fully enumerate the return codes above,
 852  *              since socket filter authors and protocol family authors may
 853  *              not choose to limit their error returns to those listed, even
 854  *              though this may result in some software operating incorrectly.
 855  *
 856  *              The error codes which are enumerated above are those known to
 857  *              be returned by the tcp_usr_bind function supplied.
 858  */
 859 int
 860 sobindlock(struct socket *so, struct sockaddr *nam, int dolock)
 861 {
 862         struct proc *p = current_proc();
 863         int error = 0;
 864
 865         if (dolock)
 866                 socket_lock(so, 1);
 867         VERIFY(so->so_usecount > 1);
 868
 869         so_update_last_owner_locked(so, p);
 870         so_update_policy(so);
 871
 872 #if NECP
 873         so_update_necp_policy(so, nam, NULL);
 874 #endif /* NECP */
 875
 876         /*
 877          * If this is a bind request on a socket that has been marked
 878          * as inactive, reject it now before we go any further.
 879          */
 880         if (so->so_flags & SOF_DEFUNCT) {
 881                 error = EINVAL;
 882                 SODEFUNCTLOG("%s[%d, %s]: defunct so 0x%llx [%d,%d] (%d)\n",
 883                     __func__, proc_pid(p), proc_best_name(p),
 884                     (uint64_t)DEBUG_KERNEL_ADDRPERM(so),
 885                     SOCK_DOM(so), SOCK_TYPE(so), error);
 886                 goto out;
 887         }
 888
 889         /* Socket filter */
 890         error = sflt_bind(so, nam);
 891
 892         if (error == 0)
 893                 error = (*so->so_proto->pr_usrreqs->pru_bind)(so, nam, p);
 894 out:
 895         if (dolock)
 896                 socket_unlock(so, 1);
 897
 898         if (error == EJUSTRETURN)
 899                 error = 0;
 900
 901         return (error);
 902 }
 903
 904 void
 905 sodealloc(struct socket *so)
 906 {
 907         kauth_cred_unref(&so->so_cred);
 908
 909         /* Remove any filters */
 910         sflt_termsock(so);
 911
 912 #if CONTENT_FILTER
 913         cfil_sock_detach(so);
 914 #endif /* CONTENT_FILTER */
 915
 916         /* Delete the state allocated for msg queues on a socket */
 917         if (so->so_flags & SOF_ENABLE_MSGS) {
 918                 FREE(so->so_msg_state, M_TEMP);
 919                 so->so_msg_state = NULL;
 920         }
 921         VERIFY(so->so_msg_state == NULL);
 922
 923         so->so_gencnt = OSIncrementAtomic64((SInt64 *)&so_gencnt);
 924
 925 #if CONFIG_MACF_SOCKET
 926         mac_socket_label_destroy(so);
 927 #endif /* MAC_SOCKET */
 928
 929         if (so->so_flags1 & SOF1_CACHED_IN_SOCK_LAYER) {
 930                 cached_sock_free(so);
 931         } else {
 932                 FREE_ZONE(so, sizeof (*so), so->so_zone);
 933         }
 934 }
 935
 936 /*
 937  * Returns:     0                       Success
 938  *              EINVAL
 939  *              EOPNOTSUPP
 940  *      <pru_listen>:EINVAL[AF_UNIX]
 941  *      <pru_listen>:EINVAL[TCP]
 942  *      <pru_listen>:EADDRNOTAVAIL[TCP] Address not available.
 943  *      <pru_listen>:EINVAL[TCP]        Invalid argument
 944  *      <pru_listen>:EAFNOSUPPORT[TCP]  Address family not supported [notdef]
 945  *      <pru_listen>:EACCES[TCP]        Permission denied
 946  *      <pru_listen>:EADDRINUSE[TCP]    Address in use
 947  *      <pru_listen>:EAGAIN[TCP]        Resource unavailable, try again
 948  *      <pru_listen>:EPERM[TCP]         Operation not permitted
 949  *      <sf_listen>:???
 950  *
 951  * Notes:       Other <pru_listen> returns depend on the protocol family; all
 952  *              <sf_listen> returns depend on what the filter author causes
 953  *              their filter to return.
 954  */
 955 int
 956 solisten(struct socket *so, int backlog)
 957 {
 958         struct proc *p = current_proc();
 959         int error = 0;
 960
 961         socket_lock(so, 1);
 962
 963         so_update_last_owner_locked(so, p);
 964         so_update_policy(so);
 965
 966 #if NECP
 967         so_update_necp_policy(so, NULL, NULL);
 968 #endif /* NECP */
 969
 970         if (so->so_proto == NULL) {
 971                 error = EINVAL;
 972                 goto out;
 973         }
 974         if ((so->so_proto->pr_flags & PR_CONNREQUIRED) == 0) {
 975                 error = EOPNOTSUPP;
 976                 goto out;
 977         }
 978
 979         /*
 980          * If the listen request is made on a socket that is not fully
 981          * disconnected, or on a socket that has been marked as inactive,
 982          * reject the request now.
 983          */
 984         if ((so->so_state &
 985             (SS_ISCONNECTED|SS_ISCONNECTING|SS_ISDISCONNECTING)) ||
 986             (so->so_flags & SOF_DEFUNCT)) {
 987                 error = EINVAL;
 988                 if (so->so_flags & SOF_DEFUNCT) {
 989                         SODEFUNCTLOG("%s[%d, %s]: defunct so 0x%llx [%d,%d] "
 990                             "(%d)\n", __func__, proc_pid(p),
 991                             proc_best_name(p),
 992                             (uint64_t)DEBUG_KERNEL_ADDRPERM(so),
 993                             SOCK_DOM(so), SOCK_TYPE(so), error);
 994                 }
 995                 goto out;
 996         }
 997
 998         if ((so->so_restrictions & SO_RESTRICT_DENY_IN) != 0) {
 999                 error = EPERM;
1000                 goto out;
1001         }
1002
1003         error = sflt_listen(so);
1004         if (error == 0)
1005                 error = (*so->so_proto->pr_usrreqs->pru_listen)(so, p);
1006
1007         if (error) {
1008                 if (error == EJUSTRETURN)
1009                         error = 0;
1010                 goto out;
1011         }
1012
1013         if (TAILQ_EMPTY(&so->so_comp))
1014                 so->so_options |= SO_ACCEPTCONN;
1015         /*
1016          * POSIX: The implementation may have an upper limit on the length of
1017          * the listen queue-either global or per accepting socket. If backlog
1018          * exceeds this limit, the length of the listen queue is set to the
1019          * limit.
1020          *
1021          * If listen() is called with a backlog argument value that is less
1022          * than 0, the function behaves as if it had been called with a backlog
1023          * argument value of 0.
1024          *
1025          * A backlog argument of 0 may allow the socket to accept connections,
1026          * in which case the length of the listen queue may be set to an
1027          * implementation-defined minimum value.
1028          */
1029         if (backlog <= 0 || backlog > somaxconn)
1030                 backlog = somaxconn;
1031
1032         so->so_qlimit = backlog;
1033 out:
1034         socket_unlock(so, 1);
1035         return (error);
1036 }
1037
1038 /*
1039  * The "accept list lock" protects the fields related to the listener queues
1040  * because we can unlock a socket to respect the lock ordering between
1041  * the listener socket and its clients sockets. The lock ordering is first to
1042  * acquire the client socket before the listener socket.
1043  *
1044  * The accept list lock serializes access to the following fields:
1045  * - of the listener socket:
1046  *   - so_comp
1047  *   - so_incomp
1048  *   - so_qlen
1049  *   - so_inqlen
1050  * - of client sockets that are in so_comp or so_incomp:
1051  *   - so_head
1052  *   - so_list
1053  *
1054  * As one can see the accept list lock protects the consistent of the
1055  * linkage of the client sockets.
1056  *
1057  * Note that those fields may be read without holding the accept list lock
1058  * for a preflight provided the accept list lock is taken when committing
1059  * to take an action based on the result of the preflight. The preflight
1060  * saves the cost of doing the unlock/lock dance.
1061  */
1062 void
1063 so_acquire_accept_list(struct socket *head, struct socket *so)
1064 {
1065         lck_mtx_t *mutex_held;
1066
1067         if (head->so_proto->pr_getlock == NULL) {
1068                 return;
1069         }
1070         mutex_held = (*head->so_proto->pr_getlock)(head, 0);
1071         lck_mtx_assert(mutex_held, LCK_MTX_ASSERT_OWNED);
1072
1073         if (!(head->so_flags1 & SOF1_ACCEPT_LIST_HELD)) {
1074                 head->so_flags1 |= SOF1_ACCEPT_LIST_HELD;
1075                 return;
1076         }
1077         if (so != NULL) {
1078                 socket_unlock(so, 0);
1079         }
1080         while (head->so_flags1 & SOF1_ACCEPT_LIST_HELD) {
1081                 so_accept_list_waits += 1;
1082                 msleep((caddr_t)&head->so_incomp, mutex_held,
1083                     PSOCK | PCATCH, __func__, NULL);
1084         }
1085         head->so_flags1 |= SOF1_ACCEPT_LIST_HELD;
1086         if (so != NULL) {
1087                 socket_unlock(head, 0);
1088                 socket_lock(so, 0);
1089                 socket_lock(head, 0);
1090         }
1091 }
1092
1093 void
1094 so_release_accept_list(struct socket *head)
1095 {
1096         if (head->so_proto->pr_getlock != NULL) {
1097                 lck_mtx_t *mutex_held;
1098
1099                 mutex_held = (*head->so_proto->pr_getlock)(head, 0);
1100                 lck_mtx_assert(mutex_held, LCK_MTX_ASSERT_OWNED);
1101
1102                 head->so_flags1 &= ~SOF1_ACCEPT_LIST_HELD;
1103                 wakeup((caddr_t)&head->so_incomp);
1104         }
1105 }
1106
1107 void
1108 sofreelastref(struct socket *so, int dealloc)
1109 {
1110         struct socket *head = so->so_head;
1111
1112         /* Assume socket is locked */
1113
1114         if (!(so->so_flags & SOF_PCBCLEARING) || !(so->so_state & SS_NOFDREF)) {
1115                 selthreadclear(&so->so_snd.sb_sel);
1116                 selthreadclear(&so->so_rcv.sb_sel);
1117                 so->so_rcv.sb_flags &= ~(SB_SEL|SB_UPCALL);
1118                 so->so_snd.sb_flags &= ~(SB_SEL|SB_UPCALL);
1119                 so->so_event = sonullevent;
1120                 return;
1121         }
1122         if (head != NULL) {
1123                 /*
1124                  * Need to lock the listener when the protocol has
1125                  * per socket locks
1126                  */
1127                 if (head->so_proto->pr_getlock != NULL) {
1128                         socket_lock(head, 1);
1129                         so_acquire_accept_list(head, so);
1130                 }
1131                 if (so->so_state & SS_INCOMP) {
1132                         so->so_state &= ~SS_INCOMP;
1133                         TAILQ_REMOVE(&head->so_incomp, so, so_list);
1134                         head->so_incqlen--;
1135                         head->so_qlen--;
1136                         so->so_head = NULL;
1137
1138                         if (head->so_proto->pr_getlock != NULL) {
1139                                 so_release_accept_list(head);
1140                                 socket_unlock(head, 1);
1141                         }
1142                 } else if (so->so_state & SS_COMP) {
1143                         if (head->so_proto->pr_getlock != NULL) {
1144                                 so_release_accept_list(head);
1145                                 socket_unlock(head, 1);
1146                         }
1147                         /*
1148                          * We must not decommission a socket that's
1149                          * on the accept(2) queue.  If we do, then
1150                          * accept(2) may hang after select(2) indicated
1151                          * that the listening socket was ready.
1152                          */
1153                         selthreadclear(&so->so_snd.sb_sel);
1154                         selthreadclear(&so->so_rcv.sb_sel);
1155                         so->so_rcv.sb_flags &= ~(SB_SEL|SB_UPCALL);
1156                         so->so_snd.sb_flags &= ~(SB_SEL|SB_UPCALL);
1157                         so->so_event = sonullevent;
1158                         return;
1159                 } else {
1160                         if (head->so_proto->pr_getlock != NULL) {
1161                                 so_release_accept_list(head);
1162                                 socket_unlock(head, 1);
1163                         }
1164                         printf("sofree: not queued\n");
1165                 }
1166         }
1167         sowflush(so);
1168         sorflush(so);
1169
1170 #if FLOW_DIVERT
1171         if (so->so_flags & SOF_FLOW_DIVERT) {
1172                 flow_divert_detach(so);
1173         }
1174 #endif  /* FLOW_DIVERT */
1175
1176         /* 3932268: disable upcall */
1177         so->so_rcv.sb_flags &= ~SB_UPCALL;
1178         so->so_snd.sb_flags &= ~(SB_UPCALL|SB_SNDBYTE_CNT);
1179         so->so_event = sonullevent;
1180
1181         if (dealloc)
1182                 sodealloc(so);
1183 }
1184
1185 void
1186 soclose_wait_locked(struct socket *so)
1187 {
1188         lck_mtx_t *mutex_held;
1189
1190         if (so->so_proto->pr_getlock != NULL)
1191                 mutex_held = (*so->so_proto->pr_getlock)(so, 0);
1192         else
1193                 mutex_held = so->so_proto->pr_domain->dom_mtx;
1194         lck_mtx_assert(mutex_held, LCK_MTX_ASSERT_OWNED);
1195
1196         /*
1197          * Double check here and return if there's no outstanding upcall;
1198          * otherwise proceed further only if SOF_UPCALLCLOSEWAIT is set.
1199          */
1200         if (!so->so_upcallusecount || !(so->so_flags & SOF_UPCALLCLOSEWAIT))
1201                 return;
1202         so->so_rcv.sb_flags &= ~SB_UPCALL;
1203         so->so_snd.sb_flags &= ~SB_UPCALL;
1204         so->so_flags |= SOF_CLOSEWAIT;
1205         (void) msleep((caddr_t)&so->so_upcallusecount, mutex_held, (PZERO - 1),
1206             "soclose_wait_locked", NULL);
1207         lck_mtx_assert(mutex_held, LCK_MTX_ASSERT_OWNED);
1208         so->so_flags &= ~SOF_CLOSEWAIT;
1209 }
1210
1211 /*
1212  * Close a socket on last file table reference removal.
1213  * Initiate disconnect if connected.
1214  * Free socket when disconnect complete.
1215  */
1216 int
1217 soclose_locked(struct socket *so)
1218 {
1219         int error = 0;
1220         struct timespec ts;
1221
1222         if (so->so_usecount == 0) {
1223                 panic("soclose: so=%p refcount=0\n", so);
1224                 /* NOTREACHED */
1225         }
1226
1227         sflt_notify(so, sock_evt_closing, NULL);
1228
1229         if (so->so_upcallusecount)
1230                 soclose_wait_locked(so);
1231
1232 #if CONTENT_FILTER
1233         /*
1234          * We have to wait until the content filters are done
1235          */
1236         if ((so->so_flags & SOF_CONTENT_FILTER) != 0) {
1237                 cfil_sock_close_wait(so);
1238                 cfil_sock_is_closed(so);
1239                 cfil_sock_detach(so);
1240         }
1241 #endif /* CONTENT_FILTER */
1242
1243         if (so->so_flags1 & SOF1_EXTEND_BK_IDLE_INPROG) {
1244                 soresume(current_proc(), so, 1);
1245                 so->so_flags1 &= ~SOF1_EXTEND_BK_IDLE_WANTED;
1246         }
1247
1248         if ((so->so_options & SO_ACCEPTCONN)) {
1249                 struct socket *sp, *sonext;
1250                 int persocklock = 0;
1251                 int incomp_overflow_only;
1252
1253                 /*
1254                  * We do not want new connection to be added
1255                  * to the connection queues
1256                  */
1257                 so->so_options &= ~SO_ACCEPTCONN;
1258
1259                 /*
1260                  * We can drop the lock on the listener once
1261                  * we've acquired the incoming list
1262                  */
1263                 if (so->so_proto->pr_getlock != NULL) {
1264                         persocklock = 1;
1265                         so_acquire_accept_list(so, NULL);
1266                         socket_unlock(so, 0);
1267                 }
1268 again:
1269                 incomp_overflow_only = 1;
1270
1271                 TAILQ_FOREACH_SAFE(sp, &so->so_incomp, so_list, sonext) {
1272                         /*
1273                          * Radar 5350314
1274                          * skip sockets thrown away by tcpdropdropblreq
1275                          * they will get cleanup by the garbage collection.
1276                          * otherwise, remove the incomp socket from the queue
1277                          * and let soabort trigger the appropriate cleanup.
1278                          */
1279                         if (sp->so_flags & SOF_OVERFLOW)
1280                                 continue;
1281
1282                         if (persocklock != 0)
1283                                 socket_lock(sp, 1);
1284
1285                         /*
1286                          * Radar 27945981
1287                          * The extra reference for the list insure the
1288                          * validity of the socket pointer when we perform the
1289                          * unlock of the head above
1290                          */
1291                         if (sp->so_state & SS_INCOMP) {
1292                                 sp->so_state &= ~SS_INCOMP;
1293                                 sp->so_head = NULL;
1294                                 TAILQ_REMOVE(&so->so_incomp, sp, so_list);
1295                                 so->so_incqlen--;
1296                                 so->so_qlen--;
1297
1298                                 (void) soabort(sp);
1299                         } else {
1300                                 panic("%s sp %p in so_incomp but !SS_INCOMP",
1301                                     __func__, sp);
1302                         }
1303
1304                         if (persocklock != 0)
1305                                 socket_unlock(sp, 1);
1306                 }
1307
1308                 TAILQ_FOREACH_SAFE(sp, &so->so_comp, so_list, sonext) {
1309                         /* Dequeue from so_comp since sofree() won't do it */
1310                         if (persocklock != 0)
1311                                 socket_lock(sp, 1);
1312
1313                         if (sp->so_state & SS_COMP) {
1314                                 sp->so_state &= ~SS_COMP;
1315                                 sp->so_head = NULL;
1316                                 TAILQ_REMOVE(&so->so_comp, sp, so_list);
1317                                 so->so_qlen--;
1318
1319                                 (void) soabort(sp);
1320                         } else {
1321                                 panic("%s sp %p in so_comp but !SS_COMP",
1322                                     __func__, sp);
1323                         }
1324
1325                         if (persocklock)
1326                                 socket_unlock(sp, 1);
1327                         }
1328
1329                 if (incomp_overflow_only == 0 && !TAILQ_EMPTY(&so->so_incomp)) {
1330 #if (DEBUG|DEVELOPMENT)
1331                         panic("%s head %p so_comp not empty\n", __func__, so);
1332 #endif /* (DEVELOPMENT || DEBUG) */
1333
1334                         goto again;
1335                 }
1336
1337                 if (!TAILQ_EMPTY(&so->so_comp)) {
1338 #if (DEBUG|DEVELOPMENT)
1339                         panic("%s head %p so_comp not empty\n", __func__, so);
1340 #endif /* (DEVELOPMENT || DEBUG) */
1341
1342                         goto again;
1343                 }
1344
1345                 if (persocklock) {
1346                         socket_lock(so, 0);
1347                         so_release_accept_list(so);
1348                 }
1349         }
1350         if (so->so_pcb == NULL) {
1351                 /* 3915887: mark the socket as ready for dealloc */
1352                 so->so_flags |= SOF_PCBCLEARING;
1353                 goto discard;
1354         }
1355         if (so->so_state & SS_ISCONNECTED) {
1356                 if ((so->so_state & SS_ISDISCONNECTING) == 0) {
1357                         error = sodisconnectlocked(so);
1358                         if (error)
1359                                 goto drop;
1360                 }
1361                 if (so->so_options & SO_LINGER) {
1362                         lck_mtx_t *mutex_held;
1363
1364                         if ((so->so_state & SS_ISDISCONNECTING) &&
1365                             (so->so_state & SS_NBIO))
1366                                 goto drop;
1367                         if (so->so_proto->pr_getlock != NULL)
1368                                 mutex_held = (*so->so_proto->pr_getlock)(so, 0);
1369                         else
1370                                 mutex_held = so->so_proto->pr_domain->dom_mtx;
1371                         while (so->so_state & SS_ISCONNECTED) {
1372                                 ts.tv_sec = (so->so_linger/100);
1373                                 ts.tv_nsec = (so->so_linger % 100) *
1374                                     NSEC_PER_USEC * 1000 * 10;
1375                                 error = msleep((caddr_t)&so->so_timeo,
1376                                     mutex_held, PSOCK | PCATCH, "soclose", &ts);
1377                                 if (error) {
1378                                         /*
1379                                          * It's OK when the time fires,
1380                                          * don't report an error
1381                                          */
1382                                         if (error == EWOULDBLOCK)
1383                                                 error = 0;
1384                                         break;
1385                                 }
1386                         }
1387                 }
1388         }
1389 drop:
1390         if (so->so_usecount == 0) {
1391                 panic("soclose: usecount is zero so=%p\n", so);
1392                 /* NOTREACHED */
1393         }
1394         if (so->so_pcb != NULL && !(so->so_flags & SOF_PCBCLEARING)) {
1395                 int error2 = (*so->so_proto->pr_usrreqs->pru_detach)(so);
1396                 if (error == 0)
1397                         error = error2;
1398         }
1399         if (so->so_usecount <= 0) {
1400                 panic("soclose: usecount is zero so=%p\n", so);
1401                 /* NOTREACHED */
1402         }
1403 discard:
1404         if (so->so_pcb != NULL && !(so->so_flags & SOF_MP_SUBFLOW) &&
1405             (so->so_state & SS_NOFDREF)) {
1406                 panic("soclose: NOFDREF");
1407                 /* NOTREACHED */
1408         }
1409         so->so_state |= SS_NOFDREF;
1410
1411         if (so->so_flags & SOF_MP_SUBFLOW)
1412                 so->so_flags &= ~SOF_MP_SUBFLOW;
1413
1414         if ((so->so_flags & SOF_KNOTE) != 0)
1415                 KNOTE(&so->so_klist, SO_FILT_HINT_LOCKED);
1416
1417         atomic_add_32(&so->so_proto->pr_domain->dom_refs, -1);
1418         evsofree(so);
1419
1420         VERIFY(so->so_usecount > 0);
1421         so->so_usecount--;
1422         sofree(so);
1423         return (error);
1424 }
1425
1426 int
1427 soclose(struct socket *so)
1428 {
1429         int error = 0;
1430         socket_lock(so, 1);
1431
1432         if (so->so_retaincnt == 0) {
1433                 error = soclose_locked(so);
1434         } else {
1435                 /*
1436                  * if the FD is going away, but socket is
1437                  * retained in kernel remove its reference
1438                  */
1439                 so->so_usecount--;
1440                 if (so->so_usecount < 2)
1441                         panic("soclose: retaincnt non null and so=%p "
1442                             "usecount=%d\n", so, so->so_usecount);
1443         }
1444         socket_unlock(so, 1);
1445         return (error);
1446 }
1447
1448 /*
1449  * Must be called at splnet...
1450  */
1451 /* Should already be locked */
1452 int
1453 soabort(struct socket *so)
1454 {
1455         int error;
1456
1457 #ifdef MORE_LOCKING_DEBUG
1458         lck_mtx_t *mutex_held;
1459
1460         if (so->so_proto->pr_getlock != NULL)
1461                 mutex_held = (*so->so_proto->pr_getlock)(so, 0);
1462         else
1463                 mutex_held = so->so_proto->pr_domain->dom_mtx;
1464         lck_mtx_assert(mutex_held, LCK_MTX_ASSERT_OWNED);
1465 #endif
1466
1467         if ((so->so_flags & SOF_ABORTED) == 0) {
1468                 so->so_flags |= SOF_ABORTED;
1469                 error = (*so->so_proto->pr_usrreqs->pru_abort)(so);
1470                 if (error) {
1471                         sofree(so);
1472                         return (error);
1473                 }
1474         }
1475         return (0);
1476 }
1477
1478 int
1479 soacceptlock(struct socket *so, struct sockaddr **nam, int dolock)
1480 {
1481         int error;
1482
1483         if (dolock)
1484                 socket_lock(so, 1);
1485
1486         so_update_last_owner_locked(so, PROC_NULL);
1487         so_update_policy(so);
1488 #if NECP
1489         so_update_necp_policy(so, NULL, NULL);
1490 #endif /* NECP */
1491
1492         if ((so->so_state & SS_NOFDREF) == 0)
1493                 panic("soaccept: !NOFDREF");
1494         so->so_state &= ~SS_NOFDREF;
1495         error = (*so->so_proto->pr_usrreqs->pru_accept)(so, nam);
1496
1497         if (dolock)
1498                 socket_unlock(so, 1);
1499         return (error);
1500 }
1501
1502 int
1503 soaccept(struct socket *so, struct sockaddr **nam)
1504 {
1505         return (soacceptlock(so, nam, 1));
1506 }
1507
1508 int
1509 soacceptfilter(struct socket *so, struct socket *head)
1510 {
1511         struct sockaddr *local = NULL, *remote = NULL;
1512         int error = 0;
1513
1514         /*
1515          * Hold the lock even if this socket has not been made visible
1516          * to the filter(s).  For sockets with global locks, this protects
1517          * against the head or peer going away
1518          */
1519         socket_lock(so, 1);
1520         if (sogetaddr_locked(so, &remote, 1) != 0 ||
1521             sogetaddr_locked(so, &local, 0) != 0) {
1522                 so->so_state &= ~SS_NOFDREF;
1523                 socket_unlock(so, 1);
1524                 soclose(so);
1525                 /* Out of resources; try it again next time */
1526                 error = ECONNABORTED;
1527                 goto done;
1528         }
1529
1530         error = sflt_accept(head, so, local, remote);
1531
1532         /*
1533          * If we get EJUSTRETURN from one of the filters, mark this socket
1534          * as inactive and return it anyway.  This newly accepted socket
1535          * will be disconnected later before we hand it off to the caller.
1536          */
1537         if (error == EJUSTRETURN) {
1538                 error = 0;
1539                 (void) sosetdefunct(current_proc(), so,
1540                     SHUTDOWN_SOCKET_LEVEL_DISCONNECT_INTERNAL, FALSE);
1541         }
1542
1543         if (error != 0) {
1544                 /*
1545                  * This may seem like a duplication to the above error
1546                  * handling part when we return ECONNABORTED, except
1547                  * the following is done while holding the lock since
1548                  * the socket has been exposed to the filter(s) earlier.
1549                  */
1550                 so->so_state &= ~SS_COMP;
1551                 socket_unlock(so, 1);
1552                 soclose(so);
1553                 /* Propagate socket filter's error code to the caller */
1554         } else {
1555                 socket_unlock(so, 1);
1556         }
1557 done:
1558         /* Callee checks for NULL pointer */
1559         sock_freeaddr(remote);
1560         sock_freeaddr(local);
1561         return (error);
1562 }
1563
1564 /*
1565  * Returns:     0                       Success
1566  *              EOPNOTSUPP              Operation not supported on socket
1567  *              EISCONN                 Socket is connected
1568  *      <pru_connect>:EADDRNOTAVAIL     Address not available.
1569  *      <pru_connect>:EINVAL            Invalid argument
1570  *      <pru_connect>:EAFNOSUPPORT      Address family not supported [notdef]
1571  *      <pru_connect>:EACCES            Permission denied
1572  *      <pru_connect>:EADDRINUSE        Address in use
1573  *      <pru_connect>:EAGAIN            Resource unavailable, try again
1574  *      <pru_connect>:EPERM             Operation not permitted
1575  *      <sf_connect_out>:???            [anything a filter writer might set]
1576  */
1577 int
1578 soconnectlock(struct socket *so, struct sockaddr *nam, int dolock)
1579 {
1580         int error;
1581         struct proc *p = current_proc();
1582
1583         if (dolock)
1584                 socket_lock(so, 1);
1585
1586         so_update_last_owner_locked(so, p);
1587         so_update_policy(so);
1588
1589 #if NECP
1590         so_update_necp_policy(so, NULL, nam);
1591 #endif /* NECP */
1592
1593         /*
1594          * If this is a listening socket or if this is a previously-accepted
1595          * socket that has been marked as inactive, reject the connect request.
1596          */
1597         if ((so->so_options & SO_ACCEPTCONN) || (so->so_flags & SOF_DEFUNCT)) {
1598                 error = EOPNOTSUPP;
1599                 if (so->so_flags & SOF_DEFUNCT) {
1600                         SODEFUNCTLOG("%s[%d, %s]: defunct so 0x%llx [%d,%d] "
1601                             "(%d)\n", __func__, proc_pid(p),
1602                             proc_best_name(p),
1603                             (uint64_t)DEBUG_KERNEL_ADDRPERM(so),
1604                             SOCK_DOM(so), SOCK_TYPE(so), error);
1605                 }
1606                 if (dolock)
1607                         socket_unlock(so, 1);
1608                 return (error);
1609         }
1610
1611         if ((so->so_restrictions & SO_RESTRICT_DENY_OUT) != 0) {
1612                 if (dolock)
1613                         socket_unlock(so, 1);
1614                 return (EPERM);
1615         }
1616
1617         /*
1618          * If protocol is connection-based, can only connect once.
1619          * Otherwise, if connected, try to disconnect first.
1620          * This allows user to disconnect by connecting to, e.g.,
1621          * a null address.
1622          */
1623         if (so->so_state & (SS_ISCONNECTED|SS_ISCONNECTING) &&
1624             ((so->so_proto->pr_flags & PR_CONNREQUIRED) ||
1625             (error = sodisconnectlocked(so)))) {
1626                 error = EISCONN;
1627         } else {
1628                 /*
1629                  * Run connect filter before calling protocol:
1630                  *  - non-blocking connect returns before completion;
1631                  */
1632                 error = sflt_connectout(so, nam);
1633                 if (error != 0) {
1634                         if (error == EJUSTRETURN)
1635                                 error = 0;
1636                 } else {
1637                         error = (*so->so_proto->pr_usrreqs->pru_connect)
1638                             (so, nam, p);
1639                 }
1640         }
1641         if (dolock)
1642                 socket_unlock(so, 1);
1643         return (error);
1644 }
1645
1646 int
1647 soconnect(struct socket *so, struct sockaddr *nam)
1648 {
1649         return (soconnectlock(so, nam, 1));
1650 }
1651
1652 /*
1653  * Returns:     0                       Success
1654  *      <pru_connect2>:EINVAL[AF_UNIX]
1655  *      <pru_connect2>:EPROTOTYPE[AF_UNIX]
1656  *      <pru_connect2>:???              [other protocol families]
1657  *
1658  * Notes:       <pru_connect2> is not supported by [TCP].
1659  */
1660 int
1661 soconnect2(struct socket *so1, struct socket *so2)
1662 {
1663         int error;
1664
1665         socket_lock(so1, 1);
1666         if (so2->so_proto->pr_lock)
1667                 socket_lock(so2, 1);
1668
1669         error = (*so1->so_proto->pr_usrreqs->pru_connect2)(so1, so2);
1670
1671         socket_unlock(so1, 1);
1672         if (so2->so_proto->pr_lock)
1673                 socket_unlock(so2, 1);
1674         return (error);
1675 }
1676
1677 int
1678 soconnectxlocked(struct socket *so, struct sockaddr *src,
1679     struct sockaddr *dst, struct proc *p, uint32_t ifscope,
1680     sae_associd_t aid, sae_connid_t *pcid, uint32_t flags, void *arg,
1681     uint32_t arglen, uio_t auio, user_ssize_t *bytes_written)
1682 {
1683         int error;
1684
1685         so_update_last_owner_locked(so, p);
1686         so_update_policy(so);
1687
1688         /*
1689          * If this is a listening socket or if this is a previously-accepted
1690          * socket that has been marked as inactive, reject the connect request.
1691          */
1692         if ((so->so_options & SO_ACCEPTCONN) || (so->so_flags & SOF_DEFUNCT)) {
1693                 error = EOPNOTSUPP;
1694                 if (so->so_flags & SOF_DEFUNCT) {
1695                         SODEFUNCTLOG("%s[%d, %s]: defunct so 0x%llx [%d,%d] "
1696                             "(%d)\n", __func__, proc_pid(p),
1697                             proc_best_name(p),
1698                             (uint64_t)DEBUG_KERNEL_ADDRPERM(so),
1699                             SOCK_DOM(so), SOCK_TYPE(so), error);
1700                 }
1701                 return (error);
1702         }
1703
1704         if ((so->so_restrictions & SO_RESTRICT_DENY_OUT) != 0)
1705                 return (EPERM);
1706
1707         /*
1708          * If protocol is connection-based, can only connect once
1709          * unless PR_MULTICONN is set.  Otherwise, if connected,
1710          * try to disconnect first.  This allows user to disconnect
1711          * by connecting to, e.g., a null address.
1712          */
1713         if ((so->so_state & (SS_ISCONNECTED|SS_ISCONNECTING)) &&
1714             !(so->so_proto->pr_flags & PR_MULTICONN) &&
1715             ((so->so_proto->pr_flags & PR_CONNREQUIRED) ||
1716             (error = sodisconnectlocked(so)) != 0)) {
1717                 error = EISCONN;
1718         } else {
1719                 /*
1720                  * Run connect filter before calling protocol:
1721                  *  - non-blocking connect returns before completion;
1722                  */
1723                 error = sflt_connectout(so, dst);
1724                 if (error != 0) {
1725                         /* Disable PRECONNECT_DATA, as we don't need to send a SYN anymore. */
1726                         so->so_flags1 &= ~SOF1_PRECONNECT_DATA;
1727                         if (error == EJUSTRETURN)
1728                                 error = 0;
1729                 } else {
1730                         error = (*so->so_proto->pr_usrreqs->pru_connectx)
1731                             (so, src, dst, p, ifscope, aid, pcid,
1732                             flags, arg, arglen, auio, bytes_written);
1733                 }
1734         }
1735
1736         return (error);
1737 }
1738
1739 int
1740 sodisconnectlocked(struct socket *so)
1741 {
1742         int error;
1743
1744         if ((so->so_state & SS_ISCONNECTED) == 0) {
1745                 error = ENOTCONN;
1746                 goto bad;
1747         }
1748         if (so->so_state & SS_ISDISCONNECTING) {
1749                 error = EALREADY;
1750                 goto bad;
1751         }
1752
1753         error = (*so->so_proto->pr_usrreqs->pru_disconnect)(so);
1754         if (error == 0)
1755                 sflt_notify(so, sock_evt_disconnected, NULL);
1756
1757 bad:
1758         return (error);
1759 }
1760
1761 /* Locking version */
1762 int
1763 sodisconnect(struct socket *so)
1764 {
1765         int error;
1766
1767         socket_lock(so, 1);
1768         error = sodisconnectlocked(so);
1769         socket_unlock(so, 1);
1770         return (error);
1771 }
1772
1773 int
1774 sodisconnectxlocked(struct socket *so, sae_associd_t aid, sae_connid_t cid)
1775 {
1776         int error;
1777
1778         /*
1779          * Call the protocol disconnectx handler; let it handle all
1780          * matters related to the connection state of this session.
1781          */
1782         error = (*so->so_proto->pr_usrreqs->pru_disconnectx)(so, aid, cid);
1783         if (error == 0) {
1784                 /*
1785                  * The event applies only for the session, not for
1786                  * the disconnection of individual subflows.
1787                  */
1788                 if (so->so_state & (SS_ISDISCONNECTING|SS_ISDISCONNECTED))
1789                         sflt_notify(so, sock_evt_disconnected, NULL);
1790         }
1791         return (error);
1792 }
1793
1794 int
1795 sodisconnectx(struct socket *so, sae_associd_t aid, sae_connid_t cid)
1796 {
1797         int error;
1798
1799         socket_lock(so, 1);
1800         error = sodisconnectxlocked(so, aid, cid);
1801         socket_unlock(so, 1);
1802         return (error);
1803 }
1804
1805 int
1806 sopeelofflocked(struct socket *so, sae_associd_t aid, struct socket **psop)
1807 {
1808         return ((*so->so_proto->pr_usrreqs->pru_peeloff)(so, aid, psop));
1809 }
1810
1811 #define SBLOCKWAIT(f)   (((f) & MSG_DONTWAIT) ? 0 : SBL_WAIT)
1812
1813 /*
1814  * sosendcheck will lock the socket buffer if it isn't locked and
1815  * verify that there is space for the data being inserted.
1816  *
1817  * Returns:     0                       Success
1818  *              EPIPE
1819  *      sblock:EWOULDBLOCK
1820  *      sblock:EINTR
1821  *      sbwait:EBADF
1822  *      sbwait:EINTR
1823  *      [so_error]:???
1824  */
1825 int
1826 sosendcheck(struct socket *so, struct sockaddr *addr, user_ssize_t resid,
1827     int32_t clen, int32_t atomic, int flags, int *sblocked,
1828     struct mbuf *control)
1829 {
1830         int     error = 0;
1831         int32_t space;
1832         int     assumelock = 0;
1833
1834 restart:
1835         if (*sblocked == 0) {
1836                 if ((so->so_snd.sb_flags & SB_LOCK) != 0 &&
1837                     so->so_send_filt_thread != 0 &&
1838                     so->so_send_filt_thread == current_thread()) {
1839                         /*
1840                          * We're being called recursively from a filter,
1841                          * allow this to continue. Radar 4150520.
1842                          * Don't set sblocked because we don't want
1843                          * to perform an unlock later.
1844                          */
1845                         assumelock = 1;
1846                 } else {
1847                         error = sblock(&so->so_snd, SBLOCKWAIT(flags));
1848                         if (error) {
1849                                 if (so->so_flags & SOF_DEFUNCT)
1850                                         goto defunct;
1851                                 return (error);
1852                         }
1853                         *sblocked = 1;
1854                 }
1855         }
1856
1857         /*
1858          * If a send attempt is made on a socket that has been marked
1859          * as inactive (disconnected), reject the request.
1860          */
1861         if (so->so_flags & SOF_DEFUNCT) {
1862 defunct:
1863                 error = EPIPE;
1864                 SODEFUNCTLOG("%s[%d, %s]: defunct so 0x%llx [%d,%d] (%d)\n",
1865                     __func__, proc_selfpid(), proc_best_name(current_proc()),
1866                     (uint64_t)DEBUG_KERNEL_ADDRPERM(so),
1867                     SOCK_DOM(so), SOCK_TYPE(so), error);
1868                 return (error);
1869         }
1870
1871         if (so->so_state & SS_CANTSENDMORE) {
1872 #if CONTENT_FILTER
1873                 /*
1874                  * Can re-inject data of half closed connections
1875                  */
1876                 if ((so->so_state & SS_ISDISCONNECTED) == 0 &&
1877                         so->so_snd.sb_cfil_thread == current_thread() &&
1878                         cfil_sock_data_pending(&so->so_snd) != 0)
1879                         CFIL_LOG(LOG_INFO,
1880                                 "so %llx ignore SS_CANTSENDMORE",
1881                                 (uint64_t)DEBUG_KERNEL_ADDRPERM(so));
1882                 else
1883 #endif /* CONTENT_FILTER */
1884                         return (EPIPE);
1885         }
1886         if (so->so_error) {
1887                 error = so->so_error;
1888                 so->so_error = 0;
1889                 return (error);
1890         }
1891
1892         if ((so->so_state & SS_ISCONNECTED) == 0) {
1893                 if ((so->so_proto->pr_flags & PR_CONNREQUIRED) != 0) {
1894                         if (((so->so_state & SS_ISCONFIRMING) == 0) &&
1895                             (resid != 0 || clen == 0) &&
1896                             !(so->so_flags1 & SOF1_PRECONNECT_DATA)) {
1897 #if MPTCP
1898                                 /*
1899                                  * MPTCP Fast Join sends data before the
1900                                  * socket is truly connected.
1901                                  */
1902                                 if ((so->so_flags & (SOF_MP_SUBFLOW |
1903                                         SOF_MPTCP_FASTJOIN)) !=
1904                                     (SOF_MP_SUBFLOW | SOF_MPTCP_FASTJOIN))
1905 #endif /* MPTCP */
1906                                 return (ENOTCONN);
1907                         }
1908                 } else if (addr == 0 && !(flags&MSG_HOLD)) {
1909                         return ((so->so_proto->pr_flags & PR_CONNREQUIRED) ?
1910                             ENOTCONN : EDESTADDRREQ);
1911                 }
1912         }
1913
1914         if (so->so_flags & SOF_ENABLE_MSGS)
1915                 space = msgq_sbspace(so, control);
1916         else
1917                 space = sbspace(&so->so_snd);
1918
1919         if (flags & MSG_OOB)
1920                 space += 1024;
1921         if ((atomic && resid > so->so_snd.sb_hiwat) ||
1922             clen > so->so_snd.sb_hiwat)
1923                 return (EMSGSIZE);
1924
1925         if ((space < resid + clen &&
1926             (atomic || (space < (int32_t)so->so_snd.sb_lowat) ||
1927             space < clen)) ||
1928             (so->so_type == SOCK_STREAM && so_wait_for_if_feedback(so))) {
1929                 /*
1930                  * don't block the connectx call when there's more data
1931                  * than can be copied.
1932                  */
1933                 if (so->so_flags1 & SOF1_PRECONNECT_DATA) {
1934                         if (space == 0) {
1935                                 return (EWOULDBLOCK);
1936                         }
1937                         if (space < (int32_t)so->so_snd.sb_lowat) {
1938                                 return (0);
1939                         }
1940                 }
1941                 if ((so->so_state & SS_NBIO) || (flags & MSG_NBIO) ||
1942                     assumelock) {
1943                         return (EWOULDBLOCK);
1944                 }
1945                 sbunlock(&so->so_snd, TRUE);    /* keep socket locked */
1946                 *sblocked = 0;
1947                 error = sbwait(&so->so_snd);
1948                 if (error) {
1949                         if (so->so_flags & SOF_DEFUNCT)
1950                                 goto defunct;
1951                         return (error);
1952                 }
1953                 goto restart;
1954         }
1955         return (0);
1956 }
1957
1958 /*
1959  * Send on a socket.
1960  * If send must go all at once and message is larger than
1961  * send buffering, then hard error.
1962  * Lock against other senders.
1963  * If must go all at once and not enough room now, then
1964  * inform user that this would block and do nothing.
1965  * Otherwise, if nonblocking, send as much as possible.
1966  * The data to be sent is described by "uio" if nonzero,
1967  * otherwise by the mbuf chain "top" (which must be null
1968  * if uio is not).  Data provided in mbuf chain must be small
1969  * enough to send all at once.
1970  *
1971  * Returns nonzero on error, timeout or signal; callers
1972  * must check for short counts if EINTR/ERESTART are returned.
1973  * Data and control buffers are freed on return.
1974  * Experiment:
1975  * MSG_HOLD: go thru most of sosend(), but just enqueue the mbuf
1976  * MSG_SEND: go thru as for MSG_HOLD on current fragment, then
1977  *  point at the mbuf chain being constructed and go from there.
1978  *
1979  * Returns:     0                       Success
1980  *              EOPNOTSUPP
1981  *              EINVAL
1982  *              ENOBUFS
1983  *      uiomove:EFAULT
1984  *      sosendcheck:EPIPE
1985  *      sosendcheck:EWOULDBLOCK
1986  *      sosendcheck:EINTR
1987  *      sosendcheck:EBADF
1988  *      sosendcheck:EINTR
1989  *      sosendcheck:???                 [value from so_error]
1990  *      <pru_send>:ECONNRESET[TCP]
1991  *      <pru_send>:EINVAL[TCP]
1992  *      <pru_send>:ENOBUFS[TCP]
1993  *      <pru_send>:EADDRINUSE[TCP]
1994  *      <pru_send>:EADDRNOTAVAIL[TCP]
1995  *      <pru_send>:EAFNOSUPPORT[TCP]
1996  *      <pru_send>:EACCES[TCP]
1997  *      <pru_send>:EAGAIN[TCP]
1998  *      <pru_send>:EPERM[TCP]
1999  *      <pru_send>:EMSGSIZE[TCP]
2000  *      <pru_send>:EHOSTUNREACH[TCP]
2001  *      <pru_send>:ENETUNREACH[TCP]
2002  *      <pru_send>:ENETDOWN[TCP]
2003  *      <pru_send>:ENOMEM[TCP]
2004  *      <pru_send>:ENOBUFS[TCP]
2005  *      <pru_send>:???[TCP]             [ignorable: mostly IPSEC/firewall/DLIL]
2006  *      <pru_send>:EINVAL[AF_UNIX]
2007  *      <pru_send>:EOPNOTSUPP[AF_UNIX]
2008  *      <pru_send>:EPIPE[AF_UNIX]
2009  *      <pru_send>:ENOTCONN[AF_UNIX]
2010  *      <pru_send>:EISCONN[AF_UNIX]
2011  *      <pru_send>:???[AF_UNIX]         [whatever a filter author chooses]
2012  *      <sf_data_out>:???               [whatever a filter author chooses]
2013  *
2014  * Notes:       Other <pru_send> returns depend on the protocol family; all
2015  *              <sf_data_out> returns depend on what the filter author causes
2016  *              their filter to return.
2017  */
2018 int
2019 sosend(struct socket *so, struct sockaddr *addr, struct uio *uio,
2020     struct mbuf *top, struct mbuf *control, int flags)
2021 {
2022         struct mbuf **mp;
2023         struct mbuf *m, *freelist = NULL;
2024         user_ssize_t space, len, resid, orig_resid;
2025         int clen = 0, error, dontroute, mlen, sendflags;
2026         int atomic = sosendallatonce(so) || top;
2027         int sblocked = 0;
2028         struct proc *p = current_proc();
2029         struct mbuf *control_copy = NULL;
2030         uint16_t headroom = 0;
2031         boolean_t en_tracing = FALSE;
2032
2033         if (uio != NULL)
2034                 resid = uio_resid(uio);
2035         else
2036                 resid = top->m_pkthdr.len;
2037
2038         KERNEL_DEBUG((DBG_FNC_SOSEND | DBG_FUNC_START), so, resid,
2039             so->so_snd.sb_cc, so->so_snd.sb_lowat, so->so_snd.sb_hiwat);
2040
2041         socket_lock(so, 1);
2042
2043         /*
2044          * trace if tracing & network (vs. unix) sockets & and
2045          * non-loopback
2046          */
2047         if (ENTR_SHOULDTRACE &&
2048             (SOCK_CHECK_DOM(so, AF_INET) || SOCK_CHECK_DOM(so, AF_INET6))) {
2049                 struct inpcb *inp = sotoinpcb(so);
2050                 if (inp->inp_last_outifp != NULL &&
2051                     !(inp->inp_last_outifp->if_flags & IFF_LOOPBACK)) {
2052                         en_tracing = TRUE;
2053                         KERNEL_ENERGYTRACE(kEnTrActKernSockWrite, DBG_FUNC_START,
2054                             VM_KERNEL_ADDRPERM(so),
2055                             ((so->so_state & SS_NBIO) ? kEnTrFlagNonBlocking : 0),
2056                             (int64_t)resid);
2057                         orig_resid = resid;
2058                 }
2059         }
2060
2061         /*
2062          * Re-injection should not affect process accounting
2063          */
2064         if ((flags & MSG_SKIPCFIL) == 0) {
2065                 so_update_last_owner_locked(so, p);
2066                 so_update_policy(so);
2067
2068 #if NECP
2069                 so_update_necp_policy(so, NULL, addr);
2070 #endif /* NECP */
2071         }
2072
2073         if (so->so_type != SOCK_STREAM && (flags & MSG_OOB) != 0) {
2074                 error = EOPNOTSUPP;
2075                 socket_unlock(so, 1);
2076                 goto out;
2077         }
2078
2079         /*
2080          * In theory resid should be unsigned.
2081          * However, space must be signed, as it might be less than 0
2082          * if we over-committed, and we must use a signed comparison
2083          * of space and resid.  On the other hand, a negative resid
2084          * causes us to loop sending 0-length segments to the protocol.
2085          *
2086          * Usually, MSG_EOR isn't used on SOCK_STREAM type sockets.
2087          * But it will be used by sockets doing message delivery.
2088          *
2089          * Note: We limit resid to be a positive int value as we use
2090          * imin() to set bytes_to_copy -- radr://14558484
2091          */
2092         if (resid < 0 || resid > INT_MAX || (so->so_type == SOCK_STREAM &&
2093             !(so->so_flags & SOF_ENABLE_MSGS) && (flags & MSG_EOR))) {
2094                 error = EINVAL;
2095                 socket_unlock(so, 1);
2096                 goto out;
2097         }
2098
2099         dontroute = (flags & MSG_DONTROUTE) &&
2100             (so->so_options & SO_DONTROUTE) == 0 &&
2101             (so->so_proto->pr_flags & PR_ATOMIC);
2102         OSIncrementAtomicLong(&p->p_stats->p_ru.ru_msgsnd);
2103
2104         if (control != NULL)
2105                 clen = control->m_len;
2106
2107         if (soreserveheadroom != 0)
2108                 headroom = so->so_pktheadroom;
2109
2110         do {
2111                 error = sosendcheck(so, addr, resid, clen, atomic, flags,
2112                     &sblocked, control);
2113                 if (error)
2114                         goto release;
2115
2116                 mp = &top;
2117                 if (so->so_flags & SOF_ENABLE_MSGS)
2118                         space = msgq_sbspace(so, control);
2119                 else
2120                         space = sbspace(&so->so_snd) - clen;
2121                 space += ((flags & MSG_OOB) ? 1024 : 0);
2122
2123                 do {
2124                         if (uio == NULL) {
2125                                 /*
2126                                  * Data is prepackaged in "top".
2127                                  */
2128                                 resid = 0;
2129                                 if (flags & MSG_EOR)
2130                                         top->m_flags |= M_EOR;
2131                         } else {
2132                                 int chainlength;
2133                                 int bytes_to_copy;
2134                                 boolean_t jumbocl;
2135                                 boolean_t bigcl;
2136                                 int bytes_to_alloc;
2137
2138                                 bytes_to_copy = imin(resid, space);
2139
2140                                 bytes_to_alloc = bytes_to_copy;
2141                                 if (top == NULL)
2142                                         bytes_to_alloc += headroom;
2143
2144                                 if (sosendminchain > 0)
2145                                         chainlength = 0;
2146                                 else
2147                                         chainlength = sosendmaxchain;
2148
2149                                 /*
2150                                  * Use big 4 KB cluster when the outgoing interface
2151                                  * does not prefer 2 KB clusters
2152                                  */
2153                                 bigcl = !(so->so_flags1 & SOF1_IF_2KCL) ||
2154                                     sosendbigcl_ignore_capab;
2155
2156                                 /*
2157                                  * Attempt to use larger than system page-size
2158                                  * clusters for large writes only if there is
2159                                  * a jumbo cluster pool and if the socket is
2160                                  * marked accordingly.
2161                                  */
2162                                 jumbocl = sosendjcl && njcl > 0 &&
2163                                     ((so->so_flags & SOF_MULTIPAGES) ||
2164                                     sosendjcl_ignore_capab) &&
2165                                     bigcl;
2166
2167                                 socket_unlock(so, 0);
2168
2169                                 do {
2170                                         int num_needed;
2171                                         int hdrs_needed = (top == NULL) ? 1 : 0;
2172
2173                                         /*
2174                                          * try to maintain a local cache of mbuf
2175                                          * clusters needed to complete this
2176                                          * write the list is further limited to
2177                                          * the number that are currently needed
2178                                          * to fill the socket this mechanism
2179                                          * allows a large number of mbufs/
2180                                          * clusters to be grabbed under a single
2181                                          * mbuf lock... if we can't get any
2182                                          * clusters, than fall back to trying
2183                                          * for mbufs if we fail early (or
2184                                          * miscalcluate the number needed) make
2185                                          * sure to release any clusters we
2186                                          * haven't yet consumed.
2187                                          */
2188                                         if (freelist == NULL &&
2189                                             bytes_to_alloc > MBIGCLBYTES &&
2190                                             jumbocl) {
2191                                                 num_needed =
2192                                                     bytes_to_alloc / M16KCLBYTES;
2193
2194                                                 if ((bytes_to_alloc -
2195                                                     (num_needed * M16KCLBYTES))
2196                                                     >= MINCLSIZE)
2197                                                         num_needed++;
2198
2199                                                 freelist =
2200                                                     m_getpackets_internal(
2201                                                     (unsigned int *)&num_needed,
2202                                                     hdrs_needed, M_WAIT, 0,
2203                                                     M16KCLBYTES);
2204                                                 /*
2205                                                  * Fall back to 4K cluster size
2206                                                  * if allocation failed
2207                                                  */
2208                                         }
2209
2210                                         if (freelist == NULL &&
2211                                             bytes_to_alloc > MCLBYTES &&
2212                                             bigcl) {
2213                                                 num_needed =
2214                                                     bytes_to_alloc / MBIGCLBYTES;
2215
2216                                                 if ((bytes_to_alloc -
2217                                                     (num_needed * MBIGCLBYTES)) >=
2218                                                     MINCLSIZE)
2219                                                         num_needed++;
2220
2221                                                 freelist =
2222                                                     m_getpackets_internal(
2223                                                     (unsigned int *)&num_needed,
2224                                                     hdrs_needed, M_WAIT, 0,
2225                                                     MBIGCLBYTES);
2226                                                 /*
2227                                                  * Fall back to cluster size
2228                                                  * if allocation failed
2229                                                  */
2230                                         }
2231
2232                                         /*
2233                                          * Allocate a cluster as we want to
2234                                          * avoid to split the data in more
2235                                          * that one segment and using MINCLSIZE
2236                                          * would lead us to allocate two mbufs
2237                                          */
2238                                         if (soreserveheadroom != 0 &&
2239                                             freelist == NULL &&
2240                                             ((top == NULL &&
2241                                             bytes_to_alloc > _MHLEN) ||
2242                                             bytes_to_alloc > _MLEN)) {
2243                                                 num_needed = ROUNDUP(bytes_to_alloc, MCLBYTES) /
2244                                                     MCLBYTES;
2245                                                 freelist =
2246                                                     m_getpackets_internal(
2247                                                     (unsigned int *)&num_needed,
2248                                                     hdrs_needed, M_WAIT, 0,
2249                                                     MCLBYTES);
2250                                                 /*
2251                                                  * Fall back to a single mbuf
2252                                                  * if allocation failed
2253                                                  */
2254                                         } else if (freelist == NULL &&
2255                                             bytes_to_alloc > MINCLSIZE) {
2256                                                 num_needed =
2257                                                     bytes_to_alloc / MCLBYTES;
2258
2259                                                 if ((bytes_to_alloc -
2260                                                     (num_needed * MCLBYTES)) >=
2261                                                     MINCLSIZE)
2262                                                         num_needed++;
2263
2264                                                 freelist =
2265                                                     m_getpackets_internal(
2266                                                     (unsigned int *)&num_needed,
2267                                                     hdrs_needed, M_WAIT, 0,
2268                                                     MCLBYTES);
2269                                                 /*
2270                                                  * Fall back to a single mbuf
2271                                                  * if allocation failed
2272                                                  */
2273                                         }
2274                                         /*
2275                                          * For datagram protocols, leave
2276                                          * headroom for protocol headers
2277                                          * in the first cluster of the chain
2278                                          */
2279                                         if (freelist != NULL && atomic &&
2280                                             top == NULL && headroom > 0) {
2281                                                 freelist->m_data += headroom;
2282                                         }
2283
2284                                         /*
2285                                          * Fall back to regular mbufs without
2286                                          * reserving the socket headroom
2287                                          */
2288                                         if (freelist == NULL) {
2289                                                 if (top == NULL)
2290                                                         MGETHDR(freelist,
2291                                                             M_WAIT, MT_DATA);
2292                                                 else
2293                                                         MGET(freelist,
2294                                                             M_WAIT, MT_DATA);
2295
2296                                                 if (freelist == NULL) {
2297                                                         error = ENOBUFS;
2298                                                         socket_lock(so, 0);
2299                                                         goto release;
2300                                                 }
2301                                                 /*
2302                                                  * For datagram protocols,
2303                                                  * leave room for protocol
2304                                                  * headers in first mbuf.
2305                                                  */
2306                                                 if (atomic && top == NULL &&
2307                                                     bytes_to_copy < MHLEN) {
2308                                                         MH_ALIGN(freelist,
2309                                                             bytes_to_copy);
2310                                                 }
2311                                         }
2312                                         m = freelist;
2313                                         freelist = m->m_next;
2314                                         m->m_next = NULL;
2315
2316                                         if ((m->m_flags & M_EXT))
2317                                                 mlen = m->m_ext.ext_size -
2318                                                     m_leadingspace(m);
2319                                         else if ((m->m_flags & M_PKTHDR))
2320                                                 mlen =
2321                                                     MHLEN - m_leadingspace(m);
2322                                         else
2323                                                 mlen = MLEN - m_leadingspace(m);
2324                                         len = imin(mlen, bytes_to_copy);
2325
2326                                         chainlength += len;
2327
2328                                         space -= len;
2329
2330                                         error = uiomove(mtod(m, caddr_t),
2331                                             len, uio);
2332
2333                                         resid = uio_resid(uio);
2334
2335                                         m->m_len = len;
2336                                         *mp = m;
2337                                         top->m_pkthdr.len += len;
2338                                         if (error)
2339                                                 break;
2340                                         mp = &m->m_next;
2341                                         if (resid <= 0) {
2342                                                 if (flags & MSG_EOR)
2343                                                         top->m_flags |= M_EOR;
2344                                                 break;
2345                                         }
2346                                         bytes_to_copy = min(resid, space);
2347
2348                                 } while (space > 0 &&
2349                                     (chainlength < sosendmaxchain || atomic ||
2350                                     resid < MINCLSIZE));
2351
2352                                 socket_lock(so, 0);
2353
2354                                 if (error)
2355                                         goto release;
2356                         }
2357
2358                         if (flags & (MSG_HOLD|MSG_SEND)) {
2359                                 /* Enqueue for later, go away if HOLD */
2360                                 struct mbuf *mb1;
2361                                 if (so->so_temp && (flags & MSG_FLUSH)) {
2362                                         m_freem(so->so_temp);
2363                                         so->so_temp = NULL;
2364                                 }
2365                                 if (so->so_temp)
2366                                         so->so_tail->m_next = top;
2367                                 else
2368                                         so->so_temp = top;
2369                                 mb1 = top;
2370                                 while (mb1->m_next)
2371                                         mb1 = mb1->m_next;
2372                                 so->so_tail = mb1;
2373                                 if (flags & MSG_HOLD) {
2374                                         top = NULL;
2375                                         goto release;
2376                                 }
2377                                 top = so->so_temp;
2378                         }
2379                         if (dontroute)
2380                                 so->so_options |= SO_DONTROUTE;
2381
2382                         /*
2383                          * Compute flags here, for pru_send and NKEs
2384                          *
2385                          * If the user set MSG_EOF, the protocol
2386                          * understands this flag and nothing left to
2387                          * send then use PRU_SEND_EOF instead of PRU_SEND.
2388                          */
2389                         sendflags = (flags & MSG_OOB) ? PRUS_OOB :
2390                             ((flags & MSG_EOF) &&
2391                             (so->so_proto->pr_flags & PR_IMPLOPCL) &&
2392                             (resid <= 0)) ? PRUS_EOF :
2393                             /* If there is more to send set PRUS_MORETOCOME */
2394                             (resid > 0 && space > 0) ? PRUS_MORETOCOME : 0;
2395
2396                         if ((flags & MSG_SKIPCFIL) == 0) {
2397                                 /*
2398                                  * Socket filter processing
2399                                  */
2400                                 error = sflt_data_out(so, addr, &top,
2401                                     &control, (sendflags & MSG_OOB) ?
2402                                     sock_data_filt_flag_oob : 0);
2403                                 if (error) {
2404                                         if (error == EJUSTRETURN) {
2405                                                 error = 0;
2406                                                 clen = 0;
2407                                                 control = NULL;
2408                                                 top = NULL;
2409                                         }
2410                                         goto release;
2411                                 }
2412 #if CONTENT_FILTER
2413                                 /*
2414                                  * Content filter processing
2415                                  */
2416                                 error = cfil_sock_data_out(so, addr, top,
2417                                     control, (sendflags & MSG_OOB) ?
2418                                     sock_data_filt_flag_oob : 0);
2419                                 if (error) {
2420                                         if (error == EJUSTRETURN) {
2421                                                 error = 0;
2422                                                 clen = 0;
2423                                                 control = NULL;
2424                                                 top = NULL;
2425                                                 }
2426                                         goto release;
2427                                 }
2428 #endif /* CONTENT_FILTER */
2429                         }
2430                         if (so->so_flags & SOF_ENABLE_MSGS) {
2431                                 /*
2432                                  * Make a copy of control mbuf,
2433                                  * so that msg priority can be
2434                                  * passed to subsequent mbufs.
2435                                  */
2436                                 control_copy = m_dup(control, M_NOWAIT);
2437                         }
2438                         error = (*so->so_proto->pr_usrreqs->pru_send)
2439                             (so, sendflags, top, addr, control, p);
2440
2441                         if (flags & MSG_SEND)
2442                                 so->so_temp = NULL;
2443
2444                         if (dontroute)
2445                                 so->so_options &= ~SO_DONTROUTE;
2446
2447                         clen = 0;
2448                         control = control_copy;
2449                         control_copy = NULL;
2450                         top = NULL;
2451                         mp = &top;
2452                         if (error)
2453                                 goto release;
2454                 } while (resid && space > 0);
2455         } while (resid);
2456
2457 release:
2458         if (sblocked)
2459                 sbunlock(&so->so_snd, FALSE);   /* will unlock socket */
2460         else
2461                 socket_unlock(so, 1);
2462 out:
2463         if (top != NULL)
2464                 m_freem(top);
2465         if (control != NULL)
2466                 m_freem(control);
2467         if (freelist != NULL)
2468                 m_freem_list(freelist);
2469         if (control_copy != NULL)
2470                 m_freem(control_copy);
2471
2472         /*
2473          * One write has been done. This was enough. Get back to "normal"
2474          * behavior.
2475          */
2476         if (so->so_flags1 & SOF1_PRECONNECT_DATA)
2477                 so->so_flags1 &= ~SOF1_PRECONNECT_DATA;
2478
2479         if (en_tracing) {
2480                 /* resid passed here is the bytes left in uio */
2481                 KERNEL_ENERGYTRACE(kEnTrActKernSockWrite, DBG_FUNC_END,
2482                     VM_KERNEL_ADDRPERM(so),
2483                     ((error == EWOULDBLOCK) ? kEnTrFlagNoWork : 0),
2484                     (int64_t)(orig_resid - resid));
2485         }
2486         KERNEL_DEBUG(DBG_FNC_SOSEND | DBG_FUNC_END, so, resid,
2487             so->so_snd.sb_cc, space, error);
2488
2489         return (error);
2490 }
2491
2492 /*
2493  * Supported only connected sockets (no address) without ancillary data
2494  * (control mbuf) for atomic protocols
2495  */
2496 int
2497 sosend_list(struct socket *so, struct uio **uioarray, u_int uiocnt, int flags)
2498 {
2499         struct mbuf *m, *freelist = NULL;
2500         user_ssize_t len, resid;
2501         int error, dontroute, mlen;
2502         int atomic = sosendallatonce(so);
2503         int sblocked = 0;
2504         struct proc *p = current_proc();
2505         u_int uiofirst = 0;
2506         u_int uiolast = 0;
2507         struct mbuf *top = NULL;
2508         uint16_t headroom = 0;
2509         boolean_t bigcl;
2510
2511         KERNEL_DEBUG((DBG_FNC_SOSEND_LIST | DBG_FUNC_START), so, uiocnt,
2512             so->so_snd.sb_cc, so->so_snd.sb_lowat, so->so_snd.sb_hiwat);
2513
2514         if (so->so_type != SOCK_DGRAM) {
2515                 error = EINVAL;
2516                 goto out;
2517         }
2518         if (atomic == 0) {
2519                 error = EINVAL;
2520                 goto out;
2521         }
2522         if (so->so_proto->pr_usrreqs->pru_send_list == NULL) {
2523                 error = EPROTONOSUPPORT;
2524                 goto out;
2525         }
2526         if (flags & ~(MSG_DONTWAIT | MSG_NBIO)) {
2527                 error = EINVAL;
2528                 goto out;
2529         }
2530         resid = uio_array_resid(uioarray, uiocnt);
2531
2532         /*
2533          * In theory resid should be unsigned.
2534          * However, space must be signed, as it might be less than 0
2535          * if we over-committed, and we must use a signed comparison
2536          * of space and resid.  On the other hand, a negative resid
2537          * causes us to loop sending 0-length segments to the protocol.
2538          *
2539          * Note: We limit resid to be a positive int value as we use
2540          * imin() to set bytes_to_copy -- radr://14558484
2541          */
2542         if (resid < 0 || resid > INT_MAX) {
2543                 error = EINVAL;
2544                 goto out;
2545         }
2546
2547         socket_lock(so, 1);
2548         so_update_last_owner_locked(so, p);
2549         so_update_policy(so);
2550
2551 #if NECP
2552         so_update_necp_policy(so, NULL, NULL);
2553 #endif /* NECP */
2554
2555         dontroute = (flags & MSG_DONTROUTE) &&
2556             (so->so_options & SO_DONTROUTE) == 0 &&
2557             (so->so_proto->pr_flags & PR_ATOMIC);
2558         OSIncrementAtomicLong(&p->p_stats->p_ru.ru_msgsnd);
2559
2560         error = sosendcheck(so, NULL, resid, 0, atomic, flags,
2561             &sblocked, NULL);
2562         if (error)
2563                 goto release;
2564
2565         /*
2566          * Use big 4 KB clusters when the outgoing interface does not prefer
2567          * 2 KB clusters
2568          */
2569         bigcl = !(so->so_flags1 & SOF1_IF_2KCL) || sosendbigcl_ignore_capab;
2570
2571         if (soreserveheadroom != 0)
2572                 headroom = so->so_pktheadroom;
2573
2574         do {
2575                 int i;
2576                 int num_needed = 0;
2577                 int chainlength;
2578                 size_t maxpktlen = 0;
2579                 int bytes_to_alloc;
2580
2581                 if (sosendminchain > 0)
2582                         chainlength = 0;
2583                 else
2584                         chainlength = sosendmaxchain;
2585
2586                 socket_unlock(so, 0);
2587
2588                 /*
2589                  * Find a set of uio that fit in a reasonable number
2590                  * of mbuf packets
2591                  */
2592                 for (i = uiofirst; i < uiocnt; i++) {
2593                         struct uio *auio = uioarray[i];
2594
2595                         len = uio_resid(auio);
2596
2597                         /* Do nothing for empty messages */
2598                         if (len == 0)
2599                                 continue;
2600
2601                         num_needed += 1;
2602                         uiolast += 1;
2603
2604                         if (len > maxpktlen)
2605                                 maxpktlen = len;
2606
2607                         chainlength += len;
2608                         if (chainlength > sosendmaxchain)
2609                                 break;
2610                 }
2611                 /*
2612                  * Nothing left to send
2613                  */
2614                 if (num_needed == 0) {
2615                         socket_lock(so, 0);
2616                         break;
2617                 }
2618                 /*
2619                  * Allocate buffer large enough to include headroom space for
2620                  * network and link header
2621                  *
2622                  */
2623                 bytes_to_alloc = maxpktlen + headroom;
2624
2625                 /*
2626                  * Allocate a single contiguous buffer of the smallest available
2627                  * size when possible
2628                  */
2629                 if (bytes_to_alloc > MCLBYTES &&
2630                     bytes_to_alloc <= MBIGCLBYTES && bigcl) {
2631                         freelist = m_getpackets_internal(
2632                             (unsigned int *)&num_needed,
2633                             num_needed, M_WAIT, 1,
2634                             MBIGCLBYTES);
2635                 } else if (bytes_to_alloc > _MHLEN &&
2636                     bytes_to_alloc <= MCLBYTES) {
2637                         freelist = m_getpackets_internal(
2638                             (unsigned int *)&num_needed,
2639                             num_needed, M_WAIT, 1,
2640                             MCLBYTES);
2641                 } else {
2642                         freelist = m_allocpacket_internal(
2643                             (unsigned int *)&num_needed,
2644                             bytes_to_alloc, NULL, M_WAIT, 1, 0);
2645                 }
2646
2647                 if (freelist == NULL) {
2648                         socket_lock(so, 0);
2649                         error = ENOMEM;
2650                         goto release;
2651                 }
2652                 /*
2653                  * Copy each uio of the set into its own mbuf packet
2654                  */
2655                 for (i = uiofirst, m = freelist;
2656                     i < uiolast && m != NULL;
2657                     i++) {
2658                         int bytes_to_copy;
2659                         struct mbuf *n;
2660                         struct uio *auio = uioarray[i];
2661
2662                         bytes_to_copy = uio_resid(auio);
2663
2664                         /* Do nothing for empty messages */
2665                         if (bytes_to_copy == 0)
2666                                 continue;
2667                         /*
2668                          * Leave headroom for protocol headers
2669                          * in the first mbuf of the chain
2670                          */
2671                         m->m_data += headroom;
2672
2673                         for (n = m; n != NULL; n = n->m_next) {
2674                                 if ((m->m_flags & M_EXT))
2675                                         mlen = m->m_ext.ext_size -
2676                                             m_leadingspace(m);
2677                                 else if ((m->m_flags & M_PKTHDR))
2678                                         mlen =
2679                                             MHLEN - m_leadingspace(m);
2680                                 else
2681                                         mlen = MLEN - m_leadingspace(m);
2682                                 len = imin(mlen, bytes_to_copy);
2683
2684                                 /*
2685                                  * Note: uiomove() decrements the iovec
2686                                  * length
2687                                  */
2688                                 error = uiomove(mtod(n, caddr_t),
2689                                     len, auio);
2690                                 if (error != 0)
2691                                         break;
2692                                 n->m_len = len;
2693                                 m->m_pkthdr.len += len;
2694
2695                                 VERIFY(m->m_pkthdr.len <= maxpktlen);
2696
2697                                 bytes_to_copy -= len;
2698                                 resid -= len;
2699                         }
2700                         if (m->m_pkthdr.len == 0) {
2701                                 printf(
2702                                     "%s:%d so %llx pkt %llx type %u len null\n",
2703                                     __func__, __LINE__,
2704                                     (uint64_t)DEBUG_KERNEL_ADDRPERM(so),
2705                                     (uint64_t)DEBUG_KERNEL_ADDRPERM(m),
2706                                     m->m_type);
2707                         }
2708                         if (error != 0)
2709                                 break;
2710                         m = m->m_nextpkt;
2711                 }
2712
2713                 socket_lock(so, 0);
2714
2715                 if (error)
2716                         goto release;
2717                 top = freelist;
2718                 freelist = NULL;
2719
2720                 if (dontroute)
2721                         so->so_options |= SO_DONTROUTE;
2722
2723                 if ((flags & MSG_SKIPCFIL) == 0) {
2724                         struct mbuf **prevnextp = NULL;
2725
2726                         for (i = uiofirst, m = top;
2727                             i < uiolast && m != NULL;
2728                             i++) {
2729                                 struct mbuf *nextpkt = m->m_nextpkt;
2730
2731                                 /*
2732                                  * Socket filter processing
2733                                  */
2734                                 error = sflt_data_out(so, NULL, &m,
2735                                     NULL, 0);
2736                                 if (error != 0 && error != EJUSTRETURN)
2737                                         goto release;
2738
2739 #if CONTENT_FILTER
2740                                 if (error == 0) {
2741                                         /*
2742                                          * Content filter processing
2743                                          */
2744                                         error = cfil_sock_data_out(so, NULL, m,
2745                                             NULL, 0);
2746                                         if (error != 0 && error != EJUSTRETURN)
2747                                                 goto release;
2748                                 }
2749 #endif /* CONTENT_FILTER */
2750                                 /*
2751                                  * Remove packet from the list when
2752                                  * swallowed by a filter
2753                                  */
2754                                 if (error == EJUSTRETURN) {
2755                                         error = 0;
2756                                         if (prevnextp != NULL)
2757                                                 *prevnextp = nextpkt;
2758                                         else
2759                                                 top = nextpkt;
2760                                 }
2761
2762                                 m = nextpkt;
2763                                 if (m != NULL)
2764                                         prevnextp = &m->m_nextpkt;
2765                         }
2766                 }
2767                 if (top != NULL)
2768                         error = (*so->so_proto->pr_usrreqs->pru_send_list)
2769                             (so, 0, top, NULL, NULL, p);
2770
2771                 if (dontroute)
2772                         so->so_options &= ~SO_DONTROUTE;
2773
2774                 top = NULL;
2775                 uiofirst = uiolast;
2776         } while (resid > 0 && error == 0);
2777 release:
2778         if (sblocked)
2779                 sbunlock(&so->so_snd, FALSE);   /* will unlock socket */
2780         else
2781                 socket_unlock(so, 1);
2782 out:
2783         if (top != NULL)
2784                 m_freem(top);
2785         if (freelist != NULL)
2786                 m_freem_list(freelist);
2787
2788         KERNEL_DEBUG(DBG_FNC_SOSEND_LIST | DBG_FUNC_END, so, resid,
2789             so->so_snd.sb_cc, 0, error);
2790
2791         return (error);
2792 }
2793
2794 /*
2795  * May return ERESTART when packet is dropped by MAC policy check
2796  */
2797 static int
2798 soreceive_addr(struct proc *p, struct socket *so, struct sockaddr **psa,
2799     int flags, struct mbuf **mp, struct mbuf **nextrecordp, int canwait)
2800 {
2801         int error = 0;
2802         struct mbuf *m = *mp;
2803         struct mbuf *nextrecord = *nextrecordp;
2804
2805         KASSERT(m->m_type == MT_SONAME, ("receive 1a"));
2806 #if CONFIG_MACF_SOCKET_SUBSET
2807         /*
2808          * Call the MAC framework for policy checking if we're in
2809          * the user process context and the socket isn't connected.
2810          */
2811         if (p != kernproc && !(so->so_state & SS_ISCONNECTED)) {
2812                 struct mbuf *m0 = m;
2813                 /*
2814                  * Dequeue this record (temporarily) from the receive
2815                  * list since we're about to drop the socket's lock
2816                  * where a new record may arrive and be appended to
2817                  * the list.  Upon MAC policy failure, the record
2818                  * will be freed.  Otherwise, we'll add it back to
2819                  * the head of the list.  We cannot rely on SB_LOCK
2820                  * because append operation uses the socket's lock.
2821                  */
2822                 do {
2823                         m->m_nextpkt = NULL;
2824                         sbfree(&so->so_rcv, m);
2825                         m = m->m_next;
2826                 } while (m != NULL);
2827                 m = m0;
2828                 so->so_rcv.sb_mb = nextrecord;
2829                 SB_EMPTY_FIXUP(&so->so_rcv);
2830                 SBLASTRECORDCHK(&so->so_rcv, "soreceive 1a");
2831                 SBLASTMBUFCHK(&so->so_rcv, "soreceive 1a");
2832                 socket_unlock(so, 0);
2833
2834                 if (mac_socket_check_received(proc_ucred(p), so,
2835                     mtod(m, struct sockaddr *)) != 0) {
2836                         /*
2837                          * MAC policy failure; free this record and
2838                          * process the next record (or block until
2839                          * one is available).  We have adjusted sb_cc
2840                          * and sb_mbcnt above so there is no need to
2841                          * call sbfree() again.
2842                          */
2843                         m_freem(m);
2844                         /*
2845                          * Clear SB_LOCK but don't unlock the socket.
2846                          * Process the next record or wait for one.
2847                          */
2848                         socket_lock(so, 0);
2849                         sbunlock(&so->so_rcv, TRUE); /* stay locked */
2850                         error = ERESTART;
2851                         goto done;
2852                 }
2853                 socket_lock(so, 0);
2854                 /*
2855                  * If the socket has been defunct'd, drop it.
2856                  */
2857                 if (so->so_flags & SOF_DEFUNCT) {
2858                         m_freem(m);
2859                         error = ENOTCONN;
2860                         goto done;
2861                 }
2862                 /*
2863                  * Re-adjust the socket receive list and re-enqueue
2864                  * the record in front of any packets which may have
2865                  * been appended while we dropped the lock.
2866                  */
2867                 for (m = m0; m->m_next != NULL; m = m->m_next)
2868                         sballoc(&so->so_rcv, m);
2869                 sballoc(&so->so_rcv, m);
2870                 if (so->so_rcv.sb_mb == NULL) {
2871                         so->so_rcv.sb_lastrecord = m0;
2872                         so->so_rcv.sb_mbtail = m;
2873                 }
2874                 m = m0;
2875                 nextrecord = m->m_nextpkt = so->so_rcv.sb_mb;
2876                 so->so_rcv.sb_mb = m;
2877                 SBLASTRECORDCHK(&so->so_rcv, "soreceive 1b");
2878                 SBLASTMBUFCHK(&so->so_rcv, "soreceive 1b");
2879         }
2880 #endif /* CONFIG_MACF_SOCKET_SUBSET */
2881         if (psa != NULL) {
2882                 *psa = dup_sockaddr(mtod(m, struct sockaddr *), canwait);
2883                 if ((*psa == NULL) && (flags & MSG_NEEDSA)) {
2884                         error = EWOULDBLOCK;
2885                         goto done;
2886                 }
2887         }
2888         if (flags & MSG_PEEK) {
2889                 m = m->m_next;
2890         } else {
2891                 sbfree(&so->so_rcv, m);
2892                 if (m->m_next == NULL && so->so_rcv.sb_cc != 0) {
2893                         panic("%s: about to create invalid socketbuf",
2894                             __func__);
2895                         /* NOTREACHED */
2896                 }
2897                 MFREE(m, so->so_rcv.sb_mb);
2898                 m = so->so_rcv.sb_mb;
2899                 if (m != NULL) {
2900                         m->m_nextpkt = nextrecord;
2901                 } else {
2902                         so->so_rcv.sb_mb = nextrecord;
2903                         SB_EMPTY_FIXUP(&so->so_rcv);
2904                 }
2905         }
2906 done:
2907         *mp = m;
2908         *nextrecordp = nextrecord;
2909
2910         return (error);
2911 }
2912
2913 /*
2914  * Process one or more MT_CONTROL mbufs present before any data mbufs
2915  * in the first mbuf chain on the socket buffer.  If MSG_PEEK, we
2916  * just copy the data; if !MSG_PEEK, we call into the protocol to
2917  * perform externalization.
2918  */
2919 static int
2920 soreceive_ctl(struct socket *so, struct mbuf **controlp, int flags,
2921     struct mbuf **mp, struct mbuf **nextrecordp)
2922 {
2923         int error = 0;
2924         struct mbuf *cm = NULL, *cmn;
2925         struct mbuf **cme = &cm;
2926         struct sockbuf *sb_rcv = &so->so_rcv;
2927         struct mbuf **msgpcm = NULL;
2928         struct mbuf *m = *mp;
2929         struct mbuf *nextrecord = *nextrecordp;
2930         struct protosw *pr = so->so_proto;
2931
2932         /*
2933          * Externalizing the control messages would require us to
2934          * drop the socket's lock below.  Once we re-acquire the
2935          * lock, the mbuf chain might change.  In order to preserve
2936          * consistency, we unlink all control messages from the
2937          * first mbuf chain in one shot and link them separately
2938          * onto a different chain.
2939          */
2940         do {
2941                 if (flags & MSG_PEEK) {
2942                         if (controlp != NULL) {
2943                                 if (*controlp == NULL) {
2944                                         msgpcm = controlp;
2945                                 }
2946                                 *controlp = m_copy(m, 0, m->m_len);
2947
2948                                 /*
2949                                  * If we failed to allocate an mbuf,
2950                                  * release any previously allocated
2951                                  * mbufs for control data. Return
2952                                  * an error. Keep the mbufs in the
2953                                  * socket as this is using
2954                                  * MSG_PEEK flag.
2955                                  */
2956                                 if (*controlp == NULL) {
2957                                         m_freem(*msgpcm);
2958                                         error = ENOBUFS;
2959                                         goto done;
2960                                 }
2961                                 controlp = &(*controlp)->m_next;
2962                         }
2963                         m = m->m_next;
2964                 } else {
2965                         m->m_nextpkt = NULL;
2966                         sbfree(sb_rcv, m);
2967                         sb_rcv->sb_mb = m->m_next;
2968                         m->m_next = NULL;
2969                         *cme = m;
2970                         cme = &(*cme)->m_next;
2971                         m = sb_rcv->sb_mb;
2972                 }
2973         } while (m != NULL && m->m_type == MT_CONTROL);
2974
2975         if (!(flags & MSG_PEEK)) {
2976                 if (sb_rcv->sb_mb != NULL) {
2977                         sb_rcv->sb_mb->m_nextpkt = nextrecord;
2978                 } else {
2979                         sb_rcv->sb_mb = nextrecord;
2980                         SB_EMPTY_FIXUP(sb_rcv);
2981                 }
2982                 if (nextrecord == NULL)
2983                         sb_rcv->sb_lastrecord = m;
2984         }
2985
2986         SBLASTRECORDCHK(&so->so_rcv, "soreceive ctl");
2987         SBLASTMBUFCHK(&so->so_rcv, "soreceive ctl");
2988
2989         while (cm != NULL) {
2990                 int cmsg_type;
2991
2992                 cmn = cm->m_next;
2993                 cm->m_next = NULL;
2994                 cmsg_type = mtod(cm, struct cmsghdr *)->cmsg_type;
2995
2996                 /*
2997                  * Call the protocol to externalize SCM_RIGHTS message
2998                  * and return the modified message to the caller upon
2999                  * success.  Otherwise, all other control messages are
3000                  * returned unmodified to the caller.  Note that we
3001                  * only get into this loop if MSG_PEEK is not set.
3002                  */
3003                 if (pr->pr_domain->dom_externalize != NULL &&
3004                     cmsg_type == SCM_RIGHTS) {
3005                         /*
3006                          * Release socket lock: see 3903171.  This
3007                          * would also allow more records to be appended
3008                          * to the socket buffer.  We still have SB_LOCK
3009                          * set on it, so we can be sure that the head
3010                          * of the mbuf chain won't change.
3011                          */
3012                         socket_unlock(so, 0);
3013                         error = (*pr->pr_domain->dom_externalize)(cm);
3014                         socket_lock(so, 0);
3015                 } else {
3016                         error = 0;
3017                 }
3018
3019                 if (controlp != NULL && error == 0) {
3020                         *controlp = cm;
3021                         controlp = &(*controlp)->m_next;
3022                 } else {
3023                         (void) m_free(cm);
3024                 }
3025                 cm = cmn;
3026         }
3027         /*
3028          * Update the value of nextrecord in case we received new
3029          * records when the socket was unlocked above for
3030          * externalizing SCM_RIGHTS.
3031          */
3032         if (m != NULL)
3033                 nextrecord = sb_rcv->sb_mb->m_nextpkt;
3034         else
3035                 nextrecord = sb_rcv->sb_mb;
3036
3037 done:
3038         *mp = m;
3039         *nextrecordp = nextrecord;
3040
3041         return (error);
3042 }
3043
3044 /*
3045  * Implement receive operations on a socket.
3046  * We depend on the way that records are added to the sockbuf
3047  * by sbappend*.  In particular, each record (mbufs linked through m_next)
3048  * must begin with an address if the protocol so specifies,
3049  * followed by an optional mbuf or mbufs containing ancillary data,
3050  * and then zero or more mbufs of data.
3051  * In order to avoid blocking network interrupts for the entire time here,
3052  * we splx() while doing the actual copy to user space.
3053  * Although the sockbuf is locked, new data may still be appended,
3054  * and thus we must maintain consistency of the sockbuf during that time.
3055  *
3056  * The caller may receive the data as a single mbuf chain by supplying
3057  * an mbuf **mp0 for use in returning the chain.  The uio is then used
3058  * only for the count in uio_resid.
3059  *
3060  * Returns:     0                       Success
3061  *              ENOBUFS
3062  *              ENOTCONN
3063  *              EWOULDBLOCK
3064  *      uiomove:EFAULT
3065  *      sblock:EWOULDBLOCK
3066  *      sblock:EINTR
3067  *      sbwait:EBADF
3068  *      sbwait:EINTR
3069  *      sodelayed_copy:EFAULT
3070  *      <pru_rcvoob>:EINVAL[TCP]
3071  *      <pru_rcvoob>:EWOULDBLOCK[TCP]
3072  *      <pru_rcvoob>:???
3073  *      <pr_domain->dom_externalize>:EMSGSIZE[AF_UNIX]
3074  *      <pr_domain->dom_externalize>:ENOBUFS[AF_UNIX]
3075  *      <pr_domain->dom_externalize>:???
3076  *
3077  * Notes:       Additional return values from calls through <pru_rcvoob> and
3078  *              <pr_domain->dom_externalize> depend on protocols other than
3079  *              TCP or AF_UNIX, which are documented above.
3080  */
3081 int
3082 soreceive(struct socket *so, struct sockaddr **psa, struct uio *uio,
3083     struct mbuf **mp0, struct mbuf **controlp, int *flagsp)
3084 {
3085         struct mbuf *m, **mp, *ml = NULL;
3086         struct mbuf *nextrecord, *free_list;
3087         int flags, error, offset;
3088         user_ssize_t len;
3089         struct protosw *pr = so->so_proto;
3090         int moff, type = 0;
3091         user_ssize_t orig_resid = uio_resid(uio);
3092         user_ssize_t delayed_copy_len;
3093         int can_delay;
3094         int need_event;
3095         struct proc *p = current_proc();
3096         boolean_t en_tracing = FALSE;
3097
3098         /*
3099          * Sanity check on the length passed by caller as we are making 'int'
3100          * comparisons
3101          */
3102         if (orig_resid < 0 || orig_resid > INT_MAX)
3103                 return (EINVAL);
3104
3105         KERNEL_DEBUG(DBG_FNC_SORECEIVE | DBG_FUNC_START, so,
3106             uio_resid(uio), so->so_rcv.sb_cc, so->so_rcv.sb_lowat,
3107             so->so_rcv.sb_hiwat);
3108
3109         socket_lock(so, 1);
3110         so_update_last_owner_locked(so, p);
3111         so_update_policy(so);
3112
3113 #ifdef MORE_LOCKING_DEBUG
3114         if (so->so_usecount == 1) {
3115                 panic("%s: so=%x no other reference on socket\n", __func__, so);
3116                 /* NOTREACHED */
3117         }
3118 #endif
3119         mp = mp0;
3120         if (psa != NULL)
3121                 *psa = NULL;
3122         if (controlp != NULL)
3123                 *controlp = NULL;
3124         if (flagsp != NULL)
3125                 flags = *flagsp &~ MSG_EOR;
3126         else
3127                 flags = 0;
3128
3129         /*
3130          * If a recv attempt is made on a previously-accepted socket
3131          * that has been marked as inactive (disconnected), reject
3132          * the request.
3133          */
3134         if (so->so_flags & SOF_DEFUNCT) {
3135                 struct sockbuf *sb = &so->so_rcv;
3136
3137                 error = ENOTCONN;
3138                 SODEFUNCTLOG("%s[%d, %s]: defunct so 0x%llx [%d,%d] (%d)\n",
3139                     __func__, proc_pid(p), proc_best_name(p),
3140                     (uint64_t)DEBUG_KERNEL_ADDRPERM(so),
3141                     SOCK_DOM(so), SOCK_TYPE(so), error);
3142                 /*
3143                  * This socket should have been disconnected and flushed
3144                  * prior to being returned from sodefunct(); there should
3145                  * be no data on its receive list, so panic otherwise.
3146                  */
3147                 if (so->so_state & SS_DEFUNCT)
3148                         sb_empty_assert(sb, __func__);
3149                 socket_unlock(so, 1);
3150                 return (error);
3151         }
3152
3153         if ((so->so_flags1 & SOF1_PRECONNECT_DATA) &&
3154             pr->pr_usrreqs->pru_preconnect) {
3155                 /*
3156                  * A user may set the CONNECT_RESUME_ON_READ_WRITE-flag but not
3157                  * calling write() right after this. *If* the app calls a read
3158                  * we do not want to block this read indefinetely. Thus,
3159                  * we trigger a connect so that the session gets initiated.
3160                  */
3161                 error = (*pr->pr_usrreqs->pru_preconnect)(so);
3162
3163                 if (error) {
3164                         socket_unlock(so, 1);
3165                         return (error);
3166                 }
3167         }
3168
3169         if (ENTR_SHOULDTRACE &&
3170             (SOCK_CHECK_DOM(so, AF_INET) || SOCK_CHECK_DOM(so, AF_INET6))) {
3171                 /*
3172                  * enable energy tracing for inet sockets that go over
3173                  * non-loopback interfaces only.
3174                  */
3175                 struct inpcb *inp = sotoinpcb(so);
3176                 if (inp->inp_last_outifp != NULL &&
3177                     !(inp->inp_last_outifp->if_flags & IFF_LOOPBACK)) {
3178                         en_tracing = TRUE;
3179                         KERNEL_ENERGYTRACE(kEnTrActKernSockRead, DBG_FUNC_START,
3180                             VM_KERNEL_ADDRPERM(so),
3181                             ((so->so_state & SS_NBIO) ?
3182                             kEnTrFlagNonBlocking : 0),
3183                             (int64_t)orig_resid);
3184                 }
3185         }
3186
3187         /*
3188          * When SO_WANTOOBFLAG is set we try to get out-of-band data
3189          * regardless of the flags argument. Here is the case were
3190          * out-of-band data is not inline.
3191          */
3192         if ((flags & MSG_OOB) ||
3193             ((so->so_options & SO_WANTOOBFLAG) != 0 &&
3194             (so->so_options & SO_OOBINLINE) == 0 &&
3195             (so->so_oobmark || (so->so_state & SS_RCVATMARK)))) {
3196                 m = m_get(M_WAIT, MT_DATA);
3197                 if (m == NULL) {
3198                         socket_unlock(so, 1);
3199                         KERNEL_DEBUG(DBG_FNC_SORECEIVE | DBG_FUNC_END,
3200                             ENOBUFS, 0, 0, 0, 0);
3201                         return (ENOBUFS);
3202                 }
3203                 error = (*pr->pr_usrreqs->pru_rcvoob)(so, m, flags & MSG_PEEK);
3204                 if (error)
3205                         goto bad;
3206                 socket_unlock(so, 0);
3207                 do {
3208                         error = uiomove(mtod(m, caddr_t),
3209                             imin(uio_resid(uio), m->m_len), uio);
3210                         m = m_free(m);
3211                 } while (uio_resid(uio) && error == 0 && m != NULL);
3212                 socket_lock(so, 0);
3213 bad:
3214                 if (m != NULL)
3215                         m_freem(m);
3216
3217                 if ((so->so_options & SO_WANTOOBFLAG) != 0) {
3218                         if (error == EWOULDBLOCK || error == EINVAL) {
3219                                 /*
3220                                  * Let's try to get normal data:
3221                                  * EWOULDBLOCK: out-of-band data not
3222                                  * receive yet. EINVAL: out-of-band data
3223                                  * already read.
3224                                  */
3225                                 error = 0;
3226                                 goto nooob;
3227                         } else if (error == 0 && flagsp != NULL) {
3228                                 *flagsp |= MSG_OOB;
3229                         }
3230                 }
3231                 socket_unlock(so, 1);
3232                 if (en_tracing) {
3233                         KERNEL_ENERGYTRACE(kEnTrActKernSockRead, DBG_FUNC_END,
3234                             VM_KERNEL_ADDRPERM(so), 0,
3235                             (int64_t)(orig_resid - uio_resid(uio)));
3236                 }
3237                 KERNEL_DEBUG(DBG_FNC_SORECEIVE | DBG_FUNC_END, error,
3238                     0, 0, 0, 0);
3239
3240                 return (error);
3241         }
3242 nooob:
3243         if (mp != NULL)
3244                 *mp = NULL;
3245
3246         if (so->so_state & SS_ISCONFIRMING && uio_resid(uio)) {
3247                 (*pr->pr_usrreqs->pru_rcvd)(so, 0);
3248         }
3249
3250         free_list = NULL;
3251         delayed_copy_len = 0;
3252 restart:
3253 #ifdef MORE_LOCKING_DEBUG
3254         if (so->so_usecount <= 1)
3255                 printf("soreceive: sblock so=0x%llx ref=%d on socket\n",
3256                     (uint64_t)DEBUG_KERNEL_ADDRPERM(so), so->so_usecount);
3257 #endif
3258         /*
3259          * See if the socket has been closed (SS_NOFDREF|SS_CANTRCVMORE)
3260          * and if so just return to the caller.  This could happen when
3261          * soreceive() is called by a socket upcall function during the
3262          * time the socket is freed.  The socket buffer would have been
3263          * locked across the upcall, therefore we cannot put this thread
3264          * to sleep (else we will deadlock) or return EWOULDBLOCK (else
3265          * we may livelock), because the lock on the socket buffer will
3266          * only be released when the upcall routine returns to its caller.
3267          * Because the socket has been officially closed, there can be
3268          * no further read on it.
3269          *
3270          * A multipath subflow socket would have its SS_NOFDREF set by
3271          * default, so check for SOF_MP_SUBFLOW socket flag; when the
3272          * socket is closed for real, SOF_MP_SUBFLOW would be cleared.
3273          */
3274         if ((so->so_state & (SS_NOFDREF | SS_CANTRCVMORE)) ==
3275             (SS_NOFDREF | SS_CANTRCVMORE) && !(so->so_flags & SOF_MP_SUBFLOW)) {
3276                 socket_unlock(so, 1);
3277                 return (0);
3278         }
3279
3280         error = sblock(&so->so_rcv, SBLOCKWAIT(flags));
3281         if (error) {
3282                 socket_unlock(so, 1);
3283                 KERNEL_DEBUG(DBG_FNC_SORECEIVE | DBG_FUNC_END, error,
3284                     0, 0, 0, 0);
3285                 if (en_tracing) {
3286                         KERNEL_ENERGYTRACE(kEnTrActKernSockRead, DBG_FUNC_END,
3287                             VM_KERNEL_ADDRPERM(so), 0,
3288                             (int64_t)(orig_resid - uio_resid(uio)));
3289                 }
3290                 return (error);
3291         }
3292
3293         m = so->so_rcv.sb_mb;
3294         /*
3295          * If we have less data than requested, block awaiting more
3296          * (subject to any timeout) if:
3297          *   1. the current count is less than the low water mark, or
3298          *   2. MSG_WAITALL is set, and it is possible to do the entire
3299          *      receive operation at once if we block (resid <= hiwat).
3300          *   3. MSG_DONTWAIT is not set
3301          * If MSG_WAITALL is set but resid is larger than the receive buffer,
3302          * we have to do the receive in sections, and thus risk returning
3303          * a short count if a timeout or signal occurs after we start.
3304          */
3305         if (m == NULL || (((flags & MSG_DONTWAIT) == 0 &&
3306             so->so_rcv.sb_cc < uio_resid(uio)) &&
3307             (so->so_rcv.sb_cc < so->so_rcv.sb_lowat ||
3308             ((flags & MSG_WAITALL) && uio_resid(uio) <= so->so_rcv.sb_hiwat)) &&
3309             m->m_nextpkt == NULL && (pr->pr_flags & PR_ATOMIC) == 0)) {
3310                 /*
3311                  * Panic if we notice inconsistencies in the socket's
3312                  * receive list; both sb_mb and sb_cc should correctly
3313                  * reflect the contents of the list, otherwise we may
3314                  * end up with false positives during select() or poll()
3315                  * which could put the application in a bad state.
3316                  */
3317                 SB_MB_CHECK(&so->so_rcv);
3318
3319                 if (so->so_error) {
3320                         if (m != NULL)
3321                                 goto dontblock;
3322                         error = so->so_error;
3323                         if ((flags & MSG_PEEK) == 0)
3324                                 so->so_error = 0;
3325                         goto release;
3326                 }
3327                 if (so->so_state & SS_CANTRCVMORE) {
3328 #if CONTENT_FILTER
3329                         /*
3330                          * Deal with half closed connections
3331                          */
3332                         if ((so->so_state & SS_ISDISCONNECTED) == 0 &&
3333                                 cfil_sock_data_pending(&so->so_rcv) != 0)
3334                                 CFIL_LOG(LOG_INFO,
3335                                         "so %llx ignore SS_CANTRCVMORE",
3336                                         (uint64_t)DEBUG_KERNEL_ADDRPERM(so));
3337                         else
3338 #endif /* CONTENT_FILTER */
3339                         if (m != NULL)
3340                                 goto dontblock;
3341                         else
3342                                 goto release;
3343                 }
3344                 for (; m != NULL; m = m->m_next)
3345                         if (m->m_type == MT_OOBDATA || (m->m_flags & M_EOR)) {
3346                                 m = so->so_rcv.sb_mb;
3347                                 goto dontblock;
3348                         }
3349                 if ((so->so_state & (SS_ISCONNECTED|SS_ISCONNECTING)) == 0 &&
3350                     (so->so_proto->pr_flags & PR_CONNREQUIRED)) {
3351                         error = ENOTCONN;
3352                         goto release;
3353                 }
3354                 if (uio_resid(uio) == 0)
3355                         goto release;
3356
3357                 if ((so->so_state & SS_NBIO) ||
3358                     (flags & (MSG_DONTWAIT|MSG_NBIO))) {
3359                         error = EWOULDBLOCK;
3360                         goto release;
3361                 }
3362                 SBLASTRECORDCHK(&so->so_rcv, "soreceive sbwait 1");
3363                 SBLASTMBUFCHK(&so->so_rcv, "soreceive sbwait 1");
3364                 sbunlock(&so->so_rcv, TRUE);    /* keep socket locked */
3365 #if EVEN_MORE_LOCKING_DEBUG
3366                 if (socket_debug)
3367                         printf("Waiting for socket data\n");
3368 #endif
3369
3370                 error = sbwait(&so->so_rcv);
3371 #if EVEN_MORE_LOCKING_DEBUG
3372                 if (socket_debug)
3373                         printf("SORECEIVE - sbwait returned %d\n", error);
3374 #endif
3375                 if (so->so_usecount < 1) {
3376                         panic("%s: after 2nd sblock so=%p ref=%d on socket\n",
3377                             __func__, so, so->so_usecount);
3378                         /* NOTREACHED */
3379                 }
3380                 if (error) {
3381                         socket_unlock(so, 1);
3382                         KERNEL_DEBUG(DBG_FNC_SORECEIVE | DBG_FUNC_END, error,
3383                             0, 0, 0, 0);
3384                         if (en_tracing) {
3385                                 KERNEL_ENERGYTRACE(kEnTrActKernSockRead, DBG_FUNC_END,
3386                                     VM_KERNEL_ADDRPERM(so), 0,
3387                                     (int64_t)(orig_resid - uio_resid(uio)));
3388                         }
3389                         return (error);
3390                 }
3391                 goto restart;
3392         }
3393 dontblock:
3394         OSIncrementAtomicLong(&p->p_stats->p_ru.ru_msgrcv);
3395         SBLASTRECORDCHK(&so->so_rcv, "soreceive 1");
3396         SBLASTMBUFCHK(&so->so_rcv, "soreceive 1");
3397         nextrecord = m->m_nextpkt;
3398
3399         if ((pr->pr_flags & PR_ADDR) && m->m_type == MT_SONAME) {
3400                 error = soreceive_addr(p, so, psa, flags, &m, &nextrecord,
3401                     mp0 == NULL);
3402                 if (error == ERESTART)
3403                         goto restart;
3404                 else if (error != 0)
3405                         goto release;
3406                 orig_resid = 0;
3407         }
3408
3409         /*
3410          * Process one or more MT_CONTROL mbufs present before any data mbufs
3411          * in the first mbuf chain on the socket buffer.  If MSG_PEEK, we
3412          * just copy the data; if !MSG_PEEK, we call into the protocol to
3413          * perform externalization.
3414          */
3415         if (m != NULL && m->m_type == MT_CONTROL) {
3416                 error = soreceive_ctl(so, controlp, flags, &m, &nextrecord);
3417                 if (error != 0)
3418                         goto release;
3419                 orig_resid = 0;
3420         }
3421
3422         /*
3423          * If the socket is a TCP socket with message delivery
3424          * enabled, then create a control msg to deliver the
3425          * relative TCP sequence number for this data. Waiting
3426          * until this point will protect against failures to
3427          * allocate an mbuf for control msgs.
3428          */
3429         if (so->so_type == SOCK_STREAM && SOCK_PROTO(so) == IPPROTO_TCP &&
3430             (so->so_flags & SOF_ENABLE_MSGS) && controlp != NULL) {
3431                 struct mbuf *seq_cm;
3432
3433                 seq_cm = sbcreatecontrol((caddr_t)&m->m_pkthdr.msg_seq,
3434                     sizeof (uint32_t), SCM_SEQNUM, SOL_SOCKET);
3435                 if (seq_cm == NULL) {
3436                         /* unable to allocate a control mbuf */
3437                         error = ENOBUFS;
3438                         goto release;
3439                 }
3440                 *controlp = seq_cm;
3441                 controlp = &seq_cm->m_next;
3442         }
3443
3444         if (m != NULL) {
3445                 if (!(flags & MSG_PEEK)) {
3446                         /*
3447                          * We get here because m points to an mbuf following
3448                          * any MT_SONAME or MT_CONTROL mbufs which have been
3449                          * processed above.  In any case, m should be pointing
3450                          * to the head of the mbuf chain, and the nextrecord
3451                          * should be either NULL or equal to m->m_nextpkt.
3452                          * See comments above about SB_LOCK.
3453                          */
3454                         if (m != so->so_rcv.sb_mb ||
3455                             m->m_nextpkt != nextrecord) {
3456                                 panic("%s: post-control !sync so=%p m=%p "
3457                                     "nextrecord=%p\n", __func__, so, m,
3458                                     nextrecord);
3459                                 /* NOTREACHED */
3460                         }
3461                         if (nextrecord == NULL)
3462                                 so->so_rcv.sb_lastrecord = m;
3463                 }
3464                 type = m->m_type;
3465                 if (type == MT_OOBDATA)
3466                         flags |= MSG_OOB;
3467         } else {
3468                 if (!(flags & MSG_PEEK)) {
3469                         SB_EMPTY_FIXUP(&so->so_rcv);
3470                 }
3471         }
3472         SBLASTRECORDCHK(&so->so_rcv, "soreceive 2");
3473         SBLASTMBUFCHK(&so->so_rcv, "soreceive 2");
3474
3475         moff = 0;
3476         offset = 0;
3477
3478         if (!(flags & MSG_PEEK) && uio_resid(uio) > sorecvmincopy)
3479                 can_delay = 1;
3480         else
3481                 can_delay = 0;
3482
3483         need_event = 0;
3484
3485         while (m != NULL &&
3486             (uio_resid(uio) - delayed_copy_len) > 0 && error == 0) {
3487                 if (m->m_type == MT_OOBDATA) {
3488                         if (type != MT_OOBDATA)
3489                                 break;
3490                 } else if (type == MT_OOBDATA) {
3491                         break;
3492                 }
3493                 /*
3494                  * Make sure to allways set MSG_OOB event when getting
3495                  * out of band data inline.
3496                  */
3497                 if ((so->so_options & SO_WANTOOBFLAG) != 0 &&
3498                     (so->so_options & SO_OOBINLINE) != 0 &&
3499                     (so->so_state & SS_RCVATMARK) != 0) {
3500                         flags |= MSG_OOB;
3501                 }
3502                 so->so_state &= ~SS_RCVATMARK;
3503                 len = uio_resid(uio) - delayed_copy_len;
3504                 if (so->so_oobmark && len > so->so_oobmark - offset)
3505                         len = so->so_oobmark - offset;
3506                 if (len > m->m_len - moff)
3507                         len = m->m_len - moff;
3508                 /*
3509                  * If mp is set, just pass back the mbufs.
3510                  * Otherwise copy them out via the uio, then free.
3511                  * Sockbuf must be consistent here (points to current mbuf,
3512                  * it points to next record) when we drop priority;
3513                  * we must note any additions to the sockbuf when we
3514                  * block interrupts again.
3515                  */
3516                 if (mp == NULL) {
3517                         SBLASTRECORDCHK(&so->so_rcv, "soreceive uiomove");
3518                         SBLASTMBUFCHK(&so->so_rcv, "soreceive uiomove");
3519                         if (can_delay && len == m->m_len) {
3520                                 /*
3521                                  * only delay the copy if we're consuming the
3522                                  * mbuf and we're NOT in MSG_PEEK mode
3523                                  * and we have enough data to make it worthwile
3524                                  * to drop and retake the lock... can_delay
3525                                  * reflects the state of the 2 latter
3526                                  * constraints moff should always be zero
3527                                  * in these cases
3528                                  */
3529                                 delayed_copy_len += len;
3530                         } else {
3531                                 if (delayed_copy_len) {
3532                                         error = sodelayed_copy(so, uio,
3533                                             &free_list, &delayed_copy_len);
3534
3535                                         if (error) {
3536                                                 goto release;
3537                                         }
3538                                         /*
3539                                          * can only get here if MSG_PEEK is not
3540                                          * set therefore, m should point at the
3541                                          * head of the rcv queue; if it doesn't,
3542                                          * it means something drastically
3543                                          * changed while we were out from behind
3544                                          * the lock in sodelayed_copy. perhaps
3545                                          * a RST on the stream. in any event,
3546                                          * the stream has been interrupted. it's
3547                                          * probably best just to return whatever
3548                                          * data we've moved and let the caller
3549                                          * sort it out...
3550                                          */
3551                                         if (m != so->so_rcv.sb_mb) {
3552                                                 break;
3553                                         }
3554                                 }
3555                                 socket_unlock(so, 0);
3556                                 error = uiomove(mtod(m, caddr_t) + moff,
3557                                     (int)len, uio);
3558                                 socket_lock(so, 0);
3559
3560                                 if (error)
3561                                         goto release;
3562                         }
3563                 } else {
3564                         uio_setresid(uio, (uio_resid(uio) - len));
3565                 }
3566                 if (len == m->m_len - moff) {
3567                         if (m->m_flags & M_EOR)
3568                                 flags |= MSG_EOR;
3569                         if (flags & MSG_PEEK) {
3570                                 m = m->m_next;
3571                                 moff = 0;
3572                         } else {
3573                                 nextrecord = m->m_nextpkt;
3574                                 sbfree(&so->so_rcv, m);
3575                                 m->m_nextpkt = NULL;
3576
3577                                 /*
3578                                  * If this packet is an unordered packet
3579                                  * (indicated by M_UNORDERED_DATA flag), remove
3580                                  * the additional bytes added to the
3581                                  * receive socket buffer size.
3582                                  */
3583                                 if ((so->so_flags & SOF_ENABLE_MSGS) &&
3584                                     m->m_len &&
3585                                     (m->m_flags & M_UNORDERED_DATA) &&
3586                                     sbreserve(&so->so_rcv,
3587                                     so->so_rcv.sb_hiwat - m->m_len)) {
3588                                         if (so->so_msg_state->msg_uno_bytes >
3589                                             m->m_len) {
3590                                                 so->so_msg_state->
3591                                                     msg_uno_bytes -= m->m_len;
3592                                         } else {
3593                                                 so->so_msg_state->
3594                                                     msg_uno_bytes = 0;
3595                                         }
3596                                         m->m_flags &= ~M_UNORDERED_DATA;
3597                                 }
3598
3599                                 if (mp != NULL) {
3600                                         *mp = m;
3601                                         mp = &m->m_next;
3602                                         so->so_rcv.sb_mb = m = m->m_next;
3603                                         *mp = NULL;
3604                                 } else {
3605                                         if (free_list == NULL)
3606                                                 free_list = m;
3607                                         else
3608                                                 ml->m_next = m;
3609                                         ml = m;
3610                                         so->so_rcv.sb_mb = m = m->m_next;
3611                                         ml->m_next = NULL;
3612                                 }
3613                                 if (m != NULL) {
3614                                         m->m_nextpkt = nextrecord;
3615                                         if (nextrecord == NULL)
3616                                                 so->so_rcv.sb_lastrecord = m;
3617                                 } else {
3618                                         so->so_rcv.sb_mb = nextrecord;
3619                                         SB_EMPTY_FIXUP(&so->so_rcv);
3620                                 }
3621                                 SBLASTRECORDCHK(&so->so_rcv, "soreceive 3");
3622                                 SBLASTMBUFCHK(&so->so_rcv, "soreceive 3");
3623                         }
3624                 } else {
3625                         if (flags & MSG_PEEK) {
3626                                 moff += len;
3627                         } else {
3628                                 if (mp != NULL) {
3629                                         int copy_flag;
3630
3631                                         if (flags & MSG_DONTWAIT)
3632                                                 copy_flag = M_DONTWAIT;
3633                                         else
3634                                                 copy_flag = M_WAIT;
3635                                         *mp = m_copym(m, 0, len, copy_flag);
3636                                         /*
3637                                          * Failed to allocate an mbuf?
3638                                          * Adjust uio_resid back, it was
3639                                          * adjusted down by len bytes which
3640                                          * we didn't copy over.
3641                                          */
3642                                         if (*mp == NULL) {
3643                                                 uio_setresid(uio,
3644                                                     (uio_resid(uio) + len));
3645                                                 break;
3646                                         }
3647                                 }
3648                                 m->m_data += len;
3649                                 m->m_len -= len;
3650                                 so->so_rcv.sb_cc -= len;
3651                         }
3652                 }
3653                 if (so->so_oobmark) {
3654                         if ((flags & MSG_PEEK) == 0) {
3655                                 so->so_oobmark -= len;
3656                                 if (so->so_oobmark == 0) {
3657                                         so->so_state |= SS_RCVATMARK;
3658                                         /*
3659                                          * delay posting the actual event until
3660                                          * after any delayed copy processing
3661                                          * has finished
3662                                          */
3663                                         need_event = 1;
3664                                         break;
3665                                 }
3666                         } else {
3667                                 offset += len;
3668                                 if (offset == so->so_oobmark)
3669                                         break;
3670                         }
3671                 }
3672                 if (flags & MSG_EOR)
3673                         break;
3674                 /*
3675                  * If the MSG_WAITALL or MSG_WAITSTREAM flag is set
3676                  * (for non-atomic socket), we must not quit until
3677                  * "uio->uio_resid == 0" or an error termination.
3678                  * If a signal/timeout occurs, return with a short
3679                  * count but without error.  Keep sockbuf locked
3680                  * against other readers.
3681                  */
3682                 while (flags & (MSG_WAITALL|MSG_WAITSTREAM) && m == NULL &&
3683                     (uio_resid(uio) - delayed_copy_len) > 0 &&
3684                     !sosendallatonce(so) && !nextrecord) {
3685                         if (so->so_error || ((so->so_state & SS_CANTRCVMORE)
3686 #if CONTENT_FILTER
3687                             && cfil_sock_data_pending(&so->so_rcv) == 0
3688 #endif /* CONTENT_FILTER */
3689                             ))
3690                                 goto release;
3691
3692                         /*
3693                          * Depending on the protocol (e.g. TCP), the following
3694                          * might cause the socket lock to be dropped and later
3695                          * be reacquired, and more data could have arrived and
3696                          * have been appended to the receive socket buffer by
3697                          * the time it returns.  Therefore, we only sleep in
3698                          * sbwait() below if and only if the socket buffer is
3699                          * empty, in order to avoid a false sleep.
3700                          */
3701                         if (pr->pr_flags & PR_WANTRCVD && so->so_pcb &&
3702                             (((struct inpcb *)so->so_pcb)->inp_state !=
3703                             INPCB_STATE_DEAD))
3704                                 (*pr->pr_usrreqs->pru_rcvd)(so, flags);
3705
3706                         SBLASTRECORDCHK(&so->so_rcv, "soreceive sbwait 2");
3707                         SBLASTMBUFCHK(&so->so_rcv, "soreceive sbwait 2");
3708
3709                         if (so->so_rcv.sb_mb == NULL && sbwait(&so->so_rcv)) {
3710                                 error = 0;
3711                                 goto release;
3712                         }
3713                         /*
3714                          * have to wait until after we get back from the sbwait
3715                          * to do the copy because we will drop the lock if we
3716                          * have enough data that has been delayed... by dropping
3717                          * the lock we open up a window allowing the netisr
3718                          * thread to process the incoming packets and to change
3719                          * the state of this socket... we're issuing the sbwait
3720                          * because the socket is empty and we're expecting the
3721                          * netisr thread to wake us up when more packets arrive;
3722                          * if we allow that processing to happen and then sbwait
3723                          * we could stall forever with packets sitting in the
3724                          * socket if no further packets arrive from the remote
3725                          * side.
3726                          *
3727                          * we want to copy before we've collected all the data
3728                          * to satisfy this request to allow the copy to overlap
3729                          * the incoming packet processing on an MP system
3730                          */
3731                         if (delayed_copy_len > sorecvmincopy &&
3732                             (delayed_copy_len > (so->so_rcv.sb_hiwat / 2))) {
3733                                 error = sodelayed_copy(so, uio,
3734                                     &free_list, &delayed_copy_len);
3735
3736                                 if (error)
3737                                         goto release;
3738                         }
3739                         m = so->so_rcv.sb_mb;
3740                         if (m != NULL) {
3741                                 nextrecord = m->m_nextpkt;
3742                         }
3743                         SB_MB_CHECK(&so->so_rcv);
3744                 }
3745         }
3746 #ifdef MORE_LOCKING_DEBUG
3747         if (so->so_usecount <= 1) {
3748                 panic("%s: after big while so=%p ref=%d on socket\n",
3749                     __func__, so, so->so_usecount);
3750                 /* NOTREACHED */
3751         }
3752 #endif
3753
3754         if (m != NULL && pr->pr_flags & PR_ATOMIC) {
3755                 if (so->so_options & SO_DONTTRUNC) {
3756                         flags |= MSG_RCVMORE;
3757                 } else {
3758                         flags |= MSG_TRUNC;
3759                         if ((flags & MSG_PEEK) == 0)
3760                                 (void) sbdroprecord(&so->so_rcv);
3761                 }
3762         }
3763
3764         /*
3765          * pru_rcvd below (for TCP) may cause more data to be received
3766          * if the socket lock is dropped prior to sending the ACK; some
3767          * legacy OpenTransport applications don't handle this well
3768          * (if it receives less data than requested while MSG_HAVEMORE
3769          * is set), and so we set the flag now based on what we know
3770          * prior to calling pru_rcvd.
3771          */
3772         if ((so->so_options & SO_WANTMORE) && so->so_rcv.sb_cc > 0)
3773                 flags |= MSG_HAVEMORE;
3774
3775         if ((flags & MSG_PEEK) == 0) {
3776                 if (m == NULL) {
3777                         so->so_rcv.sb_mb = nextrecord;
3778                         /*
3779                          * First part is an inline SB_EMPTY_FIXUP().  Second
3780                          * part makes sure sb_lastrecord is up-to-date if
3781                          * there is still data in the socket buffer.
3782                          */
3783                         if (so->so_rcv.sb_mb == NULL) {
3784                                 so->so_rcv.sb_mbtail = NULL;
3785                                 so->so_rcv.sb_lastrecord = NULL;
3786                         } else if (nextrecord->m_nextpkt == NULL) {
3787                                 so->so_rcv.sb_lastrecord = nextrecord;
3788                         }
3789                         SB_MB_CHECK(&so->so_rcv);
3790                 }
3791                 SBLASTRECORDCHK(&so->so_rcv, "soreceive 4");
3792                 SBLASTMBUFCHK(&so->so_rcv, "soreceive 4");
3793                 if (pr->pr_flags & PR_WANTRCVD && so->so_pcb)
3794                         (*pr->pr_usrreqs->pru_rcvd)(so, flags);
3795         }
3796
3797         if (delayed_copy_len) {
3798                 error = sodelayed_copy(so, uio, &free_list, &delayed_copy_len);
3799                 if (error)
3800                         goto release;
3801         }
3802         if (free_list != NULL) {
3803                 m_freem_list(free_list);
3804                 free_list = NULL;
3805         }
3806         if (need_event)
3807                 postevent(so, 0, EV_OOB);
3808
3809         if (orig_resid == uio_resid(uio) && orig_resid &&
3810             (flags & MSG_EOR) == 0 && (so->so_state & SS_CANTRCVMORE) == 0) {
3811                 sbunlock(&so->so_rcv, TRUE);    /* keep socket locked */
3812                 goto restart;
3813         }
3814
3815         if (flagsp != NULL)
3816                 *flagsp |= flags;
3817 release:
3818 #ifdef MORE_LOCKING_DEBUG
3819         if (so->so_usecount <= 1) {
3820                 panic("%s: release so=%p ref=%d on socket\n", __func__,
3821                     so, so->so_usecount);
3822                 /* NOTREACHED */
3823         }
3824 #endif
3825         if (delayed_copy_len)
3826                 error = sodelayed_copy(so, uio, &free_list, &delayed_copy_len);
3827
3828         if (free_list != NULL)
3829                 m_freem_list(free_list);
3830
3831         sbunlock(&so->so_rcv, FALSE);   /* will unlock socket */
3832
3833         if (en_tracing) {
3834                 KERNEL_ENERGYTRACE(kEnTrActKernSockRead, DBG_FUNC_END,
3835                     VM_KERNEL_ADDRPERM(so),
3836                     ((error == EWOULDBLOCK) ? kEnTrFlagNoWork : 0),
3837                     (int64_t)(orig_resid - uio_resid(uio)));
3838         }
3839         KERNEL_DEBUG(DBG_FNC_SORECEIVE | DBG_FUNC_END, so, uio_resid(uio),
3840             so->so_rcv.sb_cc, 0, error);
3841
3842         return (error);
3843 }
3844
3845 /*
3846  * Returns:     0                       Success
3847  *      uiomove:EFAULT
3848  */
3849 static int
3850 sodelayed_copy(struct socket *so, struct uio *uio, struct mbuf **free_list,
3851     user_ssize_t *resid)
3852 {
3853         int error = 0;
3854         struct mbuf *m;
3855
3856         m = *free_list;
3857
3858         socket_unlock(so, 0);
3859
3860         while (m != NULL && error == 0) {
3861                 error = uiomove(mtod(m, caddr_t), (int)m->m_len, uio);
3862                 m = m->m_next;
3863         }
3864         m_freem_list(*free_list);
3865
3866         *free_list = NULL;
3867         *resid = 0;
3868
3869         socket_lock(so, 0);
3870
3871         return (error);
3872 }
3873
3874 static int
3875 sodelayed_copy_list(struct socket *so, struct recv_msg_elem *msgarray,
3876     u_int uiocnt, struct mbuf **free_list, user_ssize_t *resid)
3877 {
3878 #pragma unused(so)
3879         int error = 0;
3880         struct mbuf *ml, *m;
3881         int i = 0;
3882         struct uio *auio;
3883
3884         for (ml = *free_list, i = 0; ml != NULL && i < uiocnt;
3885             ml = ml->m_nextpkt, i++) {
3886                 auio = msgarray[i].uio;
3887                 for (m = ml; m != NULL; m = m->m_next) {
3888                         error = uiomove(mtod(m, caddr_t), m->m_len, auio);
3889                         if (error != 0)
3890                                 goto out;
3891                 }
3892         }
3893 out:
3894         m_freem_list(*free_list);
3895
3896         *free_list = NULL;
3897         *resid = 0;
3898
3899         return (error);
3900 }
3901
3902 int
3903 soreceive_list(struct socket *so, struct recv_msg_elem *msgarray, u_int uiocnt,
3904     int *flagsp)
3905 {
3906         struct mbuf *m;
3907         struct mbuf *nextrecord;
3908         struct mbuf *ml = NULL, *free_list = NULL, *free_tail = NULL;
3909         int error;
3910         user_ssize_t len, pktlen, delayed_copy_len = 0;
3911         struct protosw *pr = so->so_proto;
3912         user_ssize_t resid;
3913         struct proc *p = current_proc();
3914         struct uio *auio = NULL;
3915         int npkts = 0;
3916         int sblocked = 0;
3917         struct sockaddr **psa = NULL;
3918         struct mbuf **controlp = NULL;
3919         int can_delay;
3920         int flags;
3921         struct mbuf *free_others = NULL;
3922
3923         KERNEL_DEBUG(DBG_FNC_SORECEIVE_LIST | DBG_FUNC_START,
3924             so, uiocnt,
3925             so->so_rcv.sb_cc, so->so_rcv.sb_lowat, so->so_rcv.sb_hiwat);
3926
3927         /*
3928          * Sanity checks:
3929          * - Only supports don't wait flags
3930          * - Only support datagram sockets (could be extended to raw)
3931          * - Must be atomic
3932          * - Protocol must support packet chains
3933          * - The uio array is NULL (should we panic?)
3934          */
3935         if (flagsp != NULL)
3936                 flags = *flagsp;
3937         else
3938                 flags = 0;
3939         if (flags & ~(MSG_PEEK | MSG_WAITALL | MSG_DONTWAIT | MSG_NEEDSA |
3940             MSG_NBIO)) {
3941                 printf("%s invalid flags 0x%x\n", __func__, flags);
3942                 error = EINVAL;
3943                 goto out;
3944         }
3945         if (so->so_type != SOCK_DGRAM) {
3946                 error = EINVAL;
3947                 goto out;
3948         }
3949         if (sosendallatonce(so) == 0) {
3950                 error = EINVAL;
3951                 goto out;
3952         }
3953         if (so->so_proto->pr_usrreqs->pru_send_list == NULL) {
3954                 error = EPROTONOSUPPORT;
3955                 goto out;
3956         }
3957         if (msgarray == NULL) {
3958                 printf("%s uioarray is NULL\n", __func__);
3959                 error = EINVAL;
3960                 goto out;
3961         }
3962         if (uiocnt == 0) {
3963                 printf("%s uiocnt is 0\n", __func__);
3964                 error = EINVAL;
3965                 goto out;
3966         }
3967         /*
3968          * Sanity check on the length passed by caller as we are making 'int'
3969          * comparisons
3970          */
3971         resid = recv_msg_array_resid(msgarray, uiocnt);
3972         if (resid < 0 || resid > INT_MAX) {
3973                 error = EINVAL;
3974                 goto out;
3975         }
3976
3977         if (!(flags & MSG_PEEK) && sorecvmincopy > 0)
3978                 can_delay = 1;
3979         else
3980                 can_delay = 0;
3981
3982         socket_lock(so, 1);
3983         so_update_last_owner_locked(so, p);
3984         so_update_policy(so);
3985
3986 #if NECP
3987         so_update_necp_policy(so, NULL, NULL);
3988 #endif /* NECP */
3989
3990         /*
3991          * If a recv attempt is made on a previously-accepted socket
3992          * that has been marked as inactive (disconnected), reject
3993          * the request.
3994          */
3995         if (so->so_flags & SOF_DEFUNCT) {
3996                 struct sockbuf *sb = &so->so_rcv;
3997
3998                 error = ENOTCONN;
3999                 SODEFUNCTLOG("%s[%d, %s]: defunct so 0x%llx [%d,%d] (%d)\n",
4000                     __func__, proc_pid(p), proc_best_name(p),
4001                     (uint64_t)DEBUG_KERNEL_ADDRPERM(so),
4002                     SOCK_DOM(so), SOCK_TYPE(so), error);
4003                 /*
4004                  * This socket should have been disconnected and flushed
4005                  * prior to being returned from sodefunct(); there should
4006                  * be no data on its receive list, so panic otherwise.
4007                  */
4008                 if (so->so_state & SS_DEFUNCT)
4009                         sb_empty_assert(sb, __func__);
4010                 goto release;
4011         }
4012
4013 next:
4014         /*
4015          * The uio may be empty
4016          */
4017         if (npkts >= uiocnt) {
4018                 error = 0;
4019                 goto release;
4020         }
4021 restart:
4022         /*
4023          * See if the socket has been closed (SS_NOFDREF|SS_CANTRCVMORE)
4024          * and if so just return to the caller.  This could happen when
4025          * soreceive() is called by a socket upcall function during the
4026          * time the socket is freed.  The socket buffer would have been
4027          * locked across the upcall, therefore we cannot put this thread
4028          * to sleep (else we will deadlock) or return EWOULDBLOCK (else
4029          * we may livelock), because the lock on the socket buffer will
4030          * only be released when the upcall routine returns to its caller.
4031          * Because the socket has been officially closed, there can be
4032          * no further read on it.
4033          */
4034         if ((so->so_state & (SS_NOFDREF | SS_CANTRCVMORE)) ==
4035             (SS_NOFDREF | SS_CANTRCVMORE)) {
4036                 error = 0;
4037                 goto release;
4038         }
4039
4040         error = sblock(&so->so_rcv, SBLOCKWAIT(flags));
4041         if (error) {
4042                 goto release;
4043         }
4044         sblocked = 1;
4045
4046         m = so->so_rcv.sb_mb;
4047         /*
4048          * Block awaiting more datagram if needed
4049          */
4050         if (m == NULL || (((flags & MSG_DONTWAIT) == 0 &&
4051             (so->so_rcv.sb_cc < so->so_rcv.sb_lowat ||
4052             ((flags & MSG_WAITALL) && npkts < uiocnt))))) {
4053                 /*
4054                  * Panic if we notice inconsistencies in the socket's
4055                  * receive list; both sb_mb and sb_cc should correctly
4056                  * reflect the contents of the list, otherwise we may
4057                  * end up with false positives during select() or poll()
4058                  * which could put the application in a bad state.
4059                  */
4060                 SB_MB_CHECK(&so->so_rcv);
4061
4062                 if (so->so_error) {
4063                         error = so->so_error;
4064                         if ((flags & MSG_PEEK) == 0)
4065                                 so->so_error = 0;
4066                         goto release;
4067                 }
4068                 if (so->so_state & SS_CANTRCVMORE) {
4069                         goto release;
4070                 }
4071                 if ((so->so_state & (SS_ISCONNECTED|SS_ISCONNECTING)) == 0 &&
4072                     (so->so_proto->pr_flags & PR_CONNREQUIRED)) {
4073                         error = ENOTCONN;
4074                         goto release;
4075                 }
4076                 if ((so->so_state & SS_NBIO) ||
4077                     (flags & (MSG_DONTWAIT|MSG_NBIO))) {
4078                         error = EWOULDBLOCK;
4079                         goto release;
4080                 }
4081                 /*
4082                  * Do not block if we got some data
4083                  */
4084                 if (free_list != NULL) {
4085                         error = 0;
4086                         goto release;
4087                 }
4088
4089                 SBLASTRECORDCHK(&so->so_rcv, "soreceive sbwait 1");
4090                 SBLASTMBUFCHK(&so->so_rcv, "soreceive sbwait 1");
4091
4092                 sbunlock(&so->so_rcv, TRUE);    /* keep socket locked */
4093                 sblocked = 0;
4094
4095                 error = sbwait(&so->so_rcv);
4096                 if (error) {
4097                         goto release;
4098                 }
4099                 goto restart;
4100         }
4101
4102         OSIncrementAtomicLong(&p->p_stats->p_ru.ru_msgrcv);
4103         SBLASTRECORDCHK(&so->so_rcv, "soreceive 1");
4104         SBLASTMBUFCHK(&so->so_rcv, "soreceive 1");
4105
4106         /*
4107          * Consume the current uio index as we have a datagram
4108          */
4109         auio = msgarray[npkts].uio;
4110         resid = uio_resid(auio);
4111         msgarray[npkts].which |= SOCK_MSG_DATA;
4112         psa = (msgarray[npkts].which & SOCK_MSG_SA) ?
4113             &msgarray[npkts].psa : NULL;
4114         controlp = (msgarray[npkts].which & SOCK_MSG_CONTROL) ?
4115             &msgarray[npkts].controlp : NULL;
4116         npkts += 1;
4117         nextrecord = m->m_nextpkt;
4118
4119         if ((pr->pr_flags & PR_ADDR) && m->m_type == MT_SONAME) {
4120                 error = soreceive_addr(p, so, psa, flags, &m, &nextrecord, 1);
4121                 if (error == ERESTART)
4122                         goto restart;
4123                 else if (error != 0)
4124                         goto release;
4125         }
4126
4127         if (m != NULL && m->m_type == MT_CONTROL) {
4128                 error = soreceive_ctl(so, controlp, flags, &m, &nextrecord);
4129                 if (error != 0)
4130                         goto release;
4131         }
4132
4133         if (m->m_pkthdr.len == 0) {
4134                 printf("%s:%d so %llx pkt %llx type %u pktlen null\n",
4135                     __func__, __LINE__,
4136                     (uint64_t)DEBUG_KERNEL_ADDRPERM(so),
4137                     (uint64_t)DEBUG_KERNEL_ADDRPERM(m),
4138                     m->m_type);
4139         }
4140
4141         /*
4142          * Loop to copy the mbufs of the current record
4143          * Support zero length packets
4144          */
4145         ml = NULL;
4146         pktlen = 0;
4147         while (m != NULL && (len = resid - pktlen) >= 0 && error == 0) {
4148                 if (m->m_len == 0)
4149                         panic("%p m_len zero", m);
4150                 if (m->m_type == 0)
4151                         panic("%p m_type zero", m);
4152                 /*
4153                  * Clip to the residual length
4154                  */
4155                 if (len > m->m_len)
4156                         len = m->m_len;
4157                 pktlen += len;
4158                 /*
4159                  * Copy the mbufs via the uio or delay the copy
4160                  * Sockbuf must be consistent here (points to current mbuf,
4161                  * it points to next record) when we drop priority;
4162                  * we must note any additions to the sockbuf when we
4163                  * block interrupts again.
4164                  */
4165                 if (len > 0 && can_delay == 0) {
4166                         socket_unlock(so, 0);
4167                         error = uiomove(mtod(m, caddr_t), (int)len, auio);
4168                         socket_lock(so, 0);
4169                         if (error)
4170                                 goto release;
4171                 } else {
4172                         delayed_copy_len += len;
4173                 }
4174
4175                 if (len == m->m_len) {
4176                         /*
4177                          * m was entirely copied
4178                          */
4179                         sbfree(&so->so_rcv, m);
4180                         nextrecord = m->m_nextpkt;
4181                         m->m_nextpkt = NULL;
4182
4183                         /*
4184                          * Set the first packet to the head of the free list
4185                          */
4186                         if (free_list == NULL)
4187                                 free_list = m;
4188                         /*
4189                          * Link current packet to tail of free list
4190                          */
4191                         if (ml == NULL) {
4192                                 if (free_tail != NULL)
4193                                         free_tail->m_nextpkt = m;
4194                                 free_tail = m;
4195                         }
4196                         /*
4197                          * Link current mbuf to last mbuf of current packet
4198                          */
4199                         if (ml != NULL)
4200                                 ml->m_next = m;
4201                         ml = m;
4202
4203                         /*
4204                          * Move next buf to head of socket buffer
4205                          */
4206                         so->so_rcv.sb_mb = m = ml->m_next;
4207                         ml->m_next = NULL;
4208
4209                         if (m != NULL) {
4210                                 m->m_nextpkt = nextrecord;
4211                                 if (nextrecord == NULL)
4212                                         so->so_rcv.sb_lastrecord = m;
4213                         } else {
4214                                 so->so_rcv.sb_mb = nextrecord;
4215                                 SB_EMPTY_FIXUP(&so->so_rcv);
4216                         }
4217                         SBLASTRECORDCHK(&so->so_rcv, "soreceive 3");
4218                         SBLASTMBUFCHK(&so->so_rcv, "soreceive 3");
4219                 } else {
4220                         /*
4221                          * Stop the loop on partial copy
4222                          */
4223                         break;
4224                 }
4225         }
4226 #ifdef MORE_LOCKING_DEBUG
4227         if (so->so_usecount <= 1) {
4228                 panic("%s: after big while so=%llx ref=%d on socket\n",
4229                     __func__,
4230                     (uint64_t)DEBUG_KERNEL_ADDRPERM(so), so->so_usecount);
4231                 /* NOTREACHED */
4232         }
4233 #endif
4234         /*
4235          * Tell the caller we made a partial copy
4236          */
4237         if (m != NULL) {
4238                 if (so->so_options & SO_DONTTRUNC) {
4239                         /*
4240                          * Copyout first the freelist then the partial mbuf
4241                          */
4242                         socket_unlock(so, 0);
4243                         if (delayed_copy_len)
4244                                 error = sodelayed_copy_list(so, msgarray,
4245                                     uiocnt, &free_list, &delayed_copy_len);
4246
4247                         if (error == 0) {
4248                                 error = uiomove(mtod(m, caddr_t), (int)len,
4249                                     auio);
4250                         }
4251                         socket_lock(so, 0);
4252                         if (error)
4253                                 goto release;
4254
4255                         m->m_data += len;
4256                         m->m_len -= len;
4257                         so->so_rcv.sb_cc -= len;
4258                         flags |= MSG_RCVMORE;
4259                 } else {
4260                         (void) sbdroprecord(&so->so_rcv);
4261                         nextrecord = so->so_rcv.sb_mb;
4262                         m = NULL;
4263                         flags |= MSG_TRUNC;
4264                 }
4265         }
4266
4267         if (m == NULL) {
4268                 so->so_rcv.sb_mb = nextrecord;
4269                 /*
4270                  * First part is an inline SB_EMPTY_FIXUP().  Second
4271                  * part makes sure sb_lastrecord is up-to-date if
4272                  * there is still data in the socket buffer.
4273                  */
4274                 if (so->so_rcv.sb_mb == NULL) {
4275                         so->so_rcv.sb_mbtail = NULL;
4276                         so->so_rcv.sb_lastrecord = NULL;
4277                 } else if (nextrecord->m_nextpkt == NULL) {
4278                         so->so_rcv.sb_lastrecord = nextrecord;
4279                 }
4280                 SB_MB_CHECK(&so->so_rcv);
4281         }
4282         SBLASTRECORDCHK(&so->so_rcv, "soreceive 4");
4283         SBLASTMBUFCHK(&so->so_rcv, "soreceive 4");
4284
4285         /*
4286          * We can continue to the next packet as long as:
4287          * - We haven't exhausted the uio array
4288          * - There was no error
4289          * - A packet was not truncated
4290          * - We can still receive more data
4291          */
4292         if (npkts < uiocnt && error == 0 &&
4293             (flags & (MSG_RCVMORE | MSG_TRUNC)) == 0 &&
4294             (so->so_state & SS_CANTRCVMORE) == 0) {
4295                 sbunlock(&so->so_rcv, TRUE);    /* keep socket locked */
4296                 sblocked = 0;
4297
4298                 goto next;
4299         }
4300         if (flagsp != NULL)
4301                 *flagsp |= flags;
4302
4303 release:
4304         /*
4305          * pru_rcvd may cause more data to be received if the socket lock
4306          * is dropped so we set MSG_HAVEMORE now based on what we know.
4307          * That way the caller won't be surprised if it receives less data
4308          * than requested.
4309          */
4310         if ((so->so_options & SO_WANTMORE) && so->so_rcv.sb_cc > 0)
4311                 flags |= MSG_HAVEMORE;
4312
4313         if (pr->pr_flags & PR_WANTRCVD && so->so_pcb)
4314                 (*pr->pr_usrreqs->pru_rcvd)(so, flags);
4315
4316         if (sblocked)
4317                 sbunlock(&so->so_rcv, FALSE);   /* will unlock socket */
4318         else
4319                 socket_unlock(so, 1);
4320
4321         if (delayed_copy_len)
4322                 error = sodelayed_copy_list(so, msgarray, uiocnt,
4323                     &free_list, &delayed_copy_len);
4324 out:
4325         /*
4326          * Amortize the cost of freeing the mbufs
4327          */
4328         if (free_list != NULL)
4329                 m_freem_list(free_list);
4330         if (free_others != NULL)
4331                 m_freem_list(free_others);
4332
4333         KERNEL_DEBUG(DBG_FNC_SORECEIVE_LIST | DBG_FUNC_END, error,
4334             0, 0, 0, 0);
4335         return (error);
4336 }
4337
4338 /*
4339  * Returns:     0                       Success
4340  *              EINVAL
4341  *              ENOTCONN
4342  *      <pru_shutdown>:EINVAL
4343  *      <pru_shutdown>:EADDRNOTAVAIL[TCP]
4344  *      <pru_shutdown>:ENOBUFS[TCP]
4345  *      <pru_shutdown>:EMSGSIZE[TCP]
4346  *      <pru_shutdown>:EHOSTUNREACH[TCP]
4347  *      <pru_shutdown>:ENETUNREACH[TCP]
4348  *      <pru_shutdown>:ENETDOWN[TCP]
4349  *      <pru_shutdown>:ENOMEM[TCP]
4350  *      <pru_shutdown>:EACCES[TCP]
4351  *      <pru_shutdown>:EMSGSIZE[TCP]
4352  *      <pru_shutdown>:ENOBUFS[TCP]
4353  *      <pru_shutdown>:???[TCP]         [ignorable: mostly IPSEC/firewall/DLIL]
4354  *      <pru_shutdown>:???              [other protocol families]
4355  */
4356 int
4357 soshutdown(struct socket *so, int how)
4358 {
4359         int error;
4360
4361         KERNEL_DEBUG(DBG_FNC_SOSHUTDOWN | DBG_FUNC_START, how, 0, 0, 0, 0);
4362
4363         switch (how) {
4364         case SHUT_RD:
4365         case SHUT_WR:
4366         case SHUT_RDWR:
4367                 socket_lock(so, 1);
4368                 if ((so->so_state &
4369                     (SS_ISCONNECTED|SS_ISCONNECTING|SS_ISDISCONNECTING)) == 0) {
4370                         error = ENOTCONN;
4371                 } else {
4372                         error = soshutdownlock(so, how);
4373                 }
4374                 socket_unlock(so, 1);
4375                 break;
4376         default:
4377                 error = EINVAL;
4378                 break;
4379         }
4380
4381         KERNEL_DEBUG(DBG_FNC_SOSHUTDOWN | DBG_FUNC_END, how, error, 0, 0, 0);
4382
4383         return (error);
4384 }
4385
4386 int
4387 soshutdownlock_final(struct socket *so, int how)
4388 {
4389         struct protosw *pr = so->so_proto;
4390         int error = 0;
4391
4392         sflt_notify(so, sock_evt_shutdown, &how);
4393
4394         if (how != SHUT_WR) {
4395                 if ((so->so_state & SS_CANTRCVMORE) != 0) {
4396                         /* read already shut down */
4397                         error = ENOTCONN;
4398                         goto done;
4399                 }
4400                 sorflush(so);
4401                 postevent(so, 0, EV_RCLOSED);
4402         }
4403         if (how != SHUT_RD) {
4404                 if ((so->so_state & SS_CANTSENDMORE) != 0) {
4405                         /* write already shut down */
4406                         error = ENOTCONN;
4407                         goto done;
4408                 }
4409                 error = (*pr->pr_usrreqs->pru_shutdown)(so);
4410                 postevent(so, 0, EV_WCLOSED);
4411         }
4412 done:
4413         KERNEL_DEBUG(DBG_FNC_SOSHUTDOWN, how, 1, 0, 0, 0);
4414         return (error);
4415 }
4416
4417 int
4418 soshutdownlock(struct socket *so, int how)
4419 {
4420         int error = 0;
4421
4422 #if CONTENT_FILTER
4423         /*
4424          * A content filter may delay the actual shutdown until it
4425          * has processed the pending data
4426          */
4427         if (so->so_flags & SOF_CONTENT_FILTER) {
4428                 error = cfil_sock_shutdown(so, &how);
4429                 if (error == EJUSTRETURN) {
4430                         error = 0;
4431                         goto done;
4432                 } else if (error != 0) {
4433                         goto done;
4434                 }
4435         }
4436 #endif /* CONTENT_FILTER */
4437
4438         error = soshutdownlock_final(so, how);
4439
4440 done:
4441         return (error);
4442 }
4443
4444 void
4445 sowflush(struct socket *so)
4446 {
4447         struct sockbuf *sb = &so->so_snd;
4448
4449         /*
4450          * Obtain lock on the socket buffer (SB_LOCK).  This is required
4451          * to prevent the socket buffer from being unexpectedly altered
4452          * while it is used by another thread in socket send/receive.
4453          *
4454          * sblock() must not fail here, hence the assertion.
4455          */
4456         (void) sblock(sb, SBL_WAIT | SBL_NOINTR | SBL_IGNDEFUNCT);
4457         VERIFY(sb->sb_flags & SB_LOCK);
4458
4459         sb->sb_flags            &= ~(SB_SEL|SB_UPCALL);
4460         sb->sb_flags            |= SB_DROP;
4461         sb->sb_upcall           = NULL;
4462         sb->sb_upcallarg        = NULL;
4463
4464         sbunlock(sb, TRUE);     /* keep socket locked */
4465
4466         selthreadclear(&sb->sb_sel);
4467         sbrelease(sb);
4468 }
4469
4470 void
4471 sorflush(struct socket *so)
4472 {
4473         struct sockbuf *sb = &so->so_rcv;
4474         struct protosw *pr = so->so_proto;
4475         struct sockbuf asb;
4476 #ifdef notyet
4477         lck_mtx_t *mutex_held;
4478         /*
4479          * XXX: This code is currently commented out, because we may get here
4480          * as part of sofreelastref(), and at that time, pr_getlock() may no
4481          * longer be able to return us the lock; this will be fixed in future.
4482          */
4483         if (so->so_proto->pr_getlock != NULL)
4484                 mutex_held = (*so->so_proto->pr_getlock)(so, 0);
4485         else
4486                 mutex_held = so->so_proto->pr_domain->dom_mtx;
4487
4488         lck_mtx_assert(mutex_held, LCK_MTX_ASSERT_OWNED);
4489 #endif /* notyet */
4490
4491         sflt_notify(so, sock_evt_flush_read, NULL);
4492
4493         socantrcvmore(so);
4494
4495         /*
4496          * Obtain lock on the socket buffer (SB_LOCK).  This is required
4497          * to prevent the socket buffer from being unexpectedly altered
4498          * while it is used by another thread in socket send/receive.
4499          *
4500          * sblock() must not fail here, hence the assertion.
4501          */
4502         (void) sblock(sb, SBL_WAIT | SBL_NOINTR | SBL_IGNDEFUNCT);
4503         VERIFY(sb->sb_flags & SB_LOCK);
4504
4505         /*
4506          * Copy only the relevant fields from "sb" to "asb" which we
4507          * need for sbrelease() to function.  In particular, skip
4508          * sb_sel as it contains the wait queue linkage, which would
4509          * wreak havoc if we were to issue selthreadclear() on "asb".
4510          * Make sure to not carry over SB_LOCK in "asb", as we need
4511          * to acquire it later as part of sbrelease().
4512          */
4513         bzero(&asb, sizeof (asb));
4514         asb.sb_cc               = sb->sb_cc;
4515         asb.sb_hiwat            = sb->sb_hiwat;
4516         asb.sb_mbcnt            = sb->sb_mbcnt;
4517         asb.sb_mbmax            = sb->sb_mbmax;
4518         asb.sb_ctl              = sb->sb_ctl;
4519         asb.sb_lowat            = sb->sb_lowat;
4520         asb.sb_mb               = sb->sb_mb;
4521         asb.sb_mbtail           = sb->sb_mbtail;
4522         asb.sb_lastrecord       = sb->sb_lastrecord;
4523         asb.sb_so               = sb->sb_so;
4524         asb.sb_flags            = sb->sb_flags;
4525         asb.sb_flags            &= ~(SB_LOCK|SB_SEL|SB_KNOTE|SB_UPCALL);
4526         asb.sb_flags            |= SB_DROP;
4527
4528         /*
4529          * Ideally we'd bzero() these and preserve the ones we need;
4530          * but to do that we'd need to shuffle things around in the
4531          * sockbuf, and we can't do it now because there are KEXTS
4532          * that are directly referring to the socket structure.
4533          *
4534          * Setting SB_DROP acts as a barrier to prevent further appends.
4535          * Clearing SB_SEL is done for selthreadclear() below.
4536          */
4537         sb->sb_cc               = 0;
4538         sb->sb_hiwat            = 0;
4539         sb->sb_mbcnt            = 0;
4540         sb->sb_mbmax            = 0;
4541         sb->sb_ctl              = 0;
4542         sb->sb_lowat            = 0;
4543         sb->sb_mb               = NULL;
4544         sb->sb_mbtail           = NULL;
4545         sb->sb_lastrecord       = NULL;
4546         sb->sb_timeo.tv_sec     = 0;
4547         sb->sb_timeo.tv_usec    = 0;
4548         sb->sb_upcall           = NULL;
4549         sb->sb_upcallarg        = NULL;
4550         sb->sb_flags            &= ~(SB_SEL|SB_UPCALL);
4551         sb->sb_flags            |= SB_DROP;
4552
4553         sbunlock(sb, TRUE);     /* keep socket locked */
4554
4555         /*
4556          * Note that selthreadclear() is called on the original "sb" and
4557          * not the local "asb" because of the way wait queue linkage is
4558          * implemented.  Given that selwakeup() may be triggered, SB_SEL
4559          * should no longer be set (cleared above.)
4560          */
4561         selthreadclear(&sb->sb_sel);
4562
4563         if ((pr->pr_flags & PR_RIGHTS) && pr->pr_domain->dom_dispose)
4564                 (*pr->pr_domain->dom_dispose)(asb.sb_mb);
4565
4566         sbrelease(&asb);
4567 }
4568
4569 /*
4570  * Perhaps this routine, and sooptcopyout(), below, ought to come in
4571  * an additional variant to handle the case where the option value needs
4572  * to be some kind of integer, but not a specific size.
4573  * In addition to their use here, these functions are also called by the
4574  * protocol-level pr_ctloutput() routines.
4575  *
4576  * Returns:     0                       Success
4577  *              EINVAL
4578  *      copyin:EFAULT
4579  */
4580 int
4581 sooptcopyin(struct sockopt *sopt, void *buf, size_t len, size_t minlen)
4582 {
4583         size_t  valsize;
4584
4585         /*
4586          * If the user gives us more than we wanted, we ignore it,
4587          * but if we don't get the minimum length the caller
4588          * wants, we return EINVAL.  On success, sopt->sopt_valsize
4589          * is set to however much we actually retrieved.
4590          */
4591         if ((valsize = sopt->sopt_valsize) < minlen)
4592                 return (EINVAL);
4593         if (valsize > len)
4594                 sopt->sopt_valsize = valsize = len;
4595
4596         if (sopt->sopt_p != kernproc)
4597                 return (copyin(sopt->sopt_val, buf, valsize));
4598
4599         bcopy(CAST_DOWN(caddr_t, sopt->sopt_val), buf, valsize);
4600         return (0);
4601 }
4602
4603 /*
4604  * sooptcopyin_timeval
4605  *   Copy in a timeval value into tv_p, and take into account whether the
4606  *   the calling process is 64-bit or 32-bit.  Moved the sanity checking
4607  *   code here so that we can verify the 64-bit tv_sec value before we lose
4608  *   the top 32-bits assigning tv64.tv_sec to tv_p->tv_sec.
4609  */
4610 static int
4611 sooptcopyin_timeval(struct sockopt *sopt, struct timeval *tv_p)
4612 {
4613         int                     error;
4614
4615         if (proc_is64bit(sopt->sopt_p)) {
4616                 struct user64_timeval   tv64;
4617
4618                 if (sopt->sopt_valsize < sizeof (tv64))
4619                         return (EINVAL);
4620
4621                 sopt->sopt_valsize = sizeof (tv64);
4622                 if (sopt->sopt_p != kernproc) {
4623                         error = copyin(sopt->sopt_val, &tv64, sizeof (tv64));
4624                         if (error != 0)
4625                                 return (error);
4626                 } else {
4627                         bcopy(CAST_DOWN(caddr_t, sopt->sopt_val), &tv64,
4628                             sizeof (tv64));
4629                 }
4630                 if (tv64.tv_sec < 0 || tv64.tv_sec > LONG_MAX ||
4631                     tv64.tv_usec < 0 || tv64.tv_usec >= 1000000)
4632                         return (EDOM);
4633
4634                 tv_p->tv_sec = tv64.tv_sec;
4635                 tv_p->tv_usec = tv64.tv_usec;
4636         } else {
4637                 struct user32_timeval   tv32;
4638
4639                 if (sopt->sopt_valsize < sizeof (tv32))
4640                         return (EINVAL);
4641
4642                 sopt->sopt_valsize = sizeof (tv32);
4643                 if (sopt->sopt_p != kernproc) {
4644                         error = copyin(sopt->sopt_val, &tv32, sizeof (tv32));
4645                         if (error != 0) {
4646                                 return (error);
4647                         }
4648                 } else {
4649                         bcopy(CAST_DOWN(caddr_t, sopt->sopt_val), &tv32,
4650                             sizeof (tv32));
4651                 }
4652 #ifndef __LP64__
4653                 /*
4654                  * K64todo "comparison is always false due to
4655                  * limited range of data type"
4656                  */
4657                 if (tv32.tv_sec < 0 || tv32.tv_sec > LONG_MAX ||
4658                     tv32.tv_usec < 0 || tv32.tv_usec >= 1000000)
4659                         return (EDOM);
4660 #endif
4661                 tv_p->tv_sec = tv32.tv_sec;
4662                 tv_p->tv_usec = tv32.tv_usec;
4663         }
4664         return (0);
4665 }
4666
4667 static int
4668 soopt_cred_check(struct socket *so, int priv)
4669 {
4670         kauth_cred_t cred =  NULL;
4671         proc_t ep = PROC_NULL;
4672         int error;
4673
4674         if (so->so_flags & SOF_DELEGATED) {
4675                 ep = proc_find(so->e_pid);
4676                 if (ep)
4677                         cred = kauth_cred_proc_ref(ep);
4678         }
4679         error = priv_check_cred(cred ? cred : so->so_cred, priv, 0);
4680         if (cred)
4681                 kauth_cred_unref(&cred);
4682         if (ep != PROC_NULL)
4683                 proc_rele(ep);
4684
4685         return (error);
4686 }
4687
4688 /*
4689  * Returns:     0                       Success
4690  *              EINVAL
4691  *              ENOPROTOOPT
4692  *              ENOBUFS
4693  *              EDOM
4694  *      sooptcopyin:EINVAL
4695  *      sooptcopyin:EFAULT
4696  *      sooptcopyin_timeval:EINVAL
4697  *      sooptcopyin_timeval:EFAULT
4698  *      sooptcopyin_timeval:EDOM
4699  *      <pr_ctloutput>:EOPNOTSUPP[AF_UNIX]
4700  *      <pr_ctloutput>:???w
4701  *      sflt_attach_private:???         [whatever a filter author chooses]
4702  *      <sf_setoption>:???              [whatever a filter author chooses]
4703  *
4704  * Notes:       Other <pru_listen> returns depend on the protocol family; all
4705  *              <sf_listen> returns depend on what the filter author causes
4706  *              their filter to return.
4707  */
4708 int
4709 sosetoptlock(struct socket *so, struct sockopt *sopt, int dolock)
4710 {
4711         int     error, optval;
4712         struct  linger l;
4713         struct  timeval tv;
4714 #if CONFIG_MACF_SOCKET
4715         struct mac extmac;
4716 #endif /* MAC_SOCKET */
4717
4718         if (sopt->sopt_dir != SOPT_SET)
4719                 sopt->sopt_dir = SOPT_SET;
4720
4721         if (dolock)
4722                 socket_lock(so, 1);
4723
4724         if ((so->so_state & (SS_CANTRCVMORE | SS_CANTSENDMORE)) ==
4725             (SS_CANTRCVMORE | SS_CANTSENDMORE) &&
4726             (so->so_flags & SOF_NPX_SETOPTSHUT) == 0) {
4727                 /* the socket has been shutdown, no more sockopt's */
4728                 error = EINVAL;
4729                 goto out;
4730         }
4731
4732         error = sflt_setsockopt(so, sopt);
4733         if (error != 0) {
4734                 if (error == EJUSTRETURN)
4735                         error = 0;
4736                 goto out;
4737         }
4738
4739         if (sopt->sopt_level != SOL_SOCKET) {
4740                 if (so->so_proto != NULL &&
4741                     so->so_proto->pr_ctloutput != NULL) {
4742                         error = (*so->so_proto->pr_ctloutput)(so, sopt);
4743                         goto out;
4744                 }
4745                 error = ENOPROTOOPT;
4746         } else {
4747                 /*
4748                  * Allow socket-level (SOL_SOCKET) options to be filtered by
4749                  * the protocol layer, if needed.  A zero value returned from
4750                  * the handler means use default socket-level processing as
4751                  * done by the rest of this routine.  Otherwise, any other
4752                  * return value indicates that the option is unsupported.
4753                  */
4754                 if (so->so_proto != NULL && (error = so->so_proto->pr_usrreqs->
4755                     pru_socheckopt(so, sopt)) != 0)
4756                         goto out;
4757
4758                 error = 0;
4759                 switch (sopt->sopt_name) {
4760                 case SO_LINGER:
4761                 case SO_LINGER_SEC:
4762                         error = sooptcopyin(sopt, &l, sizeof (l), sizeof (l));
4763                         if (error != 0)
4764                                 goto out;
4765
4766                         so->so_linger = (sopt->sopt_name == SO_LINGER) ?
4767                             l.l_linger : l.l_linger * hz;
4768                         if (l.l_onoff != 0)
4769                                 so->so_options |= SO_LINGER;
4770                         else
4771                                 so->so_options &= ~SO_LINGER;
4772                         break;
4773
4774                 case SO_DEBUG:
4775                 case SO_KEEPALIVE:
4776                 case SO_DONTROUTE:
4777                 case SO_USELOOPBACK:
4778                 case SO_BROADCAST:
4779                 case SO_REUSEADDR:
4780                 case SO_REUSEPORT:
4781                 case SO_OOBINLINE:
4782                 case SO_TIMESTAMP:
4783                 case SO_TIMESTAMP_MONOTONIC:
4784                 case SO_DONTTRUNC:
4785                 case SO_WANTMORE:
4786                 case SO_WANTOOBFLAG:
4787                 case SO_NOWAKEFROMSLEEP:
4788                 case SO_NOAPNFALLBK:
4789                         error = sooptcopyin(sopt, &optval, sizeof (optval),
4790                             sizeof (optval));
4791                         if (error != 0)
4792                                 goto out;
4793                         if (optval)
4794                                 so->so_options |= sopt->sopt_name;
4795                         else
4796                                 so->so_options &= ~sopt->sopt_name;
4797                         break;
4798
4799                 case SO_SNDBUF:
4800                 case SO_RCVBUF:
4801                 case SO_SNDLOWAT:
4802                 case SO_RCVLOWAT:
4803                         error = sooptcopyin(sopt, &optval, sizeof (optval),
4804                             sizeof (optval));
4805                         if (error != 0)
4806                                 goto out;
4807
4808                         /*
4809                          * Values < 1 make no sense for any of these
4810                          * options, so disallow them.
4811                          */
4812                         if (optval < 1) {
4813                                 error = EINVAL;
4814                                 goto out;
4815                         }
4816
4817                         switch (sopt->sopt_name) {
4818                         case SO_SNDBUF:
4819                         case SO_RCVBUF: {
4820                                 struct sockbuf *sb =
4821                                     (sopt->sopt_name == SO_SNDBUF) ?
4822                                     &so->so_snd : &so->so_rcv;
4823                                 if (sbreserve(sb, (u_int32_t)optval) == 0) {
4824                                         error = ENOBUFS;
4825                                         goto out;
4826                                 }
4827                                 sb->sb_flags |= SB_USRSIZE;
4828                                 sb->sb_flags &= ~SB_AUTOSIZE;
4829                                 sb->sb_idealsize = (u_int32_t)optval;
4830                                 break;
4831                         }
4832                         /*
4833                          * Make sure the low-water is never greater than
4834                          * the high-water.
4835                          */
4836                         case SO_SNDLOWAT: {
4837                                 int space = sbspace(&so->so_snd);
4838                                 u_int32_t hiwat = so->so_snd.sb_hiwat;
4839
4840                                 if (so->so_snd.sb_flags & SB_UNIX) {
4841                                         struct unpcb *unp =
4842                                             (struct unpcb *)(so->so_pcb);
4843                                         if (unp != NULL &&
4844                                             unp->unp_conn != NULL) {
4845                                                 hiwat += unp->unp_conn->unp_cc;
4846                                         }
4847                                 }
4848
4849                                 so->so_snd.sb_lowat =
4850                                     (optval > hiwat) ?
4851                                     hiwat : optval;
4852
4853                                 if (space >= so->so_snd.sb_lowat) {
4854                                         sowwakeup(so);
4855                                 }
4856                                 break;
4857                         }
4858                         case SO_RCVLOWAT: {
4859                                 int64_t data_len;
4860                                 so->so_rcv.sb_lowat =
4861                                     (optval > so->so_rcv.sb_hiwat) ?
4862                                     so->so_rcv.sb_hiwat : optval;
4863                                 data_len = so->so_rcv.sb_cc
4864                                     - so->so_rcv.sb_ctl;
4865                                 if (data_len >= so->so_rcv.sb_lowat)
4866                                     sorwakeup(so);
4867                                 break;
4868                         }
4869                         }
4870                         break;
4871
4872                 case SO_SNDTIMEO:
4873                 case SO_RCVTIMEO:
4874                         error = sooptcopyin_timeval(sopt, &tv);
4875                         if (error != 0)
4876                                 goto out;
4877
4878                         switch (sopt->sopt_name) {
4879                         case SO_SNDTIMEO:
4880                                 so->so_snd.sb_timeo = tv;
4881                                 break;
4882                         case SO_RCVTIMEO:
4883                                 so->so_rcv.sb_timeo = tv;
4884                                 break;
4885                         }
4886                         break;
4887
4888                 case SO_NKE: {
4889                         struct so_nke nke;
4890
4891                         error = sooptcopyin(sopt, &nke, sizeof (nke),
4892                             sizeof (nke));
4893                         if (error != 0)
4894                                 goto out;
4895
4896                         error = sflt_attach_internal(so, nke.nke_handle);
4897                         break;
4898                 }
4899
4900                 case SO_NOSIGPIPE:
4901                         error = sooptcopyin(sopt, &optval, sizeof (optval),
4902                             sizeof (optval));
4903                         if (error != 0)
4904                                 goto out;
4905                         if (optval != 0)
4906                                 so->so_flags |= SOF_NOSIGPIPE;
4907                         else
4908                                 so->so_flags &= ~SOF_NOSIGPIPE;
4909                         break;
4910
4911                 case SO_NOADDRERR:
4912                         error = sooptcopyin(sopt, &optval, sizeof (optval),
4913                             sizeof (optval));
4914                         if (error != 0)
4915                                 goto out;
4916                         if (optval != 0)
4917                                 so->so_flags |= SOF_NOADDRAVAIL;
4918                         else
4919                                 so->so_flags &= ~SOF_NOADDRAVAIL;
4920                         break;
4921
4922                 case SO_REUSESHAREUID:
4923                         error = sooptcopyin(sopt, &optval, sizeof (optval),
4924                             sizeof (optval));
4925                         if (error != 0)
4926                                 goto out;
4927                         if (optval != 0)
4928                                 so->so_flags |= SOF_REUSESHAREUID;
4929                         else
4930                                 so->so_flags &= ~SOF_REUSESHAREUID;
4931                         break;
4932
4933                 case SO_NOTIFYCONFLICT:
4934                         if (kauth_cred_issuser(kauth_cred_get()) == 0) {
4935                                 error = EPERM;
4936                                 goto out;
4937                         }
4938                         error = sooptcopyin(sopt, &optval, sizeof (optval),
4939                             sizeof (optval));
4940                         if (error != 0)
4941                                 goto out;
4942                         if (optval != 0)
4943                                 so->so_flags |= SOF_NOTIFYCONFLICT;
4944                         else
4945                                 so->so_flags &= ~SOF_NOTIFYCONFLICT;
4946                         break;
4947
4948                 case SO_RESTRICTIONS:
4949                         error = sooptcopyin(sopt, &optval, sizeof (optval),
4950                             sizeof (optval));
4951                         if (error != 0)
4952                                 goto out;
4953
4954                         error = so_set_restrictions(so, optval);
4955                         break;
4956
4957                 case SO_AWDL_UNRESTRICTED:
4958                         if (SOCK_DOM(so) != PF_INET &&
4959                             SOCK_DOM(so) != PF_INET6) {
4960                                 error = EOPNOTSUPP;
4961                                 goto out;
4962                         }
4963                         error = sooptcopyin(sopt, &optval, sizeof(optval),
4964                             sizeof(optval));
4965                         if (error != 0)
4966                                 goto out;
4967                         if (optval != 0) {
4968                                 error = soopt_cred_check(so,
4969                                     PRIV_NET_RESTRICTED_AWDL);
4970                                 if (error == 0)
4971                                         inp_set_awdl_unrestricted(
4972                                             sotoinpcb(so));
4973                         } else
4974                                 inp_clear_awdl_unrestricted(sotoinpcb(so));
4975                         break;
4976                 case SO_INTCOPROC_ALLOW:
4977                         if (SOCK_DOM(so) != PF_INET6) {
4978                                 error = EOPNOTSUPP;
4979                                 goto out;
4980                         }
4981                         error = sooptcopyin(sopt, &optval, sizeof(optval),
4982                             sizeof(optval));
4983                         if (error != 0)
4984                                 goto out;
4985                         if (optval != 0 &&
4986                                         inp_get_intcoproc_allowed(sotoinpcb(so)) == FALSE) {
4987                                 error = soopt_cred_check(so,
4988                                     PRIV_NET_RESTRICTED_INTCOPROC);
4989                                 if (error == 0)
4990                                         inp_set_intcoproc_allowed(
4991                                             sotoinpcb(so));
4992                         } else if (optval == 0)
4993                                 inp_clear_intcoproc_allowed(sotoinpcb(so));
4994                         break;
4995
4996                 case SO_LABEL:
4997 #if CONFIG_MACF_SOCKET
4998                         if ((error = sooptcopyin(sopt, &extmac, sizeof (extmac),
4999                             sizeof (extmac))) != 0)
5000                                 goto out;
5001
5002                         error = mac_setsockopt_label(proc_ucred(sopt->sopt_p),
5003                             so, &extmac);
5004 #else
5005                         error = EOPNOTSUPP;
5006 #endif /* MAC_SOCKET */
5007                         break;
5008
5009                 case SO_UPCALLCLOSEWAIT:
5010                         error = sooptcopyin(sopt, &optval, sizeof (optval),
5011                             sizeof (optval));
5012                         if (error != 0)
5013                                 goto out;
5014                         if (optval != 0)
5015                                 so->so_flags |= SOF_UPCALLCLOSEWAIT;
5016                         else
5017                                 so->so_flags &= ~SOF_UPCALLCLOSEWAIT;
5018                         break;
5019
5020                 case SO_RANDOMPORT:
5021                         error = sooptcopyin(sopt, &optval, sizeof (optval),
5022                             sizeof (optval));
5023                         if (error != 0)
5024                                 goto out;
5025                         if (optval != 0)
5026                                 so->so_flags |= SOF_BINDRANDOMPORT;
5027                         else
5028                                 so->so_flags &= ~SOF_BINDRANDOMPORT;
5029                         break;
5030
5031                 case SO_NP_EXTENSIONS: {
5032                         struct so_np_extensions sonpx;
5033
5034                         error = sooptcopyin(sopt, &sonpx, sizeof (sonpx),
5035                             sizeof (sonpx));
5036                         if (error != 0)
5037                                 goto out;
5038                         if (sonpx.npx_mask & ~SONPX_MASK_VALID) {
5039                                 error = EINVAL;
5040                                 goto out;
5041                         }
5042                         /*
5043                          * Only one bit defined for now
5044                          */
5045                         if ((sonpx.npx_mask & SONPX_SETOPTSHUT)) {
5046                                 if ((sonpx.npx_flags & SONPX_SETOPTSHUT))
5047                                         so->so_flags |= SOF_NPX_SETOPTSHUT;
5048                                 else
5049                                         so->so_flags &= ~SOF_NPX_SETOPTSHUT;
5050                         }
5051                         break;
5052                 }
5053
5054                 case SO_TRAFFIC_CLASS: {
5055                         error = sooptcopyin(sopt, &optval, sizeof (optval),
5056                             sizeof (optval));
5057                         if (error != 0)
5058                                 goto out;
5059                         if (optval >= SO_TC_NET_SERVICE_OFFSET) {
5060                                 int netsvc = optval - SO_TC_NET_SERVICE_OFFSET;
5061                                 error = so_set_net_service_type(so, netsvc);
5062                                 goto out;
5063                         }
5064                         error = so_set_traffic_class(so, optval);
5065                         if (error != 0)
5066                                 goto out;
5067                         so->so_flags1 &= ~SOF1_TC_NET_SERV_TYPE;
5068                         so->so_netsvctype = _NET_SERVICE_TYPE_UNSPEC;
5069                         break;
5070                 }
5071
5072                 case SO_RECV_TRAFFIC_CLASS: {
5073                         error = sooptcopyin(sopt, &optval, sizeof (optval),
5074                             sizeof (optval));
5075                         if (error != 0)
5076                                 goto out;
5077                         if (optval == 0)
5078                                 so->so_flags &= ~SOF_RECV_TRAFFIC_CLASS;
5079                         else
5080                                 so->so_flags |= SOF_RECV_TRAFFIC_CLASS;
5081                         break;
5082                 }
5083
5084 #if (DEVELOPMENT || DEBUG)
5085                 case SO_TRAFFIC_CLASS_DBG: {
5086                         struct so_tcdbg so_tcdbg;
5087
5088                         error = sooptcopyin(sopt, &so_tcdbg,
5089                             sizeof (struct so_tcdbg), sizeof (struct so_tcdbg));
5090                         if (error != 0)
5091                                 goto out;
5092                         error = so_set_tcdbg(so, &so_tcdbg);
5093                         if (error != 0)
5094                                 goto out;
5095                         break;
5096                 }
5097 #endif /* (DEVELOPMENT || DEBUG) */
5098
5099                 case SO_PRIVILEGED_TRAFFIC_CLASS:
5100                         error = priv_check_cred(kauth_cred_get(),
5101                             PRIV_NET_PRIVILEGED_TRAFFIC_CLASS, 0);
5102                         if (error != 0)
5103                                 goto out;
5104                         error = sooptcopyin(sopt, &optval, sizeof (optval),
5105                             sizeof (optval));
5106                         if (error != 0)
5107                                 goto out;
5108                         if (optval == 0)
5109                                 so->so_flags &= ~SOF_PRIVILEGED_TRAFFIC_CLASS;
5110                         else
5111                                 so->so_flags |= SOF_PRIVILEGED_TRAFFIC_CLASS;
5112                         break;
5113
5114                 case SO_DEFUNCTOK:
5115                         error = sooptcopyin(sopt, &optval, sizeof (optval),
5116                             sizeof (optval));
5117                         if (error != 0 || (so->so_flags & SOF_DEFUNCT)) {
5118                                 if (error == 0)
5119                                         error = EBADF;
5120                                 goto out;
5121                         }
5122                         /*
5123                          * Any process can set SO_DEFUNCTOK (clear
5124                          * SOF_NODEFUNCT), but only root can clear
5125                          * SO_DEFUNCTOK (set SOF_NODEFUNCT).
5126                          */
5127                         if (optval == 0 &&
5128                             kauth_cred_issuser(kauth_cred_get()) == 0) {
5129                                 error = EPERM;
5130                                 goto out;
5131                         }
5132                         if (optval)
5133                                 so->so_flags &= ~SOF_NODEFUNCT;
5134                         else
5135                                 so->so_flags |= SOF_NODEFUNCT;
5136
5137                         if (SOCK_DOM(so) == PF_INET ||
5138                             SOCK_DOM(so) == PF_INET6) {
5139                                 char s[MAX_IPv6_STR_LEN];
5140                                 char d[MAX_IPv6_STR_LEN];
5141                                 struct inpcb *inp = sotoinpcb(so);
5142
5143                                 SODEFUNCTLOG("%s[%d, %s]: so 0x%llx "
5144                                     "[%s %s:%d -> %s:%d] is now marked "
5145                                     "as %seligible for "
5146                                     "defunct\n", __func__, proc_selfpid(),
5147                                     proc_best_name(current_proc()),
5148                                     (uint64_t)DEBUG_KERNEL_ADDRPERM(so),
5149                                     (SOCK_TYPE(so) == SOCK_STREAM) ?
5150                                     "TCP" : "UDP", inet_ntop(SOCK_DOM(so),
5151                                     ((SOCK_DOM(so) == PF_INET) ?
5152                                     (void *)&inp->inp_laddr.s_addr :
5153                                     (void *)&inp->in6p_laddr), s, sizeof (s)),
5154                                     ntohs(inp->in6p_lport),
5155                                     inet_ntop(SOCK_DOM(so),
5156                                     (SOCK_DOM(so) == PF_INET) ?
5157                                     (void *)&inp->inp_faddr.s_addr :
5158                                     (void *)&inp->in6p_faddr, d, sizeof (d)),
5159                                     ntohs(inp->in6p_fport),
5160                                     (so->so_flags & SOF_NODEFUNCT) ?
5161                                     "not " : "");
5162                         } else {
5163                                 SODEFUNCTLOG("%s[%d, %s]: so 0x%llx [%d,%d] "
5164                                     "is now marked as %seligible for "
5165                                     "defunct\n",
5166                                     __func__, proc_selfpid(),
5167                                     proc_best_name(current_proc()),
5168                                     (uint64_t)DEBUG_KERNEL_ADDRPERM(so),
5169                                     SOCK_DOM(so), SOCK_TYPE(so),
5170                                     (so->so_flags & SOF_NODEFUNCT) ?
5171                                     "not " : "");
5172                         }
5173                         break;
5174
5175                 case SO_ISDEFUNCT:
5176                         /* This option is not settable */
5177                         error = EINVAL;
5178                         break;
5179
5180                 case SO_OPPORTUNISTIC:
5181                         error = sooptcopyin(sopt, &optval, sizeof (optval),
5182                             sizeof (optval));
5183                         if (error == 0)
5184                                 error = so_set_opportunistic(so, optval);
5185                         break;
5186
5187                 case SO_FLUSH:
5188                         /* This option is handled by lower layer(s) */
5189                         error = 0;
5190                         break;
5191
5192                 case SO_RECV_ANYIF:
5193                         error = sooptcopyin(sopt, &optval, sizeof (optval),
5194                             sizeof (optval));
5195                         if (error == 0)
5196                                 error = so_set_recv_anyif(so, optval);
5197                         break;
5198
5199                 case SO_TRAFFIC_MGT_BACKGROUND: {
5200                         /* This option is handled by lower layer(s) */
5201                         error = 0;
5202                         break;
5203                 }
5204
5205 #if FLOW_DIVERT
5206                 case SO_FLOW_DIVERT_TOKEN:
5207                         error = flow_divert_token_set(so, sopt);
5208                         break;
5209 #endif  /* FLOW_DIVERT */
5210
5211
5212                 case SO_DELEGATED:
5213                         if ((error = sooptcopyin(sopt, &optval, sizeof (optval),
5214                             sizeof (optval))) != 0)
5215                                 break;
5216
5217                         error = so_set_effective_pid(so, optval, sopt->sopt_p);
5218                         break;
5219
5220                 case SO_DELEGATED_UUID: {
5221                         uuid_t euuid;
5222
5223                         if ((error = sooptcopyin(sopt, &euuid, sizeof (euuid),
5224                             sizeof (euuid))) != 0)
5225                                 break;
5226
5227                         error = so_set_effective_uuid(so, euuid, sopt->sopt_p);
5228                         break;
5229                 }
5230
5231 #if NECP
5232                 case SO_NECP_ATTRIBUTES:
5233                         error = necp_set_socket_attributes(so, sopt);
5234                         break;
5235 #endif /* NECP */
5236
5237 #if MPTCP
5238                 case SO_MPTCP_FASTJOIN:
5239                         if (!((so->so_flags & SOF_MP_SUBFLOW) ||
5240                             ((SOCK_CHECK_DOM(so, PF_MULTIPATH)) &&
5241                             (SOCK_CHECK_PROTO(so, IPPROTO_TCP))))) {
5242                                 error = ENOPROTOOPT;
5243                                 break;
5244                         }
5245
5246                         error = sooptcopyin(sopt, &optval, sizeof (optval),
5247                             sizeof (optval));
5248                         if (error != 0)
5249                                 goto out;
5250                         if (optval == 0)
5251                                 so->so_flags &= ~SOF_MPTCP_FASTJOIN;
5252                         else
5253                                 so->so_flags |= SOF_MPTCP_FASTJOIN;
5254                         break;
5255 #endif /* MPTCP */
5256
5257                 case SO_EXTENDED_BK_IDLE:
5258                         error = sooptcopyin(sopt, &optval, sizeof (optval),
5259                             sizeof (optval));
5260                         if (error == 0)
5261                                 error = so_set_extended_bk_idle(so, optval);
5262                         break;
5263
5264                 case SO_MARK_CELLFALLBACK:
5265                         error = sooptcopyin(sopt, &optval, sizeof(optval),
5266                             sizeof(optval));
5267                         if (error != 0)
5268                                 goto out;
5269                         if (optval < 0) {
5270                                 error = EINVAL;
5271                                 goto out;
5272                         }
5273                         if (optval == 0)
5274                                 so->so_flags1 &= ~SOF1_CELLFALLBACK;
5275                         else
5276                                 so->so_flags1 |= SOF1_CELLFALLBACK;
5277                         break;
5278
5279                 case SO_NET_SERVICE_TYPE: {
5280                         error = sooptcopyin(sopt, &optval, sizeof(optval),
5281                             sizeof(optval));
5282                         if (error != 0)
5283                                 goto out;
5284                         error = so_set_net_service_type(so, optval);
5285                         break;
5286                 }
5287
5288                 case SO_QOSMARKING_POLICY_OVERRIDE:
5289                         error = priv_check_cred(kauth_cred_get(),
5290                             PRIV_NET_QOSMARKING_POLICY_OVERRIDE, 0);
5291                         if (error != 0)
5292                                 goto out;
5293                         error = sooptcopyin(sopt, &optval, sizeof(optval),
5294                             sizeof(optval));
5295                         if (error != 0)
5296                                 goto out;
5297                         if (optval == 0)
5298                                 so->so_flags1 &= ~SOF1_QOSMARKING_POLICY_OVERRIDE;
5299                         else
5300                                 so->so_flags1 |= SOF1_QOSMARKING_POLICY_OVERRIDE;
5301                         break;
5302
5303                 default:
5304                         error = ENOPROTOOPT;
5305                         break;
5306                 }
5307                 if (error == 0 && so->so_proto != NULL &&
5308                     so->so_proto->pr_ctloutput != NULL) {
5309                         (void) so->so_proto->pr_ctloutput(so, sopt);
5310                 }
5311         }
5312 out:
5313         if (dolock)
5314                 socket_unlock(so, 1);
5315         return (error);
5316 }
5317
5318 /* Helper routines for getsockopt */
5319 int
5320 sooptcopyout(struct sockopt *sopt, void *buf, size_t len)
5321 {
5322         int     error;
5323         size_t  valsize;
5324
5325         error = 0;
5326
5327         /*
5328          * Documented get behavior is that we always return a value,
5329          * possibly truncated to fit in the user's buffer.
5330          * Traditional behavior is that we always tell the user
5331          * precisely how much we copied, rather than something useful
5332          * like the total amount we had available for her.
5333          * Note that this interface is not idempotent; the entire answer must
5334          * generated ahead of time.
5335          */
5336         valsize = min(len, sopt->sopt_valsize);
5337         sopt->sopt_valsize = valsize;
5338         if (sopt->sopt_val != USER_ADDR_NULL) {
5339                 if (sopt->sopt_p != kernproc)
5340                         error = copyout(buf, sopt->sopt_val, valsize);
5341                 else
5342                         bcopy(buf, CAST_DOWN(caddr_t, sopt->sopt_val), valsize);
5343         }
5344         return (error);
5345 }
5346
5347 static int
5348 sooptcopyout_timeval(struct sockopt *sopt, const struct timeval *tv_p)
5349 {
5350         int                     error;
5351         size_t                  len;
5352         struct user64_timeval   tv64;
5353         struct user32_timeval   tv32;
5354         const void *            val;
5355         size_t                  valsize;
5356
5357         error = 0;
5358         if (proc_is64bit(sopt->sopt_p)) {
5359                 len = sizeof (tv64);
5360                 tv64.tv_sec = tv_p->tv_sec;
5361                 tv64.tv_usec = tv_p->tv_usec;
5362                 val = &tv64;
5363         } else {
5364                 len = sizeof (tv32);
5365                 tv32.tv_sec = tv_p->tv_sec;
5366                 tv32.tv_usec = tv_p->tv_usec;
5367                 val = &tv32;
5368         }
5369         valsize = min(len, sopt->sopt_valsize);
5370         sopt->sopt_valsize = valsize;
5371         if (sopt->sopt_val != USER_ADDR_NULL) {
5372                 if (sopt->sopt_p != kernproc)
5373                         error = copyout(val, sopt->sopt_val, valsize);
5374                 else
5375                         bcopy(val, CAST_DOWN(caddr_t, sopt->sopt_val), valsize);
5376         }
5377         return (error);
5378 }
5379
5380 /*
5381  * Return:      0                       Success
5382  *              ENOPROTOOPT
5383  *      <pr_ctloutput>:EOPNOTSUPP[AF_UNIX]
5384  *      <pr_ctloutput>:???
5385  *      <sf_getoption>:???
5386  */
5387 int
5388 sogetoptlock(struct socket *so, struct sockopt *sopt, int dolock)
5389 {
5390         int     error, optval;
5391         struct  linger l;
5392         struct  timeval tv;
5393 #if CONFIG_MACF_SOCKET
5394         struct mac extmac;
5395 #endif /* MAC_SOCKET */
5396
5397         if (sopt->sopt_dir != SOPT_GET)
5398                 sopt->sopt_dir = SOPT_GET;
5399
5400         if (dolock)
5401                 socket_lock(so, 1);
5402
5403         error = sflt_getsockopt(so, sopt);
5404         if (error != 0) {
5405                 if (error == EJUSTRETURN)
5406                         error = 0;
5407                 goto out;
5408         }
5409
5410         if (sopt->sopt_level != SOL_SOCKET) {
5411                 if (so->so_proto != NULL &&
5412                     so->so_proto->pr_ctloutput != NULL) {
5413                         error = (*so->so_proto->pr_ctloutput)(so, sopt);
5414                         goto out;
5415                 }
5416                 error = ENOPROTOOPT;
5417         } else {
5418                 /*
5419                  * Allow socket-level (SOL_SOCKET) options to be filtered by
5420                  * the protocol layer, if needed.  A zero value returned from
5421                  * the handler means use default socket-level processing as
5422                  * done by the rest of this routine.  Otherwise, any other
5423                  * return value indicates that the option is unsupported.
5424                  */
5425                 if (so->so_proto != NULL && (error = so->so_proto->pr_usrreqs->
5426                     pru_socheckopt(so, sopt)) != 0)
5427                         goto out;
5428
5429                 error = 0;
5430                 switch (sopt->sopt_name) {
5431                 case SO_LINGER:
5432                 case SO_LINGER_SEC:
5433                         l.l_onoff = ((so->so_options & SO_LINGER) ? 1 : 0);
5434                         l.l_linger = (sopt->sopt_name == SO_LINGER) ?
5435                             so->so_linger : so->so_linger / hz;
5436                         error = sooptcopyout(sopt, &l, sizeof (l));
5437                         break;
5438
5439                 case SO_USELOOPBACK:
5440                 case SO_DONTROUTE:
5441                 case SO_DEBUG:
5442                 case SO_KEEPALIVE:
5443                 case SO_REUSEADDR:
5444                 case SO_REUSEPORT:
5445                 case SO_BROADCAST:
5446                 case SO_OOBINLINE:
5447                 case SO_TIMESTAMP:
5448                 case SO_TIMESTAMP_MONOTONIC:
5449                 case SO_DONTTRUNC:
5450                 case SO_WANTMORE:
5451                 case SO_WANTOOBFLAG:
5452                 case SO_NOWAKEFROMSLEEP:
5453                 case SO_NOAPNFALLBK:
5454                         optval = so->so_options & sopt->sopt_name;
5455 integer:
5456                         error = sooptcopyout(sopt, &optval, sizeof (optval));
5457                         break;
5458
5459                 case SO_TYPE:
5460                         optval = so->so_type;
5461                         goto integer;
5462
5463                 case SO_NREAD:
5464                         if (so->so_proto->pr_flags & PR_ATOMIC) {
5465                                 int pkt_total;
5466                                 struct mbuf *m1;
5467
5468                                 pkt_total = 0;
5469                                 m1 = so->so_rcv.sb_mb;
5470                                 while (m1 != NULL) {
5471                                         if (m1->m_type == MT_DATA ||
5472                                             m1->m_type == MT_HEADER ||
5473                                             m1->m_type == MT_OOBDATA)
5474                                                 pkt_total += m1->m_len;
5475                                         m1 = m1->m_next;
5476                                 }
5477                                 optval = pkt_total;
5478                         } else {
5479                                 optval = so->so_rcv.sb_cc - so->so_rcv.sb_ctl;
5480                         }
5481                         goto integer;
5482
5483                 case SO_NUMRCVPKT:
5484                         if (so->so_proto->pr_flags & PR_ATOMIC) {
5485                                 int cnt = 0;
5486                                 struct mbuf *m1;
5487
5488                                 m1 = so->so_rcv.sb_mb;
5489                                 while (m1 != NULL) {
5490                                         if (m1->m_type == MT_DATA ||
5491                                             m1->m_type == MT_HEADER ||
5492                                             m1->m_type == MT_OOBDATA)
5493                                                 cnt += 1;
5494                                         m1 = m1->m_nextpkt;
5495                                 }
5496                                 optval = cnt;
5497                                 goto integer;
5498                         } else {
5499                                 error = EINVAL;
5500                                 break;
5501                         }
5502
5503                 case SO_NWRITE:
5504                         optval = so->so_snd.sb_cc;
5505                         goto integer;
5506
5507                 case SO_ERROR:
5508                         optval = so->so_error;
5509                         so->so_error = 0;
5510                         goto integer;
5511
5512                 case SO_SNDBUF: {
5513                         u_int32_t hiwat = so->so_snd.sb_hiwat;
5514
5515                         if (so->so_snd.sb_flags & SB_UNIX) {
5516                                 struct unpcb *unp =
5517                                     (struct unpcb *)(so->so_pcb);
5518                                 if (unp != NULL && unp->unp_conn != NULL) {
5519                                         hiwat += unp->unp_conn->unp_cc;
5520                                 }
5521                         }
5522
5523                         optval = hiwat;
5524                         goto integer;
5525                 }
5526                 case SO_RCVBUF:
5527                         optval = so->so_rcv.sb_hiwat;
5528                         goto integer;
5529
5530                 case SO_SNDLOWAT:
5531                         optval = so->so_snd.sb_lowat;
5532                         goto integer;
5533
5534                 case SO_RCVLOWAT:
5535                         optval = so->so_rcv.sb_lowat;
5536                         goto integer;
5537
5538                 case SO_SNDTIMEO:
5539                 case SO_RCVTIMEO:
5540                         tv = (sopt->sopt_name == SO_SNDTIMEO ?
5541                             so->so_snd.sb_timeo : so->so_rcv.sb_timeo);
5542
5543                         error = sooptcopyout_timeval(sopt, &tv);
5544                         break;
5545
5546                 case SO_NOSIGPIPE:
5547                         optval = (so->so_flags & SOF_NOSIGPIPE);
5548                         goto integer;
5549
5550                 case SO_NOADDRERR:
5551                         optval = (so->so_flags & SOF_NOADDRAVAIL);
5552                         goto integer;
5553
5554                 case SO_REUSESHAREUID:
5555                         optval = (so->so_flags & SOF_REUSESHAREUID);
5556                         goto integer;
5557
5558
5559                 case SO_NOTIFYCONFLICT:
5560                         optval = (so->so_flags & SOF_NOTIFYCONFLICT);
5561                         goto integer;
5562
5563                 case SO_RESTRICTIONS:
5564                         optval = so_get_restrictions(so);
5565                         goto integer;
5566
5567                 case SO_AWDL_UNRESTRICTED:
5568                         if (SOCK_DOM(so) == PF_INET ||
5569                             SOCK_DOM(so) == PF_INET6) {
5570                                 optval = inp_get_awdl_unrestricted(
5571                                     sotoinpcb(so));
5572                                 goto integer;
5573                         } else
5574                                 error = EOPNOTSUPP;
5575                         break;
5576
5577                 case SO_INTCOPROC_ALLOW:
5578                         if (SOCK_DOM(so) == PF_INET6) {
5579                                 optval = inp_get_intcoproc_allowed(
5580                                     sotoinpcb(so));
5581                                 goto integer;
5582                         } else
5583                                 error = EOPNOTSUPP;
5584                         break;
5585
5586                 case SO_LABEL:
5587 #if CONFIG_MACF_SOCKET
5588                         if ((error = sooptcopyin(sopt, &extmac, sizeof (extmac),
5589                             sizeof (extmac))) != 0 ||
5590                             (error = mac_socket_label_get(proc_ucred(
5591                             sopt->sopt_p), so, &extmac)) != 0)
5592                                 break;
5593
5594                         error = sooptcopyout(sopt, &extmac, sizeof (extmac));
5595 #else
5596                         error = EOPNOTSUPP;
5597 #endif /* MAC_SOCKET */
5598                         break;
5599
5600                 case SO_PEERLABEL:
5601 #if CONFIG_MACF_SOCKET
5602                         if ((error = sooptcopyin(sopt, &extmac, sizeof (extmac),
5603                             sizeof (extmac))) != 0 ||
5604                             (error = mac_socketpeer_label_get(proc_ucred(
5605                             sopt->sopt_p), so, &extmac)) != 0)
5606                                 break;
5607
5608                         error = sooptcopyout(sopt, &extmac, sizeof (extmac));
5609 #else
5610                         error = EOPNOTSUPP;
5611 #endif /* MAC_SOCKET */
5612                         break;
5613
5614 #ifdef __APPLE_API_PRIVATE
5615                 case SO_UPCALLCLOSEWAIT:
5616                         optval = (so->so_flags & SOF_UPCALLCLOSEWAIT);
5617                         goto integer;
5618 #endif
5619                 case SO_RANDOMPORT:
5620                         optval = (so->so_flags & SOF_BINDRANDOMPORT);
5621                         goto integer;
5622
5623                 case SO_NP_EXTENSIONS: {
5624                         struct so_np_extensions sonpx;
5625
5626                         sonpx.npx_flags = (so->so_flags & SOF_NPX_SETOPTSHUT) ?
5627                             SONPX_SETOPTSHUT : 0;
5628                         sonpx.npx_mask = SONPX_MASK_VALID;
5629
5630                         error = sooptcopyout(sopt, &sonpx,
5631                             sizeof (struct so_np_extensions));
5632                         break;
5633                 }
5634
5635                 case SO_TRAFFIC_CLASS:
5636                         optval = so->so_traffic_class;
5637                         goto integer;
5638
5639                 case SO_RECV_TRAFFIC_CLASS:
5640                         optval = (so->so_flags & SOF_RECV_TRAFFIC_CLASS);
5641                         goto integer;
5642
5643                 case SO_TRAFFIC_CLASS_STATS:
5644                         error = sooptcopyout(sopt, &so->so_tc_stats,
5645                             sizeof (so->so_tc_stats));
5646                         break;
5647
5648 #if (DEVELOPMENT || DEBUG)
5649                 case SO_TRAFFIC_CLASS_DBG:
5650                         error = sogetopt_tcdbg(so, sopt);
5651                         break;
5652 #endif /* (DEVELOPMENT || DEBUG) */
5653
5654                 case SO_PRIVILEGED_TRAFFIC_CLASS:
5655                         optval = (so->so_flags & SOF_PRIVILEGED_TRAFFIC_CLASS);
5656                         goto integer;
5657
5658                 case SO_DEFUNCTOK:
5659                         optval = !(so->so_flags & SOF_NODEFUNCT);
5660                         goto integer;
5661
5662                 case SO_ISDEFUNCT:
5663                         optval = (so->so_flags & SOF_DEFUNCT);
5664                         goto integer;
5665
5666                 case SO_OPPORTUNISTIC:
5667                         optval = so_get_opportunistic(so);
5668                         goto integer;
5669
5670                 case SO_FLUSH:
5671                         /* This option is not gettable */
5672                         error = EINVAL;
5673                         break;
5674
5675                 case SO_RECV_ANYIF:
5676                         optval = so_get_recv_anyif(so);
5677                         goto integer;
5678
5679                 case SO_TRAFFIC_MGT_BACKGROUND:
5680                         /* This option is handled by lower layer(s) */
5681                         if (so->so_proto != NULL &&
5682                             so->so_proto->pr_ctloutput != NULL) {
5683                                 (void) so->so_proto->pr_ctloutput(so, sopt);
5684                         }
5685                         break;
5686
5687 #if FLOW_DIVERT
5688                 case SO_FLOW_DIVERT_TOKEN:
5689                         error = flow_divert_token_get(so, sopt);
5690                         break;
5691 #endif  /* FLOW_DIVERT */
5692
5693 #if NECP
5694                 case SO_NECP_ATTRIBUTES:
5695                         error = necp_get_socket_attributes(so, sopt);
5696                         break;
5697 #endif /* NECP */
5698
5699 #if CONTENT_FILTER
5700                 case SO_CFIL_SOCK_ID: {
5701                         cfil_sock_id_t sock_id;
5702
5703                         sock_id = cfil_sock_id_from_socket(so);
5704
5705                         error = sooptcopyout(sopt, &sock_id,
5706                                 sizeof(cfil_sock_id_t));
5707                         break;
5708                 }
5709 #endif  /* CONTENT_FILTER */
5710
5711 #if MPTCP
5712                 case SO_MPTCP_FASTJOIN:
5713                         if (!((so->so_flags & SOF_MP_SUBFLOW) ||
5714                             ((SOCK_CHECK_DOM(so, PF_MULTIPATH)) &&
5715                             (SOCK_CHECK_PROTO(so, IPPROTO_TCP))))) {
5716                                 error = ENOPROTOOPT;
5717                                 break;
5718                         }
5719                         optval = (so->so_flags & SOF_MPTCP_FASTJOIN);
5720                         /* Fixed along with rdar://19391339 */
5721                         goto integer;
5722 #endif /* MPTCP */
5723
5724                 case SO_EXTENDED_BK_IDLE:
5725                         optval = (so->so_flags1 & SOF1_EXTEND_BK_IDLE_WANTED);
5726                         goto integer;
5727                 case SO_MARK_CELLFALLBACK:
5728                         optval = ((so->so_flags1 & SOF1_CELLFALLBACK) > 0)
5729                             ? 1 : 0;
5730                         goto integer;
5731                 case SO_NET_SERVICE_TYPE: {
5732                         if ((so->so_flags1 & SOF1_TC_NET_SERV_TYPE))
5733                                 optval = so->so_netsvctype;
5734                         else
5735                                 optval = NET_SERVICE_TYPE_BE;
5736                         goto integer;
5737                 }
5738                 case SO_NETSVC_MARKING_LEVEL:
5739                         optval = so_get_netsvc_marking_level(so);
5740                         goto integer;
5741
5742                 default:
5743                         error = ENOPROTOOPT;
5744                         break;
5745                 }
5746         }
5747 out:
5748         if (dolock)
5749                 socket_unlock(so, 1);
5750         return (error);
5751 }
5752
5753 /*
5754  * The size limits on our soopt_getm is different from that on FreeBSD.
5755  * We limit the size of options to MCLBYTES. This will have to change
5756  * if we need to define options that need more space than MCLBYTES.
5757  */
5758 int
5759 soopt_getm(struct sockopt *sopt, struct mbuf **mp)
5760 {
5761         struct mbuf *m, *m_prev;
5762         int sopt_size = sopt->sopt_valsize;
5763         int how;
5764
5765         if (sopt_size <= 0 || sopt_size > MCLBYTES)
5766                 return (EMSGSIZE);
5767
5768         how = sopt->sopt_p != kernproc ? M_WAIT : M_DONTWAIT;
5769         MGET(m, how, MT_DATA);
5770         if (m == NULL)
5771                 return (ENOBUFS);
5772         if (sopt_size > MLEN) {
5773                 MCLGET(m, how);
5774                 if ((m->m_flags & M_EXT) == 0) {
5775                         m_free(m);
5776                         return (ENOBUFS);
5777                 }
5778                 m->m_len = min(MCLBYTES, sopt_size);
5779         } else {
5780                 m->m_len = min(MLEN, sopt_size);
5781         }
5782         sopt_size -= m->m_len;
5783         *mp = m;
5784         m_prev = m;
5785
5786         while (sopt_size > 0) {
5787                 MGET(m, how, MT_DATA);
5788                 if (m == NULL) {
5789                         m_freem(*mp);
5790                         return (ENOBUFS);
5791                 }
5792                 if (sopt_size > MLEN) {
5793                         MCLGET(m, how);
5794                         if ((m->m_flags & M_EXT) == 0) {
5795                                 m_freem(*mp);
5796                                 m_freem(m);
5797                                 return (ENOBUFS);
5798                         }
5799                         m->m_len = min(MCLBYTES, sopt_size);
5800                 } else {
5801                         m->m_len = min(MLEN, sopt_size);
5802                 }
5803                 sopt_size -= m->m_len;
5804                 m_prev->m_next = m;
5805                 m_prev = m;
5806         }
5807         return (0);
5808 }
5809
5810 /* copyin sopt data into mbuf chain */
5811 int
5812 soopt_mcopyin(struct sockopt *sopt, struct mbuf *m)
5813 {
5814         struct mbuf *m0 = m;
5815
5816         if (sopt->sopt_val == USER_ADDR_NULL)
5817                 return (0);
5818         while (m != NULL && sopt->sopt_valsize >= m->m_len) {
5819                 if (sopt->sopt_p != kernproc) {
5820                         int error;
5821
5822                         error = copyin(sopt->sopt_val, mtod(m, char *),
5823                             m->m_len);
5824                         if (error != 0) {
5825                                 m_freem(m0);
5826                                 return (error);
5827                         }
5828                 } else {
5829                         bcopy(CAST_DOWN(caddr_t, sopt->sopt_val),
5830                             mtod(m, char *), m->m_len);
5831                 }
5832                 sopt->sopt_valsize -= m->m_len;
5833                 sopt->sopt_val += m->m_len;
5834                 m = m->m_next;
5835         }
5836         /* should be allocated enoughly at ip6_sooptmcopyin() */
5837         if (m != NULL) {
5838                 panic("soopt_mcopyin");
5839                 /* NOTREACHED */
5840         }
5841         return (0);
5842 }
5843
5844 /* copyout mbuf chain data into soopt */
5845 int
5846 soopt_mcopyout(struct sockopt *sopt, struct mbuf *m)
5847 {
5848         struct mbuf *m0 = m;
5849         size_t valsize = 0;
5850
5851         if (sopt->sopt_val == USER_ADDR_NULL)
5852                 return (0);
5853         while (m != NULL && sopt->sopt_valsize >= m->m_len) {
5854                 if (sopt->sopt_p != kernproc) {
5855                         int error;
5856
5857                         error = copyout(mtod(m, char *), sopt->sopt_val,
5858                             m->m_len);
5859                         if (error != 0) {
5860                                 m_freem(m0);
5861                                 return (error);
5862                         }
5863                 } else {
5864                         bcopy(mtod(m, char *),
5865                             CAST_DOWN(caddr_t, sopt->sopt_val), m->m_len);
5866                 }
5867                 sopt->sopt_valsize -= m->m_len;
5868                 sopt->sopt_val += m->m_len;
5869                 valsize += m->m_len;
5870                 m = m->m_next;
5871         }
5872         if (m != NULL) {
5873                 /* enough soopt buffer should be given from user-land */
5874                 m_freem(m0);
5875                 return (EINVAL);
5876         }
5877         sopt->sopt_valsize = valsize;
5878         return (0);
5879 }
5880
5881 void
5882 sohasoutofband(struct socket *so)
5883 {
5884         if (so->so_pgid < 0)
5885                 gsignal(-so->so_pgid, SIGURG);
5886         else if (so->so_pgid > 0)
5887                 proc_signal(so->so_pgid, SIGURG);
5888         selwakeup(&so->so_rcv.sb_sel);
5889         if (so->so_rcv.sb_flags & SB_KNOTE) {
5890                 KNOTE(&so->so_rcv.sb_sel.si_note,
5891                     (NOTE_OOB | SO_FILT_HINT_LOCKED));
5892         }
5893 }
5894
5895 int
5896 sopoll(struct socket *so, int events, kauth_cred_t cred, void * wql)
5897 {
5898 #pragma unused(cred)
5899         struct proc *p = current_proc();
5900         int revents = 0;
5901
5902         socket_lock(so, 1);
5903         so_update_last_owner_locked(so, PROC_NULL);
5904         so_update_policy(so);
5905
5906         if (events & (POLLIN | POLLRDNORM))
5907                 if (soreadable(so))
5908                         revents |= events & (POLLIN | POLLRDNORM);
5909
5910         if (events & (POLLOUT | POLLWRNORM))
5911                 if (sowriteable(so))
5912                         revents |= events & (POLLOUT | POLLWRNORM);
5913
5914         if (events & (POLLPRI | POLLRDBAND))
5915                 if (so->so_oobmark || (so->so_state & SS_RCVATMARK))
5916                         revents |= events & (POLLPRI | POLLRDBAND);
5917
5918         if (revents == 0) {
5919                 if (events & (POLLIN | POLLPRI | POLLRDNORM | POLLRDBAND)) {
5920                         /*
5921                          * Darwin sets the flag first,
5922                          * BSD calls selrecord first
5923                          */
5924                         so->so_rcv.sb_flags |= SB_SEL;
5925                         selrecord(p, &so->so_rcv.sb_sel, wql);
5926                 }
5927
5928                 if (events & (POLLOUT | POLLWRNORM)) {
5929                         /*
5930                          * Darwin sets the flag first,
5931                          * BSD calls selrecord first
5932                          */
5933                         so->so_snd.sb_flags |= SB_SEL;
5934                         selrecord(p, &so->so_snd.sb_sel, wql);
5935                 }
5936         }
5937
5938         socket_unlock(so, 1);
5939         return (revents);
5940 }
5941
5942 int
5943 soo_kqfilter(struct fileproc *fp, struct knote *kn, vfs_context_t ctx)
5944 {
5945 #pragma unused(fp)
5946 #if !CONFIG_MACF_SOCKET
5947 #pragma unused(ctx)
5948 #endif /* MAC_SOCKET */
5949         struct socket *so = (struct socket *)kn->kn_fp->f_fglob->fg_data;
5950         int result;
5951
5952         socket_lock(so, 1);
5953         so_update_last_owner_locked(so, PROC_NULL);
5954         so_update_policy(so);
5955
5956 #if CONFIG_MACF_SOCKET
5957         if (mac_socket_check_kqfilter(proc_ucred(vfs_context_proc(ctx)),
5958             kn, so) != 0) {
5959                 socket_unlock(so, 1);
5960                 kn->kn_flags = EV_ERROR;
5961                 kn->kn_data = EPERM;
5962                 return 0;
5963         }
5964 #endif /* MAC_SOCKET */
5965
5966         switch (kn->kn_filter) {
5967         case EVFILT_READ:
5968                 kn->kn_filtid = EVFILTID_SOREAD;
5969                 break;
5970         case EVFILT_WRITE:
5971                 kn->kn_filtid = EVFILTID_SOWRITE;
5972                 break;
5973         case EVFILT_SOCK:
5974                 kn->kn_filtid = EVFILTID_SCK;
5975                 break;
5976         case EVFILT_EXCEPT:
5977                 kn->kn_filtid = EVFILTID_SOEXCEPT;
5978                 break;
5979         default:
5980                 socket_unlock(so, 1);
5981                 kn->kn_flags = EV_ERROR;
5982                 kn->kn_data = EINVAL;
5983                 return 0;
5984         }
5985
5986         /*
5987          * call the appropriate sub-filter attach
5988          * with the socket still locked
5989          */
5990         result = knote_fops(kn)->f_attach(kn);
5991
5992         socket_unlock(so, 1);
5993
5994         return result;
5995 }
5996
5997 static int
5998 filt_soread_common(struct knote *kn, struct socket *so)
5999 {
6000         if (so->so_options & SO_ACCEPTCONN) {
6001                 int is_not_empty;
6002
6003                 /*
6004                  * Radar 6615193 handle the listen case dynamically
6005                  * for kqueue read filter. This allows to call listen()
6006                  * after registering the kqueue EVFILT_READ.
6007                  */
6008
6009                 kn->kn_data = so->so_qlen;
6010                 is_not_empty = ! TAILQ_EMPTY(&so->so_comp);
6011
6012                 return (is_not_empty);
6013         }
6014
6015         /* socket isn't a listener */
6016         /*
6017          * NOTE_LOWAT specifies new low water mark in data, i.e.
6018          * the bytes of protocol data. We therefore exclude any
6019          * control bytes.
6020          */
6021         kn->kn_data = so->so_rcv.sb_cc - so->so_rcv.sb_ctl;
6022
6023         if (kn->kn_sfflags & NOTE_OOB) {
6024                 if (so->so_oobmark || (so->so_state & SS_RCVATMARK)) {
6025                         kn->kn_fflags |= NOTE_OOB;
6026                         kn->kn_data -= so->so_oobmark;
6027                         return (1);
6028                 }
6029         }
6030
6031         if ((so->so_state & SS_CANTRCVMORE)
6032 #if CONTENT_FILTER
6033             && cfil_sock_data_pending(&so->so_rcv) == 0
6034 #endif /* CONTENT_FILTER */
6035            ) {
6036                 kn->kn_flags |= EV_EOF;
6037                 kn->kn_fflags = so->so_error;
6038                 return (1);
6039         }
6040
6041         if (so->so_error) {     /* temporary udp error */
6042                 return (1);
6043         }
6044
6045         int64_t lowwat = so->so_rcv.sb_lowat;
6046         /*
6047          * Ensure that when NOTE_LOWAT is used, the derived
6048          * low water mark is bounded by socket's rcv buf's
6049          * high and low water mark values.
6050          */
6051         if (kn->kn_sfflags & NOTE_LOWAT) {
6052                 if (kn->kn_sdata > so->so_rcv.sb_hiwat)
6053                         lowwat = so->so_rcv.sb_hiwat;
6054                 else if (kn->kn_sdata > lowwat)
6055                         lowwat = kn->kn_sdata;
6056         }
6057
6058         /*
6059          * The order below is important. Since NOTE_LOWAT
6060          * overrides sb_lowat, check for NOTE_LOWAT case
6061          * first.
6062          */
6063         if (kn->kn_sfflags & NOTE_LOWAT)
6064                 return (kn->kn_data >= lowwat);
6065
6066         return (so->so_rcv.sb_cc >= lowwat);
6067 }
6068
6069 static int
6070 filt_sorattach(struct knote *kn)
6071 {
6072         struct socket *so = (struct socket *)kn->kn_fp->f_fglob->fg_data;
6073
6074         /* socket locked */
6075
6076         /*
6077          * If the caller explicitly asked for OOB results (e.g. poll())
6078          * from EVFILT_READ, then save that off in the hookid field
6079          * and reserve the kn_flags EV_OOBAND bit for output only.
6080          */
6081         if (kn->kn_filter == EVFILT_READ &&
6082             kn->kn_flags & EV_OOBAND) {
6083                 kn->kn_flags &= ~EV_OOBAND;
6084                 kn->kn_hookid = EV_OOBAND;
6085         } else {
6086                 kn->kn_hookid = 0;
6087         }
6088         if (KNOTE_ATTACH(&so->so_rcv.sb_sel.si_note, kn))
6089                 so->so_rcv.sb_flags |= SB_KNOTE;
6090
6091         /* indicate if event is already fired */
6092         return filt_soread_common(kn, so);
6093 }
6094
6095 static void
6096 filt_sordetach(struct knote *kn)
6097 {
6098         struct socket *so = (struct socket *)kn->kn_fp->f_fglob->fg_data;
6099
6100         socket_lock(so, 1);
6101         if (so->so_rcv.sb_flags & SB_KNOTE)
6102                 if (KNOTE_DETACH(&so->so_rcv.sb_sel.si_note, kn))
6103                         so->so_rcv.sb_flags &= ~SB_KNOTE;
6104         socket_unlock(so, 1);
6105 }
6106
6107 /*ARGSUSED*/
6108 static int
6109 filt_soread(struct knote *kn, long hint)
6110 {
6111         struct socket *so = (struct socket *)kn->kn_fp->f_fglob->fg_data;
6112         int retval;
6113
6114         if ((hint & SO_FILT_HINT_LOCKED) == 0)
6115                 socket_lock(so, 1);
6116
6117         retval = filt_soread_common(kn, so);
6118
6119         if ((hint & SO_FILT_HINT_LOCKED) == 0)
6120                 socket_unlock(so, 1);
6121
6122         return retval;
6123 }
6124
6125 static int
6126 filt_sortouch(struct knote *kn, struct kevent_internal_s *kev)
6127 {
6128         struct socket *so = (struct socket *)kn->kn_fp->f_fglob->fg_data;
6129         int retval;
6130
6131         socket_lock(so, 1);
6132
6133         /* save off the new input fflags and data */
6134         kn->kn_sfflags = kev->fflags;
6135         kn->kn_sdata = kev->data;
6136         if ((kn->kn_status & KN_UDATA_SPECIFIC) == 0)
6137                 kn->kn_udata = kev->udata;
6138
6139         /* determine if changes result in fired events */
6140         retval = filt_soread_common(kn, so);
6141
6142         socket_unlock(so, 1);
6143
6144         return retval;
6145 }
6146
6147 static int
6148 filt_sorprocess(struct knote *kn, struct filt_process_s *data, struct kevent_internal_s *kev)
6149 {
6150 #pragma unused(data)
6151         struct socket *so = (struct socket *)kn->kn_fp->f_fglob->fg_data;
6152         int retval;
6153
6154         socket_lock(so, 1);
6155         retval = filt_soread_common(kn, so);
6156         if (retval) {
6157                 *kev = kn->kn_kevent;
6158                 if (kn->kn_flags & EV_CLEAR) {
6159                         kn->kn_fflags = 0;
6160                         kn->kn_data = 0;
6161                 }
6162         }
6163         socket_unlock(so, 1);
6164
6165         return retval;
6166 }
6167
6168 int
6169 so_wait_for_if_feedback(struct socket *so)
6170 {
6171         if ((SOCK_DOM(so) == PF_INET || SOCK_DOM(so) == PF_INET6) &&
6172             (so->so_state & SS_ISCONNECTED)) {
6173                 struct inpcb *inp = sotoinpcb(so);
6174                 if (INP_WAIT_FOR_IF_FEEDBACK(inp))
6175                         return (1);
6176         }
6177         return (0);
6178 }
6179
6180 static int
6181 filt_sowrite_common(struct knote *kn, struct socket *so)
6182 {
6183         int ret = 0;
6184
6185         kn->kn_data = sbspace(&so->so_snd);
6186         if (so->so_state & SS_CANTSENDMORE) {
6187                 kn->kn_flags |= EV_EOF;
6188                 kn->kn_fflags = so->so_error;
6189                 return 1;
6190         }
6191         if (so->so_error) {     /* temporary udp error */
6192                 return 1;
6193         }
6194         if (!socanwrite(so)) {
6195                 return 0;
6196         }
6197         if (so->so_flags1 & SOF1_PRECONNECT_DATA) {
6198                 return 1;
6199         }
6200         int64_t lowwat = so->so_snd.sb_lowat;
6201         if (kn->kn_sfflags & NOTE_LOWAT) {
6202                 if (kn->kn_sdata > so->so_snd.sb_hiwat)
6203                         lowwat = so->so_snd.sb_hiwat;
6204                 else if (kn->kn_sdata > lowwat)
6205                         lowwat = kn->kn_sdata;
6206         }
6207         if (kn->kn_data >= lowwat) {
6208                 if ((so->so_flags & SOF_NOTSENT_LOWAT)
6209 #if (DEBUG || DEVELOPMENT)
6210                     && so_notsent_lowat_check == 1
6211 #endif /* DEBUG || DEVELOPMENT */
6212                     ) {
6213                         if ((SOCK_DOM(so) == PF_INET ||
6214                             SOCK_DOM(so) == PF_INET6) &&
6215                             so->so_type == SOCK_STREAM) {
6216                                 ret = tcp_notsent_lowat_check(so);
6217                         }
6218 #if MPTCP
6219                         else if ((SOCK_DOM(so) == PF_MULTIPATH) &&
6220                             (SOCK_PROTO(so) == IPPROTO_TCP)) {
6221                                 ret = mptcp_notsent_lowat_check(so);
6222                         }
6223 #endif
6224                         else {
6225                                 return 1;
6226                         }
6227                 } else {
6228                         ret = 1;
6229                 }
6230         }
6231         if (so_wait_for_if_feedback(so))
6232                 ret = 0;
6233         return (ret);
6234 }
6235
6236 static int
6237 filt_sowattach(struct knote *kn)
6238 {
6239         struct socket *so = (struct socket *)kn->kn_fp->f_fglob->fg_data;
6240
6241         /* socket locked */
6242         if (KNOTE_ATTACH(&so->so_snd.sb_sel.si_note, kn))
6243                 so->so_snd.sb_flags |= SB_KNOTE;
6244
6245         /* determine if its already fired */
6246         return filt_sowrite_common(kn, so);
6247 }
6248
6249 static void
6250 filt_sowdetach(struct knote *kn)
6251 {
6252         struct socket *so = (struct socket *)kn->kn_fp->f_fglob->fg_data;
6253         socket_lock(so, 1);
6254
6255         if (so->so_snd.sb_flags & SB_KNOTE)
6256                 if (KNOTE_DETACH(&so->so_snd.sb_sel.si_note, kn))
6257                         so->so_snd.sb_flags &= ~SB_KNOTE;
6258         socket_unlock(so, 1);
6259 }
6260
6261 /*ARGSUSED*/
6262 static int
6263 filt_sowrite(struct knote *kn, long hint)
6264 {
6265         struct socket *so = (struct socket *)kn->kn_fp->f_fglob->fg_data;
6266         int ret;
6267
6268         if ((hint & SO_FILT_HINT_LOCKED) == 0)
6269                 socket_lock(so, 1);
6270
6271         ret = filt_sowrite_common(kn, so);
6272
6273         if ((hint & SO_FILT_HINT_LOCKED) == 0)
6274                 socket_unlock(so, 1);
6275
6276         return ret;
6277 }
6278
6279 static int
6280 filt_sowtouch(struct knote *kn, struct kevent_internal_s *kev)
6281 {
6282         struct socket *so = (struct socket *)kn->kn_fp->f_fglob->fg_data;
6283         int ret;
6284
6285         socket_lock(so, 1);
6286
6287         /*save off the new input fflags and data */
6288         kn->kn_sfflags = kev->fflags;
6289         kn->kn_sdata = kev->data;
6290         if ((kn->kn_status & KN_UDATA_SPECIFIC) == 0)
6291                 kn->kn_udata = kev->udata;
6292
6293         /* determine if these changes result in a triggered event */
6294         ret = filt_sowrite_common(kn, so);
6295
6296         socket_unlock(so, 1);
6297
6298         return ret;
6299 }
6300
6301 static int
6302 filt_sowprocess(struct knote *kn, struct filt_process_s *data, struct kevent_internal_s *kev)
6303 {
6304 #pragma unused(data)
6305         struct socket *so = (struct socket *)kn->kn_fp->f_fglob->fg_data;
6306         int ret;
6307
6308         socket_lock(so, 1);
6309         ret = filt_sowrite_common(kn, so);
6310         if (ret) {
6311                 *kev = kn->kn_kevent;
6312                 if (kn->kn_flags & EV_CLEAR) {
6313                         kn->kn_fflags = 0;
6314                         kn->kn_data = 0;
6315                 }
6316         }
6317         socket_unlock(so, 1);
6318         return ret;
6319 }
6320
6321 static int
6322 filt_sockev_common(struct knote *kn, struct socket *so, long ev_hint)
6323 {
6324         int ret = 0;
6325         uint32_t level_trigger = 0;
6326
6327         if (ev_hint & SO_FILT_HINT_CONNRESET) {
6328                 kn->kn_fflags |= NOTE_CONNRESET;
6329         }
6330         if (ev_hint & SO_FILT_HINT_TIMEOUT) {
6331                 kn->kn_fflags |= NOTE_TIMEOUT;
6332         }
6333         if (ev_hint & SO_FILT_HINT_NOSRCADDR) {
6334                 kn->kn_fflags |= NOTE_NOSRCADDR;
6335         }
6336         if (ev_hint & SO_FILT_HINT_IFDENIED) {
6337                 kn->kn_fflags |= NOTE_IFDENIED;
6338         }
6339         if (ev_hint & SO_FILT_HINT_KEEPALIVE) {
6340                 kn->kn_fflags |= NOTE_KEEPALIVE;
6341         }
6342         if (ev_hint & SO_FILT_HINT_ADAPTIVE_WTIMO) {
6343                 kn->kn_fflags |= NOTE_ADAPTIVE_WTIMO;
6344         }
6345         if (ev_hint & SO_FILT_HINT_ADAPTIVE_RTIMO) {
6346                 kn->kn_fflags |= NOTE_ADAPTIVE_RTIMO;
6347         }
6348         if ((ev_hint & SO_FILT_HINT_CONNECTED) ||
6349             (so->so_state & SS_ISCONNECTED)) {
6350                 kn->kn_fflags |= NOTE_CONNECTED;
6351                 level_trigger |= NOTE_CONNECTED;
6352         }
6353         if ((ev_hint & SO_FILT_HINT_DISCONNECTED) ||
6354             (so->so_state & SS_ISDISCONNECTED)) {
6355                 kn->kn_fflags |= NOTE_DISCONNECTED;
6356                 level_trigger |= NOTE_DISCONNECTED;
6357         }
6358         if (ev_hint & SO_FILT_HINT_CONNINFO_UPDATED) {
6359                 if (so->so_proto != NULL &&
6360                     (so->so_proto->pr_flags & PR_EVCONNINFO))
6361                         kn->kn_fflags |= NOTE_CONNINFO_UPDATED;
6362         }
6363
6364         if ((ev_hint & SO_FILT_HINT_NOTIFY_ACK) ||
6365             tcp_notify_ack_active(so)) {
6366                 kn->kn_fflags |= NOTE_NOTIFY_ACK;
6367         }
6368
6369         if ((so->so_state & SS_CANTRCVMORE)
6370 #if CONTENT_FILTER
6371             && cfil_sock_data_pending(&so->so_rcv) == 0
6372 #endif /* CONTENT_FILTER */
6373             ) {
6374                 kn->kn_fflags |= NOTE_READCLOSED;
6375                 level_trigger |= NOTE_READCLOSED;
6376         }
6377
6378         if (so->so_state & SS_CANTSENDMORE) {
6379                 kn->kn_fflags |= NOTE_WRITECLOSED;
6380                 level_trigger |= NOTE_WRITECLOSED;
6381         }
6382
6383         if ((ev_hint & SO_FILT_HINT_SUSPEND) ||
6384             (so->so_flags & SOF_SUSPENDED)) {
6385                 kn->kn_fflags &= ~(NOTE_SUSPEND | NOTE_RESUME);
6386
6387                 /* If resume event was delivered before, reset it */
6388                 kn->kn_hookid &= ~NOTE_RESUME;
6389
6390                 kn->kn_fflags |= NOTE_SUSPEND;
6391                 level_trigger |= NOTE_SUSPEND;
6392         }
6393
6394         if ((ev_hint & SO_FILT_HINT_RESUME) ||
6395             (so->so_flags & SOF_SUSPENDED) == 0) {
6396                 kn->kn_fflags &= ~(NOTE_SUSPEND | NOTE_RESUME);
6397
6398                 /* If suspend event was delivered before, reset it */
6399                 kn->kn_hookid &= ~NOTE_SUSPEND;
6400
6401                 kn->kn_fflags |= NOTE_RESUME;
6402                 level_trigger |= NOTE_RESUME;
6403         }
6404
6405         if (so->so_error != 0) {
6406                 ret = 1;
6407                 kn->kn_data = so->so_error;
6408                 kn->kn_flags |= EV_EOF;
6409         } else {
6410                 get_sockev_state(so, (u_int32_t *)&(kn->kn_data));
6411         }
6412
6413         /* Reset any events that are not requested on this knote */
6414         kn->kn_fflags &= (kn->kn_sfflags & EVFILT_SOCK_ALL_MASK);
6415         level_trigger &= (kn->kn_sfflags & EVFILT_SOCK_ALL_MASK);
6416
6417         /* Find the level triggerred events that are already delivered */
6418         level_trigger &= kn->kn_hookid;
6419         level_trigger &= EVFILT_SOCK_LEVEL_TRIGGER_MASK;
6420
6421         /* Do not deliver level triggerred events more than once */
6422         if ((kn->kn_fflags & ~level_trigger) != 0)
6423                 ret = 1;
6424
6425         return (ret);
6426 }
6427
6428 static int
6429 filt_sockattach(struct knote *kn)
6430 {
6431         struct socket *so = (struct socket *)kn->kn_fp->f_fglob->fg_data;
6432
6433         /* socket locked */
6434         kn->kn_hookid = 0;
6435         if (KNOTE_ATTACH(&so->so_klist, kn))
6436                 so->so_flags |= SOF_KNOTE;
6437
6438         /* determine if event already fired */
6439         return filt_sockev_common(kn, so, 0);
6440 }
6441
6442 static void
6443 filt_sockdetach(struct knote *kn)
6444 {
6445         struct socket *so = (struct socket *)kn->kn_fp->f_fglob->fg_data;
6446         socket_lock(so, 1);
6447
6448         if ((so->so_flags & SOF_KNOTE) != 0)
6449                 if (KNOTE_DETACH(&so->so_klist, kn))
6450                         so->so_flags &= ~SOF_KNOTE;
6451         socket_unlock(so, 1);
6452 }
6453
6454 static int
6455 filt_sockev(struct knote *kn, long hint)
6456 {
6457         int ret = 0, locked = 0;
6458         struct socket *so = (struct socket *)kn->kn_fp->f_fglob->fg_data;
6459         long ev_hint = (hint & SO_FILT_HINT_EV);
6460
6461         if ((hint & SO_FILT_HINT_LOCKED) == 0) {
6462                 socket_lock(so, 1);
6463                 locked = 1;
6464         }
6465
6466         ret = filt_sockev_common(kn, so, ev_hint);
6467
6468         if (locked)
6469                 socket_unlock(so, 1);
6470
6471         return ret;
6472 }
6473
6474
6475
6476 /*
6477  *      filt_socktouch - update event state
6478  */
6479 static int
6480 filt_socktouch(
6481         struct knote *kn,
6482         struct kevent_internal_s *kev)
6483 {
6484         struct socket *so = (struct socket *)kn->kn_fp->f_fglob->fg_data;
6485         uint32_t changed_flags;
6486         int ret;
6487
6488         socket_lock(so, 1);
6489
6490         /* save off the [result] data and fflags */
6491         changed_flags = (kn->kn_sfflags ^ kn->kn_hookid);
6492
6493         /* save off the new input fflags and data */
6494         kn->kn_sfflags = kev->fflags;
6495         kn->kn_sdata = kev->data;
6496         if ((kn->kn_status & KN_UDATA_SPECIFIC) == 0)
6497                 kn->kn_udata = kev->udata;
6498
6499         /* restrict the current results to the (smaller?) set of new interest */
6500         /*
6501          * For compatibility with previous implementations, we leave kn_fflags
6502          * as they were before.
6503          */
6504         //kn->kn_fflags &= kev->fflags;
6505
6506         /*
6507          * Since we keep track of events that are already
6508          * delivered, if any of those events are not requested
6509          * anymore the state related to them can be reset
6510          */
6511         kn->kn_hookid &=
6512             ~(changed_flags & EVFILT_SOCK_LEVEL_TRIGGER_MASK);
6513
6514         /* determine if we have events to deliver */
6515         ret = filt_sockev_common(kn, so, 0);
6516
6517         socket_unlock(so, 1);
6518
6519         return ret;
6520 }
6521
6522 /*
6523  *      filt_sockprocess - query event fired state and return data
6524  */
6525 static int
6526 filt_sockprocess(
6527         struct knote *kn,
6528         struct filt_process_s *data,
6529         struct kevent_internal_s *kev)
6530 {
6531 #pragma unused(data)
6532
6533         struct socket *so = (struct socket *)kn->kn_fp->f_fglob->fg_data;
6534         int ret = 0;
6535
6536         socket_lock(so, 1);
6537
6538         ret = filt_sockev_common(kn, so, 0);
6539         if (ret) {
6540                 *kev = kn->kn_kevent;
6541
6542                 /*
6543                  * Store the state of the events being delivered. This
6544                  * state can be used to deliver level triggered events
6545                  * ateast once and still avoid waking up the application
6546                  * multiple times as long as the event is active.
6547                  */
6548                 if (kn->kn_fflags != 0)
6549                         kn->kn_hookid |= (kn->kn_fflags &
6550                                           EVFILT_SOCK_LEVEL_TRIGGER_MASK);
6551
6552                 /*
6553                  * NOTE_RESUME and NOTE_SUSPEND are an exception, deliver
6554                  * only one of them and remember the last one that was
6555                  * delivered last
6556                  */
6557                 if (kn->kn_fflags & NOTE_SUSPEND)
6558                         kn->kn_hookid &= ~NOTE_RESUME;
6559                 if (kn->kn_fflags & NOTE_RESUME)
6560                         kn->kn_hookid &= ~NOTE_SUSPEND;
6561
6562                 if (kn->kn_flags & EV_CLEAR) {
6563                         kn->kn_data = 0;
6564                         kn->kn_fflags = 0;
6565                 }
6566         }
6567
6568         socket_unlock(so, 1);
6569
6570         return ret;
6571 }
6572
6573 void
6574 get_sockev_state(struct socket *so, u_int32_t *statep)
6575 {
6576         u_int32_t state = *(statep);
6577
6578         /*
6579          * If the state variable is already used by a previous event,
6580          * reset it.
6581          */
6582         if (state != 0)
6583                 return;
6584
6585         if (so->so_state & SS_ISCONNECTED)
6586                 state |= SOCKEV_CONNECTED;
6587         else
6588                 state &= ~(SOCKEV_CONNECTED);
6589         state |= ((so->so_state & SS_ISDISCONNECTED) ? SOCKEV_DISCONNECTED : 0);
6590         *(statep) = state;
6591 }
6592
6593 #define SO_LOCK_HISTORY_STR_LEN \
6594         (2 * SO_LCKDBG_MAX * (2 + (2 * sizeof (void *)) + 1) + 1)
6595
6596 __private_extern__ const char *
6597 solockhistory_nr(struct socket *so)
6598 {
6599         size_t n = 0;
6600         int i;
6601         static char lock_history_str[SO_LOCK_HISTORY_STR_LEN];
6602
6603         bzero(lock_history_str, sizeof (lock_history_str));
6604         for (i = SO_LCKDBG_MAX - 1; i >= 0; i--) {
6605                 n += snprintf(lock_history_str + n,
6606                     SO_LOCK_HISTORY_STR_LEN - n, "%p:%p ",
6607                     so->lock_lr[(so->next_lock_lr + i) % SO_LCKDBG_MAX],
6608                     so->unlock_lr[(so->next_unlock_lr + i) % SO_LCKDBG_MAX]);
6609         }
6610         return (lock_history_str);
6611 }
6612
6613 int
6614 socket_lock(struct socket *so, int refcount)
6615 {
6616         int error = 0;
6617         void *lr_saved;
6618
6619         lr_saved = __builtin_return_address(0);
6620
6621         if (so->so_proto->pr_lock) {
6622                 error = (*so->so_proto->pr_lock)(so, refcount, lr_saved);
6623         } else {
6624 #ifdef MORE_LOCKING_DEBUG
6625                 lck_mtx_assert(so->so_proto->pr_domain->dom_mtx,
6626                     LCK_MTX_ASSERT_NOTOWNED);
6627 #endif
6628                 lck_mtx_lock(so->so_proto->pr_domain->dom_mtx);
6629                 if (refcount)
6630                         so->so_usecount++;
6631                 so->lock_lr[so->next_lock_lr] = lr_saved;
6632                 so->next_lock_lr = (so->next_lock_lr+1) % SO_LCKDBG_MAX;
6633         }
6634
6635         return (error);
6636 }
6637
6638 int
6639 socket_unlock(struct socket *so, int refcount)
6640 {
6641         int error = 0;
6642         void *lr_saved;
6643         lck_mtx_t *mutex_held;
6644
6645         lr_saved = __builtin_return_address(0);
6646
6647         if (so->so_proto == NULL) {
6648                 panic("%s: null so_proto so=%p\n", __func__, so);
6649                 /* NOTREACHED */
6650         }
6651
6652         if (so && so->so_proto->pr_unlock) {
6653                 error = (*so->so_proto->pr_unlock)(so, refcount, lr_saved);
6654         } else {
6655                 mutex_held = so->so_proto->pr_domain->dom_mtx;
6656 #ifdef MORE_LOCKING_DEBUG
6657                 lck_mtx_assert(mutex_held, LCK_MTX_ASSERT_OWNED);
6658 #endif
6659                 so->unlock_lr[so->next_unlock_lr] = lr_saved;
6660                 so->next_unlock_lr = (so->next_unlock_lr+1) % SO_LCKDBG_MAX;
6661
6662                 if (refcount) {
6663                         if (so->so_usecount <= 0) {
6664                                 panic("%s: bad refcount=%d so=%p (%d, %d, %d) "
6665                                     "lrh=%s", __func__, so->so_usecount, so,
6666                                     SOCK_DOM(so), so->so_type,
6667                                     SOCK_PROTO(so), solockhistory_nr(so));
6668                                 /* NOTREACHED */
6669                         }
6670
6671                         so->so_usecount--;
6672                         if (so->so_usecount == 0)
6673                                 sofreelastref(so, 1);
6674                 }
6675                 lck_mtx_unlock(mutex_held);
6676         }
6677
6678         return (error);
6679 }
6680
6681 /* Called with socket locked, will unlock socket */
6682 void
6683 sofree(struct socket *so)
6684 {
6685         lck_mtx_t *mutex_held;
6686
6687         if (so->so_proto->pr_getlock != NULL)
6688                 mutex_held = (*so->so_proto->pr_getlock)(so, 0);
6689         else
6690                 mutex_held = so->so_proto->pr_domain->dom_mtx;
6691         lck_mtx_assert(mutex_held, LCK_MTX_ASSERT_OWNED);
6692
6693         sofreelastref(so, 0);
6694 }
6695
6696 void
6697 soreference(struct socket *so)
6698 {
6699         socket_lock(so, 1);     /* locks & take one reference on socket */
6700         socket_unlock(so, 0);   /* unlock only */
6701 }
6702
6703 void
6704 sodereference(struct socket *so)
6705 {
6706         socket_lock(so, 0);
6707         socket_unlock(so, 1);
6708 }
6709
6710 /*
6711  * Set or clear SOF_MULTIPAGES on the socket to enable or disable the
6712  * possibility of using jumbo clusters.  Caller must ensure to hold
6713  * the socket lock.
6714  */
6715 void
6716 somultipages(struct socket *so, boolean_t set)
6717 {
6718         if (set)
6719                 so->so_flags |= SOF_MULTIPAGES;
6720         else
6721                 so->so_flags &= ~SOF_MULTIPAGES;
6722 }
6723
6724 void
6725 soif2kcl(struct socket *so, boolean_t set)
6726 {
6727         if (set)
6728                 so->so_flags1 |= SOF1_IF_2KCL;
6729         else
6730                 so->so_flags1 &= ~SOF1_IF_2KCL;
6731 }
6732
6733 int
6734 so_isdstlocal(struct socket *so) {
6735
6736         struct inpcb *inp = (struct inpcb *)so->so_pcb;
6737
6738         if (SOCK_DOM(so) == PF_INET)
6739                 return (inaddr_local(inp->inp_faddr));
6740         else if (SOCK_DOM(so) == PF_INET6)
6741                 return (in6addr_local(&inp->in6p_faddr));
6742
6743         return (0);
6744 }
6745
6746 int
6747 sosetdefunct(struct proc *p, struct socket *so, int level, boolean_t noforce)
6748 {
6749         struct sockbuf *rcv, *snd;
6750         int err = 0, defunct;
6751
6752         rcv = &so->so_rcv;
6753         snd = &so->so_snd;
6754
6755         defunct = (so->so_flags & SOF_DEFUNCT);
6756         if (defunct) {
6757                 if (!(snd->sb_flags & rcv->sb_flags & SB_DROP)) {
6758                         panic("%s: SB_DROP not set", __func__);
6759                         /* NOTREACHED */
6760                 }
6761                 goto done;
6762         }
6763
6764         if (so->so_flags & SOF_NODEFUNCT) {
6765                 if (noforce) {
6766                         err = EOPNOTSUPP;
6767                         SODEFUNCTLOG("%s[%d, %s]: (target pid %d "
6768                             "name %s level %d) so 0x%llx [%d,%d] "
6769                             "is not eligible for defunct "
6770                             "(%d)\n", __func__, proc_selfpid(),
6771                             proc_best_name(current_proc()), proc_pid(p),
6772                             proc_best_name(p), level,
6773                             (uint64_t)DEBUG_KERNEL_ADDRPERM(so),
6774                             SOCK_DOM(so), SOCK_TYPE(so), err);
6775                         return (err);
6776                 }
6777                 so->so_flags &= ~SOF_NODEFUNCT;
6778                 SODEFUNCTLOG("%s[%d, %s]: (target pid %d name %s level %d) "
6779                     "so 0x%llx [%d,%d] defunct by force\n", __func__,
6780                     proc_selfpid(), proc_best_name(current_proc()),
6781                     proc_pid(p), proc_best_name(p), level,
6782                     (uint64_t)DEBUG_KERNEL_ADDRPERM(so),
6783                     SOCK_DOM(so), SOCK_TYPE(so));
6784         } else if (so->so_flags1 & SOF1_EXTEND_BK_IDLE_WANTED) {
6785                 struct inpcb *inp = (struct inpcb *)so->so_pcb;
6786                 struct ifnet *ifp = inp->inp_last_outifp;
6787
6788                 if (ifp && IFNET_IS_CELLULAR(ifp)) {
6789                         OSIncrementAtomic(&soextbkidlestat.so_xbkidle_nocell);
6790                 } else if (so->so_flags & SOF_DELEGATED) {
6791                         OSIncrementAtomic(&soextbkidlestat.so_xbkidle_nodlgtd);
6792                 } else if (soextbkidlestat.so_xbkidle_time == 0) {
6793                         OSIncrementAtomic(&soextbkidlestat.so_xbkidle_notime);
6794                 } else if (noforce) {
6795                         OSIncrementAtomic(&soextbkidlestat.so_xbkidle_active);
6796
6797                         so->so_flags1 |= SOF1_EXTEND_BK_IDLE_INPROG;
6798                         so->so_extended_bk_start = net_uptime();
6799                         OSBitOrAtomic(P_LXBKIDLEINPROG, &p->p_ladvflag);
6800
6801                         inpcb_timer_sched(inp->inp_pcbinfo, INPCB_TIMER_LAZY);
6802
6803                         err = EOPNOTSUPP;
6804                         SODEFUNCTLOG("%s[%d, %s]: (target pid %d name %s "
6805                             "level %d) extend bk idle so 0x%llx rcv hw %d "
6806                             "cc %d\n",
6807                             __func__, proc_selfpid(),
6808                             proc_best_name(current_proc()), proc_pid(p),
6809                             proc_best_name(p), level,
6810                             (uint64_t)DEBUG_KERNEL_ADDRPERM(so),
6811                             so->so_rcv.sb_hiwat, so->so_rcv.sb_cc);
6812                         return (err);
6813                 } else {
6814                         OSIncrementAtomic(&soextbkidlestat.so_xbkidle_forced);
6815                 }
6816         }
6817
6818         so->so_flags |= SOF_DEFUNCT;
6819
6820         /* Prevent further data from being appended to the socket buffers */
6821         snd->sb_flags |= SB_DROP;
6822         rcv->sb_flags |= SB_DROP;
6823
6824         /* Flush any existing data in the socket buffers */
6825         if (rcv->sb_cc != 0) {
6826                 rcv->sb_flags &= ~SB_SEL;
6827                 selthreadclear(&rcv->sb_sel);
6828                 sbrelease(rcv);
6829         }
6830         if (snd->sb_cc != 0) {
6831                 snd->sb_flags &= ~SB_SEL;
6832                 selthreadclear(&snd->sb_sel);
6833                 sbrelease(snd);
6834         }
6835
6836 done:
6837         SODEFUNCTLOG("%s[%d, %s]: (target pid %d name %s level %d) "
6838             "so 0x%llx [%d,%d] %s defunct%s\n", __func__, proc_selfpid(),
6839             proc_best_name(current_proc()), proc_pid(p), proc_best_name(p),
6840             level, (uint64_t)DEBUG_KERNEL_ADDRPERM(so), SOCK_DOM(so),
6841             SOCK_TYPE(so), defunct ? "is already" : "marked as",
6842             (so->so_flags1 & SOF1_EXTEND_BK_IDLE_WANTED) ? " extbkidle" : "");
6843
6844         return (err);
6845 }
6846
6847 int
6848 sodefunct(struct proc *p, struct socket *so, int level)
6849 {
6850         struct sockbuf *rcv, *snd;
6851
6852         if (!(so->so_flags & SOF_DEFUNCT)) {
6853                 panic("%s improperly called", __func__);
6854                 /* NOTREACHED */
6855         }
6856         if (so->so_state & SS_DEFUNCT)
6857                 goto done;
6858
6859         rcv = &so->so_rcv;
6860         snd = &so->so_snd;
6861
6862         if (SOCK_DOM(so) == PF_INET || SOCK_DOM(so) == PF_INET6) {
6863                 char s[MAX_IPv6_STR_LEN];
6864                 char d[MAX_IPv6_STR_LEN];
6865                 struct inpcb *inp = sotoinpcb(so);
6866
6867                 SODEFUNCTLOG("%s[%d, %s]: (target pid %d name %s level %d) "
6868                     "so 0x%llx [%s %s:%d -> %s:%d] is now defunct "
6869                     "[rcv_si 0x%x, snd_si 0x%x, rcv_fl 0x%x, snd_fl 0x%x]\n",
6870                     __func__, proc_selfpid(), proc_best_name(current_proc()),
6871                     proc_pid(p), proc_best_name(p), level,
6872                     (uint64_t)DEBUG_KERNEL_ADDRPERM(so),
6873                     (SOCK_TYPE(so) == SOCK_STREAM) ? "TCP" : "UDP",
6874                     inet_ntop(SOCK_DOM(so), ((SOCK_DOM(so) == PF_INET) ?
6875                     (void *)&inp->inp_laddr.s_addr : (void *)&inp->in6p_laddr),
6876                     s, sizeof (s)), ntohs(inp->in6p_lport),
6877                     inet_ntop(SOCK_DOM(so), (SOCK_DOM(so) == PF_INET) ?
6878                     (void *)&inp->inp_faddr.s_addr : (void *)&inp->in6p_faddr,
6879                     d, sizeof (d)), ntohs(inp->in6p_fport),
6880                     (uint32_t)rcv->sb_sel.si_flags,
6881                     (uint32_t)snd->sb_sel.si_flags,
6882                     rcv->sb_flags, snd->sb_flags);
6883         } else {
6884                 SODEFUNCTLOG("%s[%d, %s]: (target pid %d name %s level %d) "
6885                     "so 0x%llx [%d,%d] is now defunct [rcv_si 0x%x, "
6886                     "snd_si 0x%x, rcv_fl 0x%x, snd_fl 0x%x]\n", __func__,
6887                     proc_selfpid(), proc_best_name(current_proc()),
6888                     proc_pid(p), proc_best_name(p), level,
6889                     (uint64_t)DEBUG_KERNEL_ADDRPERM(so),
6890                     SOCK_DOM(so), SOCK_TYPE(so),
6891                     (uint32_t)rcv->sb_sel.si_flags,
6892                     (uint32_t)snd->sb_sel.si_flags, rcv->sb_flags,
6893                     snd->sb_flags);
6894         }
6895
6896         /*
6897          * Unwedge threads blocked on sbwait() and sb_lock().
6898          */
6899         sbwakeup(rcv);
6900         sbwakeup(snd);
6901
6902         so->so_flags1 |= SOF1_DEFUNCTINPROG;
6903         if (rcv->sb_flags & SB_LOCK)
6904                 sbunlock(rcv, TRUE);    /* keep socket locked */
6905         if (snd->sb_flags & SB_LOCK)
6906                 sbunlock(snd, TRUE);    /* keep socket locked */
6907
6908         /*
6909          * Flush the buffers and disconnect.  We explicitly call shutdown
6910          * on both data directions to ensure that SS_CANT{RCV,SEND}MORE
6911          * states are set for the socket.  This would also flush out data
6912          * hanging off the receive list of this socket.
6913          */
6914         (void) soshutdownlock_final(so, SHUT_RD);
6915         (void) soshutdownlock_final(so, SHUT_WR);
6916         (void) sodisconnectlocked(so);
6917
6918         /*
6919          * Explicitly handle connectionless-protocol disconnection
6920          * and release any remaining data in the socket buffers.
6921          */
6922         if (!(so->so_flags & SS_ISDISCONNECTED))
6923                 (void) soisdisconnected(so);
6924
6925         if (so->so_error == 0)
6926                 so->so_error = EBADF;
6927
6928         if (rcv->sb_cc != 0) {
6929                 rcv->sb_flags &= ~SB_SEL;
6930                 selthreadclear(&rcv->sb_sel);
6931                 sbrelease(rcv);
6932         }
6933         if (snd->sb_cc != 0) {
6934                 snd->sb_flags &= ~SB_SEL;
6935                 selthreadclear(&snd->sb_sel);
6936                 sbrelease(snd);
6937         }
6938         so->so_state |= SS_DEFUNCT;
6939         OSIncrementAtomicLong((volatile long *)&sodefunct_calls);
6940
6941 done:
6942         return (0);
6943 }
6944
6945 int
6946 soresume(struct proc *p, struct socket *so, int locked)
6947 {
6948         if (locked == 0)
6949                 socket_lock(so, 1);
6950
6951         if (so->so_flags1 & SOF1_EXTEND_BK_IDLE_INPROG) {
6952                 SODEFUNCTLOG("%s[%d, %s]: (target pid %d name %s) so 0x%llx "
6953                     "[%d,%d] resumed from bk idle\n",
6954                     __func__, proc_selfpid(), proc_best_name(current_proc()),
6955                     proc_pid(p), proc_best_name(p),
6956                     (uint64_t)DEBUG_KERNEL_ADDRPERM(so),
6957                     SOCK_DOM(so), SOCK_TYPE(so));
6958
6959                 so->so_flags1 &= ~SOF1_EXTEND_BK_IDLE_INPROG;
6960                 so->so_extended_bk_start = 0;
6961                 OSBitAndAtomic(~P_LXBKIDLEINPROG, &p->p_ladvflag);
6962
6963                 OSIncrementAtomic(&soextbkidlestat.so_xbkidle_resumed);
6964                 OSDecrementAtomic(&soextbkidlestat.so_xbkidle_active);
6965                 VERIFY(soextbkidlestat.so_xbkidle_active >= 0);
6966         }
6967         if (locked == 0)
6968                 socket_unlock(so, 1);
6969
6970         return (0);
6971 }
6972
6973 /*
6974  * Does not attempt to account for sockets that are delegated from
6975  * the current process
6976  */
6977 int
6978 so_set_extended_bk_idle(struct socket *so, int optval)
6979 {
6980         int error = 0;
6981
6982         if ((SOCK_DOM(so) != PF_INET && SOCK_DOM(so) != PF_INET6) ||
6983             SOCK_PROTO(so) != IPPROTO_TCP) {
6984                 OSDecrementAtomic(&soextbkidlestat.so_xbkidle_notsupp);
6985                 error = EOPNOTSUPP;
6986         } else if (optval == 0) {
6987                 so->so_flags1 &= ~SOF1_EXTEND_BK_IDLE_WANTED;
6988
6989                 soresume(current_proc(), so, 1);
6990         } else {
6991                 struct proc *p = current_proc();
6992                 int i;
6993                 struct filedesc *fdp;
6994                 int count = 0;
6995
6996                 proc_fdlock(p);
6997
6998                 fdp = p->p_fd;
6999                 for (i = 0; i < fdp->fd_nfiles; i++) {
7000                         struct fileproc *fp = fdp->fd_ofiles[i];
7001                         struct socket *so2;
7002
7003                         if (fp == NULL ||
7004                             (fdp->fd_ofileflags[i] & UF_RESERVED) != 0 ||
7005                             FILEGLOB_DTYPE(fp->f_fglob) != DTYPE_SOCKET)
7006                                 continue;
7007
7008                         so2 = (struct socket *)fp->f_fglob->fg_data;
7009                         if (so != so2 &&
7010                             so2->so_flags1 & SOF1_EXTEND_BK_IDLE_WANTED)
7011                                 count++;
7012                         if (count >= soextbkidlestat.so_xbkidle_maxperproc)
7013                                 break;
7014                 }
7015                 if (count >= soextbkidlestat.so_xbkidle_maxperproc) {
7016                         OSIncrementAtomic(&soextbkidlestat.so_xbkidle_toomany);
7017                         error = EBUSY;
7018                 } else if (so->so_flags & SOF_DELEGATED) {
7019                         OSIncrementAtomic(&soextbkidlestat.so_xbkidle_nodlgtd);
7020                         error = EBUSY;
7021                 } else {
7022                         so->so_flags1 |= SOF1_EXTEND_BK_IDLE_WANTED;
7023                         OSIncrementAtomic(&soextbkidlestat.so_xbkidle_wantok);
7024                 }
7025                 SODEFUNCTLOG("%s[%d, %s]: so 0x%llx [%d,%d] "
7026                     "%s marked for extended bk idle\n",
7027                     __func__, proc_selfpid(), proc_best_name(current_proc()),
7028                     (uint64_t)DEBUG_KERNEL_ADDRPERM(so),
7029                     SOCK_DOM(so), SOCK_TYPE(so),
7030                     (so->so_flags1 & SOF1_EXTEND_BK_IDLE_WANTED) ?
7031                     "is" : "not");
7032
7033                 proc_fdunlock(p);
7034         }
7035
7036         return (error);
7037 }
7038
7039 static void
7040 so_stop_extended_bk_idle(struct socket *so)
7041 {
7042         so->so_flags1 &= ~SOF1_EXTEND_BK_IDLE_INPROG;
7043         so->so_extended_bk_start = 0;
7044
7045         OSDecrementAtomic(&soextbkidlestat.so_xbkidle_active);
7046         VERIFY(soextbkidlestat.so_xbkidle_active >= 0);
7047         /*
7048          * Force defunct
7049          */
7050         sosetdefunct(current_proc(), so,
7051             SHUTDOWN_SOCKET_LEVEL_DISCONNECT_INTERNAL, FALSE);
7052         if (so->so_flags & SOF_DEFUNCT) {
7053                 sodefunct(current_proc(), so,
7054                     SHUTDOWN_SOCKET_LEVEL_DISCONNECT_INTERNAL);
7055         }
7056 }
7057
7058 void
7059 so_drain_extended_bk_idle(struct socket *so)
7060 {
7061         if (so && (so->so_flags1 & SOF1_EXTEND_BK_IDLE_INPROG)) {
7062                 /*
7063                  * Only penalize sockets that have outstanding data
7064                  */
7065                 if (so->so_rcv.sb_cc || so->so_snd.sb_cc) {
7066                         so_stop_extended_bk_idle(so);
7067
7068                         OSIncrementAtomic(&soextbkidlestat.so_xbkidle_drained);
7069                 }
7070         }
7071 }
7072
7073 /*
7074  * Return values tells if socket is still in extended background idle
7075  */
7076 int
7077 so_check_extended_bk_idle_time(struct socket *so)
7078 {
7079         int ret = 1;
7080
7081         if ((so->so_flags1 & SOF1_EXTEND_BK_IDLE_INPROG)) {
7082                 SODEFUNCTLOG("%s[%d, %s]: so 0x%llx [%d,%d]\n",
7083                     __func__, proc_selfpid(), proc_best_name(current_proc()),
7084                     (uint64_t)DEBUG_KERNEL_ADDRPERM(so),
7085                     SOCK_DOM(so), SOCK_TYPE(so));
7086                 if (net_uptime() - so->so_extended_bk_start >
7087                     soextbkidlestat.so_xbkidle_time) {
7088                         so_stop_extended_bk_idle(so);
7089
7090                         OSIncrementAtomic(&soextbkidlestat.so_xbkidle_expired);
7091
7092                         ret = 0;
7093                 } else {
7094                         struct inpcb *inp = (struct inpcb *)so->so_pcb;
7095
7096                         inpcb_timer_sched(inp->inp_pcbinfo, INPCB_TIMER_LAZY);
7097                         OSIncrementAtomic(&soextbkidlestat.so_xbkidle_resched);
7098                 }
7099         }
7100
7101         return (ret);
7102 }
7103
7104 void
7105 resume_proc_sockets(proc_t p)
7106 {
7107         if (p->p_ladvflag & P_LXBKIDLEINPROG) {
7108                 struct filedesc *fdp;
7109                 int i;
7110
7111                 proc_fdlock(p);
7112                 fdp = p->p_fd;
7113                 for (i = 0; i < fdp->fd_nfiles; i++) {
7114                         struct fileproc *fp;
7115                         struct socket *so;
7116
7117                         fp = fdp->fd_ofiles[i];
7118                         if (fp == NULL ||
7119                             (fdp->fd_ofileflags[i] & UF_RESERVED) != 0 ||
7120                             FILEGLOB_DTYPE(fp->f_fglob) != DTYPE_SOCKET)
7121                                 continue;
7122
7123                         so = (struct socket *)fp->f_fglob->fg_data;
7124                         (void) soresume(p, so, 0);
7125                 }
7126                 proc_fdunlock(p);
7127
7128                 OSBitAndAtomic(~P_LXBKIDLEINPROG, &p->p_ladvflag);
7129         }
7130 }
7131
7132 __private_extern__ int
7133 so_set_recv_anyif(struct socket *so, int optval)
7134 {
7135         int ret = 0;
7136
7137 #if INET6
7138         if (SOCK_DOM(so) == PF_INET || SOCK_DOM(so) == PF_INET6) {
7139 #else
7140         if (SOCK_DOM(so) == PF_INET) {
7141 #endif /* !INET6 */
7142                 if (optval)
7143                         sotoinpcb(so)->inp_flags |= INP_RECV_ANYIF;
7144                 else
7145                         sotoinpcb(so)->inp_flags &= ~INP_RECV_ANYIF;
7146         }
7147
7148         return (ret);
7149 }
7150
7151 __private_extern__ int
7152 so_get_recv_anyif(struct socket *so)
7153 {
7154         int ret = 0;
7155
7156 #if INET6
7157         if (SOCK_DOM(so) == PF_INET || SOCK_DOM(so) == PF_INET6) {
7158 #else
7159         if (SOCK_DOM(so) == PF_INET) {
7160 #endif /* !INET6 */
7161                 ret = (sotoinpcb(so)->inp_flags & INP_RECV_ANYIF) ? 1 : 0;
7162         }
7163
7164         return (ret);
7165 }
7166
7167 int
7168 so_set_restrictions(struct socket *so, uint32_t vals)
7169 {
7170         int nocell_old, nocell_new;
7171         int noexpensive_old, noexpensive_new;
7172
7173         /*
7174          * Deny-type restrictions are trapdoors; once set they cannot be
7175          * unset for the lifetime of the socket.  This allows them to be
7176          * issued by a framework on behalf of the application without
7177          * having to worry that they can be undone.
7178          *
7179          * Note here that socket-level restrictions overrides any protocol
7180          * level restrictions.  For instance, SO_RESTRICT_DENY_CELLULAR
7181          * socket restriction issued on the socket has a higher precendence
7182          * than INP_NO_IFT_CELLULAR.  The latter is affected by the UUID
7183          * policy PROC_UUID_NO_CELLULAR for unrestricted sockets only,
7184          * i.e. when SO_RESTRICT_DENY_CELLULAR has not been issued.
7185          */
7186         nocell_old = (so->so_restrictions & SO_RESTRICT_DENY_CELLULAR);
7187         noexpensive_old = (so->so_restrictions & SO_RESTRICT_DENY_EXPENSIVE);
7188         so->so_restrictions |= (vals & (SO_RESTRICT_DENY_IN |
7189             SO_RESTRICT_DENY_OUT | SO_RESTRICT_DENY_CELLULAR |
7190             SO_RESTRICT_DENY_EXPENSIVE));
7191         nocell_new = (so->so_restrictions & SO_RESTRICT_DENY_CELLULAR);
7192         noexpensive_new = (so->so_restrictions & SO_RESTRICT_DENY_EXPENSIVE);
7193
7194         /* we can only set, not clear restrictions */
7195         if ((nocell_new - nocell_old) == 0 &&
7196             (noexpensive_new - noexpensive_old) == 0)
7197                 return (0);
7198 #if INET6
7199         if (SOCK_DOM(so) == PF_INET || SOCK_DOM(so) == PF_INET6) {
7200 #else
7201         if (SOCK_DOM(so) == PF_INET) {
7202 #endif /* !INET6 */
7203                 if (nocell_new - nocell_old != 0) {
7204                         /*
7205                          * if deny cellular is now set, do what's needed
7206                          * for INPCB
7207                          */
7208                         inp_set_nocellular(sotoinpcb(so));
7209                 }
7210                 if (noexpensive_new - noexpensive_old != 0) {
7211                         inp_set_noexpensive(sotoinpcb(so));
7212                 }
7213         }
7214
7215         return (0);
7216 }
7217
7218 uint32_t
7219 so_get_restrictions(struct socket *so)
7220 {
7221         return (so->so_restrictions & (SO_RESTRICT_DENY_IN |
7222             SO_RESTRICT_DENY_OUT |
7223             SO_RESTRICT_DENY_CELLULAR | SO_RESTRICT_DENY_EXPENSIVE));
7224 }
7225
7226 int
7227 so_set_effective_pid(struct socket *so, int epid, struct proc *p)
7228 {
7229         struct proc *ep = PROC_NULL;
7230         int error = 0;
7231
7232         /* pid 0 is reserved for kernel */
7233         if (epid == 0) {
7234                 error = EINVAL;
7235                 goto done;
7236         }
7237
7238         /*
7239          * If this is an in-kernel socket, prevent its delegate
7240          * association from changing unless the socket option is
7241          * coming from within the kernel itself.
7242          */
7243         if (so->last_pid == 0 && p != kernproc) {
7244                 error = EACCES;
7245                 goto done;
7246         }
7247
7248         /*
7249          * If this is issued by a process that's recorded as the
7250          * real owner of the socket, or if the pid is the same as
7251          * the process's own pid, then proceed.  Otherwise ensure
7252          * that the issuing process has the necessary privileges.
7253          */
7254         if (epid != so->last_pid || epid != proc_pid(p)) {
7255                 if ((error = priv_check_cred(kauth_cred_get(),
7256                     PRIV_NET_PRIVILEGED_SOCKET_DELEGATE, 0))) {
7257                         error = EACCES;
7258                         goto done;
7259                 }
7260         }
7261
7262         /* Find the process that corresponds to the effective pid */
7263         if ((ep = proc_find(epid)) == PROC_NULL) {
7264                 error = ESRCH;
7265                 goto done;
7266         }
7267
7268         /*
7269          * If a process tries to delegate the socket to itself, then
7270          * there's really nothing to do; treat it as a way for the
7271          * delegate association to be cleared.  Note that we check
7272          * the passed-in proc rather than calling proc_selfpid(),
7273          * as we need to check the process issuing the socket option
7274          * which could be kernproc.  Given that we don't allow 0 for
7275          * effective pid, it means that a delegated in-kernel socket
7276          * stays delegated during its lifetime (which is probably OK.)
7277          */
7278         if (epid == proc_pid(p)) {
7279                 so->so_flags &= ~SOF_DELEGATED;
7280                 so->e_upid = 0;
7281                 so->e_pid = 0;
7282                 uuid_clear(so->e_uuid);
7283         } else {
7284                 so->so_flags |= SOF_DELEGATED;
7285                 so->e_upid = proc_uniqueid(ep);
7286                 so->e_pid = proc_pid(ep);
7287                 proc_getexecutableuuid(ep, so->e_uuid, sizeof (so->e_uuid));
7288         }
7289 done:
7290         if (error == 0 && net_io_policy_log) {
7291                 uuid_string_t buf;
7292
7293                 uuid_unparse(so->e_uuid, buf);
7294                 log(LOG_DEBUG, "%s[%s,%d]: so 0x%llx [%d,%d] epid %d (%s) "
7295                     "euuid %s%s\n", __func__, proc_name_address(p),
7296                     proc_pid(p), (uint64_t)DEBUG_KERNEL_ADDRPERM(so),
7297                     SOCK_DOM(so), SOCK_TYPE(so),
7298                     so->e_pid, proc_name_address(ep), buf,
7299                     ((so->so_flags & SOF_DELEGATED) ? " [delegated]" : ""));
7300         } else if (error != 0 && net_io_policy_log) {
7301                 log(LOG_ERR, "%s[%s,%d]: so 0x%llx [%d,%d] epid %d (%s) "
7302                     "ERROR (%d)\n", __func__, proc_name_address(p),
7303                     proc_pid(p), (uint64_t)DEBUG_KERNEL_ADDRPERM(so),
7304                     SOCK_DOM(so), SOCK_TYPE(so),
7305                     epid, (ep == PROC_NULL) ? "PROC_NULL" :
7306                     proc_name_address(ep), error);
7307         }
7308
7309         /* Update this socket's policy upon success */
7310         if (error == 0) {
7311                 so->so_policy_gencnt *= -1;
7312                 so_update_policy(so);
7313 #if NECP
7314                 so_update_necp_policy(so, NULL, NULL);
7315 #endif /* NECP */
7316         }
7317
7318         if (ep != PROC_NULL)
7319                 proc_rele(ep);
7320
7321         return (error);
7322 }
7323
7324 int
7325 so_set_effective_uuid(struct socket *so, uuid_t euuid, struct proc *p)
7326 {
7327         uuid_string_t buf;
7328         uuid_t uuid;
7329         int error = 0;
7330
7331         /* UUID must not be all-zeroes (reserved for kernel) */
7332         if (uuid_is_null(euuid)) {
7333                 error = EINVAL;
7334                 goto done;
7335         }
7336
7337         /*
7338          * If this is an in-kernel socket, prevent its delegate
7339          * association from changing unless the socket option is
7340          * coming from within the kernel itself.
7341          */
7342         if (so->last_pid == 0 && p != kernproc) {
7343                 error = EACCES;
7344                 goto done;
7345         }
7346
7347         /* Get the UUID of the issuing process */
7348         proc_getexecutableuuid(p, uuid, sizeof (uuid));
7349
7350         /*
7351          * If this is issued by a process that's recorded as the
7352          * real owner of the socket, or if the uuid is the same as
7353          * the process's own uuid, then proceed.  Otherwise ensure
7354          * that the issuing process has the necessary privileges.
7355          */
7356         if (uuid_compare(euuid, so->last_uuid) != 0 ||
7357             uuid_compare(euuid, uuid) != 0) {
7358                 if ((error = priv_check_cred(kauth_cred_get(),
7359                     PRIV_NET_PRIVILEGED_SOCKET_DELEGATE, 0))) {
7360                         error = EACCES;
7361                         goto done;
7362                 }
7363         }
7364
7365         /*
7366          * If a process tries to delegate the socket to itself, then
7367          * there's really nothing to do; treat it as a way for the
7368          * delegate association to be cleared.  Note that we check
7369          * the uuid of the passed-in proc rather than that of the
7370          * current process, as we need to check the process issuing
7371          * the socket option which could be kernproc itself.  Given
7372          * that we don't allow 0 for effective uuid, it means that
7373          * a delegated in-kernel socket stays delegated during its
7374          * lifetime (which is okay.)
7375          */
7376         if (uuid_compare(euuid, uuid) == 0) {
7377                 so->so_flags &= ~SOF_DELEGATED;
7378                 so->e_upid = 0;
7379                 so->e_pid = 0;
7380                 uuid_clear(so->e_uuid);
7381         } else {
7382                 so->so_flags |= SOF_DELEGATED;
7383                 /*
7384                  * Unlike so_set_effective_pid(), we only have the UUID
7385                  * here and the process ID is not known.  Inherit the
7386                  * real {pid,upid} of the socket.
7387                  */
7388                 so->e_upid = so->last_upid;
7389                 so->e_pid = so->last_pid;
7390                 uuid_copy(so->e_uuid, euuid);
7391         }
7392
7393 done:
7394         if (error == 0 && net_io_policy_log) {
7395                 uuid_unparse(so->e_uuid, buf);
7396                 log(LOG_DEBUG, "%s[%s,%d]: so 0x%llx [%d,%d] epid %d "
7397                     "euuid %s%s\n", __func__, proc_name_address(p), proc_pid(p),
7398                     (uint64_t)DEBUG_KERNEL_ADDRPERM(so), SOCK_DOM(so),
7399                     SOCK_TYPE(so), so->e_pid, buf,
7400                     ((so->so_flags & SOF_DELEGATED) ? " [delegated]" : ""));
7401         } else if (error != 0 && net_io_policy_log) {
7402                 uuid_unparse(euuid, buf);
7403                 log(LOG_DEBUG, "%s[%s,%d]: so 0x%llx [%d,%d] euuid %s "
7404                     "ERROR (%d)\n", __func__, proc_name_address(p), proc_pid(p),
7405                     (uint64_t)DEBUG_KERNEL_ADDRPERM(so), SOCK_DOM(so),
7406                     SOCK_TYPE(so), buf, error);
7407         }
7408
7409         /* Update this socket's policy upon success */
7410         if (error == 0) {
7411                 so->so_policy_gencnt *= -1;
7412                 so_update_policy(so);
7413 #if NECP
7414                 so_update_necp_policy(so, NULL, NULL);
7415 #endif /* NECP */
7416         }
7417
7418         return (error);
7419 }
7420
7421 void
7422 netpolicy_post_msg(uint32_t ev_code, struct netpolicy_event_data *ev_data,
7423     uint32_t ev_datalen)
7424 {
7425         struct kev_msg ev_msg;
7426
7427         /*
7428          * A netpolicy event always starts with a netpolicy_event_data
7429          * structure, but the caller can provide for a longer event
7430          * structure to post, depending on the event code.
7431          */
7432         VERIFY(ev_data != NULL && ev_datalen >= sizeof (*ev_data));
7433
7434         bzero(&ev_msg, sizeof (ev_msg));
7435         ev_msg.vendor_code      = KEV_VENDOR_APPLE;
7436         ev_msg.kev_class        = KEV_NETWORK_CLASS;
7437         ev_msg.kev_subclass     = KEV_NETPOLICY_SUBCLASS;
7438         ev_msg.event_code       = ev_code;
7439
7440         ev_msg.dv[0].data_ptr   = ev_data;
7441         ev_msg.dv[0].data_length = ev_datalen;
7442
7443         kev_post_msg(&ev_msg);
7444 }
7445
7446 void
7447 socket_post_kev_msg(uint32_t ev_code,
7448     struct kev_socket_event_data *ev_data,
7449     uint32_t ev_datalen)
7450 {
7451         struct kev_msg ev_msg;
7452
7453         bzero(&ev_msg, sizeof(ev_msg));
7454         ev_msg.vendor_code = KEV_VENDOR_APPLE;
7455         ev_msg.kev_class = KEV_NETWORK_CLASS;
7456         ev_msg.kev_subclass = KEV_SOCKET_SUBCLASS;
7457         ev_msg.event_code = ev_code;
7458
7459         ev_msg.dv[0].data_ptr = ev_data;
7460         ev_msg.dv[0]. data_length = ev_datalen;
7461
7462         kev_post_msg(&ev_msg);
7463 }
7464
7465 void
7466 socket_post_kev_msg_closed(struct socket *so)
7467 {
7468         struct kev_socket_closed ev;
7469         struct sockaddr *socksa = NULL, *peersa = NULL;
7470         int err;
7471         bzero(&ev, sizeof(ev));
7472         err = (*so->so_proto->pr_usrreqs->pru_sockaddr)(so, &socksa);
7473         if (err == 0) {
7474                 err = (*so->so_proto->pr_usrreqs->pru_peeraddr)(so,
7475                     &peersa);
7476                 if (err == 0) {
7477                         memcpy(&ev.ev_data.kev_sockname, socksa,
7478                             min(socksa->sa_len,
7479                             sizeof (ev.ev_data.kev_sockname)));
7480                         memcpy(&ev.ev_data.kev_peername, peersa,
7481                             min(peersa->sa_len,
7482                             sizeof (ev.ev_data.kev_peername)));
7483                         socket_post_kev_msg(KEV_SOCKET_CLOSED,
7484                             &ev.ev_data, sizeof (ev));
7485                 }
7486         }
7487         if (socksa != NULL)
7488                 FREE(socksa, M_SONAME);
7489         if (peersa != NULL)
7490                 FREE(peersa, M_SONAME);
7491 }