bsd/kern/uipc_socket.c

   1 /*
   2  * Copyright (c) 1998-2014 Apple Inc. All rights reserved.
   3  *
   4  * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
   5  *
   6  * This file contains Original Code and/or Modifications of Original Code
   7  * as defined in and that are subject to the Apple Public Source License
   8  * Version 2.0 (the 'License'). You may not use this file except in
   9  * compliance with the License. The rights granted to you under the License
  10  * may not be used to create, or enable the creation or redistribution of,
  11  * unlawful or unlicensed copies of an Apple operating system, or to
  12  * circumvent, violate, or enable the circumvention or violation of, any
  13  * terms of an Apple operating system software license agreement.
  14  *
  15  * Please obtain a copy of the License at
  16  * http://www.opensource.apple.com/apsl/ and read it before using this file.
  17  *
  18  * The Original Code and all software distributed under the License are
  19  * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
  20  * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
  21  * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
  22  * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
  23  * Please see the License for the specific language governing rights and
  24  * limitations under the License.
  25  *
  26  * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
  27  */
  28 /* Copyright (c) 1995 NeXT Computer, Inc. All Rights Reserved */
  29 /*
  30  * Copyright (c) 1982, 1986, 1988, 1990, 1993
  31  *      The Regents of the University of California.  All rights reserved.
  32  *
  33  * Redistribution and use in source and binary forms, with or without
  34  * modification, are permitted provided that the following conditions
  35  * are met:
  36  * 1. Redistributions of source code must retain the above copyright
  37  *    notice, this list of conditions and the following disclaimer.
  38  * 2. Redistributions in binary form must reproduce the above copyright
  39  *    notice, this list of conditions and the following disclaimer in the
  40  *    documentation and/or other materials provided with the distribution.
  41  * 3. All advertising materials mentioning features or use of this software
  42  *    must display the following acknowledgement:
  43  *      This product includes software developed by the University of
  44  *      California, Berkeley and its contributors.
  45  * 4. Neither the name of the University nor the names of its contributors
  46  *    may be used to endorse or promote products derived from this software
  47  *    without specific prior written permission.
  48  *
  49  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
  50  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  51  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  52  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  53  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  54  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  55  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  56  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  57  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  58  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  59  * SUCH DAMAGE.
  60  *
  61  *      @(#)uipc_socket.c       8.3 (Berkeley) 4/15/94
  62  */
  63 /*
  64  * NOTICE: This file was modified by SPARTA, Inc. in 2005 to introduce
  65  * support for mandatory and extensible security protections.  This notice
  66  * is included in support of clause 2.2 (b) of the Apple Public License,
  67  * Version 2.0.
  68  */
  69
  70 #include <sys/param.h>
  71 #include <sys/systm.h>
  72 #include <sys/filedesc.h>
  73 #include <sys/proc.h>
  74 #include <sys/proc_internal.h>
  75 #include <sys/kauth.h>
  76 #include <sys/file_internal.h>
  77 #include <sys/fcntl.h>
  78 #include <sys/malloc.h>
  79 #include <sys/mbuf.h>
  80 #include <sys/domain.h>
  81 #include <sys/kernel.h>
  82 #include <sys/event.h>
  83 #include <sys/poll.h>
  84 #include <sys/protosw.h>
  85 #include <sys/socket.h>
  86 #include <sys/socketvar.h>
  87 #include <sys/resourcevar.h>
  88 #include <sys/signalvar.h>
  89 #include <sys/sysctl.h>
  90 #include <sys/syslog.h>
  91 #include <sys/uio.h>
  92 #include <sys/uio_internal.h>
  93 #include <sys/ev.h>
  94 #include <sys/kdebug.h>
  95 #include <sys/un.h>
  96 #include <sys/user.h>
  97 #include <sys/priv.h>
  98 #include <sys/kern_event.h>
  99 #include <net/route.h>
 100 #include <net/init.h>
 101 #include <net/ntstat.h>
 102 #include <net/content_filter.h>
 103 #include <netinet/in.h>
 104 #include <netinet/in_pcb.h>
 105 #include <netinet/ip6.h>
 106 #include <netinet6/ip6_var.h>
 107 #include <netinet/flow_divert.h>
 108 #include <kern/zalloc.h>
 109 #include <kern/locks.h>
 110 #include <machine/limits.h>
 111 #include <libkern/OSAtomic.h>
 112 #include <pexpert/pexpert.h>
 113 #include <kern/assert.h>
 114 #include <kern/task.h>
 115 #include <sys/kpi_mbuf.h>
 116 #include <sys/mcache.h>
 117 #include <sys/unpcb.h>
 118
 119 #if CONFIG_MACF
 120 #include <security/mac.h>
 121 #include <security/mac_framework.h>
 122 #endif /* MAC */
 123
 124 #if MULTIPATH
 125 #include <netinet/mp_pcb.h>
 126 #include <netinet/mptcp_var.h>
 127 #endif /* MULTIPATH */
 128
 129 /* TODO: this should be in a header file somewhere */
 130 extern char *proc_name_address(void *p);
 131
 132 static u_int32_t        so_cache_hw;    /* High water mark for socache */
 133 static u_int32_t        so_cache_timeouts;      /* number of timeouts */
 134 static u_int32_t        so_cache_max_freed;     /* max freed per timeout */
 135 static u_int32_t        cached_sock_count = 0;
 136 STAILQ_HEAD(, socket)   so_cache_head;
 137 int     max_cached_sock_count = MAX_CACHED_SOCKETS;
 138 static u_int32_t        so_cache_time;
 139 static int              socketinit_done;
 140 static struct zone      *so_cache_zone;
 141
 142 static lck_grp_t        *so_cache_mtx_grp;
 143 static lck_attr_t       *so_cache_mtx_attr;
 144 static lck_grp_attr_t   *so_cache_mtx_grp_attr;
 145 static lck_mtx_t        *so_cache_mtx;
 146
 147 #include <machine/limits.h>
 148
 149 static void     filt_sordetach(struct knote *kn);
 150 static int      filt_soread(struct knote *kn, long hint);
 151 static void     filt_sowdetach(struct knote *kn);
 152 static int      filt_sowrite(struct knote *kn, long hint);
 153 static void     filt_sockdetach(struct knote *kn);
 154 static int      filt_sockev(struct knote *kn, long hint);
 155
 156 static int sooptcopyin_timeval(struct sockopt *, struct timeval *);
 157 static int sooptcopyout_timeval(struct sockopt *, const struct timeval *);
 158
 159 static struct filterops soread_filtops = {
 160         .f_isfd = 1,
 161         .f_detach = filt_sordetach,
 162         .f_event = filt_soread,
 163 };
 164
 165 static struct filterops sowrite_filtops = {
 166         .f_isfd = 1,
 167         .f_detach = filt_sowdetach,
 168         .f_event = filt_sowrite,
 169 };
 170
 171 static struct filterops sock_filtops = {
 172         .f_isfd = 1,
 173         .f_detach = filt_sockdetach,
 174         .f_event = filt_sockev,
 175 };
 176
 177 SYSCTL_DECL(_kern_ipc);
 178
 179 #define EVEN_MORE_LOCKING_DEBUG 0
 180
 181 int socket_debug = 0;
 182 SYSCTL_INT(_kern_ipc, OID_AUTO, socket_debug,
 183         CTLFLAG_RW | CTLFLAG_LOCKED, &socket_debug, 0, "");
 184
 185 static int socket_zone = M_SOCKET;
 186 so_gen_t        so_gencnt;      /* generation count for sockets */
 187
 188 MALLOC_DEFINE(M_SONAME, "soname", "socket name");
 189 MALLOC_DEFINE(M_PCB, "pcb", "protocol control block");
 190
 191 #define DBG_LAYER_IN_BEG        NETDBG_CODE(DBG_NETSOCK, 0)
 192 #define DBG_LAYER_IN_END        NETDBG_CODE(DBG_NETSOCK, 2)
 193 #define DBG_LAYER_OUT_BEG       NETDBG_CODE(DBG_NETSOCK, 1)
 194 #define DBG_LAYER_OUT_END       NETDBG_CODE(DBG_NETSOCK, 3)
 195 #define DBG_FNC_SOSEND          NETDBG_CODE(DBG_NETSOCK, (4 << 8) | 1)
 196 #define DBG_FNC_SOSEND_LIST     NETDBG_CODE(DBG_NETSOCK, (4 << 8) | 3)
 197 #define DBG_FNC_SORECEIVE       NETDBG_CODE(DBG_NETSOCK, (8 << 8))
 198 #define DBG_FNC_SORECEIVE_LIST  NETDBG_CODE(DBG_NETSOCK, (8 << 8) | 3)
 199 #define DBG_FNC_SOSHUTDOWN      NETDBG_CODE(DBG_NETSOCK, (9 << 8))
 200
 201 #define MAX_SOOPTGETM_SIZE      (128 * MCLBYTES)
 202
 203 int somaxconn = SOMAXCONN;
 204 SYSCTL_INT(_kern_ipc, KIPC_SOMAXCONN, somaxconn,
 205         CTLFLAG_RW | CTLFLAG_LOCKED, &somaxconn, 0, "");
 206
 207 /* Should we get a maximum also ??? */
 208 static int sosendmaxchain = 65536;
 209 static int sosendminchain = 16384;
 210 static int sorecvmincopy  = 16384;
 211 SYSCTL_INT(_kern_ipc, OID_AUTO, sosendminchain,
 212         CTLFLAG_RW | CTLFLAG_LOCKED, &sosendminchain, 0, "");
 213 SYSCTL_INT(_kern_ipc, OID_AUTO, sorecvmincopy,
 214         CTLFLAG_RW | CTLFLAG_LOCKED, &sorecvmincopy, 0, "");
 215
 216 /*
 217  * Set to enable jumbo clusters (if available) for large writes when
 218  * the socket is marked with SOF_MULTIPAGES; see below.
 219  */
 220 int sosendjcl = 1;
 221 SYSCTL_INT(_kern_ipc, OID_AUTO, sosendjcl,
 222         CTLFLAG_RW | CTLFLAG_LOCKED, &sosendjcl, 0, "");
 223
 224 /*
 225  * Set this to ignore SOF_MULTIPAGES and use jumbo clusters for large
 226  * writes on the socket for all protocols on any network interfaces,
 227  * depending upon sosendjcl above.  Be extra careful when setting this
 228  * to 1, because sending down packets that cross physical pages down to
 229  * broken drivers (those that falsely assume that the physical pages
 230  * are contiguous) might lead to system panics or silent data corruption.
 231  * When set to 0, the system will respect SOF_MULTIPAGES, which is set
 232  * only for TCP sockets whose outgoing interface is IFNET_MULTIPAGES
 233  * capable.  Set this to 1 only for testing/debugging purposes.
 234  */
 235 int sosendjcl_ignore_capab = 0;
 236 SYSCTL_INT(_kern_ipc, OID_AUTO, sosendjcl_ignore_capab,
 237         CTLFLAG_RW | CTLFLAG_LOCKED, &sosendjcl_ignore_capab, 0, "");
 238
 239 int sosendbigcl_ignore_capab = 0;
 240 SYSCTL_INT(_kern_ipc, OID_AUTO, sosendbigcl_ignore_capab,
 241         CTLFLAG_RW | CTLFLAG_LOCKED, &sosendbigcl_ignore_capab, 0, "");
 242
 243 int sodefunctlog = 0;
 244 SYSCTL_INT(_kern_ipc, OID_AUTO, sodefunctlog, CTLFLAG_RW | CTLFLAG_LOCKED,
 245         &sodefunctlog, 0, "");
 246
 247 int sothrottlelog = 0;
 248 SYSCTL_INT(_kern_ipc, OID_AUTO, sothrottlelog, CTLFLAG_RW | CTLFLAG_LOCKED,
 249         &sothrottlelog, 0, "");
 250
 251 int sorestrictrecv = 1;
 252 SYSCTL_INT(_kern_ipc, OID_AUTO, sorestrictrecv, CTLFLAG_RW | CTLFLAG_LOCKED,
 253         &sorestrictrecv, 0, "Enable inbound interface restrictions");
 254
 255 int sorestrictsend = 1;
 256 SYSCTL_INT(_kern_ipc, OID_AUTO, sorestrictsend, CTLFLAG_RW | CTLFLAG_LOCKED,
 257         &sorestrictsend, 0, "Enable outbound interface restrictions");
 258
 259 extern struct inpcbinfo tcbinfo;
 260
 261 /* TODO: these should be in header file */
 262 extern int get_inpcb_str_size(void);
 263 extern int get_tcp_str_size(void);
 264
 265 static unsigned int sl_zone_size;               /* size of sockaddr_list */
 266 static struct zone *sl_zone;                    /* zone for sockaddr_list */
 267
 268 static unsigned int se_zone_size;               /* size of sockaddr_entry */
 269 static struct zone *se_zone;                    /* zone for sockaddr_entry */
 270
 271 vm_size_t       so_cache_zone_element_size;
 272
 273 static int sodelayed_copy(struct socket *, struct uio *, struct mbuf **, user_ssize_t *);
 274 static void cached_sock_alloc(struct socket **, int);
 275 static void cached_sock_free(struct socket *);
 276
 277 /*
 278  * SOTCDB_NO_DSCP is set by default, to prevent the networking stack from
 279  * setting the DSCP code on the packet based on the service class; see
 280  * <rdar://problem/11277343> for details.
 281  */
 282 __private_extern__ u_int32_t sotcdb = SOTCDB_NO_DSCP;
 283 SYSCTL_INT(_kern_ipc, OID_AUTO, sotcdb, CTLFLAG_RW | CTLFLAG_LOCKED,
 284         &sotcdb, 0, "");
 285
 286 void
 287 socketinit(void)
 288 {
 289         _CASSERT(sizeof(so_gencnt) == sizeof(uint64_t));
 290         VERIFY(IS_P2ALIGNED(&so_gencnt, sizeof(uint32_t)));
 291
 292         if (socketinit_done) {
 293                 printf("socketinit: already called...\n");
 294                 return;
 295         }
 296         socketinit_done = 1;
 297
 298         PE_parse_boot_argn("socket_debug", &socket_debug,
 299             sizeof (socket_debug));
 300
 301         /*
 302          * allocate lock group attribute and group for socket cache mutex
 303          */
 304         so_cache_mtx_grp_attr = lck_grp_attr_alloc_init();
 305         so_cache_mtx_grp = lck_grp_alloc_init("so_cache",
 306             so_cache_mtx_grp_attr);
 307
 308         /*
 309          * allocate the lock attribute for socket cache mutex
 310          */
 311         so_cache_mtx_attr = lck_attr_alloc_init();
 312
 313         /* cached sockets mutex */
 314         so_cache_mtx = lck_mtx_alloc_init(so_cache_mtx_grp, so_cache_mtx_attr);
 315         if (so_cache_mtx == NULL) {
 316                 panic("%s: unable to allocate so_cache_mtx\n", __func__);
 317                 /* NOTREACHED */
 318         }
 319         STAILQ_INIT(&so_cache_head);
 320
 321         so_cache_zone_element_size = (vm_size_t)(sizeof (struct socket) + 4
 322             + get_inpcb_str_size() + 4 + get_tcp_str_size());
 323
 324         so_cache_zone = zinit(so_cache_zone_element_size,
 325             (120000 * so_cache_zone_element_size), 8192, "socache zone");
 326         zone_change(so_cache_zone, Z_CALLERACCT, FALSE);
 327         zone_change(so_cache_zone, Z_NOENCRYPT, TRUE);
 328
 329         sl_zone_size = sizeof (struct sockaddr_list);
 330         if ((sl_zone = zinit(sl_zone_size, 1024 * sl_zone_size, 1024,
 331             "sockaddr_list")) == NULL) {
 332                 panic("%s: unable to allocate sockaddr_list zone\n", __func__);
 333                 /* NOTREACHED */
 334         }
 335         zone_change(sl_zone, Z_CALLERACCT, FALSE);
 336         zone_change(sl_zone, Z_EXPAND, TRUE);
 337
 338         se_zone_size = sizeof (struct sockaddr_entry);
 339         if ((se_zone = zinit(se_zone_size, 1024 * se_zone_size, 1024,
 340             "sockaddr_entry")) == NULL) {
 341                 panic("%s: unable to allocate sockaddr_entry zone\n", __func__);
 342                 /* NOTREACHED */
 343         }
 344         zone_change(se_zone, Z_CALLERACCT, FALSE);
 345         zone_change(se_zone, Z_EXPAND, TRUE);
 346
 347
 348         in_pcbinit();
 349         sflt_init();
 350         socket_tclass_init();
 351 #if MULTIPATH
 352         mp_pcbinit();
 353 #endif /* MULTIPATH */
 354 }
 355
 356 static void
 357 cached_sock_alloc(struct socket **so, int waitok)
 358 {
 359         caddr_t temp;
 360         uintptr_t offset;
 361
 362         lck_mtx_lock(so_cache_mtx);
 363
 364         if (!STAILQ_EMPTY(&so_cache_head)) {
 365                 VERIFY(cached_sock_count > 0);
 366
 367                 *so = STAILQ_FIRST(&so_cache_head);
 368                 STAILQ_REMOVE_HEAD(&so_cache_head, so_cache_ent);
 369                 STAILQ_NEXT((*so), so_cache_ent) = NULL;
 370
 371                 cached_sock_count--;
 372                 lck_mtx_unlock(so_cache_mtx);
 373
 374                 temp = (*so)->so_saved_pcb;
 375                 bzero((caddr_t)*so, sizeof (struct socket));
 376
 377                 (*so)->so_saved_pcb = temp;
 378         } else {
 379
 380                 lck_mtx_unlock(so_cache_mtx);
 381
 382                 if (waitok)
 383                         *so = (struct socket *)zalloc(so_cache_zone);
 384                 else
 385                         *so = (struct socket *)zalloc_noblock(so_cache_zone);
 386
 387                 if (*so == NULL)
 388                         return;
 389
 390                 bzero((caddr_t)*so, sizeof (struct socket));
 391
 392                 /*
 393                  * Define offsets for extra structures into our
 394                  * single block of memory. Align extra structures
 395                  * on longword boundaries.
 396                  */
 397
 398                 offset = (uintptr_t)*so;
 399                 offset += sizeof (struct socket);
 400
 401                 offset = ALIGN(offset);
 402
 403                 (*so)->so_saved_pcb = (caddr_t)offset;
 404                 offset += get_inpcb_str_size();
 405
 406                 offset = ALIGN(offset);
 407
 408                 ((struct inpcb *)(void *)(*so)->so_saved_pcb)->inp_saved_ppcb =
 409                     (caddr_t)offset;
 410         }
 411
 412         (*so)->cached_in_sock_layer = true;
 413 }
 414
 415 static void
 416 cached_sock_free(struct socket *so)
 417 {
 418
 419         lck_mtx_lock(so_cache_mtx);
 420
 421         so_cache_time = net_uptime();
 422         if (++cached_sock_count > max_cached_sock_count) {
 423                 --cached_sock_count;
 424                 lck_mtx_unlock(so_cache_mtx);
 425                 zfree(so_cache_zone, so);
 426         } else {
 427                 if (so_cache_hw < cached_sock_count)
 428                         so_cache_hw = cached_sock_count;
 429
 430                 STAILQ_INSERT_TAIL(&so_cache_head, so, so_cache_ent);
 431
 432                 so->cache_timestamp = so_cache_time;
 433                 lck_mtx_unlock(so_cache_mtx);
 434         }
 435 }
 436
 437 void
 438 so_update_last_owner_locked(struct socket *so, proc_t self)
 439 {
 440         if (so->last_pid != 0) {
 441                 /*
 442                  * last_pid and last_upid should remain zero for sockets
 443                  * created using sock_socket. The check above achieves that
 444                  */
 445                 if (self == PROC_NULL)
 446                         self = current_proc();
 447
 448                 if (so->last_upid != proc_uniqueid(self) ||
 449                     so->last_pid != proc_pid(self)) {
 450                         so->last_upid = proc_uniqueid(self);
 451                         so->last_pid = proc_pid(self);
 452                         proc_getexecutableuuid(self, so->last_uuid,
 453                             sizeof (so->last_uuid));
 454                 }
 455                 proc_pidoriginatoruuid(so->so_vuuid, sizeof(so->so_vuuid));
 456         }
 457 }
 458
 459 void
 460 so_update_policy(struct socket *so)
 461 {
 462         if (SOCK_DOM(so) == PF_INET || SOCK_DOM(so) == PF_INET6)
 463                 (void) inp_update_policy(sotoinpcb(so));
 464 }
 465
 466 #if NECP
 467 static void
 468 so_update_necp_policy(struct socket *so, struct sockaddr *override_local_addr, struct sockaddr *override_remote_addr)
 469 {
 470         if (SOCK_DOM(so) == PF_INET || SOCK_DOM(so) == PF_INET6)
 471                 inp_update_necp_policy(sotoinpcb(so), override_local_addr, override_remote_addr, 0);
 472 }
 473 #endif /* NECP */
 474
 475 boolean_t
 476 so_cache_timer(void)
 477 {
 478         struct socket   *p;
 479         int             n_freed = 0;
 480         boolean_t rc = FALSE;
 481
 482         lck_mtx_lock(so_cache_mtx);
 483         so_cache_timeouts++;
 484         so_cache_time = net_uptime();
 485
 486         while (!STAILQ_EMPTY(&so_cache_head)) {
 487                 VERIFY(cached_sock_count > 0);
 488                 p = STAILQ_FIRST(&so_cache_head);
 489                 if ((so_cache_time - p->cache_timestamp) <
 490                         SO_CACHE_TIME_LIMIT)
 491                         break;
 492
 493                 STAILQ_REMOVE_HEAD(&so_cache_head, so_cache_ent);
 494                 --cached_sock_count;
 495
 496                 zfree(so_cache_zone, p);
 497
 498                 if (++n_freed >= SO_CACHE_MAX_FREE_BATCH) {
 499                         so_cache_max_freed++;
 500                         break;
 501                 }
 502         }
 503
 504         /* Schedule again if there is more to cleanup */
 505         if (!STAILQ_EMPTY(&so_cache_head))
 506                 rc = TRUE;
 507
 508         lck_mtx_unlock(so_cache_mtx);
 509         return (rc);
 510 }
 511
 512 /*
 513  * Get a socket structure from our zone, and initialize it.
 514  * We don't implement `waitok' yet (see comments in uipc_domain.c).
 515  * Note that it would probably be better to allocate socket
 516  * and PCB at the same time, but I'm not convinced that all
 517  * the protocols can be easily modified to do this.
 518  */
 519 struct socket *
 520 soalloc(int waitok, int dom, int type)
 521 {
 522         struct socket *so;
 523
 524         if ((dom == PF_INET) && (type == SOCK_STREAM)) {
 525                 cached_sock_alloc(&so, waitok);
 526         } else {
 527                 MALLOC_ZONE(so, struct socket *, sizeof (*so), socket_zone,
 528                     M_WAITOK);
 529                 if (so != NULL)
 530                         bzero(so, sizeof (*so));
 531         }
 532         if (so != NULL) {
 533                 so->so_gencnt = OSIncrementAtomic64((SInt64 *)&so_gencnt);
 534                 so->so_zone = socket_zone;
 535 #if CONFIG_MACF_SOCKET
 536                 /* Convert waitok to  M_WAITOK/M_NOWAIT for MAC Framework. */
 537                 if (mac_socket_label_init(so, !waitok) != 0) {
 538                         sodealloc(so);
 539                         return (NULL);
 540                 }
 541 #endif /* MAC_SOCKET */
 542         }
 543
 544         return (so);
 545 }
 546
 547 int
 548 socreate_internal(int dom, struct socket **aso, int type, int proto,
 549     struct proc *p, uint32_t flags, struct proc *ep)
 550 {
 551         struct protosw *prp;
 552         struct socket *so;
 553         int error = 0;
 554
 555 #if TCPDEBUG
 556         extern int tcpconsdebug;
 557 #endif
 558
 559         VERIFY(aso != NULL);
 560         *aso = NULL;
 561
 562         if (proto != 0)
 563                 prp = pffindproto(dom, proto, type);
 564         else
 565                 prp = pffindtype(dom, type);
 566
 567         if (prp == NULL || prp->pr_usrreqs->pru_attach == NULL) {
 568                 if (pffinddomain(dom) == NULL)
 569                         return (EAFNOSUPPORT);
 570                 if (proto != 0) {
 571                         if (pffindprotonotype(dom, proto) != NULL)
 572                                 return (EPROTOTYPE);
 573                 }
 574                 return (EPROTONOSUPPORT);
 575         }
 576         if (prp->pr_type != type)
 577                 return (EPROTOTYPE);
 578         so = soalloc(1, dom, type);
 579         if (so == NULL)
 580                 return (ENOBUFS);
 581
 582         if (flags & SOCF_ASYNC)
 583                 so->so_state |= SS_NBIO;
 584 #if MULTIPATH
 585         if (flags & SOCF_MP_SUBFLOW) {
 586                 /*
 587                  * A multipath subflow socket is used internally in the kernel,
 588                  * therefore it does not have a file desciptor associated by
 589                  * default.
 590                  */
 591                 so->so_state |= SS_NOFDREF;
 592                 so->so_flags |= SOF_MP_SUBFLOW;
 593         }
 594 #endif /* MULTIPATH */
 595
 596         TAILQ_INIT(&so->so_incomp);
 597         TAILQ_INIT(&so->so_comp);
 598         so->so_type = type;
 599         so->last_upid = proc_uniqueid(p);
 600         so->last_pid = proc_pid(p);
 601         proc_getexecutableuuid(p, so->last_uuid, sizeof (so->last_uuid));
 602         proc_pidoriginatoruuid(so->so_vuuid, sizeof(so->so_vuuid));
 603
 604         if (ep != PROC_NULL && ep != p) {
 605                 so->e_upid = proc_uniqueid(ep);
 606                 so->e_pid = proc_pid(ep);
 607                 proc_getexecutableuuid(ep, so->e_uuid, sizeof (so->e_uuid));
 608                 so->so_flags |= SOF_DELEGATED;
 609         }
 610
 611         so->so_cred = kauth_cred_proc_ref(p);
 612         if (!suser(kauth_cred_get(), NULL))
 613                 so->so_state |= SS_PRIV;
 614
 615         so->so_proto = prp;
 616         so->so_rcv.sb_flags |= SB_RECV;
 617         so->so_rcv.sb_so = so->so_snd.sb_so = so;
 618         so->next_lock_lr = 0;
 619         so->next_unlock_lr = 0;
 620
 621 #if CONFIG_MACF_SOCKET
 622         mac_socket_label_associate(kauth_cred_get(), so);
 623 #endif /* MAC_SOCKET */
 624
 625         /*
 626          * Attachment will create the per pcb lock if necessary and
 627          * increase refcount for creation, make sure it's done before
 628          * socket is inserted in lists.
 629          */
 630         so->so_usecount++;
 631
 632         error = (*prp->pr_usrreqs->pru_attach)(so, proto, p);
 633         if (error != 0) {
 634                 /*
 635                  * Warning:
 636                  * If so_pcb is not zero, the socket will be leaked,
 637                  * so protocol attachment handler must be coded carefuly
 638                  */
 639                 so->so_state |= SS_NOFDREF;
 640                 so->so_usecount--;
 641                 sofreelastref(so, 1);   /* will deallocate the socket */
 642                 return (error);
 643         }
 644
 645         atomic_add_32(&prp->pr_domain->dom_refs, 1);
 646         TAILQ_INIT(&so->so_evlist);
 647
 648         /* Attach socket filters for this protocol */
 649         sflt_initsock(so);
 650 #if TCPDEBUG
 651         if (tcpconsdebug == 2)
 652                 so->so_options |= SO_DEBUG;
 653 #endif
 654         so_set_default_traffic_class(so);
 655
 656         /*
 657          * If this thread or task is marked to create backgrounded sockets,
 658          * mark the socket as background.
 659          */
 660         if (proc_get_effective_thread_policy(current_thread(), TASK_POLICY_NEW_SOCKETS_BG)) {
 661                 socket_set_traffic_mgt_flags(so, TRAFFIC_MGT_SO_BACKGROUND);
 662                 so->so_background_thread = current_thread();
 663         }
 664
 665         switch (dom) {
 666         /*
 667          * Don't mark Unix domain, system or multipath sockets as
 668          * eligible for defunct by default.
 669          */
 670         case PF_LOCAL:
 671         case PF_SYSTEM:
 672         case PF_MULTIPATH:
 673                 so->so_flags |= SOF_NODEFUNCT;
 674                 break;
 675         default:
 676                 break;
 677         }
 678
 679         /*
 680          * Entitlements can't be checked at socket creation time except if the
 681          * application requested a feature guarded by a privilege (c.f., socket
 682          * delegation).
 683          * The priv(9) and the Sandboxing APIs are designed with the idea that
 684          * a privilege check should only be triggered by a userland request.
 685          * A privilege check at socket creation time is time consuming and
 686          * could trigger many authorisation error messages from the security
 687          * APIs.
 688          */
 689
 690         *aso = so;
 691
 692         return (0);
 693 }
 694
 695 /*
 696  * Returns:     0                       Success
 697  *              EAFNOSUPPORT
 698  *              EPROTOTYPE
 699  *              EPROTONOSUPPORT
 700  *              ENOBUFS
 701  *      <pru_attach>:ENOBUFS[AF_UNIX]
 702  *      <pru_attach>:ENOBUFS[TCP]
 703  *      <pru_attach>:ENOMEM[TCP]
 704  *      <pru_attach>:???                [other protocol families, IPSEC]
 705  */
 706 int
 707 socreate(int dom, struct socket **aso, int type, int proto)
 708 {
 709         return (socreate_internal(dom, aso, type, proto, current_proc(), 0,
 710             PROC_NULL));
 711 }
 712
 713 int
 714 socreate_delegate(int dom, struct socket **aso, int type, int proto, pid_t epid)
 715 {
 716         int error = 0;
 717         struct proc *ep = PROC_NULL;
 718
 719         if ((proc_selfpid() != epid) && ((ep = proc_find(epid)) == PROC_NULL)) {
 720                 error = ESRCH;
 721                 goto done;
 722         }
 723
 724         error = socreate_internal(dom, aso, type, proto, current_proc(), 0, ep);
 725
 726         /*
 727          * It might not be wise to hold the proc reference when calling
 728          * socreate_internal since it calls soalloc with M_WAITOK
 729          */
 730 done:
 731         if (ep != PROC_NULL)
 732                 proc_rele(ep);
 733
 734         return (error);
 735 }
 736
 737 /*
 738  * Returns:     0                       Success
 739  *      <pru_bind>:EINVAL               Invalid argument [COMMON_START]
 740  *      <pru_bind>:EAFNOSUPPORT         Address family not supported
 741  *      <pru_bind>:EADDRNOTAVAIL        Address not available.
 742  *      <pru_bind>:EINVAL               Invalid argument
 743  *      <pru_bind>:EAFNOSUPPORT         Address family not supported [notdef]
 744  *      <pru_bind>:EACCES               Permission denied
 745  *      <pru_bind>:EADDRINUSE           Address in use
 746  *      <pru_bind>:EAGAIN               Resource unavailable, try again
 747  *      <pru_bind>:EPERM                Operation not permitted
 748  *      <pru_bind>:???
 749  *      <sf_bind>:???
 750  *
 751  * Notes:       It's not possible to fully enumerate the return codes above,
 752  *              since socket filter authors and protocol family authors may
 753  *              not choose to limit their error returns to those listed, even
 754  *              though this may result in some software operating incorrectly.
 755  *
 756  *              The error codes which are enumerated above are those known to
 757  *              be returned by the tcp_usr_bind function supplied.
 758  */
 759 int
 760 sobindlock(struct socket *so, struct sockaddr *nam, int dolock)
 761 {
 762         struct proc *p = current_proc();
 763         int error = 0;
 764
 765         if (dolock)
 766                 socket_lock(so, 1);
 767         VERIFY(so->so_usecount > 1);
 768
 769         so_update_last_owner_locked(so, p);
 770         so_update_policy(so);
 771
 772 #if NECP
 773         so_update_necp_policy(so, nam, NULL);
 774 #endif /* NECP */
 775
 776         /*
 777          * If this is a bind request on a socket that has been marked
 778          * as inactive, reject it now before we go any further.
 779          */
 780         if (so->so_flags & SOF_DEFUNCT) {
 781                 error = EINVAL;
 782                 SODEFUNCTLOG(("%s[%d]: defunct so 0x%llx [%d,%d] (%d)\n",
 783                     __func__, proc_pid(p), (uint64_t)VM_KERNEL_ADDRPERM(so),
 784                     SOCK_DOM(so), SOCK_TYPE(so), error));
 785                 goto out;
 786         }
 787
 788         /* Socket filter */
 789         error = sflt_bind(so, nam);
 790
 791         if (error == 0)
 792                 error = (*so->so_proto->pr_usrreqs->pru_bind)(so, nam, p);
 793 out:
 794         if (dolock)
 795                 socket_unlock(so, 1);
 796
 797         if (error == EJUSTRETURN)
 798                 error = 0;
 799
 800         return (error);
 801 }
 802
 803 void
 804 sodealloc(struct socket *so)
 805 {
 806         kauth_cred_unref(&so->so_cred);
 807
 808         /* Remove any filters */
 809         sflt_termsock(so);
 810
 811 #if CONTENT_FILTER
 812         cfil_sock_detach(so);
 813 #endif /* CONTENT_FILTER */
 814
 815         /* Delete the state allocated for msg queues on a socket */
 816         if (so->so_flags & SOF_ENABLE_MSGS) {
 817                 FREE(so->so_msg_state, M_TEMP);
 818                 so->so_msg_state = NULL;
 819         }
 820         VERIFY(so->so_msg_state == NULL);
 821
 822         so->so_gencnt = OSIncrementAtomic64((SInt64 *)&so_gencnt);
 823
 824 #if CONFIG_MACF_SOCKET
 825         mac_socket_label_destroy(so);
 826 #endif /* MAC_SOCKET */
 827
 828         if (so->cached_in_sock_layer) {
 829                 cached_sock_free(so);
 830         } else {
 831                 FREE_ZONE(so, sizeof (*so), so->so_zone);
 832         }
 833 }
 834
 835 /*
 836  * Returns:     0                       Success
 837  *              EINVAL
 838  *              EOPNOTSUPP
 839  *      <pru_listen>:EINVAL[AF_UNIX]
 840  *      <pru_listen>:EINVAL[TCP]
 841  *      <pru_listen>:EADDRNOTAVAIL[TCP] Address not available.
 842  *      <pru_listen>:EINVAL[TCP]        Invalid argument
 843  *      <pru_listen>:EAFNOSUPPORT[TCP]  Address family not supported [notdef]
 844  *      <pru_listen>:EACCES[TCP]        Permission denied
 845  *      <pru_listen>:EADDRINUSE[TCP]    Address in use
 846  *      <pru_listen>:EAGAIN[TCP]        Resource unavailable, try again
 847  *      <pru_listen>:EPERM[TCP]         Operation not permitted
 848  *      <sf_listen>:???
 849  *
 850  * Notes:       Other <pru_listen> returns depend on the protocol family; all
 851  *              <sf_listen> returns depend on what the filter author causes
 852  *              their filter to return.
 853  */
 854 int
 855 solisten(struct socket *so, int backlog)
 856 {
 857         struct proc *p = current_proc();
 858         int error = 0;
 859
 860         socket_lock(so, 1);
 861
 862         so_update_last_owner_locked(so, p);
 863         so_update_policy(so);
 864
 865 #if NECP
 866         so_update_necp_policy(so, NULL, NULL);
 867 #endif /* NECP */
 868
 869         if (so->so_proto == NULL) {
 870                 error = EINVAL;
 871                 goto out;
 872         }
 873         if ((so->so_proto->pr_flags & PR_CONNREQUIRED) == 0) {
 874                 error = EOPNOTSUPP;
 875                 goto out;
 876         }
 877
 878         /*
 879          * If the listen request is made on a socket that is not fully
 880          * disconnected, or on a socket that has been marked as inactive,
 881          * reject the request now.
 882          */
 883         if ((so->so_state &
 884             (SS_ISCONNECTED|SS_ISCONNECTING|SS_ISDISCONNECTING)) ||
 885             (so->so_flags & SOF_DEFUNCT)) {
 886                 error = EINVAL;
 887                 if (so->so_flags & SOF_DEFUNCT) {
 888                         SODEFUNCTLOG(("%s[%d]: defunct so 0x%llx [%d,%d] "
 889                             "(%d)\n", __func__, proc_pid(p),
 890                             (uint64_t)VM_KERNEL_ADDRPERM(so),
 891                             SOCK_DOM(so), SOCK_TYPE(so), error));
 892                 }
 893                 goto out;
 894         }
 895
 896         if ((so->so_restrictions & SO_RESTRICT_DENY_IN) != 0) {
 897                 error = EPERM;
 898                 goto out;
 899         }
 900
 901         error = sflt_listen(so);
 902         if (error == 0)
 903                 error = (*so->so_proto->pr_usrreqs->pru_listen)(so, p);
 904
 905         if (error) {
 906                 if (error == EJUSTRETURN)
 907                         error = 0;
 908                 goto out;
 909         }
 910
 911         if (TAILQ_EMPTY(&so->so_comp))
 912                 so->so_options |= SO_ACCEPTCONN;
 913         /*
 914          * POSIX: The implementation may have an upper limit on the length of
 915          * the listen queue-either global or per accepting socket. If backlog
 916          * exceeds this limit, the length of the listen queue is set to the
 917          * limit.
 918          *
 919          * If listen() is called with a backlog argument value that is less
 920          * than 0, the function behaves as if it had been called with a backlog
 921          * argument value of 0.
 922          *
 923          * A backlog argument of 0 may allow the socket to accept connections,
 924          * in which case the length of the listen queue may be set to an
 925          * implementation-defined minimum value.
 926          */
 927         if (backlog <= 0 || backlog > somaxconn)
 928                 backlog = somaxconn;
 929
 930         so->so_qlimit = backlog;
 931 out:
 932         socket_unlock(so, 1);
 933         return (error);
 934 }
 935
 936 void
 937 sofreelastref(struct socket *so, int dealloc)
 938 {
 939         struct socket *head = so->so_head;
 940
 941         /* Assume socket is locked */
 942
 943         if (!(so->so_flags & SOF_PCBCLEARING) || !(so->so_state & SS_NOFDREF)) {
 944                 selthreadclear(&so->so_snd.sb_sel);
 945                 selthreadclear(&so->so_rcv.sb_sel);
 946                 so->so_rcv.sb_flags &= ~(SB_SEL|SB_UPCALL);
 947                 so->so_snd.sb_flags &= ~(SB_SEL|SB_UPCALL);
 948                 so->so_event = sonullevent;
 949                 return;
 950         }
 951         if (head != NULL) {
 952                 socket_lock(head, 1);
 953                 if (so->so_state & SS_INCOMP) {
 954                         TAILQ_REMOVE(&head->so_incomp, so, so_list);
 955                         head->so_incqlen--;
 956                 } else if (so->so_state & SS_COMP) {
 957                         /*
 958                          * We must not decommission a socket that's
 959                          * on the accept(2) queue.  If we do, then
 960                          * accept(2) may hang after select(2) indicated
 961                          * that the listening socket was ready.
 962                          */
 963                         selthreadclear(&so->so_snd.sb_sel);
 964                         selthreadclear(&so->so_rcv.sb_sel);
 965                         so->so_rcv.sb_flags &= ~(SB_SEL|SB_UPCALL);
 966                         so->so_snd.sb_flags &= ~(SB_SEL|SB_UPCALL);
 967                         so->so_event = sonullevent;
 968                         socket_unlock(head, 1);
 969                         return;
 970                 } else {
 971                         panic("sofree: not queued");
 972                 }
 973                 head->so_qlen--;
 974                 so->so_state &= ~SS_INCOMP;
 975                 so->so_head = NULL;
 976                 socket_unlock(head, 1);
 977         }
 978         sowflush(so);
 979         sorflush(so);
 980
 981 #if FLOW_DIVERT
 982         if (so->so_flags & SOF_FLOW_DIVERT) {
 983                 flow_divert_detach(so);
 984         }
 985 #endif  /* FLOW_DIVERT */
 986
 987         /* 3932268: disable upcall */
 988         so->so_rcv.sb_flags &= ~SB_UPCALL;
 989         so->so_snd.sb_flags &= ~SB_UPCALL;
 990         so->so_event = sonullevent;
 991
 992         if (dealloc)
 993                 sodealloc(so);
 994 }
 995
 996 void
 997 soclose_wait_locked(struct socket *so)
 998 {
 999         lck_mtx_t *mutex_held;
1000
1001         if (so->so_proto->pr_getlock != NULL)
1002                 mutex_held = (*so->so_proto->pr_getlock)(so, 0);
1003         else
1004                 mutex_held = so->so_proto->pr_domain->dom_mtx;
1005         lck_mtx_assert(mutex_held, LCK_MTX_ASSERT_OWNED);
1006
1007         /*
1008          * Double check here and return if there's no outstanding upcall;
1009          * otherwise proceed further only if SOF_UPCALLCLOSEWAIT is set.
1010          */
1011         if (!so->so_upcallusecount || !(so->so_flags & SOF_UPCALLCLOSEWAIT))
1012                 return;
1013         so->so_rcv.sb_flags &= ~SB_UPCALL;
1014         so->so_snd.sb_flags &= ~SB_UPCALL;
1015         so->so_flags |= SOF_CLOSEWAIT;
1016         (void) msleep((caddr_t)&so->so_upcallusecount, mutex_held, (PZERO - 1),
1017             "soclose_wait_locked", NULL);
1018         lck_mtx_assert(mutex_held, LCK_MTX_ASSERT_OWNED);
1019         so->so_flags &= ~SOF_CLOSEWAIT;
1020 }
1021
1022 /*
1023  * Close a socket on last file table reference removal.
1024  * Initiate disconnect if connected.
1025  * Free socket when disconnect complete.
1026  */
1027 int
1028 soclose_locked(struct socket *so)
1029 {
1030         int error = 0;
1031         lck_mtx_t *mutex_held;
1032         struct timespec ts;
1033
1034         if (so->so_usecount == 0) {
1035                 panic("soclose: so=%p refcount=0\n", so);
1036                 /* NOTREACHED */
1037         }
1038
1039         sflt_notify(so, sock_evt_closing, NULL);
1040
1041         if (so->so_upcallusecount)
1042                 soclose_wait_locked(so);
1043
1044 #if CONTENT_FILTER
1045         /*
1046          * We have to wait until the content filters are done
1047          */
1048         if ((so->so_flags & SOF_CONTENT_FILTER) != 0) {
1049                 cfil_sock_close_wait(so);
1050                 cfil_sock_is_closed(so);
1051                 cfil_sock_detach(so);
1052         }
1053 #endif /* CONTENT_FILTER */
1054
1055         if ((so->so_options & SO_ACCEPTCONN)) {
1056                 struct socket *sp, *sonext;
1057                 int socklock = 0;
1058
1059                 /*
1060                  * We do not want new connection to be added
1061                  * to the connection queues
1062                  */
1063                 so->so_options &= ~SO_ACCEPTCONN;
1064
1065                 for (sp = TAILQ_FIRST(&so->so_incomp);
1066                     sp != NULL; sp = sonext) {
1067                         sonext = TAILQ_NEXT(sp, so_list);
1068
1069                         /*
1070                          * Radar 5350314
1071                          * skip sockets thrown away by tcpdropdropblreq
1072                          * they will get cleanup by the garbage collection.
1073                          * otherwise, remove the incomp socket from the queue
1074                          * and let soabort trigger the appropriate cleanup.
1075                          */
1076                         if (sp->so_flags & SOF_OVERFLOW)
1077                                 continue;
1078
1079                         if (so->so_proto->pr_getlock != NULL) {
1080                                 /*
1081                                  * Lock ordering for consistency with the
1082                                  * rest of the stack, we lock the socket
1083                                  * first and then grabb the head.
1084                                  */
1085                                 socket_unlock(so, 0);
1086                                 socket_lock(sp, 1);
1087                                 socket_lock(so, 0);
1088                                 socklock = 1;
1089                         }
1090
1091                         TAILQ_REMOVE(&so->so_incomp, sp, so_list);
1092                         so->so_incqlen--;
1093
1094                         if (sp->so_state & SS_INCOMP) {
1095                                 sp->so_state &= ~SS_INCOMP;
1096                                 sp->so_head = NULL;
1097
1098                                 (void) soabort(sp);
1099                         }
1100
1101                         if (socklock)
1102                                 socket_unlock(sp, 1);
1103                 }
1104
1105                 while ((sp = TAILQ_FIRST(&so->so_comp)) != NULL) {
1106                         /* Dequeue from so_comp since sofree() won't do it */
1107                         TAILQ_REMOVE(&so->so_comp, sp, so_list);
1108                         so->so_qlen--;
1109
1110                         if (so->so_proto->pr_getlock != NULL) {
1111                                 socket_unlock(so, 0);
1112                                 socket_lock(sp, 1);
1113                         }
1114
1115                         if (sp->so_state & SS_COMP) {
1116                                 sp->so_state &= ~SS_COMP;
1117                                 sp->so_head = NULL;
1118
1119                                 (void) soabort(sp);
1120                         }
1121
1122                         if (so->so_proto->pr_getlock != NULL) {
1123                                 socket_unlock(sp, 1);
1124                                 socket_lock(so, 0);
1125                         }
1126                 }
1127         }
1128         if (so->so_pcb == NULL) {
1129                 /* 3915887: mark the socket as ready for dealloc */
1130                 so->so_flags |= SOF_PCBCLEARING;
1131                 goto discard;
1132         }
1133         if (so->so_state & SS_ISCONNECTED) {
1134                 if ((so->so_state & SS_ISDISCONNECTING) == 0) {
1135                         error = sodisconnectlocked(so);
1136                         if (error)
1137                                 goto drop;
1138                 }
1139                 if (so->so_options & SO_LINGER) {
1140                         if ((so->so_state & SS_ISDISCONNECTING) &&
1141                             (so->so_state & SS_NBIO))
1142                                 goto drop;
1143                         if (so->so_proto->pr_getlock != NULL)
1144                                 mutex_held = (*so->so_proto->pr_getlock)(so, 0);
1145                         else
1146                                 mutex_held = so->so_proto->pr_domain->dom_mtx;
1147                         while (so->so_state & SS_ISCONNECTED) {
1148                                 ts.tv_sec = (so->so_linger/100);
1149                                 ts.tv_nsec = (so->so_linger % 100) *
1150                                     NSEC_PER_USEC * 1000 * 10;
1151                                 error = msleep((caddr_t)&so->so_timeo,
1152                                     mutex_held, PSOCK | PCATCH, "soclose", &ts);
1153                                 if (error) {
1154                                         /*
1155                                          * It's OK when the time fires,
1156                                          * don't report an error
1157                                          */
1158                                         if (error == EWOULDBLOCK)
1159                                                 error = 0;
1160                                         break;
1161                                 }
1162                         }
1163                 }
1164         }
1165 drop:
1166         if (so->so_usecount == 0) {
1167                 panic("soclose: usecount is zero so=%p\n", so);
1168                 /* NOTREACHED */
1169         }
1170         if (so->so_pcb != NULL && !(so->so_flags & SOF_PCBCLEARING)) {
1171                 int error2 = (*so->so_proto->pr_usrreqs->pru_detach)(so);
1172                 if (error == 0)
1173                         error = error2;
1174         }
1175         if (so->so_usecount <= 0) {
1176                 panic("soclose: usecount is zero so=%p\n", so);
1177                 /* NOTREACHED */
1178         }
1179 discard:
1180         if (so->so_pcb != NULL && !(so->so_flags & SOF_MP_SUBFLOW) &&
1181             (so->so_state & SS_NOFDREF)) {
1182                 panic("soclose: NOFDREF");
1183                 /* NOTREACHED */
1184         }
1185         so->so_state |= SS_NOFDREF;
1186
1187         if (so->so_flags & SOF_MP_SUBFLOW)
1188                 so->so_flags &= ~SOF_MP_SUBFLOW;
1189
1190         if ((so->so_flags & SOF_KNOTE) != 0)
1191                 KNOTE(&so->so_klist, SO_FILT_HINT_LOCKED);
1192
1193         atomic_add_32(&so->so_proto->pr_domain->dom_refs, -1);
1194         evsofree(so);
1195
1196         so->so_usecount--;
1197         sofree(so);
1198         return (error);
1199 }
1200
1201 int
1202 soclose(struct socket *so)
1203 {
1204         int error = 0;
1205         socket_lock(so, 1);
1206
1207         if (so->so_retaincnt == 0) {
1208                 error = soclose_locked(so);
1209         } else {
1210                 /*
1211                  * if the FD is going away, but socket is
1212                  * retained in kernel remove its reference
1213                  */
1214                 so->so_usecount--;
1215                 if (so->so_usecount < 2)
1216                         panic("soclose: retaincnt non null and so=%p "
1217                             "usecount=%d\n", so, so->so_usecount);
1218         }
1219         socket_unlock(so, 1);
1220         return (error);
1221 }
1222
1223 /*
1224  * Must be called at splnet...
1225  */
1226 /* Should already be locked */
1227 int
1228 soabort(struct socket *so)
1229 {
1230         int error;
1231
1232 #ifdef MORE_LOCKING_DEBUG
1233         lck_mtx_t *mutex_held;
1234
1235         if (so->so_proto->pr_getlock != NULL)
1236                 mutex_held = (*so->so_proto->pr_getlock)(so, 0);
1237         else
1238                 mutex_held = so->so_proto->pr_domain->dom_mtx;
1239         lck_mtx_assert(mutex_held, LCK_MTX_ASSERT_OWNED);
1240 #endif
1241
1242         if ((so->so_flags & SOF_ABORTED) == 0) {
1243                 so->so_flags |= SOF_ABORTED;
1244                 error = (*so->so_proto->pr_usrreqs->pru_abort)(so);
1245                 if (error) {
1246                         sofree(so);
1247                         return (error);
1248                 }
1249         }
1250         return (0);
1251 }
1252
1253 int
1254 soacceptlock(struct socket *so, struct sockaddr **nam, int dolock)
1255 {
1256         int error;
1257
1258         if (dolock)
1259                 socket_lock(so, 1);
1260
1261         so_update_last_owner_locked(so, PROC_NULL);
1262         so_update_policy(so);
1263 #if NECP
1264         so_update_necp_policy(so, NULL, NULL);
1265 #endif /* NECP */
1266
1267         if ((so->so_state & SS_NOFDREF) == 0)
1268                 panic("soaccept: !NOFDREF");
1269         so->so_state &= ~SS_NOFDREF;
1270         error = (*so->so_proto->pr_usrreqs->pru_accept)(so, nam);
1271
1272         if (dolock)
1273                 socket_unlock(so, 1);
1274         return (error);
1275 }
1276
1277 int
1278 soaccept(struct socket *so, struct sockaddr **nam)
1279 {
1280         return (soacceptlock(so, nam, 1));
1281 }
1282
1283 int
1284 soacceptfilter(struct socket *so)
1285 {
1286         struct sockaddr *local = NULL, *remote = NULL;
1287         int error = 0;
1288         struct socket *head = so->so_head;
1289
1290         /*
1291          * Hold the lock even if this socket has not been made visible
1292          * to the filter(s).  For sockets with global locks, this protects
1293          * against the head or peer going away
1294          */
1295         socket_lock(so, 1);
1296         if (sogetaddr_locked(so, &remote, 1) != 0 ||
1297             sogetaddr_locked(so, &local, 0) != 0) {
1298                 so->so_state &= ~(SS_NOFDREF | SS_COMP);
1299                 so->so_head = NULL;
1300                 socket_unlock(so, 1);
1301                 soclose(so);
1302                 /* Out of resources; try it again next time */
1303                 error = ECONNABORTED;
1304                 goto done;
1305         }
1306
1307         error = sflt_accept(head, so, local, remote);
1308
1309         /*
1310          * If we get EJUSTRETURN from one of the filters, mark this socket
1311          * as inactive and return it anyway.  This newly accepted socket
1312          * will be disconnected later before we hand it off to the caller.
1313          */
1314         if (error == EJUSTRETURN) {
1315                 error = 0;
1316                 (void) sosetdefunct(current_proc(), so,
1317                     SHUTDOWN_SOCKET_LEVEL_DISCONNECT_INTERNAL, FALSE);
1318         }
1319
1320         if (error != 0) {
1321                 /*
1322                  * This may seem like a duplication to the above error
1323                  * handling part when we return ECONNABORTED, except
1324                  * the following is done while holding the lock since
1325                  * the socket has been exposed to the filter(s) earlier.
1326                  */
1327                 so->so_state &= ~(SS_NOFDREF | SS_COMP);
1328                 so->so_head = NULL;
1329                 socket_unlock(so, 1);
1330                 soclose(so);
1331                 /* Propagate socket filter's error code to the caller */
1332         } else {
1333                 socket_unlock(so, 1);
1334         }
1335 done:
1336         /* Callee checks for NULL pointer */
1337         sock_freeaddr(remote);
1338         sock_freeaddr(local);
1339         return (error);
1340 }
1341
1342 /*
1343  * Returns:     0                       Success
1344  *              EOPNOTSUPP              Operation not supported on socket
1345  *              EISCONN                 Socket is connected
1346  *      <pru_connect>:EADDRNOTAVAIL     Address not available.
1347  *      <pru_connect>:EINVAL            Invalid argument
1348  *      <pru_connect>:EAFNOSUPPORT      Address family not supported [notdef]
1349  *      <pru_connect>:EACCES            Permission denied
1350  *      <pru_connect>:EADDRINUSE        Address in use
1351  *      <pru_connect>:EAGAIN            Resource unavailable, try again
1352  *      <pru_connect>:EPERM             Operation not permitted
1353  *      <sf_connect_out>:???            [anything a filter writer might set]
1354  */
1355 int
1356 soconnectlock(struct socket *so, struct sockaddr *nam, int dolock)
1357 {
1358         int error;
1359         struct proc *p = current_proc();
1360
1361         if (dolock)
1362                 socket_lock(so, 1);
1363
1364         so_update_last_owner_locked(so, p);
1365         so_update_policy(so);
1366
1367 #if NECP
1368         so_update_necp_policy(so, NULL, nam);
1369 #endif /* NECP */
1370
1371         /*
1372          * If this is a listening socket or if this is a previously-accepted
1373          * socket that has been marked as inactive, reject the connect request.
1374          */
1375         if ((so->so_options & SO_ACCEPTCONN) || (so->so_flags & SOF_DEFUNCT)) {
1376                 error = EOPNOTSUPP;
1377                 if (so->so_flags & SOF_DEFUNCT) {
1378                         SODEFUNCTLOG(("%s[%d]: defunct so 0x%llx [%d,%d] "
1379                             "(%d)\n", __func__, proc_pid(p),
1380                             (uint64_t)VM_KERNEL_ADDRPERM(so),
1381                             SOCK_DOM(so), SOCK_TYPE(so), error));
1382                 }
1383                 if (dolock)
1384                         socket_unlock(so, 1);
1385                 return (error);
1386         }
1387
1388         if ((so->so_restrictions & SO_RESTRICT_DENY_OUT) != 0) {
1389                 if (dolock)
1390                         socket_unlock(so, 1);
1391                 return (EPERM);
1392         }
1393
1394         /*
1395          * If protocol is connection-based, can only connect once.
1396          * Otherwise, if connected, try to disconnect first.
1397          * This allows user to disconnect by connecting to, e.g.,
1398          * a null address.
1399          */
1400         if (so->so_state & (SS_ISCONNECTED|SS_ISCONNECTING) &&
1401             ((so->so_proto->pr_flags & PR_CONNREQUIRED) ||
1402             (error = sodisconnectlocked(so)))) {
1403                 error = EISCONN;
1404         } else {
1405                 /*
1406                  * Run connect filter before calling protocol:
1407                  *  - non-blocking connect returns before completion;
1408                  */
1409                 error = sflt_connectout(so, nam);
1410                 if (error != 0) {
1411                         if (error == EJUSTRETURN)
1412                                 error = 0;
1413                 } else {
1414                         error = (*so->so_proto->pr_usrreqs->pru_connect)
1415                             (so, nam, p);
1416                 }
1417         }
1418         if (dolock)
1419                 socket_unlock(so, 1);
1420         return (error);
1421 }
1422
1423 int
1424 soconnect(struct socket *so, struct sockaddr *nam)
1425 {
1426         return (soconnectlock(so, nam, 1));
1427 }
1428
1429 /*
1430  * Returns:     0                       Success
1431  *      <pru_connect2>:EINVAL[AF_UNIX]
1432  *      <pru_connect2>:EPROTOTYPE[AF_UNIX]
1433  *      <pru_connect2>:???              [other protocol families]
1434  *
1435  * Notes:       <pru_connect2> is not supported by [TCP].
1436  */
1437 int
1438 soconnect2(struct socket *so1, struct socket *so2)
1439 {
1440         int error;
1441
1442         socket_lock(so1, 1);
1443         if (so2->so_proto->pr_lock)
1444                 socket_lock(so2, 1);
1445
1446         error = (*so1->so_proto->pr_usrreqs->pru_connect2)(so1, so2);
1447
1448         socket_unlock(so1, 1);
1449         if (so2->so_proto->pr_lock)
1450                 socket_unlock(so2, 1);
1451         return (error);
1452 }
1453
1454 int
1455 soconnectxlocked(struct socket *so, struct sockaddr_list **src_sl,
1456     struct sockaddr_list **dst_sl, struct proc *p, uint32_t ifscope,
1457     associd_t aid, connid_t *pcid, uint32_t flags, void *arg,
1458     uint32_t arglen)
1459 {
1460         int error;
1461
1462         so_update_last_owner_locked(so, p);
1463         so_update_policy(so);
1464
1465         /*
1466          * If this is a listening socket or if this is a previously-accepted
1467          * socket that has been marked as inactive, reject the connect request.
1468          */
1469         if ((so->so_options & SO_ACCEPTCONN) || (so->so_flags & SOF_DEFUNCT)) {
1470                 error = EOPNOTSUPP;
1471                 if (so->so_flags & SOF_DEFUNCT) {
1472                         SODEFUNCTLOG(("%s[%d]: defunct so 0x%llx [%d,%d] "
1473                             "(%d)\n", __func__, proc_pid(p),
1474                             (uint64_t)VM_KERNEL_ADDRPERM(so),
1475                             SOCK_DOM(so), SOCK_TYPE(so), error));
1476                 }
1477                 return (error);
1478         }
1479
1480         if ((so->so_restrictions & SO_RESTRICT_DENY_OUT) != 0)
1481                 return (EPERM);
1482
1483         /*
1484          * If protocol is connection-based, can only connect once
1485          * unless PR_MULTICONN is set.  Otherwise, if connected,
1486          * try to disconnect first.  This allows user to disconnect
1487          * by connecting to, e.g., a null address.
1488          */
1489         if ((so->so_state & (SS_ISCONNECTED|SS_ISCONNECTING)) &&
1490             !(so->so_proto->pr_flags & PR_MULTICONN) &&
1491             ((so->so_proto->pr_flags & PR_CONNREQUIRED) ||
1492             (error = sodisconnectlocked(so)) != 0)) {
1493                 error = EISCONN;
1494         } else {
1495                 /*
1496                  * Run connect filter before calling protocol:
1497                  *  - non-blocking connect returns before completion;
1498                  */
1499                 error = sflt_connectxout(so, dst_sl);
1500                 if (error != 0) {
1501                         if (error == EJUSTRETURN)
1502                                 error = 0;
1503                 } else {
1504                         error = (*so->so_proto->pr_usrreqs->pru_connectx)
1505                             (so, src_sl, dst_sl, p, ifscope, aid, pcid,
1506                             flags, arg, arglen);
1507                 }
1508         }
1509
1510         return (error);
1511 }
1512
1513 int
1514 sodisconnectlocked(struct socket *so)
1515 {
1516         int error;
1517
1518         if ((so->so_state & SS_ISCONNECTED) == 0) {
1519                 error = ENOTCONN;
1520                 goto bad;
1521         }
1522         if (so->so_state & SS_ISDISCONNECTING) {
1523                 error = EALREADY;
1524                 goto bad;
1525         }
1526
1527         error = (*so->so_proto->pr_usrreqs->pru_disconnect)(so);
1528         if (error == 0)
1529                 sflt_notify(so, sock_evt_disconnected, NULL);
1530
1531 bad:
1532         return (error);
1533 }
1534
1535 /* Locking version */
1536 int
1537 sodisconnect(struct socket *so)
1538 {
1539         int error;
1540
1541         socket_lock(so, 1);
1542         error = sodisconnectlocked(so);
1543         socket_unlock(so, 1);
1544         return (error);
1545 }
1546
1547 int
1548 sodisconnectxlocked(struct socket *so, associd_t aid, connid_t cid)
1549 {
1550         int error;
1551
1552         /*
1553          * Call the protocol disconnectx handler; let it handle all
1554          * matters related to the connection state of this session.
1555          */
1556         error = (*so->so_proto->pr_usrreqs->pru_disconnectx)(so, aid, cid);
1557         if (error == 0) {
1558                 /*
1559                  * The event applies only for the session, not for
1560                  * the disconnection of individual subflows.
1561                  */
1562                 if (so->so_state & (SS_ISDISCONNECTING|SS_ISDISCONNECTED))
1563                         sflt_notify(so, sock_evt_disconnected, NULL);
1564         }
1565         return (error);
1566 }
1567
1568 int
1569 sodisconnectx(struct socket *so, associd_t aid, connid_t cid)
1570 {
1571         int error;
1572
1573         socket_lock(so, 1);
1574         error = sodisconnectxlocked(so, aid, cid);
1575         socket_unlock(so, 1);
1576         return (error);
1577 }
1578
1579 int
1580 sopeelofflocked(struct socket *so, associd_t aid, struct socket **psop)
1581 {
1582         return ((*so->so_proto->pr_usrreqs->pru_peeloff)(so, aid, psop));
1583 }
1584
1585 #define SBLOCKWAIT(f)   (((f) & MSG_DONTWAIT) ? 0 : SBL_WAIT)
1586
1587 /*
1588  * sosendcheck will lock the socket buffer if it isn't locked and
1589  * verify that there is space for the data being inserted.
1590  *
1591  * Returns:     0                       Success
1592  *              EPIPE
1593  *      sblock:EWOULDBLOCK
1594  *      sblock:EINTR
1595  *      sbwait:EBADF
1596  *      sbwait:EINTR
1597  *      [so_error]:???
1598  */
1599 int
1600 sosendcheck(struct socket *so, struct sockaddr *addr, user_ssize_t resid,
1601     int32_t clen, int32_t atomic, int flags, int *sblocked,
1602     struct mbuf *control)
1603 {
1604         int     error = 0;
1605         int32_t space;
1606         int     assumelock = 0;
1607
1608 restart:
1609         if (*sblocked == 0) {
1610                 if ((so->so_snd.sb_flags & SB_LOCK) != 0 &&
1611                     so->so_send_filt_thread != 0 &&
1612                     so->so_send_filt_thread == current_thread()) {
1613                         /*
1614                          * We're being called recursively from a filter,
1615                          * allow this to continue. Radar 4150520.
1616                          * Don't set sblocked because we don't want
1617                          * to perform an unlock later.
1618                          */
1619                         assumelock = 1;
1620                 } else {
1621                         error = sblock(&so->so_snd, SBLOCKWAIT(flags));
1622                         if (error) {
1623                                 if (so->so_flags & SOF_DEFUNCT)
1624                                         goto defunct;
1625                                 return (error);
1626                         }
1627                         *sblocked = 1;
1628                 }
1629         }
1630
1631         /*
1632          * If a send attempt is made on a socket that has been marked
1633          * as inactive (disconnected), reject the request.
1634          */
1635         if (so->so_flags & SOF_DEFUNCT) {
1636 defunct:
1637                 error = EPIPE;
1638                 SODEFUNCTLOG(("%s[%d]: defunct so 0x%llx [%d,%d] (%d)\n",
1639                     __func__, proc_selfpid(), (uint64_t)VM_KERNEL_ADDRPERM(so),
1640                     SOCK_DOM(so), SOCK_TYPE(so), error));
1641                 return (error);
1642         }
1643
1644         if (so->so_state & SS_CANTSENDMORE) {
1645 #if CONTENT_FILTER
1646                 /*
1647                  * Can re-inject data of half closed connections
1648                  */
1649                 if ((so->so_state & SS_ISDISCONNECTED) == 0 &&
1650                         so->so_snd.sb_cfil_thread == current_thread() &&
1651                         cfil_sock_data_pending(&so->so_snd) != 0)
1652                         CFIL_LOG(LOG_INFO,
1653                                 "so %llx ignore SS_CANTSENDMORE",
1654                                 (uint64_t)VM_KERNEL_ADDRPERM(so));
1655                 else
1656 #endif /* CONTENT_FILTER */
1657                         return (EPIPE);
1658         }
1659         if (so->so_error) {
1660                 error = so->so_error;
1661                 so->so_error = 0;
1662                 return (error);
1663         }
1664
1665         if ((so->so_state & SS_ISCONNECTED) == 0) {
1666                 if ((so->so_proto->pr_flags & PR_CONNREQUIRED) != 0) {
1667                         if (((so->so_state & SS_ISCONFIRMING) == 0) &&
1668                             (resid != 0 || clen == 0)) {
1669 #if MPTCP
1670                                 /*
1671                                  * MPTCP Fast Join sends data before the
1672                                  * socket is truly connected.
1673                                  */
1674                                 if ((so->so_flags & (SOF_MP_SUBFLOW |
1675                                         SOF_MPTCP_FASTJOIN)) !=
1676                                     (SOF_MP_SUBFLOW | SOF_MPTCP_FASTJOIN))
1677 #endif /* MPTCP */
1678                                 return (ENOTCONN);
1679                         }
1680                 } else if (addr == 0 && !(flags&MSG_HOLD)) {
1681                         return ((so->so_proto->pr_flags & PR_CONNREQUIRED) ?
1682                             ENOTCONN : EDESTADDRREQ);
1683                 }
1684         }
1685         if (so->so_flags & SOF_ENABLE_MSGS)
1686                 space = msgq_sbspace(so, control);
1687         else
1688                 space = sbspace(&so->so_snd);
1689
1690         if (flags & MSG_OOB)
1691                 space += 1024;
1692         if ((atomic && resid > so->so_snd.sb_hiwat) ||
1693             clen > so->so_snd.sb_hiwat)
1694                 return (EMSGSIZE);
1695
1696         if ((space < resid + clen &&
1697             (atomic || space < (int32_t)so->so_snd.sb_lowat || space < clen)) ||
1698             (so->so_type == SOCK_STREAM && so_wait_for_if_feedback(so))) {
1699                 if ((so->so_state & SS_NBIO) || (flags & MSG_NBIO) ||
1700                     assumelock) {
1701                         return (EWOULDBLOCK);
1702                 }
1703                 sbunlock(&so->so_snd, TRUE);    /* keep socket locked */
1704                 *sblocked = 0;
1705                 error = sbwait(&so->so_snd);
1706                 if (error) {
1707                         if (so->so_flags & SOF_DEFUNCT)
1708                                 goto defunct;
1709                         return (error);
1710                 }
1711                 goto restart;
1712         }
1713         return (0);
1714 }
1715
1716 /*
1717  * Send on a socket.
1718  * If send must go all at once and message is larger than
1719  * send buffering, then hard error.
1720  * Lock against other senders.
1721  * If must go all at once and not enough room now, then
1722  * inform user that this would block and do nothing.
1723  * Otherwise, if nonblocking, send as much as possible.
1724  * The data to be sent is described by "uio" if nonzero,
1725  * otherwise by the mbuf chain "top" (which must be null
1726  * if uio is not).  Data provided in mbuf chain must be small
1727  * enough to send all at once.
1728  *
1729  * Returns nonzero on error, timeout or signal; callers
1730  * must check for short counts if EINTR/ERESTART are returned.
1731  * Data and control buffers are freed on return.
1732  * Experiment:
1733  * MSG_HOLD: go thru most of sosend(), but just enqueue the mbuf
1734  * MSG_SEND: go thru as for MSG_HOLD on current fragment, then
1735  *  point at the mbuf chain being constructed and go from there.
1736  *
1737  * Returns:     0                       Success
1738  *              EOPNOTSUPP
1739  *              EINVAL
1740  *              ENOBUFS
1741  *      uiomove:EFAULT
1742  *      sosendcheck:EPIPE
1743  *      sosendcheck:EWOULDBLOCK
1744  *      sosendcheck:EINTR
1745  *      sosendcheck:EBADF
1746  *      sosendcheck:EINTR
1747  *      sosendcheck:???                 [value from so_error]
1748  *      <pru_send>:ECONNRESET[TCP]
1749  *      <pru_send>:EINVAL[TCP]
1750  *      <pru_send>:ENOBUFS[TCP]
1751  *      <pru_send>:EADDRINUSE[TCP]
1752  *      <pru_send>:EADDRNOTAVAIL[TCP]
1753  *      <pru_send>:EAFNOSUPPORT[TCP]
1754  *      <pru_send>:EACCES[TCP]
1755  *      <pru_send>:EAGAIN[TCP]
1756  *      <pru_send>:EPERM[TCP]
1757  *      <pru_send>:EMSGSIZE[TCP]
1758  *      <pru_send>:EHOSTUNREACH[TCP]
1759  *      <pru_send>:ENETUNREACH[TCP]
1760  *      <pru_send>:ENETDOWN[TCP]
1761  *      <pru_send>:ENOMEM[TCP]
1762  *      <pru_send>:ENOBUFS[TCP]
1763  *      <pru_send>:???[TCP]             [ignorable: mostly IPSEC/firewall/DLIL]
1764  *      <pru_send>:EINVAL[AF_UNIX]
1765  *      <pru_send>:EOPNOTSUPP[AF_UNIX]
1766  *      <pru_send>:EPIPE[AF_UNIX]
1767  *      <pru_send>:ENOTCONN[AF_UNIX]
1768  *      <pru_send>:EISCONN[AF_UNIX]
1769  *      <pru_send>:???[AF_UNIX]         [whatever a filter author chooses]
1770  *      <sf_data_out>:???               [whatever a filter author chooses]
1771  *
1772  * Notes:       Other <pru_send> returns depend on the protocol family; all
1773  *              <sf_data_out> returns depend on what the filter author causes
1774  *              their filter to return.
1775  */
1776 int
1777 sosend(struct socket *so, struct sockaddr *addr, struct uio *uio,
1778     struct mbuf *top, struct mbuf *control, int flags)
1779 {
1780         struct mbuf **mp;
1781         struct mbuf *m, *freelist = NULL;
1782         user_ssize_t space, len, resid;
1783         int clen = 0, error, dontroute, mlen, sendflags;
1784         int atomic = sosendallatonce(so) || top;
1785         int sblocked = 0;
1786         struct proc *p = current_proc();
1787         struct mbuf *control_copy = NULL;
1788
1789         if (uio != NULL)
1790                 resid = uio_resid(uio);
1791         else
1792                 resid = top->m_pkthdr.len;
1793
1794         KERNEL_DEBUG((DBG_FNC_SOSEND | DBG_FUNC_START), so, resid,
1795             so->so_snd.sb_cc, so->so_snd.sb_lowat, so->so_snd.sb_hiwat);
1796
1797         socket_lock(so, 1);
1798
1799         /*
1800          * Re-injection should not affect process accounting
1801          */
1802         if ((flags & MSG_SKIPCFIL) == 0) {
1803         so_update_last_owner_locked(so, p);
1804         so_update_policy(so);
1805
1806 #if NECP
1807         so_update_necp_policy(so, NULL, addr);
1808 #endif /* NECP */
1809         }
1810
1811         if (so->so_type != SOCK_STREAM && (flags & MSG_OOB) != 0) {
1812                 error = EOPNOTSUPP;
1813                 socket_unlock(so, 1);
1814                 goto out;
1815         }
1816
1817         /*
1818          * In theory resid should be unsigned.
1819          * However, space must be signed, as it might be less than 0
1820          * if we over-committed, and we must use a signed comparison
1821          * of space and resid.  On the other hand, a negative resid
1822          * causes us to loop sending 0-length segments to the protocol.
1823          *
1824          * Usually, MSG_EOR isn't used on SOCK_STREAM type sockets.
1825          * But it will be used by sockets doing message delivery.
1826          *
1827          * Note: We limit resid to be a positive int value as we use
1828          * imin() to set bytes_to_copy -- radr://14558484
1829          */
1830         if (resid < 0 || resid > INT_MAX || (so->so_type == SOCK_STREAM &&
1831             !(so->so_flags & SOF_ENABLE_MSGS) && (flags & MSG_EOR))) {
1832                 error = EINVAL;
1833                 socket_unlock(so, 1);
1834                 goto out;
1835         }
1836
1837         dontroute = (flags & MSG_DONTROUTE) &&
1838             (so->so_options & SO_DONTROUTE) == 0 &&
1839             (so->so_proto->pr_flags & PR_ATOMIC);
1840         OSIncrementAtomicLong(&p->p_stats->p_ru.ru_msgsnd);
1841
1842         if (control != NULL)
1843                 clen = control->m_len;
1844
1845         do {
1846                 error = sosendcheck(so, addr, resid, clen, atomic, flags,
1847                     &sblocked, control);
1848                 if (error)
1849                         goto release;
1850
1851                 mp = &top;
1852                 if (so->so_flags & SOF_ENABLE_MSGS)
1853                         space = msgq_sbspace(so, control);
1854                 else
1855                         space = sbspace(&so->so_snd) - clen;
1856                 space += ((flags & MSG_OOB) ? 1024 : 0);
1857
1858                 do {
1859                         if (uio == NULL) {
1860                                 /*
1861                                  * Data is prepackaged in "top".
1862                                  */
1863                                 resid = 0;
1864                                 if (flags & MSG_EOR)
1865                                         top->m_flags |= M_EOR;
1866                         } else {
1867                                 int chainlength;
1868                                 int bytes_to_copy;
1869                                 boolean_t jumbocl;
1870                                 boolean_t bigcl;
1871
1872                                 bytes_to_copy = imin(resid, space);
1873
1874                                 if (sosendminchain > 0)
1875                                         chainlength = 0;
1876                                 else
1877                                         chainlength = sosendmaxchain;
1878
1879                                 /*
1880                                  * Use big 4 KB cluster only when outgoing
1881                                  * interface does not want 2 LB clusters
1882                                  */
1883                                 bigcl =
1884                                     !(so->so_flags1 & SOF1_IF_2KCL) ||
1885                                     sosendbigcl_ignore_capab;
1886
1887                                 /*
1888                                  * Attempt to use larger than system page-size
1889                                  * clusters for large writes only if there is
1890                                  * a jumbo cluster pool and if the socket is
1891                                  * marked accordingly.
1892                                  */
1893                                 jumbocl = sosendjcl && njcl > 0 &&
1894                                     ((so->so_flags & SOF_MULTIPAGES) ||
1895                                     sosendjcl_ignore_capab) &&
1896                                     bigcl;
1897
1898                                 socket_unlock(so, 0);
1899
1900                                 do {
1901                                         int num_needed;
1902                                         int hdrs_needed = (top == NULL) ? 1 : 0;
1903
1904                                         /*
1905                                          * try to maintain a local cache of mbuf
1906                                          * clusters needed to complete this
1907                                          * write the list is further limited to
1908                                          * the number that are currently needed
1909                                          * to fill the socket this mechanism
1910                                          * allows a large number of mbufs/
1911                                          * clusters to be grabbed under a single
1912                                          * mbuf lock... if we can't get any
1913                                          * clusters, than fall back to trying
1914                                          * for mbufs if we fail early (or
1915                                          * miscalcluate the number needed) make
1916                                          * sure to release any clusters we
1917                                          * haven't yet consumed.
1918                                          */
1919                                         if (freelist == NULL &&
1920                                             bytes_to_copy > MBIGCLBYTES &&
1921                                             jumbocl) {
1922                                                 num_needed =
1923                                                     bytes_to_copy / M16KCLBYTES;
1924
1925                                                 if ((bytes_to_copy -
1926                                                     (num_needed * M16KCLBYTES))
1927                                                     >= MINCLSIZE)
1928                                                         num_needed++;
1929
1930                                                 freelist =
1931                                                     m_getpackets_internal(
1932                                                     (unsigned int *)&num_needed,
1933                                                     hdrs_needed, M_WAIT, 0,
1934                                                     M16KCLBYTES);
1935                                                 /*
1936                                                  * Fall back to 4K cluster size
1937                                                  * if allocation failed
1938                                                  */
1939                                         }
1940
1941                                         if (freelist == NULL &&
1942                                             bytes_to_copy > MCLBYTES &&
1943                                             bigcl) {
1944                                                 num_needed =
1945                                                     bytes_to_copy / MBIGCLBYTES;
1946
1947                                                 if ((bytes_to_copy -
1948                                                     (num_needed * MBIGCLBYTES)) >=
1949                                                     MINCLSIZE)
1950                                                         num_needed++;
1951
1952                                                 freelist =
1953                                                     m_getpackets_internal(
1954                                                     (unsigned int *)&num_needed,
1955                                                     hdrs_needed, M_WAIT, 0,
1956                                                     MBIGCLBYTES);
1957                                                 /*
1958                                                  * Fall back to cluster size
1959                                                  * if allocation failed
1960                                                  */
1961                                         }
1962
1963                                         if (freelist == NULL &&
1964                                             bytes_to_copy > MINCLSIZE) {
1965                                                 num_needed =
1966                                                     bytes_to_copy / MCLBYTES;
1967
1968                                                 if ((bytes_to_copy -
1969                                                     (num_needed * MCLBYTES)) >=
1970                                                     MINCLSIZE)
1971                                                         num_needed++;
1972
1973                                                 freelist =
1974                                                     m_getpackets_internal(
1975                                                     (unsigned int *)&num_needed,
1976                                                     hdrs_needed, M_WAIT, 0,
1977                                                     MCLBYTES);
1978                                                 /*
1979                                                  * Fall back to a single mbuf
1980                                                  * if allocation failed
1981                                                  */
1982                                         }
1983
1984                                         if (freelist == NULL) {
1985                                                 if (top == NULL)
1986                                                         MGETHDR(freelist,
1987                                                             M_WAIT, MT_DATA);
1988                                                 else
1989                                                         MGET(freelist,
1990                                                             M_WAIT, MT_DATA);
1991
1992                                                 if (freelist == NULL) {
1993                                                         error = ENOBUFS;
1994                                                         socket_lock(so, 0);
1995                                                         goto release;
1996                                                 }
1997                                                 /*
1998                                                  * For datagram protocols,
1999                                                  * leave room for protocol
2000                                                  * headers in first mbuf.
2001                                                  */
2002                                                 if (atomic && top == NULL &&
2003                                                     bytes_to_copy < MHLEN) {
2004                                                         MH_ALIGN(freelist,
2005                                                             bytes_to_copy);
2006                                                 }
2007                                         }
2008                                         m = freelist;
2009                                         freelist = m->m_next;
2010                                         m->m_next = NULL;
2011
2012                                         if ((m->m_flags & M_EXT))
2013                                                 mlen = m->m_ext.ext_size;
2014                                         else if ((m->m_flags & M_PKTHDR))
2015                                                 mlen =
2016                                                     MHLEN - m_leadingspace(m);
2017                                         else
2018                                                 mlen = MLEN;
2019                                         len = imin(mlen, bytes_to_copy);
2020
2021                                         chainlength += len;
2022
2023                                         space -= len;
2024
2025                                         error = uiomove(mtod(m, caddr_t),
2026                                             len, uio);
2027
2028                                         resid = uio_resid(uio);
2029
2030                                         m->m_len = len;
2031                                         *mp = m;
2032                                         top->m_pkthdr.len += len;
2033                                         if (error)
2034                                                 break;
2035                                         mp = &m->m_next;
2036                                         if (resid <= 0) {
2037                                                 if (flags & MSG_EOR)
2038                                                         top->m_flags |= M_EOR;
2039                                                 break;
2040                                         }
2041                                         bytes_to_copy = min(resid, space);
2042
2043                                 } while (space > 0 &&
2044                                     (chainlength < sosendmaxchain || atomic ||
2045                                     resid < MINCLSIZE));
2046
2047                                 socket_lock(so, 0);
2048
2049                                 if (error)
2050                                         goto release;
2051                         }
2052
2053                         if (flags & (MSG_HOLD|MSG_SEND)) {
2054                                 /* Enqueue for later, go away if HOLD */
2055                                 struct mbuf *mb1;
2056                                 if (so->so_temp && (flags & MSG_FLUSH)) {
2057                                         m_freem(so->so_temp);
2058                                         so->so_temp = NULL;
2059                                 }
2060                                 if (so->so_temp)
2061                                         so->so_tail->m_next = top;
2062                                 else
2063                                         so->so_temp = top;
2064                                 mb1 = top;
2065                                 while (mb1->m_next)
2066                                         mb1 = mb1->m_next;
2067                                 so->so_tail = mb1;
2068                                 if (flags & MSG_HOLD) {
2069                                         top = NULL;
2070                                         goto release;
2071                                 }
2072                                 top = so->so_temp;
2073                         }
2074                         if (dontroute)
2075                                 so->so_options |= SO_DONTROUTE;
2076
2077                         /* Compute flags here, for pru_send and NKEs */
2078                         sendflags = (flags & MSG_OOB) ? PRUS_OOB :
2079                             /*
2080                              * If the user set MSG_EOF, the protocol
2081                              * understands this flag and nothing left to
2082                              * send then use PRU_SEND_EOF instead of PRU_SEND.
2083                              */
2084                             ((flags & MSG_EOF) &&
2085                              (so->so_proto->pr_flags & PR_IMPLOPCL) &&
2086                              (resid <= 0)) ? PRUS_EOF :
2087                              /* If there is more to send set PRUS_MORETOCOME */
2088                              (resid > 0 && space > 0) ? PRUS_MORETOCOME : 0;
2089
2090                         if ((flags & MSG_SKIPCFIL) == 0) {
2091                                 /*
2092                                  * Socket filter processing
2093                                  */
2094                                 error = sflt_data_out(so, addr, &top,
2095                                     &control, (sendflags & MSG_OOB) ?
2096                                     sock_data_filt_flag_oob : 0);
2097                                 if (error) {
2098                                         if (error == EJUSTRETURN) {
2099                                                 error = 0;
2100                                                 clen = 0;
2101                                                 control = NULL;
2102                                                 top = NULL;
2103                                         }
2104                                         goto release;
2105                                 }
2106 #if CONTENT_FILTER
2107                                 /*
2108                                  * Content filter processing
2109                                  */
2110                                 error = cfil_sock_data_out(so, addr, top,
2111                                    control, (sendflags & MSG_OOB) ?
2112                                     sock_data_filt_flag_oob : 0);
2113                                 if (error) {
2114                                         if (error == EJUSTRETURN) {
2115                                                 error = 0;
2116                                                 clen = 0;
2117                                                 control = NULL;
2118                                                 top = NULL;
2119                                                 }
2120                                         goto release;
2121                                 }
2122 #endif /* CONTENT_FILTER */
2123                         }
2124                         if (so->so_flags & SOF_ENABLE_MSGS) {
2125                                 /*
2126                                  * Make a copy of control mbuf,
2127                                  * so that msg priority can be
2128                                  * passed to subsequent mbufs.
2129                                  */
2130                                 control_copy = m_dup(control, M_NOWAIT);
2131                         }
2132                         error = (*so->so_proto->pr_usrreqs->pru_send)
2133                             (so, sendflags, top, addr, control, p);
2134
2135                         if (flags & MSG_SEND)
2136                                 so->so_temp = NULL;
2137
2138                         if (dontroute)
2139                                 so->so_options &= ~SO_DONTROUTE;
2140
2141                         clen = 0;
2142                         control = control_copy;
2143                         control_copy = NULL;
2144                         top = NULL;
2145                         mp = &top;
2146                         if (error)
2147                                 goto release;
2148                 } while (resid && space > 0);
2149         } while (resid);
2150
2151 release:
2152         if (sblocked)
2153                 sbunlock(&so->so_snd, FALSE);   /* will unlock socket */
2154         else
2155                 socket_unlock(so, 1);
2156 out:
2157         if (top != NULL)
2158                 m_freem(top);
2159         if (control != NULL)
2160                 m_freem(control);
2161         if (freelist != NULL)
2162                 m_freem_list(freelist);
2163         if (control_copy != NULL)
2164                 m_freem(control_copy);
2165
2166         KERNEL_DEBUG(DBG_FNC_SOSEND | DBG_FUNC_END, so, resid, so->so_snd.sb_cc,
2167             space, error);
2168
2169         return (error);
2170 }
2171
2172 int
2173 sosend_list(struct socket *so, struct sockaddr *addr, struct uio **uioarray,
2174      u_int uiocnt, struct mbuf *top, struct mbuf *control, int flags)
2175 {
2176         struct mbuf *m, *freelist = NULL;
2177         user_ssize_t len, resid;
2178         int clen = 0, error, dontroute, mlen;
2179         int atomic = sosendallatonce(so) || top;
2180         int sblocked = 0;
2181         struct proc *p = current_proc();
2182         u_int uiofirst = 0;
2183         u_int uiolast = 0;
2184
2185         KERNEL_DEBUG((DBG_FNC_SOSEND_LIST | DBG_FUNC_START), so, uiocnt,
2186             so->so_snd.sb_cc, so->so_snd.sb_lowat, so->so_snd.sb_hiwat);
2187
2188         if (so->so_type != SOCK_DGRAM) {
2189                 error = EINVAL;
2190                 goto out;
2191         }
2192         if (atomic == 0) {
2193                 error = EINVAL;
2194                 goto out;
2195         }
2196         if (so->so_proto->pr_usrreqs->pru_send_list == NULL) {
2197                 error = EPROTONOSUPPORT;
2198                 goto out;
2199         }
2200         if (flags & ~(MSG_DONTWAIT | MSG_NBIO)) {
2201                 error = EINVAL;
2202                 goto out;
2203         }
2204         if (uioarray != NULL)
2205                 resid = uio_array_resid(uioarray, uiocnt);
2206         else
2207                 resid = mbuf_pkt_list_len(top);
2208
2209         /*
2210          * In theory resid should be unsigned.
2211          * However, space must be signed, as it might be less than 0
2212          * if we over-committed, and we must use a signed comparison
2213          * of space and resid.  On the other hand, a negative resid
2214          * causes us to loop sending 0-length segments to the protocol.
2215          *
2216          * Note: We limit resid to be a positive int value as we use
2217          * imin() to set bytes_to_copy -- radr://14558484
2218          */
2219         if (resid < 0 || resid > INT_MAX) {
2220                 error = EINVAL;
2221                 goto out;
2222         }
2223         /*
2224          * Disallow functionality not currently supported
2225          * Note: Will need to treat arrays of addresses and controls
2226          */
2227         if (addr != NULL) {
2228                 printf("%s addr not supported\n", __func__);
2229                 error = EOPNOTSUPP;
2230                 goto out;
2231         }
2232         if (control != NULL) {
2233                 printf("%s control not supported\n", __func__);
2234                 error = EOPNOTSUPP;
2235                 goto out;
2236         }
2237
2238         socket_lock(so, 1);
2239         so_update_last_owner_locked(so, p);
2240         so_update_policy(so);
2241
2242 #if NECP
2243         so_update_necp_policy(so, NULL, addr);
2244 #endif /* NECP */
2245
2246         dontroute = (flags & MSG_DONTROUTE) &&
2247             (so->so_options & SO_DONTROUTE) == 0 &&
2248             (so->so_proto->pr_flags & PR_ATOMIC);
2249         OSIncrementAtomicLong(&p->p_stats->p_ru.ru_msgsnd);
2250
2251         if (control != NULL)
2252                 clen = control->m_len;
2253
2254         error = sosendcheck(so, addr, resid, clen, atomic, flags,
2255             &sblocked, control);
2256         if (error)
2257                 goto release;
2258
2259         do {
2260                 int i;
2261
2262                 if (uioarray == NULL) {
2263                         /*
2264                          * Data is prepackaged in "top".
2265                          */
2266                         resid = 0;
2267                 } else {
2268                         int num_needed = 0;
2269                         int chainlength;
2270                         size_t maxpktlen = 0;
2271
2272                         if (sosendminchain > 0)
2273                                 chainlength = 0;
2274                         else
2275                                 chainlength = sosendmaxchain;
2276
2277                         socket_unlock(so, 0);
2278
2279                         /*
2280                          * Find a set of uio that fit in a reasonable number
2281                          * of mbuf packets
2282                          */
2283                         for (i = uiofirst; i < uiocnt; i++) {
2284                                 struct uio *auio = uioarray[i];
2285
2286                                 len = uio_resid(auio);
2287
2288                                 /* Do nothing for empty messages */
2289                                 if (len == 0)
2290                                         continue;
2291
2292                                 num_needed += 1;
2293                                 uiolast += 1;
2294
2295                                 if (len > maxpktlen)
2296                                         maxpktlen = len;
2297
2298                                 chainlength += len;
2299                                 if (chainlength > sosendmaxchain)
2300                                         break;
2301                         }
2302                         /*
2303                          * Nothing left to send
2304                          */
2305                         if (num_needed == 0) {
2306                                 socket_lock(so, 0);
2307                                 break;
2308                         }
2309                         /*
2310                          * Allocate the mbuf packets at once
2311                          */
2312                         freelist = m_allocpacket_internal(
2313                             (unsigned int *)&num_needed,
2314                             maxpktlen, NULL, M_WAIT, 1, 0);
2315
2316                         if (freelist == NULL) {
2317                                 socket_lock(so, 0);
2318                                 error = ENOMEM;
2319                                 goto release;
2320                         }
2321                         /*
2322                          * Copy each uio of the set into its own mbuf packet
2323                          */
2324                         for (i = uiofirst, m = freelist;
2325                             i < uiolast && m != NULL;
2326                             i++) {
2327                                 int bytes_to_copy;
2328                                 struct mbuf *n;
2329                                 struct uio *auio = uioarray[i];
2330
2331                                 bytes_to_copy = uio_resid(auio);
2332
2333                                 /* Do nothing for empty messages */
2334                                 if (bytes_to_copy == 0)
2335                                         continue;
2336
2337                                 for (n = m; n != NULL; n = n->m_next) {
2338                                         mlen = mbuf_maxlen(n);
2339
2340                                         len = imin(mlen, bytes_to_copy);
2341
2342                                         /*
2343                                          * Note: uiomove() decrements the iovec
2344                                          * length
2345                                          */
2346                                         error = uiomove(mtod(n, caddr_t),
2347                                             len, auio);
2348                                         if (error != 0)
2349                                                 break;
2350                                         n->m_len = len;
2351                                         m->m_pkthdr.len += len;
2352
2353                                         VERIFY(m->m_pkthdr.len <= maxpktlen);
2354
2355                                         bytes_to_copy -= len;
2356                                         resid -= len;
2357                                 }
2358                                 if (m->m_pkthdr.len == 0) {
2359                                         printf("%s so %llx pkt %llx len null\n",
2360                                             __func__,
2361                                             (uint64_t)VM_KERNEL_ADDRPERM(so),
2362                                             (uint64_t)VM_KERNEL_ADDRPERM(m));
2363                                 }
2364                                 if (error != 0)
2365                                         break;
2366                                 m = m->m_nextpkt;
2367                         }
2368
2369                         socket_lock(so, 0);
2370
2371                         if (error)
2372                                 goto release;
2373                         top = freelist;
2374                         freelist = NULL;
2375                 }
2376
2377                 if (dontroute)
2378                         so->so_options |= SO_DONTROUTE;
2379
2380                 if ((flags & MSG_SKIPCFIL) == 0) {
2381                         struct mbuf **prevnextp = NULL;
2382
2383                         for (i = uiofirst, m = top;
2384                             i < uiolast && m != NULL;
2385                             i++) {
2386                                 struct mbuf *nextpkt = m->m_nextpkt;
2387
2388                                 /*
2389                                  * Socket filter processing
2390                                  */
2391                                 error = sflt_data_out(so, addr, &m,
2392                                     &control, 0);
2393                                 if (error != 0 && error != EJUSTRETURN)
2394                                         goto release;
2395
2396 #if CONTENT_FILTER
2397                                 if (error == 0) {
2398                                         /*
2399                                          * Content filter processing
2400                                          */
2401                                         error = cfil_sock_data_out(so, addr, m,
2402                                            control, 0);
2403                                         if (error != 0 && error != EJUSTRETURN)
2404                                                 goto release;
2405                                 }
2406 #endif /* CONTENT_FILTER */
2407                                 /*
2408                                  * Remove packet from the list when
2409                                  * swallowed by a filter
2410                                  */
2411                                 if (error == EJUSTRETURN) {
2412                                         error = 0;
2413                                         if (prevnextp != NULL)
2414                                                 *prevnextp = nextpkt;
2415                                         else
2416                                                 top = nextpkt;
2417                                 }
2418
2419                                 m = nextpkt;
2420                                 if (m != NULL)
2421                                         prevnextp = &m->m_nextpkt;
2422                         }
2423                 }
2424                 if (top != NULL)
2425                         error = (*so->so_proto->pr_usrreqs->pru_send_list)
2426                             (so, 0, top, addr, control, p);
2427
2428                 if (dontroute)
2429                         so->so_options &= ~SO_DONTROUTE;
2430
2431                 clen = 0;
2432                 top = NULL;
2433                 uiofirst = uiolast;
2434         } while (resid > 0 && error == 0);
2435 release:
2436         if (sblocked)
2437                 sbunlock(&so->so_snd, FALSE);   /* will unlock socket */
2438         else
2439                 socket_unlock(so, 1);
2440 out:
2441         if (top != NULL)
2442                 m_freem(top);
2443         if (control != NULL)
2444                 m_freem(control);
2445         if (freelist != NULL)
2446                 m_freem_list(freelist);
2447
2448         KERNEL_DEBUG(DBG_FNC_SOSEND_LIST | DBG_FUNC_END, so, resid,
2449             so->so_snd.sb_cc, 0, error);
2450
2451         return (error);
2452 }
2453
2454 /*
2455  * Implement receive operations on a socket.
2456  * We depend on the way that records are added to the sockbuf
2457  * by sbappend*.  In particular, each record (mbufs linked through m_next)
2458  * must begin with an address if the protocol so specifies,
2459  * followed by an optional mbuf or mbufs containing ancillary data,
2460  * and then zero or more mbufs of data.
2461  * In order to avoid blocking network interrupts for the entire time here,
2462  * we splx() while doing the actual copy to user space.
2463  * Although the sockbuf is locked, new data may still be appended,
2464  * and thus we must maintain consistency of the sockbuf during that time.
2465  *
2466  * The caller may receive the data as a single mbuf chain by supplying
2467  * an mbuf **mp0 for use in returning the chain.  The uio is then used
2468  * only for the count in uio_resid.
2469  *
2470  * Returns:     0                       Success
2471  *              ENOBUFS
2472  *              ENOTCONN
2473  *              EWOULDBLOCK
2474  *      uiomove:EFAULT
2475  *      sblock:EWOULDBLOCK
2476  *      sblock:EINTR
2477  *      sbwait:EBADF
2478  *      sbwait:EINTR
2479  *      sodelayed_copy:EFAULT
2480  *      <pru_rcvoob>:EINVAL[TCP]
2481  *      <pru_rcvoob>:EWOULDBLOCK[TCP]
2482  *      <pru_rcvoob>:???
2483  *      <pr_domain->dom_externalize>:EMSGSIZE[AF_UNIX]
2484  *      <pr_domain->dom_externalize>:ENOBUFS[AF_UNIX]
2485  *      <pr_domain->dom_externalize>:???
2486  *
2487  * Notes:       Additional return values from calls through <pru_rcvoob> and
2488  *              <pr_domain->dom_externalize> depend on protocols other than
2489  *              TCP or AF_UNIX, which are documented above.
2490  */
2491 int
2492 soreceive(struct socket *so, struct sockaddr **psa, struct uio *uio,
2493     struct mbuf **mp0, struct mbuf **controlp, int *flagsp)
2494 {
2495         struct mbuf *m, **mp, *ml = NULL;
2496         struct mbuf *nextrecord, *free_list;
2497         int flags, error, offset;
2498         user_ssize_t len;
2499         struct protosw *pr = so->so_proto;
2500         int moff, type =0;
2501         user_ssize_t orig_resid = uio_resid(uio);
2502         user_ssize_t delayed_copy_len;
2503         int can_delay;
2504         int need_event;
2505         struct proc *p = current_proc();
2506
2507         KERNEL_DEBUG(DBG_FNC_SORECEIVE | DBG_FUNC_START, so, uio_resid(uio),
2508             so->so_rcv.sb_cc, so->so_rcv.sb_lowat, so->so_rcv.sb_hiwat);
2509
2510         /*
2511          * Sanity check on the length passed by caller as we are making 'int'
2512          * comparisons
2513          */
2514         if (orig_resid < 0 || orig_resid > INT_MAX)
2515                 return (EINVAL);
2516
2517         socket_lock(so, 1);
2518         so_update_last_owner_locked(so, p);
2519         so_update_policy(so);
2520
2521 #ifdef MORE_LOCKING_DEBUG
2522         if (so->so_usecount == 1) {
2523                 panic("%s: so=%x no other reference on socket\n", __func__, so);
2524                 /* NOTREACHED */
2525         }
2526 #endif
2527         mp = mp0;
2528         if (psa != NULL)
2529                 *psa = NULL;
2530         if (controlp != NULL)
2531                 *controlp = NULL;
2532         if (flagsp != NULL)
2533                 flags = *flagsp &~ MSG_EOR;
2534         else
2535                 flags = 0;
2536
2537         /*
2538          * If a recv attempt is made on a previously-accepted socket
2539          * that has been marked as inactive (disconnected), reject
2540          * the request.
2541          */
2542         if (so->so_flags & SOF_DEFUNCT) {
2543                 struct sockbuf *sb = &so->so_rcv;
2544
2545                 error = ENOTCONN;
2546                 SODEFUNCTLOG(("%s[%d]: defunct so 0x%llx [%d,%d] (%d)\n",
2547                     __func__, proc_pid(p), (uint64_t)VM_KERNEL_ADDRPERM(so),
2548                     SOCK_DOM(so), SOCK_TYPE(so), error));
2549                 /*
2550                  * This socket should have been disconnected and flushed
2551                  * prior to being returned from sodefunct(); there should
2552                  * be no data on its receive list, so panic otherwise.
2553                  */
2554                 if (so->so_state & SS_DEFUNCT)
2555                         sb_empty_assert(sb, __func__);
2556                 socket_unlock(so, 1);
2557                 return (error);
2558         }
2559
2560         /*
2561          * When SO_WANTOOBFLAG is set we try to get out-of-band data
2562          * regardless of the flags argument. Here is the case were
2563          * out-of-band data is not inline.
2564          */
2565         if ((flags & MSG_OOB) ||
2566             ((so->so_options & SO_WANTOOBFLAG) != 0 &&
2567             (so->so_options & SO_OOBINLINE) == 0 &&
2568             (so->so_oobmark || (so->so_state & SS_RCVATMARK)))) {
2569                 m = m_get(M_WAIT, MT_DATA);
2570                 if (m == NULL) {
2571                         socket_unlock(so, 1);
2572                         KERNEL_DEBUG(DBG_FNC_SORECEIVE | DBG_FUNC_END,
2573                             ENOBUFS, 0, 0, 0, 0);
2574                         return (ENOBUFS);
2575                 }
2576                 error = (*pr->pr_usrreqs->pru_rcvoob)(so, m, flags & MSG_PEEK);
2577                 if (error)
2578                         goto bad;
2579                 socket_unlock(so, 0);
2580                 do {
2581                         error = uiomove(mtod(m, caddr_t),
2582                             imin(uio_resid(uio), m->m_len), uio);
2583                         m = m_free(m);
2584                 } while (uio_resid(uio) && error == 0 && m != NULL);
2585                 socket_lock(so, 0);
2586 bad:
2587                 if (m != NULL)
2588                         m_freem(m);
2589
2590                 if ((so->so_options & SO_WANTOOBFLAG) != 0) {
2591                         if (error == EWOULDBLOCK || error == EINVAL) {
2592                                 /*
2593                                  * Let's try to get normal data:
2594                                  * EWOULDBLOCK: out-of-band data not
2595                                  * receive yet. EINVAL: out-of-band data
2596                                  * already read.
2597                                  */
2598                                 error = 0;
2599                                 goto nooob;
2600                         } else if (error == 0 && flagsp != NULL) {
2601                                 *flagsp |= MSG_OOB;
2602                         }
2603                 }
2604                 socket_unlock(so, 1);
2605                 KERNEL_DEBUG(DBG_FNC_SORECEIVE | DBG_FUNC_END, error,
2606                     0, 0, 0, 0);
2607
2608                 return (error);
2609         }
2610 nooob:
2611         if (mp != NULL)
2612                 *mp = NULL;
2613
2614         if (so->so_state & SS_ISCONFIRMING && uio_resid(uio)) {
2615                 (*pr->pr_usrreqs->pru_rcvd)(so, 0);
2616         }
2617
2618         free_list = NULL;
2619         delayed_copy_len = 0;
2620 restart:
2621 #ifdef MORE_LOCKING_DEBUG
2622         if (so->so_usecount <= 1)
2623                 printf("soreceive: sblock so=0x%llx ref=%d on socket\n",
2624                     (uint64_t)VM_KERNEL_ADDRPERM(so), so->so_usecount);
2625 #endif
2626         /*
2627          * See if the socket has been closed (SS_NOFDREF|SS_CANTRCVMORE)
2628          * and if so just return to the caller.  This could happen when
2629          * soreceive() is called by a socket upcall function during the
2630          * time the socket is freed.  The socket buffer would have been
2631          * locked across the upcall, therefore we cannot put this thread
2632          * to sleep (else we will deadlock) or return EWOULDBLOCK (else
2633          * we may livelock), because the lock on the socket buffer will
2634          * only be released when the upcall routine returns to its caller.
2635          * Because the socket has been officially closed, there can be
2636          * no further read on it.
2637          *
2638          * A multipath subflow socket would have its SS_NOFDREF set by
2639          * default, so check for SOF_MP_SUBFLOW socket flag; when the
2640          * socket is closed for real, SOF_MP_SUBFLOW would be cleared.
2641          */
2642         if ((so->so_state & (SS_NOFDREF | SS_CANTRCVMORE)) ==
2643             (SS_NOFDREF | SS_CANTRCVMORE) && !(so->so_flags & SOF_MP_SUBFLOW)) {
2644                 socket_unlock(so, 1);
2645                 return (0);
2646         }
2647
2648         error = sblock(&so->so_rcv, SBLOCKWAIT(flags));
2649         if (error) {
2650                 socket_unlock(so, 1);
2651                 KERNEL_DEBUG(DBG_FNC_SORECEIVE | DBG_FUNC_END, error,
2652                     0, 0, 0, 0);
2653                 return (error);
2654         }
2655
2656         m = so->so_rcv.sb_mb;
2657         /*
2658          * If we have less data than requested, block awaiting more
2659          * (subject to any timeout) if:
2660          *   1. the current count is less than the low water mark, or
2661          *   2. MSG_WAITALL is set, and it is possible to do the entire
2662          *      receive operation at once if we block (resid <= hiwat).
2663          *   3. MSG_DONTWAIT is not set
2664          * If MSG_WAITALL is set but resid is larger than the receive buffer,
2665          * we have to do the receive in sections, and thus risk returning
2666          * a short count if a timeout or signal occurs after we start.
2667          */
2668         if (m == NULL || (((flags & MSG_DONTWAIT) == 0 &&
2669             so->so_rcv.sb_cc < uio_resid(uio)) &&
2670             (so->so_rcv.sb_cc < so->so_rcv.sb_lowat ||
2671             ((flags & MSG_WAITALL) && uio_resid(uio) <= so->so_rcv.sb_hiwat)) &&
2672             m->m_nextpkt == NULL && (pr->pr_flags & PR_ATOMIC) == 0)) {
2673                 /*
2674                  * Panic if we notice inconsistencies in the socket's
2675                  * receive list; both sb_mb and sb_cc should correctly
2676                  * reflect the contents of the list, otherwise we may
2677                  * end up with false positives during select() or poll()
2678                  * which could put the application in a bad state.
2679                  */
2680                 SB_MB_CHECK(&so->so_rcv);
2681
2682                 if (so->so_error) {
2683                         if (m != NULL)
2684                                 goto dontblock;
2685                         error = so->so_error;
2686                         if ((flags & MSG_PEEK) == 0)
2687                                 so->so_error = 0;
2688                         goto release;
2689                 }
2690                 if (so->so_state & SS_CANTRCVMORE) {
2691 #if CONTENT_FILTER
2692                         /*
2693                          * Deal with half closed connections
2694                          */
2695                         if ((so->so_state & SS_ISDISCONNECTED) == 0 &&
2696                                 cfil_sock_data_pending(&so->so_rcv) != 0)
2697                                 CFIL_LOG(LOG_INFO,
2698                                         "so %llx ignore SS_CANTRCVMORE",
2699                                         (uint64_t)VM_KERNEL_ADDRPERM(so));
2700                         else
2701 #endif /* CONTENT_FILTER */
2702                         if (m != NULL)
2703                                 goto dontblock;
2704                         else
2705                                 goto release;
2706                 }
2707                 for (; m != NULL; m = m->m_next)
2708                         if (m->m_type == MT_OOBDATA || (m->m_flags & M_EOR)) {
2709                                 m = so->so_rcv.sb_mb;
2710                                 goto dontblock;
2711                         }
2712                 if ((so->so_state & (SS_ISCONNECTED|SS_ISCONNECTING)) == 0 &&
2713                     (so->so_proto->pr_flags & PR_CONNREQUIRED)) {
2714                         error = ENOTCONN;
2715                         goto release;
2716                 }
2717                 if (uio_resid(uio) == 0)
2718                         goto release;
2719                 if ((so->so_state & SS_NBIO) ||
2720                     (flags & (MSG_DONTWAIT|MSG_NBIO))) {
2721                         error = EWOULDBLOCK;
2722                         goto release;
2723                 }
2724                 SBLASTRECORDCHK(&so->so_rcv, "soreceive sbwait 1");
2725                 SBLASTMBUFCHK(&so->so_rcv, "soreceive sbwait 1");
2726                 sbunlock(&so->so_rcv, TRUE);    /* keep socket locked */
2727 #if EVEN_MORE_LOCKING_DEBUG
2728                 if (socket_debug)
2729                         printf("Waiting for socket data\n");
2730 #endif
2731
2732                 error = sbwait(&so->so_rcv);
2733 #if EVEN_MORE_LOCKING_DEBUG
2734                 if (socket_debug)
2735                         printf("SORECEIVE - sbwait returned %d\n", error);
2736 #endif
2737                 if (so->so_usecount < 1) {
2738                         panic("%s: after 2nd sblock so=%p ref=%d on socket\n",
2739                             __func__, so, so->so_usecount);
2740                         /* NOTREACHED */
2741                 }
2742                 if (error) {
2743                         socket_unlock(so, 1);
2744                         KERNEL_DEBUG(DBG_FNC_SORECEIVE | DBG_FUNC_END, error,
2745                             0, 0, 0, 0);
2746                         return (error);
2747                 }
2748                 goto restart;
2749         }
2750 dontblock:
2751         OSIncrementAtomicLong(&p->p_stats->p_ru.ru_msgrcv);
2752         SBLASTRECORDCHK(&so->so_rcv, "soreceive 1");
2753         SBLASTMBUFCHK(&so->so_rcv, "soreceive 1");
2754         nextrecord = m->m_nextpkt;
2755         if ((pr->pr_flags & PR_ADDR) && m->m_type == MT_SONAME) {
2756                 KASSERT(m->m_type == MT_SONAME, ("receive 1a"));
2757 #if CONFIG_MACF_SOCKET_SUBSET
2758                 /*
2759                  * Call the MAC framework for policy checking if we're in
2760                  * the user process context and the socket isn't connected.
2761                  */
2762                 if (p != kernproc && !(so->so_state & SS_ISCONNECTED)) {
2763                         struct mbuf *m0 = m;
2764                         /*
2765                          * Dequeue this record (temporarily) from the receive
2766                          * list since we're about to drop the socket's lock
2767                          * where a new record may arrive and be appended to
2768                          * the list.  Upon MAC policy failure, the record
2769                          * will be freed.  Otherwise, we'll add it back to
2770                          * the head of the list.  We cannot rely on SB_LOCK
2771                          * because append operation uses the socket's lock.
2772                          */
2773                         do {
2774                                 m->m_nextpkt = NULL;
2775                                 sbfree(&so->so_rcv, m);
2776                                 m = m->m_next;
2777                         } while (m != NULL);
2778                         m = m0;
2779                         so->so_rcv.sb_mb = nextrecord;
2780                         SB_EMPTY_FIXUP(&so->so_rcv);
2781                         SBLASTRECORDCHK(&so->so_rcv, "soreceive 1a");
2782                         SBLASTMBUFCHK(&so->so_rcv, "soreceive 1a");
2783                         socket_unlock(so, 0);
2784
2785                         if (mac_socket_check_received(proc_ucred(p), so,
2786                             mtod(m, struct sockaddr *)) != 0) {
2787                                 /*
2788                                  * MAC policy failure; free this record and
2789                                  * process the next record (or block until
2790                                  * one is available).  We have adjusted sb_cc
2791                                  * and sb_mbcnt above so there is no need to
2792                                  * call sbfree() again.
2793                                  */
2794                                 do {
2795                                         m = m_free(m);
2796                                 } while (m != NULL);
2797                                 /*
2798                                  * Clear SB_LOCK but don't unlock the socket.
2799                                  * Process the next record or wait for one.
2800                                  */
2801                                 socket_lock(so, 0);
2802                                 sbunlock(&so->so_rcv, TRUE); /* stay locked */
2803                                 goto restart;
2804                         }
2805                         socket_lock(so, 0);
2806                         /*
2807                          * If the socket has been defunct'd, drop it.
2808                          */
2809                         if (so->so_flags & SOF_DEFUNCT) {
2810                                 m_freem(m);
2811                                 error = ENOTCONN;
2812                                 goto release;
2813                         }
2814                         /*
2815                          * Re-adjust the socket receive list and re-enqueue
2816                          * the record in front of any packets which may have
2817                          * been appended while we dropped the lock.
2818                          */
2819                         for (m = m0; m->m_next != NULL; m = m->m_next)
2820                                 sballoc(&so->so_rcv, m);
2821                         sballoc(&so->so_rcv, m);
2822                         if (so->so_rcv.sb_mb == NULL) {
2823                                 so->so_rcv.sb_lastrecord = m0;
2824                                 so->so_rcv.sb_mbtail = m;
2825                         }
2826                         m = m0;
2827                         nextrecord = m->m_nextpkt = so->so_rcv.sb_mb;
2828                         so->so_rcv.sb_mb = m;
2829                         SBLASTRECORDCHK(&so->so_rcv, "soreceive 1b");
2830                         SBLASTMBUFCHK(&so->so_rcv, "soreceive 1b");
2831                 }
2832 #endif /* CONFIG_MACF_SOCKET_SUBSET */
2833                 orig_resid = 0;
2834                 if (psa != NULL) {
2835                         *psa = dup_sockaddr(mtod(m, struct sockaddr *),
2836                             mp0 == NULL);
2837                         if ((*psa == NULL) && (flags & MSG_NEEDSA)) {
2838                                 error = EWOULDBLOCK;
2839                                 goto release;
2840                         }
2841                 }
2842                 if (flags & MSG_PEEK) {
2843                         m = m->m_next;
2844                 } else {
2845                         sbfree(&so->so_rcv, m);
2846                         if (m->m_next == NULL && so->so_rcv.sb_cc != 0) {
2847                                 panic("%s: about to create invalid socketbuf",
2848                                     __func__);
2849                                 /* NOTREACHED */
2850                         }
2851                         MFREE(m, so->so_rcv.sb_mb);
2852                         m = so->so_rcv.sb_mb;
2853                         if (m != NULL) {
2854                                 m->m_nextpkt = nextrecord;
2855                         } else {
2856                                 so->so_rcv.sb_mb = nextrecord;
2857                                 SB_EMPTY_FIXUP(&so->so_rcv);
2858                         }
2859                 }
2860         }
2861
2862         /*
2863          * Process one or more MT_CONTROL mbufs present before any data mbufs
2864          * in the first mbuf chain on the socket buffer.  If MSG_PEEK, we
2865          * just copy the data; if !MSG_PEEK, we call into the protocol to
2866          * perform externalization.
2867          */
2868         if (m != NULL && m->m_type == MT_CONTROL) {
2869                 struct mbuf *cm = NULL, *cmn;
2870                 struct mbuf **cme = &cm;
2871                 struct sockbuf *sb_rcv = &so->so_rcv;
2872                 struct mbuf **msgpcm = NULL;
2873
2874                 /*
2875                  * Externalizing the control messages would require us to
2876                  * drop the socket's lock below.  Once we re-acquire the
2877                  * lock, the mbuf chain might change.  In order to preserve
2878                  * consistency, we unlink all control messages from the
2879                  * first mbuf chain in one shot and link them separately
2880                  * onto a different chain.
2881                  */
2882                 do {
2883                         if (flags & MSG_PEEK) {
2884                                 if (controlp != NULL) {
2885                                         if (*controlp == NULL) {
2886                                                 msgpcm = controlp;
2887                                         }
2888                                         *controlp = m_copy(m, 0, m->m_len);
2889
2890                                         /*
2891                                          * If we failed to allocate an mbuf,
2892                                          * release any previously allocated
2893                                          * mbufs for control data. Return
2894                                          * an error. Keep the mbufs in the
2895                                          * socket as this is using
2896                                          * MSG_PEEK flag.
2897                                          */
2898                                         if (*controlp == NULL) {
2899                                                 m_freem(*msgpcm);
2900                                                 error = ENOBUFS;
2901                                                 goto release;
2902                                         }
2903                                         controlp = &(*controlp)->m_next;
2904                                 }
2905                                 m = m->m_next;
2906                         } else {
2907                                 m->m_nextpkt = NULL;
2908                                 sbfree(sb_rcv, m);
2909                                 sb_rcv->sb_mb = m->m_next;
2910                                 m->m_next = NULL;
2911                                 *cme = m;
2912                                 cme = &(*cme)->m_next;
2913                                 m = sb_rcv->sb_mb;
2914                         }
2915                 } while (m != NULL && m->m_type == MT_CONTROL);
2916
2917                 if (!(flags & MSG_PEEK)) {
2918                         if (sb_rcv->sb_mb != NULL) {
2919                                 sb_rcv->sb_mb->m_nextpkt = nextrecord;
2920                         } else {
2921                                 sb_rcv->sb_mb = nextrecord;
2922                                 SB_EMPTY_FIXUP(sb_rcv);
2923                         }
2924                         if (nextrecord == NULL)
2925                                 sb_rcv->sb_lastrecord = m;
2926                 }
2927
2928                 SBLASTRECORDCHK(&so->so_rcv, "soreceive ctl");
2929                 SBLASTMBUFCHK(&so->so_rcv, "soreceive ctl");
2930
2931                 while (cm != NULL) {
2932                         int cmsg_type;
2933
2934                         cmn = cm->m_next;
2935                         cm->m_next = NULL;
2936                         cmsg_type = mtod(cm, struct cmsghdr *)->cmsg_type;
2937
2938                         /*
2939                          * Call the protocol to externalize SCM_RIGHTS message
2940                          * and return the modified message to the caller upon
2941                          * success.  Otherwise, all other control messages are
2942                          * returned unmodified to the caller.  Note that we
2943                          * only get into this loop if MSG_PEEK is not set.
2944                          */
2945                         if (pr->pr_domain->dom_externalize != NULL &&
2946                             cmsg_type == SCM_RIGHTS) {
2947                                 /*
2948                                  * Release socket lock: see 3903171.  This
2949                                  * would also allow more records to be appended
2950                                  * to the socket buffer.  We still have SB_LOCK
2951                                  * set on it, so we can be sure that the head
2952                                  * of the mbuf chain won't change.
2953                                  */
2954                                 socket_unlock(so, 0);
2955                                 error = (*pr->pr_domain->dom_externalize)(cm);
2956                                 socket_lock(so, 0);
2957                         } else {
2958                                 error = 0;
2959                         }
2960
2961                         if (controlp != NULL && error == 0) {
2962                                 *controlp = cm;
2963                                 controlp = &(*controlp)->m_next;
2964                                 orig_resid = 0;
2965                         } else {
2966                                 (void) m_free(cm);
2967                         }
2968                         cm = cmn;
2969                 }
2970                 /*
2971                  * Update the value of nextrecord in case we received new
2972                  * records when the socket was unlocked above for
2973                  * externalizing SCM_RIGHTS.
2974                  */
2975                 if (m != NULL)
2976                         nextrecord = sb_rcv->sb_mb->m_nextpkt;
2977                 else
2978                         nextrecord = sb_rcv->sb_mb;
2979                 orig_resid = 0;
2980         }
2981
2982         /*
2983          * If the socket is a TCP socket with message delivery
2984          * enabled, then create a control msg to deliver the
2985          * relative TCP sequence number for this data. Waiting
2986          * until this point will protect against failures to
2987          * allocate an mbuf for control msgs.
2988          */
2989         if (so->so_type == SOCK_STREAM && SOCK_PROTO(so) == IPPROTO_TCP &&
2990             (so->so_flags & SOF_ENABLE_MSGS) && controlp != NULL) {
2991                 struct mbuf *seq_cm;
2992
2993                 seq_cm = sbcreatecontrol((caddr_t)&m->m_pkthdr.msg_seq,
2994                     sizeof (uint32_t), SCM_SEQNUM, SOL_SOCKET);
2995                 if (seq_cm == NULL) {
2996                         /* unable to allocate a control mbuf */
2997                         error = ENOBUFS;
2998                         goto release;
2999                 }
3000                 *controlp = seq_cm;
3001                 controlp = &seq_cm->m_next;
3002         }
3003
3004         if (m != NULL) {
3005                 if (!(flags & MSG_PEEK)) {
3006                         /*
3007                          * We get here because m points to an mbuf following
3008                          * any MT_SONAME or MT_CONTROL mbufs which have been
3009                          * processed above.  In any case, m should be pointing
3010                          * to the head of the mbuf chain, and the nextrecord
3011                          * should be either NULL or equal to m->m_nextpkt.
3012                          * See comments above about SB_LOCK.
3013                          */
3014                         if (m != so->so_rcv.sb_mb ||
3015                             m->m_nextpkt != nextrecord) {
3016                                 panic("%s: post-control !sync so=%p m=%p "
3017                                     "nextrecord=%p\n", __func__, so, m,
3018                                     nextrecord);
3019                                 /* NOTREACHED */
3020                         }
3021                         if (nextrecord == NULL)
3022                                 so->so_rcv.sb_lastrecord = m;
3023                 }
3024                 type = m->m_type;
3025                 if (type == MT_OOBDATA)
3026                         flags |= MSG_OOB;
3027         } else {
3028                 if (!(flags & MSG_PEEK)) {
3029                         SB_EMPTY_FIXUP(&so->so_rcv);
3030                 }
3031         }
3032         SBLASTRECORDCHK(&so->so_rcv, "soreceive 2");
3033         SBLASTMBUFCHK(&so->so_rcv, "soreceive 2");
3034
3035         moff = 0;
3036         offset = 0;
3037
3038         if (!(flags & MSG_PEEK) && uio_resid(uio) > sorecvmincopy)
3039                 can_delay = 1;
3040         else
3041                 can_delay = 0;
3042
3043         need_event = 0;
3044
3045         while (m != NULL &&
3046             (uio_resid(uio) - delayed_copy_len) > 0 && error == 0) {
3047                 if (m->m_type == MT_OOBDATA) {
3048                         if (type != MT_OOBDATA)
3049                                 break;
3050                 } else if (type == MT_OOBDATA) {
3051                         break;
3052                 }
3053                 /*
3054                  * Make sure to allways set MSG_OOB event when getting
3055                  * out of band data inline.
3056                  */
3057                 if ((so->so_options & SO_WANTOOBFLAG) != 0 &&
3058                     (so->so_options & SO_OOBINLINE) != 0 &&
3059                     (so->so_state & SS_RCVATMARK) != 0) {
3060                         flags |= MSG_OOB;
3061                 }
3062                 so->so_state &= ~SS_RCVATMARK;
3063                 len = uio_resid(uio) - delayed_copy_len;
3064                 if (so->so_oobmark && len > so->so_oobmark - offset)
3065                         len = so->so_oobmark - offset;
3066                 if (len > m->m_len - moff)
3067                         len = m->m_len - moff;
3068                 /*
3069                  * If mp is set, just pass back the mbufs.
3070                  * Otherwise copy them out via the uio, then free.
3071                  * Sockbuf must be consistent here (points to current mbuf,
3072                  * it points to next record) when we drop priority;
3073                  * we must note any additions to the sockbuf when we
3074                  * block interrupts again.
3075                  */
3076                 if (mp == NULL) {
3077                         SBLASTRECORDCHK(&so->so_rcv, "soreceive uiomove");
3078                         SBLASTMBUFCHK(&so->so_rcv, "soreceive uiomove");
3079                         if (can_delay && len == m->m_len) {
3080                                 /*
3081                                  * only delay the copy if we're consuming the
3082                                  * mbuf and we're NOT in MSG_PEEK mode
3083                                  * and we have enough data to make it worthwile
3084                                  * to drop and retake the lock... can_delay
3085                                  * reflects the state of the 2 latter
3086                                  * constraints moff should always be zero
3087                                  * in these cases
3088                                  */
3089                                 delayed_copy_len += len;
3090                         } else {
3091                                 if (delayed_copy_len) {
3092                                         error = sodelayed_copy(so, uio,
3093                                             &free_list, &delayed_copy_len);
3094
3095                                         if (error) {
3096                                                 goto release;
3097                                         }
3098                                         /*
3099                                          * can only get here if MSG_PEEK is not
3100                                          * set therefore, m should point at the
3101                                          * head of the rcv queue; if it doesn't,
3102                                          * it means something drastically
3103                                          * changed while we were out from behind
3104                                          * the lock in sodelayed_copy. perhaps
3105                                          * a RST on the stream. in any event,
3106                                          * the stream has been interrupted. it's
3107                                          * probably best just to return whatever
3108                                          * data we've moved and let the caller
3109                                          * sort it out...
3110                                          */
3111                                         if (m != so->so_rcv.sb_mb) {
3112                                                 break;
3113                                         }
3114                                 }
3115                                 socket_unlock(so, 0);
3116                                 error = uiomove(mtod(m, caddr_t) + moff,
3117                                     (int)len, uio);
3118                                 socket_lock(so, 0);
3119
3120                                 if (error)
3121                                         goto release;
3122                         }
3123                 } else {
3124                         uio_setresid(uio, (uio_resid(uio) - len));
3125                 }
3126                 if (len == m->m_len - moff) {
3127                         if (m->m_flags & M_EOR)
3128                                 flags |= MSG_EOR;
3129                         if (flags & MSG_PEEK) {
3130                                 m = m->m_next;
3131                                 moff = 0;
3132                         } else {
3133                                 nextrecord = m->m_nextpkt;
3134                                 sbfree(&so->so_rcv, m);
3135                                 m->m_nextpkt = NULL;
3136
3137                                 /*
3138                                  * If this packet is an unordered packet
3139                                  * (indicated by M_UNORDERED_DATA flag), remove
3140                                  * the additional bytes added to the
3141                                  * receive socket buffer size.
3142                                  */
3143                                 if ((so->so_flags & SOF_ENABLE_MSGS) &&
3144                                     m->m_len &&
3145                                     (m->m_flags & M_UNORDERED_DATA) &&
3146                                     sbreserve(&so->so_rcv,
3147                                     so->so_rcv.sb_hiwat - m->m_len)) {
3148                                         if (so->so_msg_state->msg_uno_bytes >
3149                                             m->m_len) {
3150                                                 so->so_msg_state->
3151                                                     msg_uno_bytes -= m->m_len;
3152                                         } else {
3153                                                 so->so_msg_state->
3154                                                     msg_uno_bytes = 0;
3155                                         }
3156                                         m->m_flags &= ~M_UNORDERED_DATA;
3157                                 }
3158
3159                                 if (mp != NULL) {
3160                                         *mp = m;
3161                                         mp = &m->m_next;
3162                                         so->so_rcv.sb_mb = m = m->m_next;
3163                                         *mp = NULL;
3164                                 } else {
3165                                         if (free_list == NULL)
3166                                                 free_list = m;
3167                                         else
3168                                                 ml->m_next = m;
3169                                         ml = m;
3170                                         so->so_rcv.sb_mb = m = m->m_next;
3171                                         ml->m_next = NULL;
3172                                 }
3173                                 if (m != NULL) {
3174                                         m->m_nextpkt = nextrecord;
3175                                         if (nextrecord == NULL)
3176                                                 so->so_rcv.sb_lastrecord = m;
3177                                 } else {
3178                                         so->so_rcv.sb_mb = nextrecord;
3179                                         SB_EMPTY_FIXUP(&so->so_rcv);
3180                                 }
3181                                 SBLASTRECORDCHK(&so->so_rcv, "soreceive 3");
3182                                 SBLASTMBUFCHK(&so->so_rcv, "soreceive 3");
3183                         }
3184                 } else {
3185                         if (flags & MSG_PEEK) {
3186                                 moff += len;
3187                         } else {
3188                                 if (mp != NULL) {
3189                                         int copy_flag;
3190
3191                                         if (flags & MSG_DONTWAIT)
3192                                                 copy_flag = M_DONTWAIT;
3193                                         else
3194                                                 copy_flag = M_WAIT;
3195                                         *mp = m_copym(m, 0, len, copy_flag);
3196                                         /*
3197                                          * Failed to allocate an mbuf?
3198                                          * Adjust uio_resid back, it was
3199                                          * adjusted down by len bytes which
3200                                          * we didn't copy over.
3201                                          */
3202                                         if (*mp == NULL) {
3203                                                 uio_setresid(uio,
3204                                                     (uio_resid(uio) + len));
3205                                                 break;
3206                                         }
3207                                 }
3208                                 m->m_data += len;
3209                                 m->m_len -= len;
3210                                 so->so_rcv.sb_cc -= len;
3211                         }
3212                 }
3213                 if (so->so_oobmark) {
3214                         if ((flags & MSG_PEEK) == 0) {
3215                                 so->so_oobmark -= len;
3216                                 if (so->so_oobmark == 0) {
3217                                         so->so_state |= SS_RCVATMARK;
3218                                         /*
3219                                          * delay posting the actual event until
3220                                          * after any delayed copy processing
3221                                          * has finished
3222                                          */
3223                                         need_event = 1;
3224                                         break;
3225                                 }
3226                         } else {
3227                                 offset += len;
3228                                 if (offset == so->so_oobmark)
3229                                         break;
3230                         }
3231                 }
3232                 if (flags & MSG_EOR)
3233                         break;
3234                 /*
3235                  * If the MSG_WAITALL or MSG_WAITSTREAM flag is set
3236                  * (for non-atomic socket), we must not quit until
3237                  * "uio->uio_resid == 0" or an error termination.
3238                  * If a signal/timeout occurs, return with a short
3239                  * count but without error.  Keep sockbuf locked
3240                  * against other readers.
3241                  */
3242                 while (flags & (MSG_WAITALL|MSG_WAITSTREAM) && m == NULL &&
3243                     (uio_resid(uio) - delayed_copy_len) > 0 &&
3244                     !sosendallatonce(so) && !nextrecord) {
3245                         if (so->so_error || ((so->so_state & SS_CANTRCVMORE)
3246 #if CONTENT_FILTER
3247                             && cfil_sock_data_pending(&so->so_rcv) == 0
3248 #endif /* CONTENT_FILTER */
3249                             ))
3250                                 goto release;
3251
3252                         /*
3253                          * Depending on the protocol (e.g. TCP), the following
3254                          * might cause the socket lock to be dropped and later
3255                          * be reacquired, and more data could have arrived and
3256                          * have been appended to the receive socket buffer by
3257                          * the time it returns.  Therefore, we only sleep in
3258                          * sbwait() below if and only if the socket buffer is
3259                          * empty, in order to avoid a false sleep.
3260                          */
3261                         if (pr->pr_flags & PR_WANTRCVD && so->so_pcb &&
3262                             (((struct inpcb *)so->so_pcb)->inp_state !=
3263                             INPCB_STATE_DEAD))
3264                                 (*pr->pr_usrreqs->pru_rcvd)(so, flags);
3265
3266                         SBLASTRECORDCHK(&so->so_rcv, "soreceive sbwait 2");
3267                         SBLASTMBUFCHK(&so->so_rcv, "soreceive sbwait 2");
3268
3269                         if (so->so_rcv.sb_mb == NULL && sbwait(&so->so_rcv)) {
3270                                 error = 0;
3271                                 goto release;
3272                         }
3273                         /*
3274                          * have to wait until after we get back from the sbwait
3275                          * to do the copy because we will drop the lock if we
3276                          * have enough data that has been delayed... by dropping
3277                          * the lock we open up a window allowing the netisr
3278                          * thread to process the incoming packets and to change
3279                          * the state of this socket... we're issuing the sbwait
3280                          * because the socket is empty and we're expecting the
3281                          * netisr thread to wake us up when more packets arrive;
3282                          * if we allow that processing to happen and then sbwait
3283                          * we could stall forever with packets sitting in the
3284                          * socket if no further packets arrive from the remote
3285                          * side.
3286                          *
3287                          * we want to copy before we've collected all the data
3288                          * to satisfy this request to allow the copy to overlap
3289                          * the incoming packet processing on an MP system
3290                          */
3291                         if (delayed_copy_len > sorecvmincopy &&
3292                             (delayed_copy_len > (so->so_rcv.sb_hiwat / 2))) {
3293                                 error = sodelayed_copy(so, uio,
3294                                     &free_list, &delayed_copy_len);
3295
3296                                 if (error)
3297                                         goto release;
3298                         }
3299                         m = so->so_rcv.sb_mb;
3300                         if (m != NULL) {
3301                                 nextrecord = m->m_nextpkt;
3302                         }
3303                         SB_MB_CHECK(&so->so_rcv);
3304                 }
3305         }
3306 #ifdef MORE_LOCKING_DEBUG
3307         if (so->so_usecount <= 1) {
3308                 panic("%s: after big while so=%p ref=%d on socket\n",
3309                     __func__, so, so->so_usecount);
3310                 /* NOTREACHED */
3311         }
3312 #endif
3313
3314         if (m != NULL && pr->pr_flags & PR_ATOMIC) {
3315                 if (so->so_options & SO_DONTTRUNC) {
3316                         flags |= MSG_RCVMORE;
3317                 } else {
3318                         flags |= MSG_TRUNC;
3319                         if ((flags & MSG_PEEK) == 0)
3320                                 (void) sbdroprecord(&so->so_rcv);
3321                 }
3322         }
3323
3324         /*
3325          * pru_rcvd below (for TCP) may cause more data to be received
3326          * if the socket lock is dropped prior to sending the ACK; some
3327          * legacy OpenTransport applications don't handle this well
3328          * (if it receives less data than requested while MSG_HAVEMORE
3329          * is set), and so we set the flag now based on what we know
3330          * prior to calling pru_rcvd.
3331          */
3332         if ((so->so_options & SO_WANTMORE) && so->so_rcv.sb_cc > 0)
3333                 flags |= MSG_HAVEMORE;
3334
3335         if ((flags & MSG_PEEK) == 0) {
3336                 if (m == NULL) {
3337                         so->so_rcv.sb_mb = nextrecord;
3338                         /*
3339                          * First part is an inline SB_EMPTY_FIXUP().  Second
3340                          * part makes sure sb_lastrecord is up-to-date if
3341                          * there is still data in the socket buffer.
3342                          */
3343                         if (so->so_rcv.sb_mb == NULL) {
3344                                 so->so_rcv.sb_mbtail = NULL;
3345                                 so->so_rcv.sb_lastrecord = NULL;
3346                         } else if (nextrecord->m_nextpkt == NULL) {
3347                                 so->so_rcv.sb_lastrecord = nextrecord;
3348                         }
3349                         SB_MB_CHECK(&so->so_rcv);
3350                 }
3351                 SBLASTRECORDCHK(&so->so_rcv, "soreceive 4");
3352                 SBLASTMBUFCHK(&so->so_rcv, "soreceive 4");
3353                 if (pr->pr_flags & PR_WANTRCVD && so->so_pcb)
3354                         (*pr->pr_usrreqs->pru_rcvd)(so, flags);
3355         }
3356
3357         if (delayed_copy_len) {
3358                 error = sodelayed_copy(so, uio, &free_list, &delayed_copy_len);
3359                 if (error)
3360                         goto release;
3361         }
3362         if (free_list != NULL) {
3363                 m_freem_list(free_list);
3364                 free_list = NULL;
3365         }
3366         if (need_event)
3367                 postevent(so, 0, EV_OOB);
3368
3369         if (orig_resid == uio_resid(uio) && orig_resid &&
3370             (flags & MSG_EOR) == 0 && (so->so_state & SS_CANTRCVMORE) == 0) {
3371                 sbunlock(&so->so_rcv, TRUE);    /* keep socket locked */
3372                 goto restart;
3373         }
3374
3375         if (flagsp != NULL)
3376                 *flagsp |= flags;
3377 release:
3378 #ifdef MORE_LOCKING_DEBUG
3379         if (so->so_usecount <= 1) {
3380                 panic("%s: release so=%p ref=%d on socket\n", __func__,
3381                     so, so->so_usecount);
3382                 /* NOTREACHED */
3383         }
3384 #endif
3385         if (delayed_copy_len)
3386                 error = sodelayed_copy(so, uio, &free_list, &delayed_copy_len);
3387
3388         if (free_list != NULL)
3389                 m_freem_list(free_list);
3390
3391         sbunlock(&so->so_rcv, FALSE);   /* will unlock socket */
3392
3393         KERNEL_DEBUG(DBG_FNC_SORECEIVE | DBG_FUNC_END, so, uio_resid(uio),
3394             so->so_rcv.sb_cc, 0, error);
3395
3396         return (error);
3397 }
3398
3399 /*
3400  * Returns:     0                       Success
3401  *      uiomove:EFAULT
3402  */
3403 static int
3404 sodelayed_copy(struct socket *so, struct uio *uio, struct mbuf **free_list,
3405     user_ssize_t *resid)
3406 {
3407         int error = 0;
3408         struct mbuf *m;
3409
3410         m = *free_list;
3411
3412         socket_unlock(so, 0);
3413
3414         while (m != NULL && error == 0) {
3415                 error = uiomove(mtod(m, caddr_t), (int)m->m_len, uio);
3416                 m = m->m_next;
3417         }
3418         m_freem_list(*free_list);
3419
3420         *free_list = NULL;
3421         *resid = 0;
3422
3423         socket_lock(so, 0);
3424
3425         return (error);
3426 }
3427
3428 int
3429 soreceive_list(struct socket *so, struct sockaddr **psa, struct uio **uioarray,
3430         u_int uiocnt, struct mbuf **mp0, struct mbuf **controlp, int *flagsp)
3431 {
3432         struct mbuf *m, **mp;
3433         struct mbuf *nextrecord;
3434         struct mbuf *ml = NULL, *free_list = NULL;
3435         int flags, error, offset;
3436         user_ssize_t len;
3437         struct protosw *pr = so->so_proto;
3438         user_ssize_t orig_resid, resid;
3439         struct proc *p = current_proc();
3440         struct uio *auio = NULL;
3441         int i = 0;
3442         int sblocked = 0;
3443
3444         KERNEL_DEBUG(DBG_FNC_SORECEIVE_LIST | DBG_FUNC_START,
3445             so, uiocnt,
3446             so->so_rcv.sb_cc, so->so_rcv.sb_lowat, so->so_rcv.sb_hiwat);
3447
3448         mp = mp0;
3449         if (psa != NULL)
3450                 *psa = NULL;
3451         if (controlp != NULL)
3452                 *controlp = NULL;
3453         if (flagsp != NULL)
3454                 flags = *flagsp &~ MSG_EOR;
3455         else
3456                 flags = 0;
3457         /*
3458          * Disallow functionality not currently supported
3459          */
3460         if (mp0 != NULL) {
3461                 printf("%s mp0 not supported\n", __func__);
3462                 error = EOPNOTSUPP;
3463                 goto out;
3464         }
3465         if (psa != NULL) {
3466                 printf("%s sockaddr not supported\n", __func__);
3467                 error = EOPNOTSUPP;
3468                 goto out;
3469         }
3470         if (controlp != NULL) {
3471                 printf("%s control not supported\n", __func__);
3472                 error = EOPNOTSUPP;
3473                 goto out;
3474         }
3475
3476         /*
3477          * Sanity checks:
3478          * - Only supports don't wait flags
3479          * - Only support datagram sockets (could be extended to raw)
3480          * - Must be atomic
3481          * - Protocol must support packet chains
3482          * - The uio array is NULL (should we panic?)
3483          */
3484         if (flags & ~(MSG_DONTWAIT | MSG_NBIO)) {
3485                 printf("%s flags not supported\n", __func__);
3486                 error = EOPNOTSUPP;
3487                 goto out;
3488         }
3489         if (so->so_type != SOCK_DGRAM) {
3490                 error = EINVAL;
3491                 goto out;
3492         }
3493         if (sosendallatonce(so) == 0) {
3494                 error = EINVAL;
3495                 goto out;
3496         }
3497         if (so->so_proto->pr_usrreqs->pru_send_list == NULL) {
3498                 error = EPROTONOSUPPORT;
3499                 goto out;
3500         }
3501         if (uioarray == NULL) {
3502                 printf("%s uioarray is NULL\n", __func__);
3503                 error = EINVAL;
3504                 goto out;
3505         }
3506         if (uiocnt == 0) {
3507                 printf("%s uiocnt is 0\n", __func__);
3508                 error = EINVAL;
3509                 goto out;
3510         }
3511         /*
3512          * Sanity check on the length passed by caller as we are making 'int'
3513          * comparisons
3514          */
3515         resid = orig_resid = uio_array_resid(uioarray, uiocnt);
3516         if (orig_resid < 0 || orig_resid > INT_MAX) {
3517                 error = EINVAL;
3518                 goto out;
3519         }
3520
3521         socket_lock(so, 1);
3522         so_update_last_owner_locked(so, p);
3523         so_update_policy(so);
3524
3525 #if NECP
3526         so_update_necp_policy(so, NULL, NULL);
3527 #endif /* NECP */
3528
3529         /*
3530          * If a recv attempt is made on a previously-accepted socket
3531          * that has been marked as inactive (disconnected), reject
3532          * the request.
3533          */
3534         if (so->so_flags & SOF_DEFUNCT) {
3535                 struct sockbuf *sb = &so->so_rcv;
3536
3537                 error = ENOTCONN;
3538                 SODEFUNCTLOG(("%s[%d]: defunct so 0x%llx [%d,%d] (%d)\n",
3539                     __func__, proc_pid(p), (uint64_t)VM_KERNEL_ADDRPERM(so),
3540                     SOCK_DOM(so), SOCK_TYPE(so), error));
3541                 /*
3542                  * This socket should have been disconnected and flushed
3543                  * prior to being returned from sodefunct(); there should
3544                  * be no data on its receive list, so panic otherwise.
3545                  */
3546                 if (so->so_state & SS_DEFUNCT)
3547                         sb_empty_assert(sb, __func__);
3548                 goto release;
3549         }
3550         if (mp != NULL)
3551                 *mp = NULL;
3552 restart:
3553         /*
3554          * See if the socket has been closed (SS_NOFDREF|SS_CANTRCVMORE)
3555          * and if so just return to the caller.  This could happen when
3556          * soreceive() is called by a socket upcall function during the
3557          * time the socket is freed.  The socket buffer would have been
3558          * locked across the upcall, therefore we cannot put this thread
3559          * to sleep (else we will deadlock) or return EWOULDBLOCK (else
3560          * we may livelock), because the lock on the socket buffer will
3561          * only be released when the upcall routine returns to its caller.
3562          * Because the socket has been officially closed, there can be
3563          * no further read on it.
3564          */
3565         if ((so->so_state & (SS_NOFDREF | SS_CANTRCVMORE)) ==
3566             (SS_NOFDREF | SS_CANTRCVMORE)) {
3567                 error = 0;
3568                 goto release;
3569         }
3570
3571         error = sblock(&so->so_rcv, SBLOCKWAIT(flags));
3572         if (error) {
3573                 goto release;
3574         }
3575         sblocked = 1;
3576
3577         /*
3578          * Skip empty uio
3579          */
3580         auio = uioarray[i];
3581         while (uio_resid(auio) == 0) {
3582                 i++;
3583                 if (i >= uiocnt) {
3584                         error = 0;
3585                         goto release;
3586                 }
3587         }
3588
3589         m = so->so_rcv.sb_mb;
3590         /*
3591          * Block awaiting more datagram if needed
3592          */
3593         if (m == NULL) {
3594                 /*
3595                  * Panic if we notice inconsistencies in the socket's
3596                  * receive list; both sb_mb and sb_cc should correctly
3597                  * reflect the contents of the list, otherwise we may
3598                  * end up with false positives during select() or poll()
3599                  * which could put the application in a bad state.
3600                  */
3601                 SB_MB_CHECK(&so->so_rcv);
3602
3603                 if (so->so_error) {
3604                         error = so->so_error;
3605                         goto release;
3606                 }
3607                 if (so->so_state & SS_CANTRCVMORE) {
3608                         goto release;
3609                 }
3610                 if ((so->so_state & (SS_ISCONNECTED|SS_ISCONNECTING)) == 0 &&
3611                     (so->so_proto->pr_flags & PR_CONNREQUIRED)) {
3612                         error = ENOTCONN;
3613                         goto release;
3614                 }
3615                 if ((so->so_state & SS_NBIO) ||
3616                     (flags & (MSG_DONTWAIT|MSG_NBIO))) {
3617                         error = EWOULDBLOCK;
3618                         goto release;
3619                 }
3620                 /*
3621                  * Do not block if we got some data
3622                  * Note: We could use MSG_WAITALL to wait
3623                  */
3624                 resid = uio_array_resid(uioarray, uiocnt);
3625                 if (resid != orig_resid) {
3626                         error = 0;
3627                         goto release;
3628                 }
3629
3630                 SBLASTRECORDCHK(&so->so_rcv, "soreceive sbwait 1");
3631                 SBLASTMBUFCHK(&so->so_rcv, "soreceive sbwait 1");
3632
3633                 sbunlock(&so->so_rcv, TRUE);    /* keep socket locked */
3634                 sblocked = 0;
3635
3636                 error = sbwait(&so->so_rcv);
3637                 if (error) {
3638                         goto release;
3639                 }
3640                 goto restart;
3641         }
3642
3643         if (m->m_pkthdr.len == 0) {
3644                 printf("%s so %llx pkt %llx len is null\n",
3645                         __func__,
3646                         (uint64_t)VM_KERNEL_ADDRPERM(so),
3647                         (uint64_t)VM_KERNEL_ADDRPERM(m));
3648                 goto restart;
3649         }
3650         OSIncrementAtomicLong(&p->p_stats->p_ru.ru_msgrcv);
3651         SBLASTRECORDCHK(&so->so_rcv, "soreceive 1");
3652         SBLASTMBUFCHK(&so->so_rcv, "soreceive 1");
3653
3654         /*
3655          * Consume the current uio index as we have a datagram
3656          */
3657         i += 1;
3658         nextrecord = m->m_nextpkt;
3659
3660 #if SO_RECEIVE_LIST_SOCKADDR_NOT_YET
3661         if ((pr->pr_flags & PR_ADDR) && m->m_type == MT_SONAME) {
3662                 /*
3663                  * to be adapted from soreceive()
3664                  */
3665         }
3666 #endif /* SO_RECEIVE_LIST_SOCKADDR_NOT_YET */
3667
3668 #if SO_RECEIVE_LIST_CONTROL_NOT_YET
3669         /*
3670          * Process one or more MT_CONTROL mbufs present before any data mbufs
3671          * in the first mbuf chain on the socket buffer.  If MSG_PEEK, we
3672          * just copy the data; if !MSG_PEEK, we call into the protocol to
3673          * perform externalization.
3674          */
3675         if (m != NULL && m->m_type == MT_CONTROL) {
3676                 /*
3677                  * to be adapted from soreceive()
3678                  */
3679         }
3680 #endif /* SO_RECEIVE_LIST_CONTROL_NOT_YET */
3681
3682         offset = 0;
3683
3684         /*
3685          * Loop to copy out the mbufs of the current record
3686          */
3687         while (m != NULL && uio_resid(auio) > 0 && error == 0) {
3688                 len = uio_resid(auio);
3689
3690                 if (m->m_len == 0)
3691                         printf("%s: so %llx m %llx m_len is 0\n",
3692                                 __func__,
3693                                 (uint64_t)VM_KERNEL_ADDRPERM(so),
3694                                 (uint64_t)VM_KERNEL_ADDRPERM(m));
3695
3696                 /*
3697                  * Clip to the residual length
3698                  */
3699                 if (len > m->m_len)
3700                         len = m->m_len;
3701                 /*
3702                  * If mp is set, just pass back the mbufs.
3703                  * Otherwise copy them out via the uio, then free.
3704                  * Sockbuf must be consistent here (points to current mbuf,
3705                  * it points to next record) when we drop priority;
3706                  * we must note any additions to the sockbuf when we
3707                  * block interrupts again.
3708                  */
3709                 if (mp != NULL) {
3710                         uio_setresid(auio, (uio_resid(auio) - len));
3711                 } else {
3712                         SBLASTRECORDCHK(&so->so_rcv, "soreceive uiomove");
3713                         SBLASTMBUFCHK(&so->so_rcv, "soreceive uiomove");
3714
3715                         socket_unlock(so, 0);
3716                         error = uiomove(mtod(m, caddr_t), (int)len, auio);
3717                         socket_lock(so, 0);
3718
3719                         if (error)
3720                                 goto release;
3721                 }
3722                 if (len == m->m_len) {
3723                         /*
3724                          * m was entirely copied
3725                          */
3726                         nextrecord = m->m_nextpkt;
3727                         sbfree(&so->so_rcv, m);
3728                         m->m_nextpkt = NULL;
3729
3730                         /*
3731                          * Move to m_next
3732                          */
3733                         if (mp != NULL) {
3734                                 *mp = m;
3735                                 mp = &m->m_next;
3736                                 so->so_rcv.sb_mb = m = m->m_next;
3737                                 *mp = NULL;
3738                         } else {
3739                                 if (free_list == NULL)
3740                                         free_list = m;
3741                                 else
3742                                         ml->m_next = m;
3743                                 ml = m;
3744                                 so->so_rcv.sb_mb = m = m->m_next;
3745                                 ml->m_next = NULL;
3746                                 ml->m_nextpkt = NULL;
3747                         }
3748                         if (m != NULL) {
3749                                 m->m_nextpkt = nextrecord;
3750                                 if (nextrecord == NULL)
3751                                         so->so_rcv.sb_lastrecord = m;
3752                         } else {
3753                                 so->so_rcv.sb_mb = nextrecord;
3754                                 SB_EMPTY_FIXUP(&so->so_rcv);
3755                         }
3756                         SBLASTRECORDCHK(&so->so_rcv, "soreceive 3");
3757                         SBLASTMBUFCHK(&so->so_rcv, "soreceive 3");
3758                 } else {
3759                         /*
3760                          * Stop the loop on partial copy
3761                          */
3762                         if (mp != NULL) {
3763                                 int copy_flag;
3764
3765                                 if (flags & MSG_DONTWAIT)
3766                                         copy_flag = M_DONTWAIT;
3767                                 else
3768                                         copy_flag = M_WAIT;
3769                                 *mp = m_copym(m, 0, len, copy_flag);
3770                                 /*
3771                                  * Failed to allocate an mbuf?
3772                                  * Adjust uio_resid back, it was
3773                                  * adjusted down by len bytes which
3774                                  * we didn't copy over.
3775                                  */
3776                                 if (*mp == NULL) {
3777                                         uio_setresid(auio,
3778                                             (uio_resid(auio) + len));
3779                                         error = ENOMEM;
3780                                         break;
3781                                 }
3782                         }
3783                         break;
3784                 }
3785         }
3786 #ifdef MORE_LOCKING_DEBUG
3787         if (so->so_usecount <= 1) {
3788                 panic("%s: after big while so=%llx ref=%d on socket\n",
3789                     __func__,
3790                     (uint64_t)VM_KERNEL_ADDRPERM(so), so->so_usecount);
3791                 /* NOTREACHED */
3792         }
3793 #endif
3794         /*
3795          * Tell the caller we made a partial copy
3796          */
3797         if (m != NULL) {
3798                 if (so->so_options & SO_DONTTRUNC) {
3799                         m->m_data += len;
3800                         m->m_len -= len;
3801                         so->so_rcv.sb_cc -= len;
3802                         flags |= MSG_RCVMORE;
3803                 } else {
3804                         (void) sbdroprecord(&so->so_rcv);
3805                         nextrecord = so->so_rcv.sb_mb;
3806                         m = NULL;
3807                         flags |= MSG_TRUNC;
3808                 }
3809         }
3810
3811         if (m == NULL) {
3812                 so->so_rcv.sb_mb = nextrecord;
3813                 /*
3814                  * First part is an inline SB_EMPTY_FIXUP().  Second
3815                  * part makes sure sb_lastrecord is up-to-date if
3816                  * there is still data in the socket buffer.
3817                  */
3818                 if (so->so_rcv.sb_mb == NULL) {
3819                         so->so_rcv.sb_mbtail = NULL;
3820                         so->so_rcv.sb_lastrecord = NULL;
3821                 } else if (nextrecord->m_nextpkt == NULL) {
3822                         so->so_rcv.sb_lastrecord = nextrecord;
3823                 }
3824                 SB_MB_CHECK(&so->so_rcv);
3825         }
3826         SBLASTRECORDCHK(&so->so_rcv, "soreceive 4");
3827         SBLASTMBUFCHK(&so->so_rcv, "soreceive 4");
3828
3829         /*
3830          * We can continue to the next packet as long as:
3831          * - We haven't exhausted the uio array
3832          * - There was no error
3833          * - A packet was not truncated
3834          * - We can still receive more data
3835          */
3836         if (i < uiocnt && error == 0 &&
3837             (flags & (MSG_RCVMORE | MSG_TRUNC)) == 0
3838             && (so->so_state & SS_CANTRCVMORE) == 0) {
3839                 sbunlock(&so->so_rcv, TRUE);    /* keep socket locked */
3840                 sblocked = 0;
3841
3842                 goto restart;
3843         }
3844
3845 release:
3846         /*
3847          * pru_rcvd may cause more data to be received if the socket lock
3848          * is dropped so we set MSG_HAVEMORE now based on what we know.
3849          * That way the caller won't be surprised if it receives less data than requested.
3850          */
3851         if ((so->so_options & SO_WANTMORE) && so->so_rcv.sb_cc > 0)
3852                 flags |= MSG_HAVEMORE;
3853
3854         if (pr->pr_flags & PR_WANTRCVD && so->so_pcb)
3855                 (*pr->pr_usrreqs->pru_rcvd)(so, flags);
3856
3857         if (flagsp != NULL)
3858                 *flagsp |= flags;
3859         if (sblocked)
3860                 sbunlock(&so->so_rcv, FALSE);   /* will unlock socket */
3861         else
3862                 socket_unlock(so, 1);
3863 out:
3864         /*
3865          * Amortize the cost
3866          */
3867         if (free_list != NULL)
3868                 m_freem_list(free_list);
3869
3870         KERNEL_DEBUG(DBG_FNC_SORECEIVE_LIST | DBG_FUNC_END, error,
3871             0, 0, 0, 0);
3872         return (error);
3873 }
3874
3875 /*
3876  * Returns:     0                       Success
3877  *              EINVAL
3878  *              ENOTCONN
3879  *      <pru_shutdown>:EINVAL
3880  *      <pru_shutdown>:EADDRNOTAVAIL[TCP]
3881  *      <pru_shutdown>:ENOBUFS[TCP]
3882  *      <pru_shutdown>:EMSGSIZE[TCP]
3883  *      <pru_shutdown>:EHOSTUNREACH[TCP]
3884  *      <pru_shutdown>:ENETUNREACH[TCP]
3885  *      <pru_shutdown>:ENETDOWN[TCP]
3886  *      <pru_shutdown>:ENOMEM[TCP]
3887  *      <pru_shutdown>:EACCES[TCP]
3888  *      <pru_shutdown>:EMSGSIZE[TCP]
3889  *      <pru_shutdown>:ENOBUFS[TCP]
3890  *      <pru_shutdown>:???[TCP]         [ignorable: mostly IPSEC/firewall/DLIL]
3891  *      <pru_shutdown>:???              [other protocol families]
3892  */
3893 int
3894 soshutdown(struct socket *so, int how)
3895 {
3896         int error;
3897
3898         KERNEL_DEBUG(DBG_FNC_SOSHUTDOWN | DBG_FUNC_START, how, 0, 0, 0, 0);
3899
3900         switch (how) {
3901         case SHUT_RD:
3902         case SHUT_WR:
3903         case SHUT_RDWR:
3904                 socket_lock(so, 1);
3905                 if ((so->so_state &
3906                     (SS_ISCONNECTED|SS_ISCONNECTING|SS_ISDISCONNECTING)) == 0) {
3907                         error = ENOTCONN;
3908                 } else {
3909                         error = soshutdownlock(so, how);
3910                 }
3911                 socket_unlock(so, 1);
3912                 break;
3913         default:
3914                 error = EINVAL;
3915                 break;
3916         }
3917
3918         KERNEL_DEBUG(DBG_FNC_SOSHUTDOWN | DBG_FUNC_END, how, error, 0, 0, 0);
3919
3920         return (error);
3921 }
3922
3923 int
3924 soshutdownlock_final(struct socket *so, int how)
3925 {
3926         struct protosw *pr = so->so_proto;
3927         int error = 0;
3928
3929         sflt_notify(so, sock_evt_shutdown, &how);
3930
3931         if (how != SHUT_WR) {
3932                 if ((so->so_state & SS_CANTRCVMORE) != 0) {
3933                         /* read already shut down */
3934                         error = ENOTCONN;
3935                         goto done;
3936                 }
3937                 sorflush(so);
3938                 postevent(so, 0, EV_RCLOSED);
3939         }
3940         if (how != SHUT_RD) {
3941                 if ((so->so_state & SS_CANTSENDMORE) != 0) {
3942                         /* write already shut down */
3943                         error = ENOTCONN;
3944                         goto done;
3945                 }
3946                 error = (*pr->pr_usrreqs->pru_shutdown)(so);
3947                 postevent(so, 0, EV_WCLOSED);
3948         }
3949 done:
3950         KERNEL_DEBUG(DBG_FNC_SOSHUTDOWN, how, 1, 0, 0, 0);
3951         return (error);
3952 }
3953
3954 int
3955 soshutdownlock(struct socket *so, int how)
3956 {
3957         int error = 0;
3958
3959 #if CONTENT_FILTER
3960         /*
3961          * A content filter may delay the actual shutdown until it
3962          * has processed the pending data
3963          */
3964         if (so->so_flags & SOF_CONTENT_FILTER) {
3965                 error = cfil_sock_shutdown(so, &how);
3966                 if (error == EJUSTRETURN) {
3967                         error = 0;
3968                         goto done;
3969                 } else if (error != 0) {
3970                         goto done;
3971                 }
3972         }
3973 #endif /* CONTENT_FILTER */
3974
3975         error = soshutdownlock_final(so, how);
3976
3977 done:
3978         return (error);
3979 }
3980
3981 void
3982 sowflush(struct socket *so)
3983 {
3984         struct sockbuf *sb = &so->so_snd;
3985 #ifdef notyet
3986         lck_mtx_t *mutex_held;
3987         /*
3988          * XXX: This code is currently commented out, because we may get here
3989          * as part of sofreelastref(), and at that time, pr_getlock() may no
3990          * longer be able to return us the lock; this will be fixed in future.
3991          */
3992         if (so->so_proto->pr_getlock != NULL)
3993                 mutex_held = (*so->so_proto->pr_getlock)(so, 0);
3994         else
3995                 mutex_held = so->so_proto->pr_domain->dom_mtx;
3996
3997         lck_mtx_assert(mutex_held, LCK_MTX_ASSERT_OWNED);
3998 #endif /* notyet */
3999
4000         /*
4001          * Obtain lock on the socket buffer (SB_LOCK).  This is required
4002          * to prevent the socket buffer from being unexpectedly altered
4003          * while it is used by another thread in socket send/receive.
4004          *
4005          * sblock() must not fail here, hence the assertion.
4006          */
4007         (void) sblock(sb, SBL_WAIT | SBL_NOINTR | SBL_IGNDEFUNCT);
4008         VERIFY(sb->sb_flags & SB_LOCK);
4009
4010         sb->sb_flags            &= ~(SB_SEL|SB_UPCALL);
4011         sb->sb_flags            |= SB_DROP;
4012         sb->sb_upcall           = NULL;
4013         sb->sb_upcallarg        = NULL;
4014
4015         sbunlock(sb, TRUE);     /* keep socket locked */
4016
4017         selthreadclear(&sb->sb_sel);
4018         sbrelease(sb);
4019 }
4020
4021 void
4022 sorflush(struct socket *so)
4023 {
4024         struct sockbuf *sb = &so->so_rcv;
4025         struct protosw *pr = so->so_proto;
4026         struct sockbuf asb;
4027 #ifdef notyet
4028         lck_mtx_t *mutex_held;
4029         /*
4030          * XXX: This code is currently commented out, because we may get here
4031          * as part of sofreelastref(), and at that time, pr_getlock() may no
4032          * longer be able to return us the lock; this will be fixed in future.
4033          */
4034         if (so->so_proto->pr_getlock != NULL)
4035                 mutex_held = (*so->so_proto->pr_getlock)(so, 0);
4036         else
4037                 mutex_held = so->so_proto->pr_domain->dom_mtx;
4038
4039         lck_mtx_assert(mutex_held, LCK_MTX_ASSERT_OWNED);
4040 #endif /* notyet */
4041
4042         sflt_notify(so, sock_evt_flush_read, NULL);
4043
4044         socantrcvmore(so);
4045
4046         /*
4047          * Obtain lock on the socket buffer (SB_LOCK).  This is required
4048          * to prevent the socket buffer from being unexpectedly altered
4049          * while it is used by another thread in socket send/receive.
4050          *
4051          * sblock() must not fail here, hence the assertion.
4052          */
4053         (void) sblock(sb, SBL_WAIT | SBL_NOINTR | SBL_IGNDEFUNCT);
4054         VERIFY(sb->sb_flags & SB_LOCK);
4055
4056         /*
4057          * Copy only the relevant fields from "sb" to "asb" which we
4058          * need for sbrelease() to function.  In particular, skip
4059          * sb_sel as it contains the wait queue linkage, which would
4060          * wreak havoc if we were to issue selthreadclear() on "asb".
4061          * Make sure to not carry over SB_LOCK in "asb", as we need
4062          * to acquire it later as part of sbrelease().
4063          */
4064         bzero(&asb, sizeof (asb));
4065         asb.sb_cc               = sb->sb_cc;
4066         asb.sb_hiwat            = sb->sb_hiwat;
4067         asb.sb_mbcnt            = sb->sb_mbcnt;
4068         asb.sb_mbmax            = sb->sb_mbmax;
4069         asb.sb_ctl              = sb->sb_ctl;
4070         asb.sb_lowat            = sb->sb_lowat;
4071         asb.sb_mb               = sb->sb_mb;
4072         asb.sb_mbtail           = sb->sb_mbtail;
4073         asb.sb_lastrecord       = sb->sb_lastrecord;
4074         asb.sb_so               = sb->sb_so;
4075         asb.sb_flags            = sb->sb_flags;
4076         asb.sb_flags            &= ~(SB_LOCK|SB_SEL|SB_KNOTE|SB_UPCALL);
4077         asb.sb_flags            |= SB_DROP;
4078
4079         /*
4080          * Ideally we'd bzero() these and preserve the ones we need;
4081          * but to do that we'd need to shuffle things around in the
4082          * sockbuf, and we can't do it now because there are KEXTS
4083          * that are directly referring to the socket structure.
4084          *
4085          * Setting SB_DROP acts as a barrier to prevent further appends.
4086          * Clearing SB_SEL is done for selthreadclear() below.
4087          */
4088         sb->sb_cc               = 0;
4089         sb->sb_hiwat            = 0;
4090         sb->sb_mbcnt            = 0;
4091         sb->sb_mbmax            = 0;
4092         sb->sb_ctl              = 0;
4093         sb->sb_lowat            = 0;
4094         sb->sb_mb               = NULL;
4095         sb->sb_mbtail           = NULL;
4096         sb->sb_lastrecord       = NULL;
4097         sb->sb_timeo.tv_sec     = 0;
4098         sb->sb_timeo.tv_usec    = 0;
4099         sb->sb_upcall           = NULL;
4100         sb->sb_upcallarg        = NULL;
4101         sb->sb_flags            &= ~(SB_SEL|SB_UPCALL);
4102         sb->sb_flags            |= SB_DROP;
4103
4104         sbunlock(sb, TRUE);     /* keep socket locked */
4105
4106         /*
4107          * Note that selthreadclear() is called on the original "sb" and
4108          * not the local "asb" because of the way wait queue linkage is
4109          * implemented.  Given that selwakeup() may be triggered, SB_SEL
4110          * should no longer be set (cleared above.)
4111          */
4112         selthreadclear(&sb->sb_sel);
4113
4114         if ((pr->pr_flags & PR_RIGHTS) && pr->pr_domain->dom_dispose)
4115                 (*pr->pr_domain->dom_dispose)(asb.sb_mb);
4116
4117         sbrelease(&asb);
4118 }
4119
4120 /*
4121  * Perhaps this routine, and sooptcopyout(), below, ought to come in
4122  * an additional variant to handle the case where the option value needs
4123  * to be some kind of integer, but not a specific size.
4124  * In addition to their use here, these functions are also called by the
4125  * protocol-level pr_ctloutput() routines.
4126  *
4127  * Returns:     0                       Success
4128  *              EINVAL
4129  *      copyin:EFAULT
4130  */
4131 int
4132 sooptcopyin(struct sockopt *sopt, void *buf, size_t len, size_t minlen)
4133 {
4134         size_t  valsize;
4135
4136         /*
4137          * If the user gives us more than we wanted, we ignore it,
4138          * but if we don't get the minimum length the caller
4139          * wants, we return EINVAL.  On success, sopt->sopt_valsize
4140          * is set to however much we actually retrieved.
4141          */
4142         if ((valsize = sopt->sopt_valsize) < minlen)
4143                 return (EINVAL);
4144         if (valsize > len)
4145                 sopt->sopt_valsize = valsize = len;
4146
4147         if (sopt->sopt_p != kernproc)
4148                 return (copyin(sopt->sopt_val, buf, valsize));
4149
4150         bcopy(CAST_DOWN(caddr_t, sopt->sopt_val), buf, valsize);
4151         return (0);
4152 }
4153
4154 /*
4155  * sooptcopyin_timeval
4156  *   Copy in a timeval value into tv_p, and take into account whether the
4157  *   the calling process is 64-bit or 32-bit.  Moved the sanity checking
4158  *   code here so that we can verify the 64-bit tv_sec value before we lose
4159  *   the top 32-bits assigning tv64.tv_sec to tv_p->tv_sec.
4160  */
4161 static int
4162 sooptcopyin_timeval(struct sockopt *sopt, struct timeval *tv_p)
4163 {
4164         int                     error;
4165
4166         if (proc_is64bit(sopt->sopt_p)) {
4167                 struct user64_timeval   tv64;
4168
4169                 if (sopt->sopt_valsize < sizeof (tv64))
4170                         return (EINVAL);
4171
4172                 sopt->sopt_valsize = sizeof (tv64);
4173                 if (sopt->sopt_p != kernproc) {
4174                         error = copyin(sopt->sopt_val, &tv64, sizeof (tv64));
4175                         if (error != 0)
4176                                 return (error);
4177                 } else {
4178                         bcopy(CAST_DOWN(caddr_t, sopt->sopt_val), &tv64,
4179                             sizeof (tv64));
4180                 }
4181                 if (tv64.tv_sec < 0 || tv64.tv_sec > LONG_MAX ||
4182                     tv64.tv_usec < 0 || tv64.tv_usec >= 1000000)
4183                         return (EDOM);
4184
4185                 tv_p->tv_sec = tv64.tv_sec;
4186                 tv_p->tv_usec = tv64.tv_usec;
4187         } else {
4188                 struct user32_timeval   tv32;
4189
4190                 if (sopt->sopt_valsize < sizeof (tv32))
4191                         return (EINVAL);
4192
4193                 sopt->sopt_valsize = sizeof (tv32);
4194                 if (sopt->sopt_p != kernproc) {
4195                         error = copyin(sopt->sopt_val, &tv32, sizeof (tv32));
4196                         if (error != 0) {
4197                                 return (error);
4198                         }
4199                 } else {
4200                         bcopy(CAST_DOWN(caddr_t, sopt->sopt_val), &tv32,
4201                             sizeof (tv32));
4202                 }
4203 #ifndef __LP64__
4204                 /*
4205                  * K64todo "comparison is always false due to
4206                  * limited range of data type"
4207                  */
4208                 if (tv32.tv_sec < 0 || tv32.tv_sec > LONG_MAX ||
4209                     tv32.tv_usec < 0 || tv32.tv_usec >= 1000000)
4210                         return (EDOM);
4211 #endif
4212                 tv_p->tv_sec = tv32.tv_sec;
4213                 tv_p->tv_usec = tv32.tv_usec;
4214         }
4215         return (0);
4216 }
4217
4218 /*
4219  * Returns:     0                       Success
4220  *              EINVAL
4221  *              ENOPROTOOPT
4222  *              ENOBUFS
4223  *              EDOM
4224  *      sooptcopyin:EINVAL
4225  *      sooptcopyin:EFAULT
4226  *      sooptcopyin_timeval:EINVAL
4227  *      sooptcopyin_timeval:EFAULT
4228  *      sooptcopyin_timeval:EDOM
4229  *      <pr_ctloutput>:EOPNOTSUPP[AF_UNIX]
4230  *      <pr_ctloutput>:???w
4231  *      sflt_attach_private:???         [whatever a filter author chooses]
4232  *      <sf_setoption>:???              [whatever a filter author chooses]
4233  *
4234  * Notes:       Other <pru_listen> returns depend on the protocol family; all
4235  *              <sf_listen> returns depend on what the filter author causes
4236  *              their filter to return.
4237  */
4238 int
4239 sosetoptlock(struct socket *so, struct sockopt *sopt, int dolock)
4240 {
4241         int     error, optval;
4242         struct  linger l;
4243         struct  timeval tv;
4244 #if CONFIG_MACF_SOCKET
4245         struct mac extmac;
4246 #endif /* MAC_SOCKET */
4247
4248         if (sopt->sopt_dir != SOPT_SET)
4249                 sopt->sopt_dir = SOPT_SET;
4250
4251         if (dolock)
4252                 socket_lock(so, 1);
4253
4254         if ((so->so_state & (SS_CANTRCVMORE | SS_CANTSENDMORE)) ==
4255             (SS_CANTRCVMORE | SS_CANTSENDMORE) &&
4256             (so->so_flags & SOF_NPX_SETOPTSHUT) == 0) {
4257                 /* the socket has been shutdown, no more sockopt's */
4258                 error = EINVAL;
4259                 goto out;
4260         }
4261
4262         error = sflt_setsockopt(so, sopt);
4263         if (error != 0) {
4264                 if (error == EJUSTRETURN)
4265                         error = 0;
4266                 goto out;
4267         }
4268
4269         if (sopt->sopt_level != SOL_SOCKET) {
4270                 if (so->so_proto != NULL &&
4271                     so->so_proto->pr_ctloutput != NULL) {
4272                         error = (*so->so_proto->pr_ctloutput)(so, sopt);
4273                         goto out;
4274                 }
4275                 error = ENOPROTOOPT;
4276         } else {
4277                 /*
4278                  * Allow socket-level (SOL_SOCKET) options to be filtered by
4279                  * the protocol layer, if needed.  A zero value returned from
4280                  * the handler means use default socket-level processing as
4281                  * done by the rest of this routine.  Otherwise, any other
4282                  * return value indicates that the option is unsupported.
4283                  */
4284                 if (so->so_proto != NULL && (error = so->so_proto->pr_usrreqs->
4285                     pru_socheckopt(so, sopt)) != 0)
4286                         goto out;
4287
4288                 error = 0;
4289                 switch (sopt->sopt_name) {
4290                 case SO_LINGER:
4291                 case SO_LINGER_SEC:
4292                         error = sooptcopyin(sopt, &l, sizeof (l), sizeof (l));
4293                         if (error != 0)
4294                                 goto out;
4295
4296                         so->so_linger = (sopt->sopt_name == SO_LINGER) ?
4297                             l.l_linger : l.l_linger * hz;
4298                         if (l.l_onoff != 0)
4299                                 so->so_options |= SO_LINGER;
4300                         else
4301                                 so->so_options &= ~SO_LINGER;
4302                         break;
4303
4304                 case SO_DEBUG:
4305                 case SO_KEEPALIVE:
4306                 case SO_DONTROUTE:
4307                 case SO_USELOOPBACK:
4308                 case SO_BROADCAST:
4309                 case SO_REUSEADDR:
4310                 case SO_REUSEPORT:
4311                 case SO_OOBINLINE:
4312                 case SO_TIMESTAMP:
4313                 case SO_TIMESTAMP_MONOTONIC:
4314                 case SO_DONTTRUNC:
4315                 case SO_WANTMORE:
4316                 case SO_WANTOOBFLAG:
4317                 case SO_NOWAKEFROMSLEEP:
4318                         error = sooptcopyin(sopt, &optval, sizeof (optval),
4319                             sizeof (optval));
4320                         if (error != 0)
4321                                 goto out;
4322                         if (optval)
4323                                 so->so_options |= sopt->sopt_name;
4324                         else
4325                                 so->so_options &= ~sopt->sopt_name;
4326                         break;
4327
4328                 case SO_SNDBUF:
4329                 case SO_RCVBUF:
4330                 case SO_SNDLOWAT:
4331                 case SO_RCVLOWAT:
4332                         error = sooptcopyin(sopt, &optval, sizeof (optval),
4333                             sizeof (optval));
4334                         if (error != 0)
4335                                 goto out;
4336
4337                         /*
4338                          * Values < 1 make no sense for any of these
4339                          * options, so disallow them.
4340                          */
4341                         if (optval < 1) {
4342                                 error = EINVAL;
4343                                 goto out;
4344                         }
4345
4346                         switch (sopt->sopt_name) {
4347                         case SO_SNDBUF:
4348                         case SO_RCVBUF: {
4349                                 struct sockbuf *sb =
4350                                     (sopt->sopt_name == SO_SNDBUF) ?
4351                                     &so->so_snd : &so->so_rcv;
4352                                 if (sbreserve(sb, (u_int32_t)optval) == 0) {
4353                                         error = ENOBUFS;
4354                                         goto out;
4355                                 }
4356                                 sb->sb_flags |= SB_USRSIZE;
4357                                 sb->sb_flags &= ~SB_AUTOSIZE;
4358                                 sb->sb_idealsize = (u_int32_t)optval;
4359                                 break;
4360                         }
4361                         /*
4362                          * Make sure the low-water is never greater than
4363                          * the high-water.
4364                          */
4365                         case SO_SNDLOWAT: {
4366                                 int space = sbspace(&so->so_snd);
4367                                 u_int32_t hiwat = so->so_snd.sb_hiwat;
4368
4369                                 if (so->so_snd.sb_flags & SB_UNIX) {
4370                                         struct unpcb *unp =
4371                                             (struct unpcb *)(so->so_pcb);
4372                                         if (unp != NULL && unp->unp_conn != NULL) {
4373                                                 hiwat += unp->unp_conn->unp_cc;
4374                                         }
4375                                 }
4376
4377                                 so->so_snd.sb_lowat =
4378                                     (optval > hiwat) ?
4379                                     hiwat : optval;
4380
4381                                 if (space >= so->so_snd.sb_lowat) {
4382                                         sowwakeup(so);
4383                                 }
4384                                 break;
4385                         }
4386                         case SO_RCVLOWAT: {
4387                                 int64_t data_len;
4388                                 so->so_rcv.sb_lowat =
4389                                     (optval > so->so_rcv.sb_hiwat) ?
4390                                     so->so_rcv.sb_hiwat : optval;
4391                                 data_len = so->so_rcv.sb_cc
4392                                     - so->so_rcv.sb_ctl;
4393                                 if (data_len >= so->so_rcv.sb_lowat)
4394                                     sorwakeup(so);
4395                                 break;
4396                         }
4397                         }
4398                         break;
4399
4400                 case SO_SNDTIMEO:
4401                 case SO_RCVTIMEO:
4402                         error = sooptcopyin_timeval(sopt, &tv);
4403                         if (error != 0)
4404                                 goto out;
4405
4406                         switch (sopt->sopt_name) {
4407                         case SO_SNDTIMEO:
4408                                 so->so_snd.sb_timeo = tv;
4409                                 break;
4410                         case SO_RCVTIMEO:
4411                                 so->so_rcv.sb_timeo = tv;
4412                                 break;
4413                         }
4414                         break;
4415
4416                 case SO_NKE: {
4417                         struct so_nke nke;
4418
4419                         error = sooptcopyin(sopt, &nke, sizeof (nke),
4420                             sizeof (nke));
4421                         if (error != 0)
4422                                 goto out;
4423
4424                         error = sflt_attach_internal(so, nke.nke_handle);
4425                         break;
4426                 }
4427
4428                 case SO_NOSIGPIPE:
4429                         error = sooptcopyin(sopt, &optval, sizeof (optval),
4430                             sizeof (optval));
4431                         if (error != 0)
4432                                 goto out;
4433                         if (optval != 0)
4434                                 so->so_flags |= SOF_NOSIGPIPE;
4435                         else
4436                                 so->so_flags &= ~SOF_NOSIGPIPE;
4437                         break;
4438
4439                 case SO_NOADDRERR:
4440                         error = sooptcopyin(sopt, &optval, sizeof (optval),
4441                             sizeof (optval));
4442                         if (error != 0)
4443                                 goto out;
4444                         if (optval != 0)
4445                                 so->so_flags |= SOF_NOADDRAVAIL;
4446                         else
4447                                 so->so_flags &= ~SOF_NOADDRAVAIL;
4448                         break;
4449
4450                 case SO_REUSESHAREUID:
4451                         error = sooptcopyin(sopt, &optval, sizeof (optval),
4452                             sizeof (optval));
4453                         if (error != 0)
4454                                 goto out;
4455                         if (optval != 0)
4456                                 so->so_flags |= SOF_REUSESHAREUID;
4457                         else
4458                                 so->so_flags &= ~SOF_REUSESHAREUID;
4459                         break;
4460
4461                 case SO_NOTIFYCONFLICT:
4462                         if (kauth_cred_issuser(kauth_cred_get()) == 0) {
4463                                 error = EPERM;
4464                                 goto out;
4465                         }
4466                         error = sooptcopyin(sopt, &optval, sizeof (optval),
4467                             sizeof (optval));
4468                         if (error != 0)
4469                                 goto out;
4470                         if (optval != 0)
4471                                 so->so_flags |= SOF_NOTIFYCONFLICT;
4472                         else
4473                                 so->so_flags &= ~SOF_NOTIFYCONFLICT;
4474                         break;
4475
4476                 case SO_RESTRICTIONS:
4477                         error = sooptcopyin(sopt, &optval, sizeof (optval),
4478                             sizeof (optval));
4479                         if (error != 0)
4480                                 goto out;
4481
4482                         error = so_set_restrictions(so, optval);
4483                         break;
4484
4485                 case SO_AWDL_UNRESTRICTED:
4486                         if (SOCK_DOM(so) != PF_INET &&
4487                             SOCK_DOM(so) != PF_INET6) {
4488                                 error = EOPNOTSUPP;
4489                                 goto out;
4490                         }
4491                         error = sooptcopyin(sopt, &optval, sizeof(optval),
4492                             sizeof(optval));
4493                         if (error != 0)
4494                                 goto out;
4495                         if (optval != 0) {
4496                                 kauth_cred_t cred =  NULL;
4497                                 proc_t ep = PROC_NULL;
4498
4499                                 if (so->so_flags & SOF_DELEGATED) {
4500                                         ep = proc_find(so->e_pid);
4501                                         if (ep)
4502                                                 cred = kauth_cred_proc_ref(ep);
4503                                 }
4504                                 error = priv_check_cred(
4505                                     cred ? cred : so->so_cred,
4506                                     PRIV_NET_RESTRICTED_AWDL, 0);
4507                                 if (error == 0)
4508                                         inp_set_awdl_unrestricted(
4509                                             sotoinpcb(so));
4510                                 if (cred)
4511                                         kauth_cred_unref(&cred);
4512                                 if (ep != PROC_NULL)
4513                                         proc_rele(ep);
4514                         } else
4515                                 inp_clear_awdl_unrestricted(sotoinpcb(so));
4516                         break;
4517
4518                 case SO_LABEL:
4519 #if CONFIG_MACF_SOCKET
4520                         if ((error = sooptcopyin(sopt, &extmac, sizeof (extmac),
4521                             sizeof (extmac))) != 0)
4522                                 goto out;
4523
4524                         error = mac_setsockopt_label(proc_ucred(sopt->sopt_p),
4525                             so, &extmac);
4526 #else
4527                         error = EOPNOTSUPP;
4528 #endif /* MAC_SOCKET */
4529                         break;
4530
4531                 case SO_UPCALLCLOSEWAIT:
4532                         error = sooptcopyin(sopt, &optval, sizeof (optval),
4533                             sizeof (optval));
4534                         if (error != 0)
4535                                 goto out;
4536                         if (optval != 0)
4537                                 so->so_flags |= SOF_UPCALLCLOSEWAIT;
4538                         else
4539                                 so->so_flags &= ~SOF_UPCALLCLOSEWAIT;
4540                         break;
4541
4542                 case SO_RANDOMPORT:
4543                         error = sooptcopyin(sopt, &optval, sizeof (optval),
4544                             sizeof (optval));
4545                         if (error != 0)
4546                                 goto out;
4547                         if (optval != 0)
4548                                 so->so_flags |= SOF_BINDRANDOMPORT;
4549                         else
4550                                 so->so_flags &= ~SOF_BINDRANDOMPORT;
4551                         break;
4552
4553                 case SO_NP_EXTENSIONS: {
4554                         struct so_np_extensions sonpx;
4555
4556                         error = sooptcopyin(sopt, &sonpx, sizeof (sonpx),
4557                             sizeof (sonpx));
4558                         if (error != 0)
4559                                 goto out;
4560                         if (sonpx.npx_mask & ~SONPX_MASK_VALID) {
4561                                 error = EINVAL;
4562                                 goto out;
4563                         }
4564                         /*
4565                          * Only one bit defined for now
4566                          */
4567                         if ((sonpx.npx_mask & SONPX_SETOPTSHUT)) {
4568                                 if ((sonpx.npx_flags & SONPX_SETOPTSHUT))
4569                                         so->so_flags |= SOF_NPX_SETOPTSHUT;
4570                                 else
4571                                         so->so_flags &= ~SOF_NPX_SETOPTSHUT;
4572                         }
4573                         break;
4574                 }
4575
4576                 case SO_TRAFFIC_CLASS: {
4577                         error = sooptcopyin(sopt, &optval, sizeof (optval),
4578                             sizeof (optval));
4579                         if (error != 0)
4580                                 goto out;
4581                         error = so_set_traffic_class(so, optval);
4582                         if (error != 0)
4583                                 goto out;
4584                         break;
4585                 }
4586
4587                 case SO_RECV_TRAFFIC_CLASS: {
4588                         error = sooptcopyin(sopt, &optval, sizeof (optval),
4589                             sizeof (optval));
4590                         if (error != 0)
4591                                 goto out;
4592                         if (optval == 0)
4593                                 so->so_flags &= ~SOF_RECV_TRAFFIC_CLASS;
4594                         else
4595                                 so->so_flags |= SOF_RECV_TRAFFIC_CLASS;
4596                         break;
4597                 }
4598
4599                 case SO_TRAFFIC_CLASS_DBG: {
4600                         struct so_tcdbg so_tcdbg;
4601
4602                         error = sooptcopyin(sopt, &so_tcdbg,
4603                             sizeof (struct so_tcdbg), sizeof (struct so_tcdbg));
4604                         if (error != 0)
4605                                 goto out;
4606                         error = so_set_tcdbg(so, &so_tcdbg);
4607                         if (error != 0)
4608                                 goto out;
4609                         break;
4610                 }
4611
4612                 case SO_PRIVILEGED_TRAFFIC_CLASS:
4613                         error = priv_check_cred(kauth_cred_get(),
4614                             PRIV_NET_PRIVILEGED_TRAFFIC_CLASS, 0);
4615                         if (error != 0)
4616                                 goto out;
4617                         error = sooptcopyin(sopt, &optval, sizeof (optval),
4618                             sizeof (optval));
4619                         if (error != 0)
4620                                 goto out;
4621                         if (optval == 0)
4622                                 so->so_flags &= ~SOF_PRIVILEGED_TRAFFIC_CLASS;
4623                         else
4624                                 so->so_flags |= SOF_PRIVILEGED_TRAFFIC_CLASS;
4625                         break;
4626
4627                 case SO_DEFUNCTOK:
4628                         error = sooptcopyin(sopt, &optval, sizeof (optval),
4629                             sizeof (optval));
4630                         if (error != 0 || (so->so_flags & SOF_DEFUNCT)) {
4631                                 if (error == 0)
4632                                         error = EBADF;
4633                                 goto out;
4634                         }
4635                         /*
4636                          * Any process can set SO_DEFUNCTOK (clear
4637                          * SOF_NODEFUNCT), but only root can clear
4638                          * SO_DEFUNCTOK (set SOF_NODEFUNCT).
4639                          */
4640                         if (optval == 0 &&
4641                             kauth_cred_issuser(kauth_cred_get()) == 0) {
4642                                 error = EPERM;
4643                                 goto out;
4644                         }
4645                         if (optval)
4646                                 so->so_flags &= ~SOF_NODEFUNCT;
4647                         else
4648                                 so->so_flags |= SOF_NODEFUNCT;
4649
4650                         if (SOCK_DOM(so) == PF_INET ||
4651                             SOCK_DOM(so) == PF_INET6) {
4652                                 char s[MAX_IPv6_STR_LEN];
4653                                 char d[MAX_IPv6_STR_LEN];
4654                                 struct inpcb *inp = sotoinpcb(so);
4655
4656                                 SODEFUNCTLOG(("%s[%d]: so 0x%llx [%s %s:%d -> "
4657                                     "%s:%d] is now marked as %seligible for "
4658                                     "defunct\n", __func__, proc_selfpid(),
4659                                     (uint64_t)VM_KERNEL_ADDRPERM(so),
4660                                     (SOCK_TYPE(so) == SOCK_STREAM) ?
4661                                     "TCP" : "UDP", inet_ntop(SOCK_DOM(so),
4662                                     ((SOCK_DOM(so) == PF_INET) ?
4663                                     (void *)&inp->inp_laddr.s_addr :
4664                                     (void *)&inp->in6p_laddr), s, sizeof (s)),
4665                                     ntohs(inp->in6p_lport),
4666                                     inet_ntop(SOCK_DOM(so),
4667                                     (SOCK_DOM(so) == PF_INET) ?
4668                                     (void *)&inp->inp_faddr.s_addr :
4669                                     (void *)&inp->in6p_faddr, d, sizeof (d)),
4670                                     ntohs(inp->in6p_fport),
4671                                     (so->so_flags & SOF_NODEFUNCT) ?
4672                                     "not " : ""));
4673                         } else {
4674                                 SODEFUNCTLOG(("%s[%d]: so 0x%llx [%d,%d] is "
4675                                     "now marked as %seligible for defunct\n",
4676                                     __func__, proc_selfpid(),
4677                                     (uint64_t)VM_KERNEL_ADDRPERM(so),
4678                                     SOCK_DOM(so), SOCK_TYPE(so),
4679                                     (so->so_flags & SOF_NODEFUNCT) ?
4680                                     "not " : ""));
4681                         }
4682                         break;
4683
4684                 case SO_ISDEFUNCT:
4685                         /* This option is not settable */
4686                         error = EINVAL;
4687                         break;
4688
4689                 case SO_OPPORTUNISTIC:
4690                         error = sooptcopyin(sopt, &optval, sizeof (optval),
4691                             sizeof (optval));
4692                         if (error == 0)
4693                                 error = so_set_opportunistic(so, optval);
4694                         break;
4695
4696                 case SO_FLUSH:
4697                         /* This option is handled by lower layer(s) */
4698                         error = 0;
4699                         break;
4700
4701                 case SO_RECV_ANYIF:
4702                         error = sooptcopyin(sopt, &optval, sizeof (optval),
4703                             sizeof (optval));
4704                         if (error == 0)
4705                                 error = so_set_recv_anyif(so, optval);
4706                         break;
4707
4708                 case SO_TRAFFIC_MGT_BACKGROUND: {
4709                         /* This option is handled by lower layer(s) */
4710                         error = 0;
4711                         break;
4712                 }
4713
4714 #if FLOW_DIVERT
4715                 case SO_FLOW_DIVERT_TOKEN:
4716                         error = flow_divert_token_set(so, sopt);
4717                         break;
4718 #endif  /* FLOW_DIVERT */
4719
4720
4721                 case SO_DELEGATED:
4722                         if ((error = sooptcopyin(sopt, &optval, sizeof (optval),
4723                             sizeof (optval))) != 0)
4724                                 break;
4725
4726                         error = so_set_effective_pid(so, optval, sopt->sopt_p);
4727                         break;
4728
4729                 case SO_DELEGATED_UUID: {
4730                         uuid_t euuid;
4731
4732                         if ((error = sooptcopyin(sopt, &euuid, sizeof (euuid),
4733                             sizeof (euuid))) != 0)
4734                                 break;
4735
4736                         error = so_set_effective_uuid(so, euuid, sopt->sopt_p);
4737                         break;
4738                 }
4739
4740 #if NECP
4741                 case SO_NECP_ATTRIBUTES:
4742                         error = necp_set_socket_attributes(so, sopt);
4743                         break;
4744 #endif /* NECP */
4745
4746 #if MPTCP
4747                 case SO_MPTCP_FASTJOIN:
4748                         if (!((so->so_flags & SOF_MP_SUBFLOW) ||
4749                             ((SOCK_CHECK_DOM(so, PF_MULTIPATH)) &&
4750                             (SOCK_CHECK_PROTO(so, IPPROTO_TCP))))) {
4751                                 error = ENOPROTOOPT;
4752                                 break;
4753                         }
4754
4755                         error = sooptcopyin(sopt, &optval, sizeof (optval),
4756                             sizeof (optval));
4757                         if (error != 0)
4758                                 goto out;
4759                         if (optval == 0)
4760                                 so->so_flags &= ~SOF_MPTCP_FASTJOIN;
4761                         else
4762                                 so->so_flags |= SOF_MPTCP_FASTJOIN;
4763                         break;
4764 #endif /* MPTCP */
4765
4766                 default:
4767                         error = ENOPROTOOPT;
4768                         break;
4769                 }
4770                 if (error == 0 && so->so_proto != NULL &&
4771                     so->so_proto->pr_ctloutput != NULL) {
4772                         (void) so->so_proto->pr_ctloutput(so, sopt);
4773                 }
4774         }
4775 out:
4776         if (dolock)
4777                 socket_unlock(so, 1);
4778         return (error);
4779 }
4780
4781 /* Helper routines for getsockopt */
4782 int
4783 sooptcopyout(struct sockopt *sopt, void *buf, size_t len)
4784 {
4785         int     error;
4786         size_t  valsize;
4787
4788         error = 0;
4789
4790         /*
4791          * Documented get behavior is that we always return a value,
4792          * possibly truncated to fit in the user's buffer.
4793          * Traditional behavior is that we always tell the user
4794          * precisely how much we copied, rather than something useful
4795          * like the total amount we had available for her.
4796          * Note that this interface is not idempotent; the entire answer must
4797          * generated ahead of time.
4798          */
4799         valsize = min(len, sopt->sopt_valsize);
4800         sopt->sopt_valsize = valsize;
4801         if (sopt->sopt_val != USER_ADDR_NULL) {
4802                 if (sopt->sopt_p != kernproc)
4803                         error = copyout(buf, sopt->sopt_val, valsize);
4804                 else
4805                         bcopy(buf, CAST_DOWN(caddr_t, sopt->sopt_val), valsize);
4806         }
4807         return (error);
4808 }
4809
4810 static int
4811 sooptcopyout_timeval(struct sockopt *sopt, const struct timeval *tv_p)
4812 {
4813         int                     error;
4814         size_t                  len;
4815         struct user64_timeval   tv64;
4816         struct user32_timeval   tv32;
4817         const void *            val;
4818         size_t                  valsize;
4819
4820         error = 0;
4821         if (proc_is64bit(sopt->sopt_p)) {
4822                 len = sizeof (tv64);
4823                 tv64.tv_sec = tv_p->tv_sec;
4824                 tv64.tv_usec = tv_p->tv_usec;
4825                 val = &tv64;
4826         } else {
4827                 len = sizeof (tv32);
4828                 tv32.tv_sec = tv_p->tv_sec;
4829                 tv32.tv_usec = tv_p->tv_usec;
4830                 val = &tv32;
4831         }
4832         valsize = min(len, sopt->sopt_valsize);
4833         sopt->sopt_valsize = valsize;
4834         if (sopt->sopt_val != USER_ADDR_NULL) {
4835                 if (sopt->sopt_p != kernproc)
4836                         error = copyout(val, sopt->sopt_val, valsize);
4837                 else
4838                         bcopy(val, CAST_DOWN(caddr_t, sopt->sopt_val), valsize);
4839         }
4840         return (error);
4841 }
4842
4843 /*
4844  * Return:      0                       Success
4845  *              ENOPROTOOPT
4846  *      <pr_ctloutput>:EOPNOTSUPP[AF_UNIX]
4847  *      <pr_ctloutput>:???
4848  *      <sf_getoption>:???
4849  */
4850 int
4851 sogetoptlock(struct socket *so, struct sockopt *sopt, int dolock)
4852 {
4853         int     error, optval;
4854         struct  linger l;
4855         struct  timeval tv;
4856 #if CONFIG_MACF_SOCKET
4857         struct mac extmac;
4858 #endif /* MAC_SOCKET */
4859
4860         if (sopt->sopt_dir != SOPT_GET)
4861                 sopt->sopt_dir = SOPT_GET;
4862
4863         if (dolock)
4864                 socket_lock(so, 1);
4865
4866         error = sflt_getsockopt(so, sopt);
4867         if (error != 0) {
4868                 if (error == EJUSTRETURN)
4869                         error = 0;
4870                 goto out;
4871         }
4872
4873         if (sopt->sopt_level != SOL_SOCKET) {
4874                 if (so->so_proto != NULL &&
4875                     so->so_proto->pr_ctloutput != NULL) {
4876                         error = (*so->so_proto->pr_ctloutput)(so, sopt);
4877                         goto out;
4878                 }
4879                 error = ENOPROTOOPT;
4880         } else {
4881                 /*
4882                  * Allow socket-level (SOL_SOCKET) options to be filtered by
4883                  * the protocol layer, if needed.  A zero value returned from
4884                  * the handler means use default socket-level processing as
4885                  * done by the rest of this routine.  Otherwise, any other
4886                  * return value indicates that the option is unsupported.
4887                  */
4888                 if (so->so_proto != NULL && (error = so->so_proto->pr_usrreqs->
4889                     pru_socheckopt(so, sopt)) != 0)
4890                         goto out;
4891
4892                 error = 0;
4893                 switch (sopt->sopt_name) {
4894                 case SO_LINGER:
4895                 case SO_LINGER_SEC:
4896                         l.l_onoff = ((so->so_options & SO_LINGER) ? 1 : 0);
4897                         l.l_linger = (sopt->sopt_name == SO_LINGER) ?
4898                             so->so_linger : so->so_linger / hz;
4899                         error = sooptcopyout(sopt, &l, sizeof (l));
4900                         break;
4901
4902                 case SO_USELOOPBACK:
4903                 case SO_DONTROUTE:
4904                 case SO_DEBUG:
4905                 case SO_KEEPALIVE:
4906                 case SO_REUSEADDR:
4907                 case SO_REUSEPORT:
4908                 case SO_BROADCAST:
4909                 case SO_OOBINLINE:
4910                 case SO_TIMESTAMP:
4911                 case SO_TIMESTAMP_MONOTONIC:
4912                 case SO_DONTTRUNC:
4913                 case SO_WANTMORE:
4914                 case SO_WANTOOBFLAG:
4915                 case SO_NOWAKEFROMSLEEP:
4916                         optval = so->so_options & sopt->sopt_name;
4917 integer:
4918                         error = sooptcopyout(sopt, &optval, sizeof (optval));
4919                         break;
4920
4921                 case SO_TYPE:
4922                         optval = so->so_type;
4923                         goto integer;
4924
4925                 case SO_NREAD:
4926                         if (so->so_proto->pr_flags & PR_ATOMIC) {
4927                                 int pkt_total;
4928                                 struct mbuf *m1;
4929
4930                                 pkt_total = 0;
4931                                 m1 = so->so_rcv.sb_mb;
4932                                 while (m1 != NULL) {
4933                                         if (m1->m_type == MT_DATA ||
4934                                             m1->m_type == MT_HEADER ||
4935                                             m1->m_type == MT_OOBDATA)
4936                                                 pkt_total += m1->m_len;
4937                                         m1 = m1->m_next;
4938                                 }
4939                                 optval = pkt_total;
4940                         } else {
4941                                 optval = so->so_rcv.sb_cc - so->so_rcv.sb_ctl;
4942                         }
4943                         goto integer;
4944
4945                 case SO_NUMRCVPKT:
4946                         if (so->so_proto->pr_flags & PR_ATOMIC) {
4947                                 int cnt = 0;
4948                                 struct mbuf *m1;
4949
4950                                 m1 = so->so_rcv.sb_mb;
4951                                 while (m1 != NULL) {
4952                                         if (m1->m_type == MT_DATA ||
4953                                             m1->m_type == MT_HEADER ||
4954                                             m1->m_type == MT_OOBDATA)
4955                                                 cnt += 1;
4956                                         m1 = m1->m_nextpkt;
4957                                 }
4958                                 optval = cnt;
4959                                 goto integer;
4960                         } else {
4961                                 error = EINVAL;
4962                                 break;
4963                         }
4964
4965                 case SO_NWRITE:
4966                         optval = so->so_snd.sb_cc;
4967                         goto integer;
4968
4969                 case SO_ERROR:
4970                         optval = so->so_error;
4971                         so->so_error = 0;
4972                         goto integer;
4973
4974                 case SO_SNDBUF: {
4975                         u_int32_t hiwat = so->so_snd.sb_hiwat;
4976
4977                         if (so->so_snd.sb_flags & SB_UNIX) {
4978                                 struct unpcb *unp =
4979                                     (struct unpcb *)(so->so_pcb);
4980                                 if (unp != NULL && unp->unp_conn != NULL) {
4981                                         hiwat += unp->unp_conn->unp_cc;
4982                                 }
4983                         }
4984
4985                         optval = hiwat;
4986                         goto integer;
4987                 }
4988                 case SO_RCVBUF:
4989                         optval = so->so_rcv.sb_hiwat;
4990                         goto integer;
4991
4992                 case SO_SNDLOWAT:
4993                         optval = so->so_snd.sb_lowat;
4994                         goto integer;
4995
4996                 case SO_RCVLOWAT:
4997                         optval = so->so_rcv.sb_lowat;
4998                         goto integer;
4999
5000                 case SO_SNDTIMEO:
5001                 case SO_RCVTIMEO:
5002                         tv = (sopt->sopt_name == SO_SNDTIMEO ?
5003                             so->so_snd.sb_timeo : so->so_rcv.sb_timeo);
5004
5005                         error = sooptcopyout_timeval(sopt, &tv);
5006                         break;
5007
5008                 case SO_NOSIGPIPE:
5009                         optval = (so->so_flags & SOF_NOSIGPIPE);
5010                         goto integer;
5011
5012                 case SO_NOADDRERR:
5013                         optval = (so->so_flags & SOF_NOADDRAVAIL);
5014                         goto integer;
5015
5016                 case SO_REUSESHAREUID:
5017                         optval = (so->so_flags & SOF_REUSESHAREUID);
5018                         goto integer;
5019
5020
5021                 case SO_NOTIFYCONFLICT:
5022                         optval = (so->so_flags & SOF_NOTIFYCONFLICT);
5023                         goto integer;
5024
5025                 case SO_RESTRICTIONS:
5026                         optval = so_get_restrictions(so);
5027                         goto integer;
5028
5029                 case SO_AWDL_UNRESTRICTED:
5030                         if (SOCK_DOM(so) == PF_INET ||
5031                             SOCK_DOM(so) == PF_INET6) {
5032                                 optval = inp_get_awdl_unrestricted(
5033                                     sotoinpcb(so));
5034                                 goto integer;
5035                         } else
5036                                 error = EOPNOTSUPP;
5037                         break;
5038
5039                 case SO_LABEL:
5040 #if CONFIG_MACF_SOCKET
5041                         if ((error = sooptcopyin(sopt, &extmac, sizeof (extmac),
5042                             sizeof (extmac))) != 0 ||
5043                             (error = mac_socket_label_get(proc_ucred(
5044                             sopt->sopt_p), so, &extmac)) != 0)
5045                                 break;
5046
5047                         error = sooptcopyout(sopt, &extmac, sizeof (extmac));
5048 #else
5049                         error = EOPNOTSUPP;
5050 #endif /* MAC_SOCKET */
5051                         break;
5052
5053                 case SO_PEERLABEL:
5054 #if CONFIG_MACF_SOCKET
5055                         if ((error = sooptcopyin(sopt, &extmac, sizeof (extmac),
5056                             sizeof (extmac))) != 0 ||
5057                             (error = mac_socketpeer_label_get(proc_ucred(
5058                             sopt->sopt_p), so, &extmac)) != 0)
5059                                 break;
5060
5061                         error = sooptcopyout(sopt, &extmac, sizeof (extmac));
5062 #else
5063                         error = EOPNOTSUPP;
5064 #endif /* MAC_SOCKET */
5065                         break;
5066
5067 #ifdef __APPLE_API_PRIVATE
5068                 case SO_UPCALLCLOSEWAIT:
5069                         optval = (so->so_flags & SOF_UPCALLCLOSEWAIT);
5070                         goto integer;
5071 #endif
5072                 case SO_RANDOMPORT:
5073                         optval = (so->so_flags & SOF_BINDRANDOMPORT);
5074                         goto integer;
5075
5076                 case SO_NP_EXTENSIONS: {
5077                         struct so_np_extensions sonpx;
5078
5079                         sonpx.npx_flags = (so->so_flags & SOF_NPX_SETOPTSHUT) ?
5080                             SONPX_SETOPTSHUT : 0;
5081                         sonpx.npx_mask = SONPX_MASK_VALID;
5082
5083                         error = sooptcopyout(sopt, &sonpx,
5084                             sizeof (struct so_np_extensions));
5085                         break;
5086                 }
5087
5088                 case SO_TRAFFIC_CLASS:
5089                         optval = so->so_traffic_class;
5090                         goto integer;
5091
5092                 case SO_RECV_TRAFFIC_CLASS:
5093                         optval = (so->so_flags & SOF_RECV_TRAFFIC_CLASS);
5094                         goto integer;
5095
5096                 case SO_TRAFFIC_CLASS_STATS:
5097                         error = sooptcopyout(sopt, &so->so_tc_stats,
5098                             sizeof (so->so_tc_stats));
5099                         break;
5100
5101                 case SO_TRAFFIC_CLASS_DBG:
5102                         error = sogetopt_tcdbg(so, sopt);
5103                         break;
5104
5105                 case SO_PRIVILEGED_TRAFFIC_CLASS:
5106                         optval = (so->so_flags & SOF_PRIVILEGED_TRAFFIC_CLASS);
5107                         goto integer;
5108
5109                 case SO_DEFUNCTOK:
5110                         optval = !(so->so_flags & SOF_NODEFUNCT);
5111                         goto integer;
5112
5113                 case SO_ISDEFUNCT:
5114                         optval = (so->so_flags & SOF_DEFUNCT);
5115                         goto integer;
5116
5117                 case SO_OPPORTUNISTIC:
5118                         optval = so_get_opportunistic(so);
5119                         goto integer;
5120
5121                 case SO_FLUSH:
5122                         /* This option is not gettable */
5123                         error = EINVAL;
5124                         break;
5125
5126                 case SO_RECV_ANYIF:
5127                         optval = so_get_recv_anyif(so);
5128                         goto integer;
5129
5130                 case SO_TRAFFIC_MGT_BACKGROUND:
5131                         /* This option is handled by lower layer(s) */
5132                         if (so->so_proto != NULL &&
5133                             so->so_proto->pr_ctloutput != NULL) {
5134                                 (void) so->so_proto->pr_ctloutput(so, sopt);
5135                         }
5136                         break;
5137
5138 #if FLOW_DIVERT
5139                 case SO_FLOW_DIVERT_TOKEN:
5140                         error = flow_divert_token_get(so, sopt);
5141                         break;
5142 #endif  /* FLOW_DIVERT */
5143
5144 #if NECP
5145                 case SO_NECP_ATTRIBUTES:
5146                         error = necp_get_socket_attributes(so, sopt);
5147                         break;
5148 #endif /* NECP */
5149
5150 #if CONTENT_FILTER
5151                 case SO_CFIL_SOCK_ID: {
5152                         cfil_sock_id_t sock_id;
5153
5154                         sock_id = cfil_sock_id_from_socket(so);
5155
5156                         error = sooptcopyout(sopt, &sock_id,
5157                                 sizeof(cfil_sock_id_t));
5158                         break;
5159                 }
5160 #endif  /* CONTENT_FILTER */
5161
5162 #if MPTCP
5163                 case SO_MPTCP_FASTJOIN:
5164                         if (!((so->so_flags & SOF_MP_SUBFLOW) ||
5165                             ((SOCK_CHECK_DOM(so, PF_MULTIPATH)) &&
5166                             (SOCK_CHECK_PROTO(so, IPPROTO_TCP))))) {
5167                                 error = ENOPROTOOPT;
5168                                 break;
5169                         }
5170                         optval = (so->so_flags & SOF_MPTCP_FASTJOIN);
5171                         break;
5172 #endif /* MPTCP */
5173
5174                 default:
5175                         error = ENOPROTOOPT;
5176                         break;
5177                 }
5178         }
5179 out:
5180         if (dolock)
5181                 socket_unlock(so, 1);
5182         return (error);
5183 }
5184
5185 /*
5186  * The size limits on our soopt_getm is different from that on FreeBSD.
5187  * We limit the size of options to MCLBYTES. This will have to change
5188  * if we need to define options that need more space than MCLBYTES.
5189  */
5190 int
5191 soopt_getm(struct sockopt *sopt, struct mbuf **mp)
5192 {
5193         struct mbuf *m, *m_prev;
5194         int sopt_size = sopt->sopt_valsize;
5195         int how;
5196
5197         if (sopt_size <= 0 || sopt_size > MCLBYTES)
5198                 return (EMSGSIZE);
5199
5200         how = sopt->sopt_p != kernproc ? M_WAIT : M_DONTWAIT;
5201         MGET(m, how, MT_DATA);
5202         if (m == NULL)
5203                 return (ENOBUFS);
5204         if (sopt_size > MLEN) {
5205                 MCLGET(m, how);
5206                 if ((m->m_flags & M_EXT) == 0) {
5207                         m_free(m);
5208                         return (ENOBUFS);
5209                 }
5210                 m->m_len = min(MCLBYTES, sopt_size);
5211         } else {
5212                 m->m_len = min(MLEN, sopt_size);
5213         }
5214         sopt_size -= m->m_len;
5215         *mp = m;
5216         m_prev = m;
5217
5218         while (sopt_size > 0) {
5219                 MGET(m, how, MT_DATA);
5220                 if (m == NULL) {
5221                         m_freem(*mp);
5222                         return (ENOBUFS);
5223                 }
5224                 if (sopt_size > MLEN) {
5225                         MCLGET(m, how);
5226                         if ((m->m_flags & M_EXT) == 0) {
5227                                 m_freem(*mp);
5228                                 m_freem(m);
5229                                 return (ENOBUFS);
5230                         }
5231                         m->m_len = min(MCLBYTES, sopt_size);
5232                 } else {
5233                         m->m_len = min(MLEN, sopt_size);
5234                 }
5235                 sopt_size -= m->m_len;
5236                 m_prev->m_next = m;
5237                 m_prev = m;
5238         }
5239         return (0);
5240 }
5241
5242 /* copyin sopt data into mbuf chain */
5243 int
5244 soopt_mcopyin(struct sockopt *sopt, struct mbuf *m)
5245 {
5246         struct mbuf *m0 = m;
5247
5248         if (sopt->sopt_val == USER_ADDR_NULL)
5249                 return (0);
5250         while (m != NULL && sopt->sopt_valsize >= m->m_len) {
5251                 if (sopt->sopt_p != kernproc) {
5252                         int error;
5253
5254                         error = copyin(sopt->sopt_val, mtod(m, char *),
5255                             m->m_len);
5256                         if (error != 0) {
5257                                 m_freem(m0);
5258                                 return (error);
5259                         }
5260                 } else {
5261                         bcopy(CAST_DOWN(caddr_t, sopt->sopt_val),
5262                             mtod(m, char *), m->m_len);
5263                 }
5264                 sopt->sopt_valsize -= m->m_len;
5265                 sopt->sopt_val += m->m_len;
5266                 m = m->m_next;
5267         }
5268         /* should be allocated enoughly at ip6_sooptmcopyin() */
5269         if (m != NULL) {
5270                 panic("soopt_mcopyin");
5271                 /* NOTREACHED */
5272         }
5273         return (0);
5274 }
5275
5276 /* copyout mbuf chain data into soopt */
5277 int
5278 soopt_mcopyout(struct sockopt *sopt, struct mbuf *m)
5279 {
5280         struct mbuf *m0 = m;
5281         size_t valsize = 0;
5282
5283         if (sopt->sopt_val == USER_ADDR_NULL)
5284                 return (0);
5285         while (m != NULL && sopt->sopt_valsize >= m->m_len) {
5286                 if (sopt->sopt_p != kernproc) {
5287                         int error;
5288
5289                         error = copyout(mtod(m, char *), sopt->sopt_val,
5290                             m->m_len);
5291                         if (error != 0) {
5292                                 m_freem(m0);
5293                                 return (error);
5294                         }
5295                 } else {
5296                         bcopy(mtod(m, char *),
5297                             CAST_DOWN(caddr_t, sopt->sopt_val), m->m_len);
5298                 }
5299                 sopt->sopt_valsize -= m->m_len;
5300                 sopt->sopt_val += m->m_len;
5301                 valsize += m->m_len;
5302                 m = m->m_next;
5303         }
5304         if (m != NULL) {
5305                 /* enough soopt buffer should be given from user-land */
5306                 m_freem(m0);
5307                 return (EINVAL);
5308         }
5309         sopt->sopt_valsize = valsize;
5310         return (0);
5311 }
5312
5313 void
5314 sohasoutofband(struct socket *so)
5315 {
5316         if (so->so_pgid < 0)
5317                 gsignal(-so->so_pgid, SIGURG);
5318         else if (so->so_pgid > 0)
5319                 proc_signal(so->so_pgid, SIGURG);
5320         selwakeup(&so->so_rcv.sb_sel);
5321 }
5322
5323 int
5324 sopoll(struct socket *so, int events, kauth_cred_t cred, void * wql)
5325 {
5326 #pragma unused(cred)
5327         struct proc *p = current_proc();
5328         int revents = 0;
5329
5330         socket_lock(so, 1);
5331         so_update_last_owner_locked(so, PROC_NULL);
5332         so_update_policy(so);
5333
5334         if (events & (POLLIN | POLLRDNORM))
5335                 if (soreadable(so))
5336                         revents |= events & (POLLIN | POLLRDNORM);
5337
5338         if (events & (POLLOUT | POLLWRNORM))
5339                 if (sowriteable(so))
5340                         revents |= events & (POLLOUT | POLLWRNORM);
5341
5342         if (events & (POLLPRI | POLLRDBAND))
5343                 if (so->so_oobmark || (so->so_state & SS_RCVATMARK))
5344                         revents |= events & (POLLPRI | POLLRDBAND);
5345
5346         if (revents == 0) {
5347                 if (events & (POLLIN | POLLPRI | POLLRDNORM | POLLRDBAND)) {
5348                         /*
5349                          * Darwin sets the flag first,
5350                          * BSD calls selrecord first
5351                          */
5352                         so->so_rcv.sb_flags |= SB_SEL;
5353                         selrecord(p, &so->so_rcv.sb_sel, wql);
5354                 }
5355
5356                 if (events & (POLLOUT | POLLWRNORM)) {
5357                         /*
5358                          * Darwin sets the flag first,
5359                          * BSD calls selrecord first
5360                          */
5361                         so->so_snd.sb_flags |= SB_SEL;
5362                         selrecord(p, &so->so_snd.sb_sel, wql);
5363                 }
5364         }
5365
5366         socket_unlock(so, 1);
5367         return (revents);
5368 }
5369
5370 int
5371 soo_kqfilter(struct fileproc *fp, struct knote *kn, vfs_context_t ctx)
5372 {
5373 #pragma unused(fp)
5374 #if !CONFIG_MACF_SOCKET
5375 #pragma unused(ctx)
5376 #endif /* MAC_SOCKET */
5377         struct socket *so = (struct socket *)kn->kn_fp->f_fglob->fg_data;
5378         struct klist *skl;
5379
5380         socket_lock(so, 1);
5381         so_update_last_owner_locked(so, PROC_NULL);
5382         so_update_policy(so);
5383
5384 #if CONFIG_MACF_SOCKET
5385         if (mac_socket_check_kqfilter(proc_ucred(vfs_context_proc(ctx)),
5386             kn, so) != 0) {
5387                 socket_unlock(so, 1);
5388                 return (1);
5389         }
5390 #endif /* MAC_SOCKET */
5391
5392         switch (kn->kn_filter) {
5393         case EVFILT_READ:
5394                 kn->kn_fop = &soread_filtops;
5395                 skl = &so->so_rcv.sb_sel.si_note;
5396                 break;
5397         case EVFILT_WRITE:
5398                 kn->kn_fop = &sowrite_filtops;
5399                 skl = &so->so_snd.sb_sel.si_note;
5400                 break;
5401         case EVFILT_SOCK:
5402                 kn->kn_fop = &sock_filtops;
5403                 skl = &so->so_klist;
5404                 break;
5405         default:
5406                 socket_unlock(so, 1);
5407                 return (1);
5408         }
5409
5410         if (KNOTE_ATTACH(skl, kn)) {
5411                 switch (kn->kn_filter) {
5412                 case EVFILT_READ:
5413                         so->so_rcv.sb_flags |= SB_KNOTE;
5414                         break;
5415                 case EVFILT_WRITE:
5416                         so->so_snd.sb_flags |= SB_KNOTE;
5417                         break;
5418                 case EVFILT_SOCK:
5419                         so->so_flags |= SOF_KNOTE;
5420                         break;
5421                 default:
5422                         socket_unlock(so, 1);
5423                         return (1);
5424                 }
5425         }
5426         socket_unlock(so, 1);
5427         return (0);
5428 }
5429
5430 static void
5431 filt_sordetach(struct knote *kn)
5432 {
5433         struct socket *so = (struct socket *)kn->kn_fp->f_fglob->fg_data;
5434
5435         socket_lock(so, 1);
5436         if (so->so_rcv.sb_flags & SB_KNOTE)
5437                 if (KNOTE_DETACH(&so->so_rcv.sb_sel.si_note, kn))
5438                         so->so_rcv.sb_flags &= ~SB_KNOTE;
5439         socket_unlock(so, 1);
5440 }
5441
5442 /*ARGSUSED*/
5443 static int
5444 filt_soread(struct knote *kn, long hint)
5445 {
5446         struct socket *so = (struct socket *)kn->kn_fp->f_fglob->fg_data;
5447
5448         if ((hint & SO_FILT_HINT_LOCKED) == 0)
5449                 socket_lock(so, 1);
5450
5451         if (so->so_options & SO_ACCEPTCONN) {
5452                 int isempty;
5453
5454                 /*
5455                  * Radar 6615193 handle the listen case dynamically
5456                  * for kqueue read filter. This allows to call listen()
5457                  * after registering the kqueue EVFILT_READ.
5458                  */
5459
5460                 kn->kn_data = so->so_qlen;
5461                 isempty = ! TAILQ_EMPTY(&so->so_comp);
5462
5463                 if ((hint & SO_FILT_HINT_LOCKED) == 0)
5464                         socket_unlock(so, 1);
5465
5466                 return (isempty);
5467         }
5468
5469         /* socket isn't a listener */
5470
5471         kn->kn_data = so->so_rcv.sb_cc - so->so_rcv.sb_ctl;
5472
5473         if (so->so_oobmark) {
5474                 if (kn->kn_flags & EV_OOBAND) {
5475                         kn->kn_data -= so->so_oobmark;
5476                         if ((hint & SO_FILT_HINT_LOCKED) == 0)
5477                                 socket_unlock(so, 1);
5478                         return (1);
5479                 }
5480                 kn->kn_data = so->so_oobmark;
5481                 kn->kn_flags |= EV_OOBAND;
5482         } else {
5483                 if ((so->so_state & SS_CANTRCVMORE)
5484 #if CONTENT_FILTER
5485                 && cfil_sock_data_pending(&so->so_rcv) == 0
5486 #endif /* CONTENT_FILTER */
5487                 ) {
5488                         kn->kn_flags |= EV_EOF;
5489                         kn->kn_fflags = so->so_error;
5490                         if ((hint & SO_FILT_HINT_LOCKED) == 0)
5491                                 socket_unlock(so, 1);
5492                         return (1);
5493                 }
5494         }
5495
5496         if (so->so_state & SS_RCVATMARK) {
5497                 if (kn->kn_flags & EV_OOBAND) {
5498                         if ((hint & SO_FILT_HINT_LOCKED) == 0)
5499                                 socket_unlock(so, 1);
5500                         return (1);
5501                 }
5502                 kn->kn_flags |= EV_OOBAND;
5503         } else if (kn->kn_flags & EV_OOBAND) {
5504                 kn->kn_data = 0;
5505                 if ((hint & SO_FILT_HINT_LOCKED) == 0)
5506                         socket_unlock(so, 1);
5507                 return (0);
5508         }
5509
5510         if (so->so_error) {     /* temporary udp error */
5511                 if ((hint & SO_FILT_HINT_LOCKED) == 0)
5512                         socket_unlock(so, 1);
5513                 return (1);
5514         }
5515
5516         int64_t lowwat = so->so_rcv.sb_lowat;
5517         if (kn->kn_sfflags & NOTE_LOWAT) {
5518                 if (kn->kn_sdata > so->so_rcv.sb_hiwat)
5519                         lowwat = so->so_rcv.sb_hiwat;
5520                 else if (kn->kn_sdata > lowwat)
5521                         lowwat = kn->kn_sdata;
5522         }
5523
5524         if ((hint & SO_FILT_HINT_LOCKED) == 0)
5525                 socket_unlock(so, 1);
5526
5527         return ((kn->kn_flags & EV_OOBAND) || kn->kn_data >= lowwat);
5528 }
5529
5530 static void
5531 filt_sowdetach(struct knote *kn)
5532 {
5533         struct socket *so = (struct socket *)kn->kn_fp->f_fglob->fg_data;
5534         socket_lock(so, 1);
5535
5536         if (so->so_snd.sb_flags & SB_KNOTE)
5537                 if (KNOTE_DETACH(&so->so_snd.sb_sel.si_note, kn))
5538                         so->so_snd.sb_flags &= ~SB_KNOTE;
5539         socket_unlock(so, 1);
5540 }
5541
5542 int
5543 so_wait_for_if_feedback(struct socket *so)
5544 {
5545         if ((SOCK_DOM(so) == PF_INET || SOCK_DOM(so) == PF_INET6) &&
5546             (so->so_state & SS_ISCONNECTED)) {
5547                 struct inpcb *inp = sotoinpcb(so);
5548                 if (INP_WAIT_FOR_IF_FEEDBACK(inp))
5549                         return (1);
5550         }
5551         return (0);
5552 }
5553
5554 /*ARGSUSED*/
5555 static int
5556 filt_sowrite(struct knote *kn, long hint)
5557 {
5558         struct socket *so = (struct socket *)kn->kn_fp->f_fglob->fg_data;
5559         int ret = 0;
5560
5561         if ((hint & SO_FILT_HINT_LOCKED) == 0)
5562                 socket_lock(so, 1);
5563
5564         kn->kn_data = sbspace(&so->so_snd);
5565         if (so->so_state & SS_CANTSENDMORE) {
5566                 kn->kn_flags |= EV_EOF;
5567                 kn->kn_fflags = so->so_error;
5568                 ret = 1;
5569                 goto out;
5570         }
5571         if (so->so_error) {     /* temporary udp error */
5572                 ret = 1;
5573                 goto out;
5574         }
5575         if (((so->so_state & SS_ISCONNECTED) == 0) &&
5576             (so->so_proto->pr_flags & PR_CONNREQUIRED)) {
5577                 ret = 0;
5578                 goto out;
5579         }
5580         int64_t lowwat = so->so_snd.sb_lowat;
5581         if (kn->kn_sfflags & NOTE_LOWAT) {
5582                 if (kn->kn_sdata > so->so_snd.sb_hiwat)
5583                         lowwat = so->so_snd.sb_hiwat;
5584                 else if (kn->kn_sdata > lowwat)
5585                         lowwat = kn->kn_sdata;
5586         }
5587         if (kn->kn_data >= lowwat) {
5588                 if (so->so_flags & SOF_NOTSENT_LOWAT) {
5589                         if ((SOCK_DOM(so) == PF_INET
5590                             || SOCK_DOM(so) == PF_INET6)
5591                             && so->so_type == SOCK_STREAM) {
5592                                 ret = tcp_notsent_lowat_check(so);
5593                         }
5594 #if MPTCP
5595                         else if ((SOCK_DOM(so) == PF_MULTIPATH) &&
5596                             (SOCK_PROTO(so) == IPPROTO_TCP)) {
5597                                 ret = mptcp_notsent_lowat_check(so);
5598                         }
5599 #endif
5600                         else {
5601                                 return (1);
5602                         }
5603                 } else {
5604                         ret = 1;
5605                 }
5606         }
5607         if (so_wait_for_if_feedback(so))
5608                 ret = 0;
5609 out:
5610         if ((hint & SO_FILT_HINT_LOCKED) == 0)
5611                 socket_unlock(so, 1);
5612         return (ret);
5613 }
5614
5615 static void
5616 filt_sockdetach(struct knote *kn)
5617 {
5618         struct socket *so = (struct socket *)kn->kn_fp->f_fglob->fg_data;
5619         socket_lock(so, 1);
5620
5621         if ((so->so_flags & SOF_KNOTE) != 0)
5622                 if (KNOTE_DETACH(&so->so_klist, kn))
5623                         so->so_flags &= ~SOF_KNOTE;
5624         socket_unlock(so, 1);
5625 }
5626
5627 static int
5628 filt_sockev(struct knote *kn, long hint)
5629 {
5630         int ret = 0, locked = 0;
5631         struct socket *so = (struct socket *)kn->kn_fp->f_fglob->fg_data;
5632         long ev_hint = (hint & SO_FILT_HINT_EV);
5633
5634         if ((hint & SO_FILT_HINT_LOCKED) == 0) {
5635                 socket_lock(so, 1);
5636                 locked = 1;
5637         }
5638
5639         if (ev_hint & SO_FILT_HINT_CONNRESET) {
5640                 if (kn->kn_sfflags & NOTE_CONNRESET)
5641                         kn->kn_fflags |= NOTE_CONNRESET;
5642         }
5643         if (ev_hint & SO_FILT_HINT_TIMEOUT) {
5644                 if (kn->kn_sfflags & NOTE_TIMEOUT)
5645                         kn->kn_fflags |= NOTE_TIMEOUT;
5646         }
5647         if (ev_hint & SO_FILT_HINT_NOSRCADDR) {
5648                 if (kn->kn_sfflags & NOTE_NOSRCADDR)
5649                         kn->kn_fflags |= NOTE_NOSRCADDR;
5650         }
5651         if (ev_hint & SO_FILT_HINT_IFDENIED) {
5652                 if ((kn->kn_sfflags & NOTE_IFDENIED))
5653                         kn->kn_fflags |= NOTE_IFDENIED;
5654         }
5655         if (ev_hint & SO_FILT_HINT_KEEPALIVE) {
5656                 if (kn->kn_sfflags & NOTE_KEEPALIVE)
5657                         kn->kn_fflags |= NOTE_KEEPALIVE;
5658         }
5659         if (ev_hint & SO_FILT_HINT_ADAPTIVE_WTIMO) {
5660                 if (kn->kn_sfflags & NOTE_ADAPTIVE_WTIMO)
5661                         kn->kn_fflags |= NOTE_ADAPTIVE_WTIMO;
5662         }
5663         if (ev_hint & SO_FILT_HINT_ADAPTIVE_RTIMO) {
5664                 if (kn->kn_sfflags & NOTE_ADAPTIVE_RTIMO)
5665                         kn->kn_fflags |= NOTE_ADAPTIVE_RTIMO;
5666         }
5667         if (ev_hint & SO_FILT_HINT_CONNECTED) {
5668                 if (kn->kn_sfflags & NOTE_CONNECTED)
5669                         kn->kn_fflags |= NOTE_CONNECTED;
5670         }
5671         if (ev_hint & SO_FILT_HINT_DISCONNECTED) {
5672                 if (kn->kn_sfflags & NOTE_DISCONNECTED)
5673                         kn->kn_fflags |= NOTE_DISCONNECTED;
5674         }
5675         if (ev_hint & SO_FILT_HINT_CONNINFO_UPDATED) {
5676                 if (so->so_proto != NULL &&
5677                     (so->so_proto->pr_flags & PR_EVCONNINFO) &&
5678                     (kn->kn_sfflags & NOTE_CONNINFO_UPDATED))
5679                         kn->kn_fflags |= NOTE_CONNINFO_UPDATED;
5680         }
5681
5682         if ((kn->kn_sfflags & NOTE_READCLOSED) &&
5683             (so->so_state & SS_CANTRCVMORE)
5684 #if CONTENT_FILTER
5685                 && cfil_sock_data_pending(&so->so_rcv) == 0
5686 #endif /* CONTENT_FILTER */
5687                 )
5688                 kn->kn_fflags |= NOTE_READCLOSED;
5689
5690         if ((kn->kn_sfflags & NOTE_WRITECLOSED) &&
5691             (so->so_state & SS_CANTSENDMORE))
5692                 kn->kn_fflags |= NOTE_WRITECLOSED;
5693
5694         if ((kn->kn_sfflags & NOTE_SUSPEND) &&
5695             ((ev_hint & SO_FILT_HINT_SUSPEND) ||
5696             (so->so_flags & SOF_SUSPENDED))) {
5697                 kn->kn_fflags &= ~(NOTE_SUSPEND | NOTE_RESUME);
5698                 kn->kn_fflags |= NOTE_SUSPEND;
5699         }
5700
5701         if ((kn->kn_sfflags & NOTE_RESUME) &&
5702             ((ev_hint & SO_FILT_HINT_RESUME) ||
5703             (so->so_flags & SOF_SUSPENDED) == 0)) {
5704                 kn->kn_fflags &= ~(NOTE_SUSPEND | NOTE_RESUME);
5705                 kn->kn_fflags |= NOTE_RESUME;
5706         }
5707
5708         if (so->so_error != 0) {
5709                 ret = 1;
5710                 kn->kn_data = so->so_error;
5711                 kn->kn_flags |= EV_EOF;
5712         } else {
5713                 get_sockev_state(so, (u_int32_t *)&(kn->kn_data));
5714         }
5715
5716         if (kn->kn_fflags != 0)
5717                 ret = 1;
5718
5719         if (locked)
5720                 socket_unlock(so, 1);
5721
5722         return (ret);
5723 }
5724
5725 void
5726 get_sockev_state(struct socket *so, u_int32_t *statep)
5727 {
5728         u_int32_t state = *(statep);
5729
5730         if (so->so_state & SS_ISCONNECTED)
5731                 state |= SOCKEV_CONNECTED;
5732         else
5733                 state &= ~(SOCKEV_CONNECTED);
5734         state |= ((so->so_state & SS_ISDISCONNECTED) ? SOCKEV_DISCONNECTED : 0);
5735         *(statep) = state;
5736 }
5737
5738 #define SO_LOCK_HISTORY_STR_LEN \
5739         (2 * SO_LCKDBG_MAX * (2 + (2 * sizeof (void *)) + 1) + 1)
5740
5741 __private_extern__ const char *
5742 solockhistory_nr(struct socket *so)
5743 {
5744         size_t n = 0;
5745         int i;
5746         static char lock_history_str[SO_LOCK_HISTORY_STR_LEN];
5747
5748         bzero(lock_history_str, sizeof (lock_history_str));
5749         for (i = SO_LCKDBG_MAX - 1; i >= 0; i--) {
5750                 n += snprintf(lock_history_str + n,
5751                     SO_LOCK_HISTORY_STR_LEN - n, "%p:%p ",
5752                     so->lock_lr[(so->next_lock_lr + i) % SO_LCKDBG_MAX],
5753                     so->unlock_lr[(so->next_unlock_lr + i) % SO_LCKDBG_MAX]);
5754         }
5755         return (lock_history_str);
5756 }
5757
5758 int
5759 socket_lock(struct socket *so, int refcount)
5760 {
5761         int error = 0;
5762         void *lr_saved;
5763
5764         lr_saved = __builtin_return_address(0);
5765
5766         if (so->so_proto->pr_lock) {
5767                 error = (*so->so_proto->pr_lock)(so, refcount, lr_saved);
5768         } else {
5769 #ifdef MORE_LOCKING_DEBUG
5770                 lck_mtx_assert(so->so_proto->pr_domain->dom_mtx,
5771                     LCK_MTX_ASSERT_NOTOWNED);
5772 #endif
5773                 lck_mtx_lock(so->so_proto->pr_domain->dom_mtx);
5774                 if (refcount)
5775                         so->so_usecount++;
5776                 so->lock_lr[so->next_lock_lr] = lr_saved;
5777                 so->next_lock_lr = (so->next_lock_lr+1) % SO_LCKDBG_MAX;
5778         }
5779
5780         return (error);
5781 }
5782
5783 int
5784 socket_unlock(struct socket *so, int refcount)
5785 {
5786         int error = 0;
5787         void *lr_saved;
5788         lck_mtx_t *mutex_held;
5789
5790         lr_saved = __builtin_return_address(0);
5791
5792         if (so->so_proto == NULL) {
5793                 panic("%s: null so_proto so=%p\n", __func__, so);
5794                 /* NOTREACHED */
5795         }
5796
5797         if (so && so->so_proto->pr_unlock) {
5798                 error = (*so->so_proto->pr_unlock)(so, refcount, lr_saved);
5799         } else {
5800                 mutex_held = so->so_proto->pr_domain->dom_mtx;
5801 #ifdef MORE_LOCKING_DEBUG
5802                 lck_mtx_assert(mutex_held, LCK_MTX_ASSERT_OWNED);
5803 #endif
5804                 so->unlock_lr[so->next_unlock_lr] = lr_saved;
5805                 so->next_unlock_lr = (so->next_unlock_lr+1) % SO_LCKDBG_MAX;
5806
5807                 if (refcount) {
5808                         if (so->so_usecount <= 0) {
5809                                 panic("%s: bad refcount=%d so=%p (%d, %d, %d) "
5810                                     "lrh=%s", __func__, so->so_usecount, so,
5811                                     SOCK_DOM(so), so->so_type,
5812                                     SOCK_PROTO(so), solockhistory_nr(so));
5813                                 /* NOTREACHED */
5814                         }
5815
5816                         so->so_usecount--;
5817                         if (so->so_usecount == 0)
5818                                 sofreelastref(so, 1);
5819                 }
5820                 lck_mtx_unlock(mutex_held);
5821         }
5822
5823         return (error);
5824 }
5825
5826 /* Called with socket locked, will unlock socket */
5827 void
5828 sofree(struct socket *so)
5829 {
5830         lck_mtx_t *mutex_held;
5831
5832         if (so->so_proto->pr_getlock != NULL)
5833                 mutex_held = (*so->so_proto->pr_getlock)(so, 0);
5834         else
5835                 mutex_held = so->so_proto->pr_domain->dom_mtx;
5836         lck_mtx_assert(mutex_held, LCK_MTX_ASSERT_OWNED);
5837
5838         sofreelastref(so, 0);
5839 }
5840
5841 void
5842 soreference(struct socket *so)
5843 {
5844         socket_lock(so, 1);     /* locks & take one reference on socket */
5845         socket_unlock(so, 0);   /* unlock only */
5846 }
5847
5848 void
5849 sodereference(struct socket *so)
5850 {
5851         socket_lock(so, 0);
5852         socket_unlock(so, 1);
5853 }
5854
5855 /*
5856  * Set or clear SOF_MULTIPAGES on the socket to enable or disable the
5857  * possibility of using jumbo clusters.  Caller must ensure to hold
5858  * the socket lock.
5859  */
5860 void
5861 somultipages(struct socket *so, boolean_t set)
5862 {
5863         if (set)
5864                 so->so_flags |= SOF_MULTIPAGES;
5865         else
5866                 so->so_flags &= ~SOF_MULTIPAGES;
5867 }
5868
5869 void
5870 soif2kcl(struct socket *so, boolean_t set)
5871 {
5872         if (set)
5873                 so->so_flags1 |= SOF1_IF_2KCL;
5874         else
5875                 so->so_flags1 &= ~SOF1_IF_2KCL;
5876 }
5877
5878 int
5879 so_isdstlocal(struct socket *so) {
5880
5881         struct inpcb *inp = (struct inpcb *)so->so_pcb;
5882
5883         if (SOCK_DOM(so) == PF_INET)
5884                 return (inaddr_local(inp->inp_faddr));
5885         else if (SOCK_DOM(so) == PF_INET6)
5886                 return (in6addr_local(&inp->in6p_faddr));
5887
5888         return (0);
5889 }
5890
5891 int
5892 sosetdefunct(struct proc *p, struct socket *so, int level, boolean_t noforce)
5893 {
5894         struct sockbuf *rcv, *snd;
5895         int err = 0, defunct;
5896
5897         rcv = &so->so_rcv;
5898         snd = &so->so_snd;
5899
5900         defunct = (so->so_flags & SOF_DEFUNCT);
5901         if (defunct) {
5902                 if (!(snd->sb_flags & rcv->sb_flags & SB_DROP)) {
5903                         panic("%s: SB_DROP not set", __func__);
5904                         /* NOTREACHED */
5905                 }
5906                 goto done;
5907         }
5908
5909         if (so->so_flags & SOF_NODEFUNCT) {
5910                 if (noforce) {
5911                         err = EOPNOTSUPP;
5912                         SODEFUNCTLOG(("%s[%d]: (target pid %d level %d) "
5913                             "so 0x%llx [%d,%d] is not eligible for defunct "
5914                             "(%d)\n", __func__, proc_selfpid(), proc_pid(p),
5915                             level, (uint64_t)VM_KERNEL_ADDRPERM(so),
5916                             SOCK_DOM(so), SOCK_TYPE(so), err));
5917                         return (err);
5918                 }
5919                 so->so_flags &= ~SOF_NODEFUNCT;
5920                 SODEFUNCTLOG(("%s[%d]: (target pid %d level %d) so 0x%llx "
5921                     "[%d,%d] defunct by force\n", __func__, proc_selfpid(),
5922                     proc_pid(p), level, (uint64_t)VM_KERNEL_ADDRPERM(so),
5923                     SOCK_DOM(so), SOCK_TYPE(so)));
5924         }
5925
5926         so->so_flags |= SOF_DEFUNCT;
5927
5928         /* Prevent further data from being appended to the socket buffers */
5929         snd->sb_flags |= SB_DROP;
5930         rcv->sb_flags |= SB_DROP;
5931
5932         /* Flush any existing data in the socket buffers */
5933         if (rcv->sb_cc != 0) {
5934                 rcv->sb_flags &= ~SB_SEL;
5935                 selthreadclear(&rcv->sb_sel);
5936                 sbrelease(rcv);
5937         }
5938         if (snd->sb_cc != 0) {
5939                 snd->sb_flags &= ~SB_SEL;
5940                 selthreadclear(&snd->sb_sel);
5941                 sbrelease(snd);
5942         }
5943
5944 done:
5945         SODEFUNCTLOG(("%s[%d]: (target pid %d level %d) so 0x%llx [%d,%d] %s "
5946             "defunct\n", __func__, proc_selfpid(), proc_pid(p), level,
5947             (uint64_t)VM_KERNEL_ADDRPERM(so), SOCK_DOM(so), SOCK_TYPE(so),
5948             defunct ? "is already" : "marked as"));
5949
5950         return (err);
5951 }
5952
5953 int
5954 sodefunct(struct proc *p, struct socket *so, int level)
5955 {
5956         struct sockbuf *rcv, *snd;
5957
5958         if (!(so->so_flags & SOF_DEFUNCT)) {
5959                 panic("%s improperly called", __func__);
5960                 /* NOTREACHED */
5961         }
5962         if (so->so_state & SS_DEFUNCT)
5963                 goto done;
5964
5965         rcv = &so->so_rcv;
5966         snd = &so->so_snd;
5967
5968         if (SOCK_DOM(so) == PF_INET || SOCK_DOM(so) == PF_INET6) {
5969                 char s[MAX_IPv6_STR_LEN];
5970                 char d[MAX_IPv6_STR_LEN];
5971                 struct inpcb *inp = sotoinpcb(so);
5972
5973                 SODEFUNCTLOG(("%s[%d]: (target pid %d level %d) so 0x%llx [%s "
5974                     "%s:%d -> %s:%d] is now defunct [rcv_si 0x%x, snd_si 0x%x, "
5975                     "rcv_fl 0x%x, snd_fl 0x%x]\n", __func__, proc_selfpid(),
5976                     proc_pid(p), level, (uint64_t)VM_KERNEL_ADDRPERM(so),
5977                     (SOCK_TYPE(so) == SOCK_STREAM) ? "TCP" : "UDP",
5978                     inet_ntop(SOCK_DOM(so), ((SOCK_DOM(so) == PF_INET) ?
5979                     (void *)&inp->inp_laddr.s_addr : (void *)&inp->in6p_laddr),
5980                     s, sizeof (s)), ntohs(inp->in6p_lport),
5981                     inet_ntop(SOCK_DOM(so), (SOCK_DOM(so) == PF_INET) ?
5982                     (void *)&inp->inp_faddr.s_addr : (void *)&inp->in6p_faddr,
5983                     d, sizeof (d)), ntohs(inp->in6p_fport),
5984                     (uint32_t)rcv->sb_sel.si_flags,
5985                     (uint32_t)snd->sb_sel.si_flags,
5986                     rcv->sb_flags, snd->sb_flags));
5987         } else {
5988                 SODEFUNCTLOG(("%s[%d]: (target pid %d level %d) so 0x%llx "
5989                     "[%d,%d] is now defunct [rcv_si 0x%x, snd_si 0x%x, "
5990                     "rcv_fl 0x%x, snd_fl 0x%x]\n", __func__, proc_selfpid(),
5991                     proc_pid(p), level, (uint64_t)VM_KERNEL_ADDRPERM(so),
5992                     SOCK_DOM(so), SOCK_TYPE(so), (uint32_t)rcv->sb_sel.si_flags,
5993                     (uint32_t)snd->sb_sel.si_flags, rcv->sb_flags,
5994                     snd->sb_flags));
5995         }
5996
5997         /*
5998          * Unwedge threads blocked on sbwait() and sb_lock().
5999          */
6000         sbwakeup(rcv);
6001         sbwakeup(snd);
6002
6003         so->so_flags1 |= SOF1_DEFUNCTINPROG;
6004         if (rcv->sb_flags & SB_LOCK)
6005                 sbunlock(rcv, TRUE);    /* keep socket locked */
6006         if (snd->sb_flags & SB_LOCK)
6007                 sbunlock(snd, TRUE);    /* keep socket locked */
6008
6009         /*
6010          * Flush the buffers and disconnect.  We explicitly call shutdown
6011          * on both data directions to ensure that SS_CANT{RCV,SEND}MORE
6012          * states are set for the socket.  This would also flush out data
6013          * hanging off the receive list of this socket.
6014          */
6015         (void) soshutdownlock_final(so, SHUT_RD);
6016         (void) soshutdownlock_final(so, SHUT_WR);
6017         (void) sodisconnectlocked(so);
6018
6019         /*
6020          * Explicitly handle connectionless-protocol disconnection
6021          * and release any remaining data in the socket buffers.
6022          */
6023         if (!(so->so_flags & SS_ISDISCONNECTED))
6024                 (void) soisdisconnected(so);
6025
6026         if (so->so_error == 0)
6027                 so->so_error = EBADF;
6028
6029         if (rcv->sb_cc != 0) {
6030                 rcv->sb_flags &= ~SB_SEL;
6031                 selthreadclear(&rcv->sb_sel);
6032                 sbrelease(rcv);
6033         }
6034         if (snd->sb_cc != 0) {
6035                 snd->sb_flags &= ~SB_SEL;
6036                 selthreadclear(&snd->sb_sel);
6037                 sbrelease(snd);
6038         }
6039         so->so_state |= SS_DEFUNCT;
6040
6041 done:
6042         return (0);
6043 }
6044
6045 __private_extern__ int
6046 so_set_recv_anyif(struct socket *so, int optval)
6047 {
6048         int ret = 0;
6049
6050 #if INET6
6051         if (SOCK_DOM(so) == PF_INET || SOCK_DOM(so) == PF_INET6) {
6052 #else
6053         if (SOCK_DOM(so) == PF_INET) {
6054 #endif /* !INET6 */
6055                 if (optval)
6056                         sotoinpcb(so)->inp_flags |= INP_RECV_ANYIF;
6057                 else
6058                         sotoinpcb(so)->inp_flags &= ~INP_RECV_ANYIF;
6059         }
6060
6061         return (ret);
6062 }
6063
6064 __private_extern__ int
6065 so_get_recv_anyif(struct socket *so)
6066 {
6067         int ret = 0;
6068
6069 #if INET6
6070         if (SOCK_DOM(so) == PF_INET || SOCK_DOM(so) == PF_INET6) {
6071 #else
6072         if (SOCK_DOM(so) == PF_INET) {
6073 #endif /* !INET6 */
6074                 ret = (sotoinpcb(so)->inp_flags & INP_RECV_ANYIF) ? 1 : 0;
6075         }
6076
6077         return (ret);
6078 }
6079
6080 int
6081 so_set_restrictions(struct socket *so, uint32_t vals)
6082 {
6083         int nocell_old, nocell_new;
6084         int noexpensive_old, noexpensive_new;
6085
6086         /*
6087          * Deny-type restrictions are trapdoors; once set they cannot be
6088          * unset for the lifetime of the socket.  This allows them to be
6089          * issued by a framework on behalf of the application without
6090          * having to worry that they can be undone.
6091          *
6092          * Note here that socket-level restrictions overrides any protocol
6093          * level restrictions.  For instance, SO_RESTRICT_DENY_CELLULAR
6094          * socket restriction issued on the socket has a higher precendence
6095          * than INP_NO_IFT_CELLULAR.  The latter is affected by the UUID
6096          * policy PROC_UUID_NO_CELLULAR for unrestricted sockets only,
6097          * i.e. when SO_RESTRICT_DENY_CELLULAR has not been issued.
6098          */
6099         nocell_old = (so->so_restrictions & SO_RESTRICT_DENY_CELLULAR);
6100         noexpensive_old = (so->so_restrictions & SO_RESTRICT_DENY_EXPENSIVE);
6101         so->so_restrictions |= (vals & (SO_RESTRICT_DENY_IN |
6102             SO_RESTRICT_DENY_OUT | SO_RESTRICT_DENY_CELLULAR |
6103             SO_RESTRICT_DENY_EXPENSIVE));
6104         nocell_new = (so->so_restrictions & SO_RESTRICT_DENY_CELLULAR);
6105         noexpensive_new = (so->so_restrictions & SO_RESTRICT_DENY_EXPENSIVE);
6106
6107         /* we can only set, not clear restrictions */
6108         if ((nocell_new - nocell_old) == 0 &&
6109             (noexpensive_new - noexpensive_old) == 0)
6110                 return (0);
6111 #if INET6
6112         if (SOCK_DOM(so) == PF_INET || SOCK_DOM(so) == PF_INET6) {
6113 #else
6114         if (SOCK_DOM(so) == PF_INET) {
6115 #endif /* !INET6 */
6116                 if (nocell_new - nocell_old != 0) {
6117                         /* if deny cellular is now set, do what's needed for INPCB */
6118                         inp_set_nocellular(sotoinpcb(so));
6119                 }
6120                 if (noexpensive_new - noexpensive_old != 0) {
6121                         inp_set_noexpensive(sotoinpcb(so));
6122                 }
6123         }
6124
6125         return (0);
6126 }
6127
6128 uint32_t
6129 so_get_restrictions(struct socket *so)
6130 {
6131         return (so->so_restrictions & (SO_RESTRICT_DENY_IN |
6132             SO_RESTRICT_DENY_OUT |
6133             SO_RESTRICT_DENY_CELLULAR | SO_RESTRICT_DENY_EXPENSIVE));
6134 }
6135
6136 struct sockaddr_entry *
6137 sockaddrentry_alloc(int how)
6138 {
6139         struct sockaddr_entry *se;
6140
6141         se = (how == M_WAITOK) ? zalloc(se_zone) : zalloc_noblock(se_zone);
6142         if (se != NULL)
6143                 bzero(se, se_zone_size);
6144
6145         return (se);
6146 }
6147
6148 void
6149 sockaddrentry_free(struct sockaddr_entry *se)
6150 {
6151         if (se->se_addr != NULL) {
6152                 FREE(se->se_addr, M_SONAME);
6153                 se->se_addr = NULL;
6154         }
6155         zfree(se_zone, se);
6156 }
6157
6158 struct sockaddr_entry *
6159 sockaddrentry_dup(const struct sockaddr_entry *src_se, int how)
6160 {
6161         struct sockaddr_entry *dst_se;
6162
6163         dst_se = sockaddrentry_alloc(how);
6164         if (dst_se != NULL) {
6165                 int len = src_se->se_addr->sa_len;
6166
6167                 MALLOC(dst_se->se_addr, struct sockaddr *,
6168                     len, M_SONAME, how | M_ZERO);
6169                 if (dst_se->se_addr != NULL) {
6170                         bcopy(src_se->se_addr, dst_se->se_addr, len);
6171                 } else {
6172                         sockaddrentry_free(dst_se);
6173                         dst_se = NULL;
6174                 }
6175         }
6176
6177         return (dst_se);
6178 }
6179
6180 struct sockaddr_list *
6181 sockaddrlist_alloc(int how)
6182 {
6183         struct sockaddr_list *sl;
6184
6185         sl = (how == M_WAITOK) ? zalloc(sl_zone) : zalloc_noblock(sl_zone);
6186         if (sl != NULL) {
6187                 bzero(sl, sl_zone_size);
6188                 TAILQ_INIT(&sl->sl_head);
6189         }
6190         return (sl);
6191 }
6192
6193 void
6194 sockaddrlist_free(struct sockaddr_list *sl)
6195 {
6196         struct sockaddr_entry *se, *tse;
6197
6198         TAILQ_FOREACH_SAFE(se, &sl->sl_head, se_link, tse) {
6199                 sockaddrlist_remove(sl, se);
6200                 sockaddrentry_free(se);
6201         }
6202         VERIFY(sl->sl_cnt == 0 && TAILQ_EMPTY(&sl->sl_head));
6203         zfree(sl_zone, sl);
6204 }
6205
6206 void
6207 sockaddrlist_insert(struct sockaddr_list *sl, struct sockaddr_entry *se)
6208 {
6209         VERIFY(!(se->se_flags & SEF_ATTACHED));
6210         se->se_flags |= SEF_ATTACHED;
6211         TAILQ_INSERT_TAIL(&sl->sl_head, se, se_link);
6212         sl->sl_cnt++;
6213         VERIFY(sl->sl_cnt != 0);
6214 }
6215
6216 void
6217 sockaddrlist_remove(struct sockaddr_list *sl, struct sockaddr_entry *se)
6218 {
6219         VERIFY(se->se_flags & SEF_ATTACHED);
6220         se->se_flags &= ~SEF_ATTACHED;
6221         VERIFY(sl->sl_cnt != 0);
6222         sl->sl_cnt--;
6223         TAILQ_REMOVE(&sl->sl_head, se, se_link);
6224 }
6225
6226 struct sockaddr_list *
6227 sockaddrlist_dup(const struct sockaddr_list *src_sl, int how)
6228 {
6229         struct sockaddr_entry *src_se, *tse;
6230         struct sockaddr_list *dst_sl;
6231
6232         dst_sl = sockaddrlist_alloc(how);
6233         if (dst_sl == NULL)
6234                 return (NULL);
6235
6236         TAILQ_FOREACH_SAFE(src_se, &src_sl->sl_head, se_link, tse) {
6237                 struct sockaddr_entry *dst_se;
6238
6239                 if (src_se->se_addr == NULL)
6240                         continue;
6241
6242                 dst_se = sockaddrentry_dup(src_se, how);
6243                 if (dst_se == NULL) {
6244                         sockaddrlist_free(dst_sl);
6245                         return (NULL);
6246                 }
6247
6248                 sockaddrlist_insert(dst_sl, dst_se);
6249         }
6250         VERIFY(src_sl->sl_cnt == dst_sl->sl_cnt);
6251
6252         return (dst_sl);
6253 }
6254
6255 int
6256 so_set_effective_pid(struct socket *so, int epid, struct proc *p)
6257 {
6258         struct proc *ep = PROC_NULL;
6259         int error = 0;
6260
6261         /* pid 0 is reserved for kernel */
6262         if (epid == 0) {
6263                 error = EINVAL;
6264                 goto done;
6265         }
6266
6267         /*
6268          * If this is an in-kernel socket, prevent its delegate
6269          * association from changing unless the socket option is
6270          * coming from within the kernel itself.
6271          */
6272         if (so->last_pid == 0 && p != kernproc) {
6273                 error = EACCES;
6274                 goto done;
6275         }
6276
6277         /*
6278          * If this is issued by a process that's recorded as the
6279          * real owner of the socket, or if the pid is the same as
6280          * the process's own pid, then proceed.  Otherwise ensure
6281          * that the issuing process has the necessary privileges.
6282          */
6283         if (epid != so->last_pid || epid != proc_pid(p)) {
6284                 if ((error = priv_check_cred(kauth_cred_get(),
6285                     PRIV_NET_PRIVILEGED_SOCKET_DELEGATE, 0))) {
6286                         error = EACCES;
6287                         goto done;
6288                 }
6289         }
6290
6291         /* Find the process that corresponds to the effective pid */
6292         if ((ep = proc_find(epid)) == PROC_NULL) {
6293                 error = ESRCH;
6294                 goto done;
6295         }
6296
6297         /*
6298          * If a process tries to delegate the socket to itself, then
6299          * there's really nothing to do; treat it as a way for the
6300          * delegate association to be cleared.  Note that we check
6301          * the passed-in proc rather than calling proc_selfpid(),
6302          * as we need to check the process issuing the socket option
6303          * which could be kernproc.  Given that we don't allow 0 for
6304          * effective pid, it means that a delegated in-kernel socket
6305          * stays delegated during its lifetime (which is probably OK.)
6306          */
6307         if (epid == proc_pid(p)) {
6308                 so->so_flags &= ~SOF_DELEGATED;
6309                 so->e_upid = 0;
6310                 so->e_pid = 0;
6311                 uuid_clear(so->e_uuid);
6312         } else {
6313                 so->so_flags |= SOF_DELEGATED;
6314                 so->e_upid = proc_uniqueid(ep);
6315                 so->e_pid = proc_pid(ep);
6316                 proc_getexecutableuuid(ep, so->e_uuid, sizeof (so->e_uuid));
6317         }
6318 done:
6319         if (error == 0 && net_io_policy_log) {
6320                 uuid_string_t buf;
6321
6322                 uuid_unparse(so->e_uuid, buf);
6323                 log(LOG_DEBUG, "%s[%s,%d]: so 0x%llx [%d,%d] epid %d (%s) "
6324                     "euuid %s%s\n", __func__, proc_name_address(p),
6325                     proc_pid(p), (uint64_t)VM_KERNEL_ADDRPERM(so), SOCK_DOM(so),
6326                     SOCK_TYPE(so), so->e_pid, proc_name_address(ep), buf,
6327                     ((so->so_flags & SOF_DELEGATED) ? " [delegated]" : ""));
6328         } else if (error != 0 && net_io_policy_log) {
6329                 log(LOG_ERR, "%s[%s,%d]: so 0x%llx [%d,%d] epid %d (%s) "
6330                     "ERROR (%d)\n", __func__, proc_name_address(p),
6331                     proc_pid(p), (uint64_t)VM_KERNEL_ADDRPERM(so), SOCK_DOM(so),
6332                     SOCK_TYPE(so), epid, (ep == PROC_NULL) ? "PROC_NULL" :
6333                     proc_name_address(ep), error);
6334         }
6335
6336         /* Update this socket's policy upon success */
6337         if (error == 0) {
6338                 so->so_policy_gencnt *= -1;
6339                 so_update_policy(so);
6340 #if NECP
6341                 so_update_necp_policy(so, NULL, NULL);
6342 #endif /* NECP */
6343         }
6344
6345         if (ep != PROC_NULL)
6346                 proc_rele(ep);
6347
6348         return (error);
6349 }
6350
6351 int
6352 so_set_effective_uuid(struct socket *so, uuid_t euuid, struct proc *p)
6353 {
6354         uuid_string_t buf;
6355         uuid_t uuid;
6356         int error = 0;
6357
6358         /* UUID must not be all-zeroes (reserved for kernel) */
6359         if (uuid_is_null(euuid)) {
6360                 error = EINVAL;
6361                 goto done;;
6362         }
6363
6364         /*
6365          * If this is an in-kernel socket, prevent its delegate
6366          * association from changing unless the socket option is
6367          * coming from within the kernel itself.
6368          */
6369         if (so->last_pid == 0 && p != kernproc) {
6370                 error = EACCES;
6371                 goto done;
6372         }
6373
6374         /* Get the UUID of the issuing process */
6375         proc_getexecutableuuid(p, uuid, sizeof (uuid));
6376
6377         /*
6378          * If this is issued by a process that's recorded as the
6379          * real owner of the socket, or if the uuid is the same as
6380          * the process's own uuid, then proceed.  Otherwise ensure
6381          * that the issuing process has the necessary privileges.
6382          */
6383         if (uuid_compare(euuid, so->last_uuid) != 0 ||
6384             uuid_compare(euuid, uuid) != 0) {
6385                 if ((error = priv_check_cred(kauth_cred_get(),
6386                     PRIV_NET_PRIVILEGED_SOCKET_DELEGATE, 0))) {
6387                         error = EACCES;
6388                         goto done;
6389                 }
6390         }
6391
6392         /*
6393          * If a process tries to delegate the socket to itself, then
6394          * there's really nothing to do; treat it as a way for the
6395          * delegate association to be cleared.  Note that we check
6396          * the uuid of the passed-in proc rather than that of the
6397          * current process, as we need to check the process issuing
6398          * the socket option which could be kernproc itself.  Given
6399          * that we don't allow 0 for effective uuid, it means that
6400          * a delegated in-kernel socket stays delegated during its
6401          * lifetime (which is okay.)
6402          */
6403         if (uuid_compare(euuid, uuid) == 0) {
6404                 so->so_flags &= ~SOF_DELEGATED;
6405                 so->e_upid = 0;
6406                 so->e_pid = 0;
6407                 uuid_clear(so->e_uuid);
6408         } else {
6409                 so->so_flags |= SOF_DELEGATED;
6410                 /*
6411                  * Unlike so_set_effective_pid(), we only have the UUID
6412                  * here and the process ID is not known.  Inherit the
6413                  * real {pid,upid} of the socket.
6414                  */
6415                 so->e_upid = so->last_upid;
6416                 so->e_pid = so->last_pid;
6417                 uuid_copy(so->e_uuid, euuid);
6418         }
6419
6420 done:
6421         if (error == 0 && net_io_policy_log) {
6422                 uuid_unparse(so->e_uuid, buf);
6423                 log(LOG_DEBUG, "%s[%s,%d]: so 0x%llx [%d,%d] epid %d "
6424                     "euuid %s%s\n", __func__, proc_name_address(p), proc_pid(p),
6425                     (uint64_t)VM_KERNEL_ADDRPERM(so), SOCK_DOM(so),
6426                     SOCK_TYPE(so), so->e_pid, buf,
6427                     ((so->so_flags & SOF_DELEGATED) ? " [delegated]" : ""));
6428         } else if (error != 0 && net_io_policy_log) {
6429                 uuid_unparse(euuid, buf);
6430                 log(LOG_DEBUG, "%s[%s,%d]: so 0x%llx [%d,%d] euuid %s "
6431                     "ERROR (%d)\n", __func__, proc_name_address(p), proc_pid(p),
6432                     (uint64_t)VM_KERNEL_ADDRPERM(so), SOCK_DOM(so),
6433                     SOCK_TYPE(so), buf, error);
6434         }
6435
6436         /* Update this socket's policy upon success */
6437         if (error == 0) {
6438                 so->so_policy_gencnt *= -1;
6439                 so_update_policy(so);
6440 #if NECP
6441                 so_update_necp_policy(so, NULL, NULL);
6442 #endif /* NECP */
6443         }
6444
6445         return (error);
6446 }
6447
6448 void
6449 netpolicy_post_msg(uint32_t ev_code, struct netpolicy_event_data *ev_data,
6450     uint32_t ev_datalen)
6451 {
6452         struct kev_msg ev_msg;
6453
6454         /*
6455          * A netpolicy event always starts with a netpolicy_event_data
6456          * structure, but the caller can provide for a longer event
6457          * structure to post, depending on the event code.
6458          */
6459         VERIFY(ev_data != NULL && ev_datalen >= sizeof (*ev_data));
6460
6461         bzero(&ev_msg, sizeof (ev_msg));
6462         ev_msg.vendor_code      = KEV_VENDOR_APPLE;
6463         ev_msg.kev_class        = KEV_NETWORK_CLASS;
6464         ev_msg.kev_subclass     = KEV_NETPOLICY_SUBCLASS;
6465         ev_msg.event_code       = ev_code;
6466
6467         ev_msg.dv[0].data_ptr   = ev_data;
6468         ev_msg.dv[0].data_length = ev_datalen;
6469
6470         kev_post_msg(&ev_msg);
6471 }
6472
6473 void
6474 socket_post_kev_msg(uint32_t ev_code,
6475     struct kev_socket_event_data *ev_data,
6476     uint32_t ev_datalen)
6477 {
6478         struct kev_msg ev_msg;
6479
6480         bzero(&ev_msg, sizeof(ev_msg));
6481         ev_msg.vendor_code = KEV_VENDOR_APPLE;
6482         ev_msg.kev_class = KEV_NETWORK_CLASS;
6483         ev_msg.kev_subclass = KEV_SOCKET_SUBCLASS;
6484         ev_msg.event_code = ev_code;
6485
6486         ev_msg.dv[0].data_ptr = ev_data;
6487         ev_msg.dv[0]. data_length = ev_datalen;
6488
6489         kev_post_msg(&ev_msg);
6490 }
6491
6492 void
6493 socket_post_kev_msg_closed(struct socket *so)
6494 {
6495         struct kev_socket_closed ev;
6496         struct sockaddr *socksa = NULL, *peersa = NULL;
6497         int err;
6498         bzero(&ev, sizeof(ev));
6499         err = (*so->so_proto->pr_usrreqs->pru_sockaddr)(so, &socksa);
6500         if (err == 0) {
6501                 err = (*so->so_proto->pr_usrreqs->pru_peeraddr)(so,
6502                     &peersa);
6503                 if (err == 0) {
6504                         memcpy(&ev.ev_data.kev_sockname, socksa,
6505                             min(socksa->sa_len,
6506                             sizeof (ev.ev_data.kev_sockname)));
6507                         memcpy(&ev.ev_data.kev_peername, peersa,
6508                             min(peersa->sa_len,
6509                             sizeof (ev.ev_data.kev_peername)));
6510                         socket_post_kev_msg(KEV_SOCKET_CLOSED,
6511                             &ev.ev_data, sizeof (ev));
6512                 }
6513         }
6514         if (socksa != NULL)
6515                 FREE(socksa, M_SONAME);
6516         if (peersa != NULL)
6517                 FREE(peersa, M_SONAME);
6518 }