bsd/kern/uipc_socket.c

   1 /*
   2  * Copyright (c) 1998-2013 Apple Inc. All rights reserved.
   3  *
   4  * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
   5  *
   6  * This file contains Original Code and/or Modifications of Original Code
   7  * as defined in and that are subject to the Apple Public Source License
   8  * Version 2.0 (the 'License'). You may not use this file except in
   9  * compliance with the License. The rights granted to you under the License
  10  * may not be used to create, or enable the creation or redistribution of,
  11  * unlawful or unlicensed copies of an Apple operating system, or to
  12  * circumvent, violate, or enable the circumvention or violation of, any
  13  * terms of an Apple operating system software license agreement.
  14  *
  15  * Please obtain a copy of the License at
  16  * http://www.opensource.apple.com/apsl/ and read it before using this file.
  17  *
  18  * The Original Code and all software distributed under the License are
  19  * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
  20  * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
  21  * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
  22  * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
  23  * Please see the License for the specific language governing rights and
  24  * limitations under the License.
  25  *
  26  * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
  27  */
  28 /* Copyright (c) 1995 NeXT Computer, Inc. All Rights Reserved */
  29 /*
  30  * Copyright (c) 1982, 1986, 1988, 1990, 1993
  31  *      The Regents of the University of California.  All rights reserved.
  32  *
  33  * Redistribution and use in source and binary forms, with or without
  34  * modification, are permitted provided that the following conditions
  35  * are met:
  36  * 1. Redistributions of source code must retain the above copyright
  37  *    notice, this list of conditions and the following disclaimer.
  38  * 2. Redistributions in binary form must reproduce the above copyright
  39  *    notice, this list of conditions and the following disclaimer in the
  40  *    documentation and/or other materials provided with the distribution.
  41  * 3. All advertising materials mentioning features or use of this software
  42  *    must display the following acknowledgement:
  43  *      This product includes software developed by the University of
  44  *      California, Berkeley and its contributors.
  45  * 4. Neither the name of the University nor the names of its contributors
  46  *    may be used to endorse or promote products derived from this software
  47  *    without specific prior written permission.
  48  *
  49  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
  50  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  51  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  52  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  53  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  54  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  55  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  56  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  57  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  58  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  59  * SUCH DAMAGE.
  60  *
  61  *      @(#)uipc_socket.c       8.3 (Berkeley) 4/15/94
  62  */
  63 /*
  64  * NOTICE: This file was modified by SPARTA, Inc. in 2005 to introduce
  65  * support for mandatory and extensible security protections.  This notice
  66  * is included in support of clause 2.2 (b) of the Apple Public License,
  67  * Version 2.0.
  68  */
  69
  70 #include <sys/param.h>
  71 #include <sys/systm.h>
  72 #include <sys/filedesc.h>
  73 #include <sys/proc.h>
  74 #include <sys/proc_internal.h>
  75 #include <sys/kauth.h>
  76 #include <sys/file_internal.h>
  77 #include <sys/fcntl.h>
  78 #include <sys/malloc.h>
  79 #include <sys/mbuf.h>
  80 #include <sys/domain.h>
  81 #include <sys/kernel.h>
  82 #include <sys/event.h>
  83 #include <sys/poll.h>
  84 #include <sys/protosw.h>
  85 #include <sys/socket.h>
  86 #include <sys/socketvar.h>
  87 #include <sys/resourcevar.h>
  88 #include <sys/signalvar.h>
  89 #include <sys/sysctl.h>
  90 #include <sys/syslog.h>
  91 #include <sys/uio.h>
  92 #include <sys/ev.h>
  93 #include <sys/kdebug.h>
  94 #include <sys/un.h>
  95 #include <sys/user.h>
  96 #include <sys/priv.h>
  97 #include <sys/kern_event.h>
  98 #include <net/route.h>
  99 #include <net/init.h>
 100 #include <net/ntstat.h>
 101 #include <netinet/in.h>
 102 #include <netinet/in_pcb.h>
 103 #include <netinet/ip6.h>
 104 #include <netinet6/ip6_var.h>
 105 #include <netinet/flow_divert.h>
 106 #include <kern/zalloc.h>
 107 #include <kern/locks.h>
 108 #include <machine/limits.h>
 109 #include <libkern/OSAtomic.h>
 110 #include <pexpert/pexpert.h>
 111 #include <kern/assert.h>
 112 #include <kern/task.h>
 113 #include <sys/kpi_mbuf.h>
 114 #include <sys/mcache.h>
 115
 116 #if CONFIG_MACF
 117 #include <security/mac.h>
 118 #include <security/mac_framework.h>
 119 #endif /* MAC */
 120
 121 #if MULTIPATH
 122 #include <netinet/mp_pcb.h>
 123 #endif /* MULTIPATH */
 124
 125 /* TODO: this should be in a header file somewhere */
 126 extern char *proc_name_address(void *p);
 127
 128 static u_int32_t        so_cache_hw;    /* High water mark for socache */
 129 static u_int32_t        so_cache_timeouts;      /* number of timeouts */
 130 static u_int32_t        so_cache_max_freed;     /* max freed per timeout */
 131 static u_int32_t        cached_sock_count = 0;
 132 STAILQ_HEAD(, socket)   so_cache_head;
 133 int     max_cached_sock_count = MAX_CACHED_SOCKETS;
 134 static u_int32_t        so_cache_time;
 135 static int              socketinit_done;
 136 static struct zone      *so_cache_zone;
 137
 138 static lck_grp_t        *so_cache_mtx_grp;
 139 static lck_attr_t       *so_cache_mtx_attr;
 140 static lck_grp_attr_t   *so_cache_mtx_grp_attr;
 141 static lck_mtx_t        *so_cache_mtx;
 142
 143 #include <machine/limits.h>
 144
 145 static void     filt_sordetach(struct knote *kn);
 146 static int      filt_soread(struct knote *kn, long hint);
 147 static void     filt_sowdetach(struct knote *kn);
 148 static int      filt_sowrite(struct knote *kn, long hint);
 149 static void     filt_sockdetach(struct knote *kn);
 150 static int      filt_sockev(struct knote *kn, long hint);
 151
 152 static int sooptcopyin_timeval(struct sockopt *, struct timeval *);
 153 static int sooptcopyout_timeval(struct sockopt *, const struct timeval *);
 154
 155 static struct filterops soread_filtops = {
 156         .f_isfd = 1,
 157         .f_detach = filt_sordetach,
 158         .f_event = filt_soread,
 159 };
 160
 161 static struct filterops sowrite_filtops = {
 162         .f_isfd = 1,
 163         .f_detach = filt_sowdetach,
 164         .f_event = filt_sowrite,
 165 };
 166
 167 static struct filterops sock_filtops = {
 168         .f_isfd = 1,
 169         .f_detach = filt_sockdetach,
 170         .f_event = filt_sockev,
 171 };
 172
 173 #define EVEN_MORE_LOCKING_DEBUG 0
 174 int socket_debug = 0;
 175 static int socket_zone = M_SOCKET;
 176 so_gen_t        so_gencnt;      /* generation count for sockets */
 177
 178 MALLOC_DEFINE(M_SONAME, "soname", "socket name");
 179 MALLOC_DEFINE(M_PCB, "pcb", "protocol control block");
 180
 181 #define DBG_LAYER_IN_BEG        NETDBG_CODE(DBG_NETSOCK, 0)
 182 #define DBG_LAYER_IN_END        NETDBG_CODE(DBG_NETSOCK, 2)
 183 #define DBG_LAYER_OUT_BEG       NETDBG_CODE(DBG_NETSOCK, 1)
 184 #define DBG_LAYER_OUT_END       NETDBG_CODE(DBG_NETSOCK, 3)
 185 #define DBG_FNC_SOSEND          NETDBG_CODE(DBG_NETSOCK, (4 << 8) | 1)
 186 #define DBG_FNC_SORECEIVE       NETDBG_CODE(DBG_NETSOCK, (8 << 8))
 187 #define DBG_FNC_SOSHUTDOWN      NETDBG_CODE(DBG_NETSOCK, (9 << 8))
 188
 189 #define MAX_SOOPTGETM_SIZE      (128 * MCLBYTES)
 190
 191 SYSCTL_DECL(_kern_ipc);
 192
 193 int somaxconn = SOMAXCONN;
 194 SYSCTL_INT(_kern_ipc, KIPC_SOMAXCONN, somaxconn,
 195         CTLFLAG_RW | CTLFLAG_LOCKED, &somaxconn, 0, "");
 196
 197 /* Should we get a maximum also ??? */
 198 static int sosendmaxchain = 65536;
 199 static int sosendminchain = 16384;
 200 static int sorecvmincopy  = 16384;
 201 SYSCTL_INT(_kern_ipc, OID_AUTO, sosendminchain,
 202         CTLFLAG_RW | CTLFLAG_LOCKED, &sosendminchain, 0, "");
 203 SYSCTL_INT(_kern_ipc, OID_AUTO, sorecvmincopy,
 204         CTLFLAG_RW | CTLFLAG_LOCKED, &sorecvmincopy, 0, "");
 205
 206 /*
 207  * Set to enable jumbo clusters (if available) for large writes when
 208  * the socket is marked with SOF_MULTIPAGES; see below.
 209  */
 210 int sosendjcl = 1;
 211 SYSCTL_INT(_kern_ipc, OID_AUTO, sosendjcl,
 212         CTLFLAG_RW | CTLFLAG_LOCKED, &sosendjcl, 0, "");
 213
 214 /*
 215  * Set this to ignore SOF_MULTIPAGES and use jumbo clusters for large
 216  * writes on the socket for all protocols on any network interfaces,
 217  * depending upon sosendjcl above.  Be extra careful when setting this
 218  * to 1, because sending down packets that cross physical pages down to
 219  * broken drivers (those that falsely assume that the physical pages
 220  * are contiguous) might lead to system panics or silent data corruption.
 221  * When set to 0, the system will respect SOF_MULTIPAGES, which is set
 222  * only for TCP sockets whose outgoing interface is IFNET_MULTIPAGES
 223  * capable.  Set this to 1 only for testing/debugging purposes.
 224  */
 225 int sosendjcl_ignore_capab = 0;
 226 SYSCTL_INT(_kern_ipc, OID_AUTO, sosendjcl_ignore_capab,
 227         CTLFLAG_RW | CTLFLAG_LOCKED, &sosendjcl_ignore_capab, 0, "");
 228
 229 int sodefunctlog = 0;
 230 SYSCTL_INT(_kern_ipc, OID_AUTO, sodefunctlog, CTLFLAG_RW | CTLFLAG_LOCKED,
 231         &sodefunctlog, 0, "");
 232
 233 int sothrottlelog = 0;
 234 SYSCTL_INT(_kern_ipc, OID_AUTO, sothrottlelog, CTLFLAG_RW | CTLFLAG_LOCKED,
 235         &sothrottlelog, 0, "");
 236
 237 int sorestrictrecv = 1;
 238 SYSCTL_INT(_kern_ipc, OID_AUTO, sorestrictrecv, CTLFLAG_RW | CTLFLAG_LOCKED,
 239         &sorestrictrecv, 0, "Enable inbound interface restrictions");
 240
 241 /*
 242  * Socket operation routines.
 243  * These routines are called by the routines in
 244  * sys_socket.c or from a system process, and
 245  * implement the semantics of socket operations by
 246  * switching out to the protocol specific routines.
 247  */
 248
 249 /* sys_generic.c */
 250 extern void postevent(struct socket *, struct sockbuf *, int);
 251 extern void evsofree(struct socket *);
 252 extern int tcp_notsent_lowat_check(struct socket *so);
 253 extern struct inpcbinfo tcbinfo;
 254
 255 /* TODO: these should be in header file */
 256 extern int get_inpcb_str_size(void);
 257 extern int get_tcp_str_size(void);
 258
 259 static unsigned int sl_zone_size;               /* size of sockaddr_list */
 260 static struct zone *sl_zone;                    /* zone for sockaddr_list */
 261
 262 static unsigned int se_zone_size;               /* size of sockaddr_entry */
 263 static struct zone *se_zone;                    /* zone for sockaddr_entry */
 264
 265 vm_size_t       so_cache_zone_element_size;
 266
 267 static int sodelayed_copy(struct socket *, struct uio *, struct mbuf **, user_ssize_t *);
 268 static void cached_sock_alloc(struct socket **, int);
 269 static void cached_sock_free(struct socket *);
 270
 271 /*
 272  * SOTCDB_NO_DSCP is set by default, to prevent the networking stack from
 273  * setting the DSCP code on the packet based on the service class; see
 274  * <rdar://problem/11277343> for details.
 275  */
 276 __private_extern__ u_int32_t sotcdb = SOTCDB_NO_DSCP;
 277 SYSCTL_INT(_kern_ipc, OID_AUTO, sotcdb, CTLFLAG_RW | CTLFLAG_LOCKED,
 278         &sotcdb, 0, "");
 279
 280 void
 281 socketinit(void)
 282 {
 283         if (socketinit_done) {
 284                 printf("socketinit: already called...\n");
 285                 return;
 286         }
 287         socketinit_done = 1;
 288
 289         PE_parse_boot_argn("socket_debug", &socket_debug,
 290             sizeof (socket_debug));
 291
 292         /*
 293          * allocate lock group attribute and group for socket cache mutex
 294          */
 295         so_cache_mtx_grp_attr = lck_grp_attr_alloc_init();
 296         so_cache_mtx_grp = lck_grp_alloc_init("so_cache",
 297             so_cache_mtx_grp_attr);
 298
 299         /*
 300          * allocate the lock attribute for socket cache mutex
 301          */
 302         so_cache_mtx_attr = lck_attr_alloc_init();
 303
 304         /* cached sockets mutex */
 305         so_cache_mtx = lck_mtx_alloc_init(so_cache_mtx_grp, so_cache_mtx_attr);
 306         if (so_cache_mtx == NULL) {
 307                 panic("%s: unable to allocate so_cache_mtx\n", __func__);
 308                 /* NOTREACHED */
 309         }
 310         STAILQ_INIT(&so_cache_head);
 311
 312         so_cache_zone_element_size = (vm_size_t)(sizeof (struct socket) + 4
 313             + get_inpcb_str_size() + 4 + get_tcp_str_size());
 314
 315         so_cache_zone = zinit(so_cache_zone_element_size,
 316             (120000 * so_cache_zone_element_size), 8192, "socache zone");
 317         zone_change(so_cache_zone, Z_CALLERACCT, FALSE);
 318         zone_change(so_cache_zone, Z_NOENCRYPT, TRUE);
 319
 320         sl_zone_size = sizeof (struct sockaddr_list);
 321         if ((sl_zone = zinit(sl_zone_size, 1024 * sl_zone_size, 1024,
 322             "sockaddr_list")) == NULL) {
 323                 panic("%s: unable to allocate sockaddr_list zone\n", __func__);
 324                 /* NOTREACHED */
 325         }
 326         zone_change(sl_zone, Z_CALLERACCT, FALSE);
 327         zone_change(sl_zone, Z_EXPAND, TRUE);
 328
 329         se_zone_size = sizeof (struct sockaddr_entry);
 330         if ((se_zone = zinit(se_zone_size, 1024 * se_zone_size, 1024,
 331             "sockaddr_entry")) == NULL) {
 332                 panic("%s: unable to allocate sockaddr_entry zone\n", __func__);
 333                 /* NOTREACHED */
 334         }
 335         zone_change(se_zone, Z_CALLERACCT, FALSE);
 336         zone_change(se_zone, Z_EXPAND, TRUE);
 337
 338
 339         in_pcbinit();
 340         sflt_init();
 341         socket_tclass_init();
 342 #if MULTIPATH
 343         mp_pcbinit();
 344 #endif /* MULTIPATH */
 345 }
 346
 347 static void
 348 cached_sock_alloc(struct socket **so, int waitok)
 349 {
 350         caddr_t temp;
 351         uintptr_t offset;
 352
 353         lck_mtx_lock(so_cache_mtx);
 354
 355         if (!STAILQ_EMPTY(&so_cache_head)) {
 356                 VERIFY(cached_sock_count > 0);
 357
 358                 *so = STAILQ_FIRST(&so_cache_head);
 359                 STAILQ_REMOVE_HEAD(&so_cache_head, so_cache_ent);
 360                 STAILQ_NEXT((*so), so_cache_ent) = NULL;
 361
 362                 cached_sock_count--;
 363                 lck_mtx_unlock(so_cache_mtx);
 364
 365                 temp = (*so)->so_saved_pcb;
 366                 bzero((caddr_t)*so, sizeof (struct socket));
 367
 368                 (*so)->so_saved_pcb = temp;
 369         } else {
 370
 371                 lck_mtx_unlock(so_cache_mtx);
 372
 373                 if (waitok)
 374                         *so = (struct socket *)zalloc(so_cache_zone);
 375                 else
 376                         *so = (struct socket *)zalloc_noblock(so_cache_zone);
 377
 378                 if (*so == NULL)
 379                         return;
 380
 381                 bzero((caddr_t)*so, sizeof (struct socket));
 382
 383                 /*
 384                  * Define offsets for extra structures into our
 385                  * single block of memory. Align extra structures
 386                  * on longword boundaries.
 387                  */
 388
 389                 offset = (uintptr_t)*so;
 390                 offset += sizeof (struct socket);
 391
 392                 offset = ALIGN(offset);
 393
 394                 (*so)->so_saved_pcb = (caddr_t)offset;
 395                 offset += get_inpcb_str_size();
 396
 397                 offset = ALIGN(offset);
 398
 399                 ((struct inpcb *)(void *)(*so)->so_saved_pcb)->inp_saved_ppcb =
 400                     (caddr_t)offset;
 401         }
 402
 403         (*so)->cached_in_sock_layer = true;
 404 }
 405
 406 static void
 407 cached_sock_free(struct socket *so)
 408 {
 409
 410         lck_mtx_lock(so_cache_mtx);
 411
 412         so_cache_time = net_uptime();
 413         if (++cached_sock_count > max_cached_sock_count) {
 414                 --cached_sock_count;
 415                 lck_mtx_unlock(so_cache_mtx);
 416                 zfree(so_cache_zone, so);
 417         } else {
 418                 if (so_cache_hw < cached_sock_count)
 419                         so_cache_hw = cached_sock_count;
 420
 421                 STAILQ_INSERT_TAIL(&so_cache_head, so, so_cache_ent);
 422
 423                 so->cache_timestamp = so_cache_time;
 424                 lck_mtx_unlock(so_cache_mtx);
 425         }
 426 }
 427
 428 void
 429 so_update_last_owner_locked(struct socket *so, proc_t self)
 430 {
 431         if (so->last_pid != 0) {
 432                 /*
 433                  * last_pid and last_upid should remain zero for sockets
 434                  * created using sock_socket. The check above achieves that
 435                  */
 436                 if (self == PROC_NULL)
 437                         self = current_proc();
 438
 439                 if (so->last_upid != proc_uniqueid(self) ||
 440                     so->last_pid != proc_pid(self)) {
 441                         so->last_upid = proc_uniqueid(self);
 442                         so->last_pid = proc_pid(self);
 443                         proc_getexecutableuuid(self, so->last_uuid,
 444                             sizeof (so->last_uuid));
 445                 }
 446         }
 447 }
 448
 449 void
 450 so_update_policy(struct socket *so)
 451 {
 452         if (SOCK_DOM(so) == PF_INET || SOCK_DOM(so) == PF_INET6)
 453                 (void) inp_update_policy(sotoinpcb(so));
 454 }
 455
 456 boolean_t
 457 so_cache_timer(void)
 458 {
 459         struct socket   *p;
 460         int             n_freed = 0;
 461         boolean_t rc = FALSE;
 462
 463         lck_mtx_lock(so_cache_mtx);
 464         so_cache_timeouts++;
 465         so_cache_time = net_uptime();
 466
 467         while (!STAILQ_EMPTY(&so_cache_head)) {
 468                 VERIFY(cached_sock_count > 0);
 469                 p = STAILQ_FIRST(&so_cache_head);
 470                 if ((so_cache_time - p->cache_timestamp) <
 471                         SO_CACHE_TIME_LIMIT)
 472                         break;
 473
 474                 STAILQ_REMOVE_HEAD(&so_cache_head, so_cache_ent);
 475                 --cached_sock_count;
 476
 477                 zfree(so_cache_zone, p);
 478
 479                 if (++n_freed >= SO_CACHE_MAX_FREE_BATCH) {
 480                         so_cache_max_freed++;
 481                         break;
 482                 }
 483         }
 484
 485         /* Schedule again if there is more to cleanup */
 486         if (!STAILQ_EMPTY(&so_cache_head))
 487                 rc = TRUE;
 488
 489         lck_mtx_unlock(so_cache_mtx);
 490         return (rc);
 491 }
 492
 493 /*
 494  * Get a socket structure from our zone, and initialize it.
 495  * We don't implement `waitok' yet (see comments in uipc_domain.c).
 496  * Note that it would probably be better to allocate socket
 497  * and PCB at the same time, but I'm not convinced that all
 498  * the protocols can be easily modified to do this.
 499  */
 500 struct socket *
 501 soalloc(int waitok, int dom, int type)
 502 {
 503         struct socket *so;
 504
 505         if ((dom == PF_INET) && (type == SOCK_STREAM)) {
 506                 cached_sock_alloc(&so, waitok);
 507         } else {
 508                 MALLOC_ZONE(so, struct socket *, sizeof (*so), socket_zone,
 509                     M_WAITOK);
 510                 if (so != NULL)
 511                         bzero(so, sizeof (*so));
 512         }
 513         if (so != NULL) {
 514                 so->so_gencnt = ++so_gencnt;
 515                 so->so_zone = socket_zone;
 516 #if CONFIG_MACF_SOCKET
 517                 /* Convert waitok to  M_WAITOK/M_NOWAIT for MAC Framework. */
 518                 if (mac_socket_label_init(so, !waitok) != 0) {
 519                         sodealloc(so);
 520                         return (NULL);
 521                 }
 522 #endif /* MAC_SOCKET */
 523         }
 524
 525         return (so);
 526 }
 527
 528 int
 529 socreate_internal(int dom, struct socket **aso, int type, int proto,
 530     struct proc *p, uint32_t flags, struct proc *ep)
 531 {
 532         struct protosw *prp;
 533         struct socket *so;
 534         int error = 0;
 535
 536 #if TCPDEBUG
 537         extern int tcpconsdebug;
 538 #endif
 539
 540         VERIFY(aso != NULL);
 541         *aso = NULL;
 542
 543         if (proto != 0)
 544                 prp = pffindproto(dom, proto, type);
 545         else
 546                 prp = pffindtype(dom, type);
 547
 548         if (prp == NULL || prp->pr_usrreqs->pru_attach == NULL) {
 549                 if (pffinddomain(dom) == NULL)
 550                         return (EAFNOSUPPORT);
 551                 if (proto != 0) {
 552                         if (pffindprotonotype(dom, proto) != NULL)
 553                                 return (EPROTOTYPE);
 554                 }
 555                 return (EPROTONOSUPPORT);
 556         }
 557         if (prp->pr_type != type)
 558                 return (EPROTOTYPE);
 559         so = soalloc(1, dom, type);
 560         if (so == NULL)
 561                 return (ENOBUFS);
 562
 563         if (flags & SOCF_ASYNC)
 564                 so->so_state |= SS_NBIO;
 565 #if MULTIPATH
 566         if (flags & SOCF_MP_SUBFLOW) {
 567                 /*
 568                  * A multipath subflow socket is used internally in the kernel,
 569                  * therefore it does not have a file desciptor associated by
 570                  * default.
 571                  */
 572                 so->so_state |= SS_NOFDREF;
 573                 so->so_flags |= SOF_MP_SUBFLOW;
 574         }
 575 #endif /* MULTIPATH */
 576
 577         TAILQ_INIT(&so->so_incomp);
 578         TAILQ_INIT(&so->so_comp);
 579         so->so_type = type;
 580         so->last_upid = proc_uniqueid(p);
 581         so->last_pid = proc_pid(p);
 582         proc_getexecutableuuid(p, so->last_uuid, sizeof (so->last_uuid));
 583
 584         if (ep != PROC_NULL && ep != p) {
 585                 so->e_upid = proc_uniqueid(ep);
 586                 so->e_pid = proc_pid(ep);
 587                 proc_getexecutableuuid(ep, so->e_uuid, sizeof (so->e_uuid));
 588                 so->so_flags |= SOF_DELEGATED;
 589         }
 590
 591         so->so_cred = kauth_cred_proc_ref(p);
 592         if (!suser(kauth_cred_get(), NULL))
 593                 so->so_state |= SS_PRIV;
 594
 595         so->so_proto = prp;
 596         so->so_rcv.sb_flags |= SB_RECV;
 597         so->so_rcv.sb_so = so->so_snd.sb_so = so;
 598         so->next_lock_lr = 0;
 599         so->next_unlock_lr = 0;
 600
 601 #if CONFIG_MACF_SOCKET
 602         mac_socket_label_associate(kauth_cred_get(), so);
 603 #endif /* MAC_SOCKET */
 604
 605         /*
 606          * Attachment will create the per pcb lock if necessary and
 607          * increase refcount for creation, make sure it's done before
 608          * socket is inserted in lists.
 609          */
 610         so->so_usecount++;
 611
 612         error = (*prp->pr_usrreqs->pru_attach)(so, proto, p);
 613         if (error != 0) {
 614                 /*
 615                  * Warning:
 616                  * If so_pcb is not zero, the socket will be leaked,
 617                  * so protocol attachment handler must be coded carefuly
 618                  */
 619                 so->so_state |= SS_NOFDREF;
 620                 so->so_usecount--;
 621                 sofreelastref(so, 1);   /* will deallocate the socket */
 622                 return (error);
 623         }
 624
 625         atomic_add_32(&prp->pr_domain->dom_refs, 1);
 626         TAILQ_INIT(&so->so_evlist);
 627
 628         /* Attach socket filters for this protocol */
 629         sflt_initsock(so);
 630 #if TCPDEBUG
 631         if (tcpconsdebug == 2)
 632                 so->so_options |= SO_DEBUG;
 633 #endif
 634         so_set_default_traffic_class(so);
 635
 636         /*
 637          * If this thread or task is marked to create backgrounded sockets,
 638          * mark the socket as background.
 639          */
 640         if (proc_get_effective_thread_policy(current_thread(), TASK_POLICY_NEW_SOCKETS_BG)) {
 641                 socket_set_traffic_mgt_flags(so, TRAFFIC_MGT_SO_BACKGROUND);
 642                 so->so_background_thread = current_thread();
 643         }
 644
 645         switch (dom) {
 646         /*
 647          * Don't mark Unix domain, system or multipath sockets as
 648          * eligible for defunct by default.
 649          */
 650         case PF_LOCAL:
 651         case PF_SYSTEM:
 652         case PF_MULTIPATH:
 653                 so->so_flags |= SOF_NODEFUNCT;
 654                 break;
 655         default:
 656                 break;
 657         }
 658
 659         *aso = so;
 660
 661         return (0);
 662 }
 663
 664 /*
 665  * Returns:     0                       Success
 666  *              EAFNOSUPPORT
 667  *              EPROTOTYPE
 668  *              EPROTONOSUPPORT
 669  *              ENOBUFS
 670  *      <pru_attach>:ENOBUFS[AF_UNIX]
 671  *      <pru_attach>:ENOBUFS[TCP]
 672  *      <pru_attach>:ENOMEM[TCP]
 673  *      <pru_attach>:???                [other protocol families, IPSEC]
 674  */
 675 int
 676 socreate(int dom, struct socket **aso, int type, int proto)
 677 {
 678         return (socreate_internal(dom, aso, type, proto, current_proc(), 0,
 679             PROC_NULL));
 680 }
 681
 682 int
 683 socreate_delegate(int dom, struct socket **aso, int type, int proto, pid_t epid)
 684 {
 685         int error = 0;
 686         struct proc *ep = PROC_NULL;
 687
 688         if ((proc_selfpid() != epid) && ((ep = proc_find(epid)) == PROC_NULL)) {
 689                 error = ESRCH;
 690                 goto done;
 691         }
 692
 693         error = socreate_internal(dom, aso, type, proto, current_proc(), 0, ep);
 694
 695         /*
 696          * It might not be wise to hold the proc reference when calling
 697          * socreate_internal since it calls soalloc with M_WAITOK
 698          */
 699 done:
 700         if (ep != PROC_NULL)
 701                 proc_rele(ep);
 702
 703         return (error);
 704 }
 705
 706 /*
 707  * Returns:     0                       Success
 708  *      <pru_bind>:EINVAL               Invalid argument [COMMON_START]
 709  *      <pru_bind>:EAFNOSUPPORT         Address family not supported
 710  *      <pru_bind>:EADDRNOTAVAIL        Address not available.
 711  *      <pru_bind>:EINVAL               Invalid argument
 712  *      <pru_bind>:EAFNOSUPPORT         Address family not supported [notdef]
 713  *      <pru_bind>:EACCES               Permission denied
 714  *      <pru_bind>:EADDRINUSE           Address in use
 715  *      <pru_bind>:EAGAIN               Resource unavailable, try again
 716  *      <pru_bind>:EPERM                Operation not permitted
 717  *      <pru_bind>:???
 718  *      <sf_bind>:???
 719  *
 720  * Notes:       It's not possible to fully enumerate the return codes above,
 721  *              since socket filter authors and protocol family authors may
 722  *              not choose to limit their error returns to those listed, even
 723  *              though this may result in some software operating incorrectly.
 724  *
 725  *              The error codes which are enumerated above are those known to
 726  *              be returned by the tcp_usr_bind function supplied.
 727  */
 728 int
 729 sobindlock(struct socket *so, struct sockaddr *nam, int dolock)
 730 {
 731         struct proc *p = current_proc();
 732         int error = 0;
 733
 734         if (dolock)
 735                 socket_lock(so, 1);
 736         VERIFY(so->so_usecount > 1);
 737
 738         so_update_last_owner_locked(so, p);
 739         so_update_policy(so);
 740
 741         /*
 742          * If this is a bind request on a socket that has been marked
 743          * as inactive, reject it now before we go any further.
 744          */
 745         if (so->so_flags & SOF_DEFUNCT) {
 746                 error = EINVAL;
 747                 SODEFUNCTLOG(("%s[%d]: defunct so 0x%llx [%d,%d] (%d)\n",
 748                     __func__, proc_pid(p), (uint64_t)VM_KERNEL_ADDRPERM(so),
 749                     SOCK_DOM(so), SOCK_TYPE(so), error));
 750                 goto out;
 751         }
 752
 753         /* Socket filter */
 754         error = sflt_bind(so, nam);
 755
 756         if (error == 0)
 757                 error = (*so->so_proto->pr_usrreqs->pru_bind)(so, nam, p);
 758 out:
 759         if (dolock)
 760                 socket_unlock(so, 1);
 761
 762         if (error == EJUSTRETURN)
 763                 error = 0;
 764
 765         return (error);
 766 }
 767
 768 void
 769 sodealloc(struct socket *so)
 770 {
 771         kauth_cred_unref(&so->so_cred);
 772
 773         /* Remove any filters */
 774         sflt_termsock(so);
 775
 776         /* Delete the state allocated for msg queues on a socket */
 777         if (so->so_flags & SOF_ENABLE_MSGS) {
 778                 FREE(so->so_msg_state, M_TEMP);
 779                 so->so_msg_state = NULL;
 780         }
 781         VERIFY(so->so_msg_state == NULL);
 782
 783         so->so_gencnt = ++so_gencnt;
 784
 785 #if CONFIG_MACF_SOCKET
 786         mac_socket_label_destroy(so);
 787 #endif /* MAC_SOCKET */
 788
 789         if (so->cached_in_sock_layer) {
 790                 cached_sock_free(so);
 791         } else {
 792                 FREE_ZONE(so, sizeof (*so), so->so_zone);
 793         }
 794 }
 795
 796 /*
 797  * Returns:     0                       Success
 798  *              EINVAL
 799  *              EOPNOTSUPP
 800  *      <pru_listen>:EINVAL[AF_UNIX]
 801  *      <pru_listen>:EINVAL[TCP]
 802  *      <pru_listen>:EADDRNOTAVAIL[TCP] Address not available.
 803  *      <pru_listen>:EINVAL[TCP]        Invalid argument
 804  *      <pru_listen>:EAFNOSUPPORT[TCP]  Address family not supported [notdef]
 805  *      <pru_listen>:EACCES[TCP]        Permission denied
 806  *      <pru_listen>:EADDRINUSE[TCP]    Address in use
 807  *      <pru_listen>:EAGAIN[TCP]        Resource unavailable, try again
 808  *      <pru_listen>:EPERM[TCP]         Operation not permitted
 809  *      <sf_listen>:???
 810  *
 811  * Notes:       Other <pru_listen> returns depend on the protocol family; all
 812  *              <sf_listen> returns depend on what the filter author causes
 813  *              their filter to return.
 814  */
 815 int
 816 solisten(struct socket *so, int backlog)
 817 {
 818         struct proc *p = current_proc();
 819         int error = 0;
 820
 821         socket_lock(so, 1);
 822
 823         so_update_last_owner_locked(so, p);
 824         so_update_policy(so);
 825
 826         if (so->so_proto == NULL) {
 827                 error = EINVAL;
 828                 goto out;
 829         }
 830         if ((so->so_proto->pr_flags & PR_CONNREQUIRED) == 0) {
 831                 error = EOPNOTSUPP;
 832                 goto out;
 833         }
 834
 835         /*
 836          * If the listen request is made on a socket that is not fully
 837          * disconnected, or on a socket that has been marked as inactive,
 838          * reject the request now.
 839          */
 840         if ((so->so_state &
 841             (SS_ISCONNECTED|SS_ISCONNECTING|SS_ISDISCONNECTING)) ||
 842             (so->so_flags & SOF_DEFUNCT)) {
 843                 error = EINVAL;
 844                 if (so->so_flags & SOF_DEFUNCT) {
 845                         SODEFUNCTLOG(("%s[%d]: defunct so 0x%llx [%d,%d] "
 846                             "(%d)\n", __func__, proc_pid(p),
 847                             (uint64_t)VM_KERNEL_ADDRPERM(so),
 848                             SOCK_DOM(so), SOCK_TYPE(so), error));
 849                 }
 850                 goto out;
 851         }
 852
 853         if ((so->so_restrictions & SO_RESTRICT_DENY_IN) != 0) {
 854                 error = EPERM;
 855                 goto out;
 856         }
 857
 858         error = sflt_listen(so);
 859         if (error == 0)
 860                 error = (*so->so_proto->pr_usrreqs->pru_listen)(so, p);
 861
 862         if (error) {
 863                 if (error == EJUSTRETURN)
 864                         error = 0;
 865                 goto out;
 866         }
 867
 868         if (TAILQ_EMPTY(&so->so_comp))
 869                 so->so_options |= SO_ACCEPTCONN;
 870         /*
 871          * POSIX: The implementation may have an upper limit on the length of
 872          * the listen queue-either global or per accepting socket. If backlog
 873          * exceeds this limit, the length of the listen queue is set to the
 874          * limit.
 875          *
 876          * If listen() is called with a backlog argument value that is less
 877          * than 0, the function behaves as if it had been called with a backlog
 878          * argument value of 0.
 879          *
 880          * A backlog argument of 0 may allow the socket to accept connections,
 881          * in which case the length of the listen queue may be set to an
 882          * implementation-defined minimum value.
 883          */
 884         if (backlog <= 0 || backlog > somaxconn)
 885                 backlog = somaxconn;
 886
 887         so->so_qlimit = backlog;
 888 out:
 889         socket_unlock(so, 1);
 890         return (error);
 891 }
 892
 893 void
 894 sofreelastref(struct socket *so, int dealloc)
 895 {
 896         struct socket *head = so->so_head;
 897
 898         /* Assume socket is locked */
 899
 900         if (!(so->so_flags & SOF_PCBCLEARING) || !(so->so_state & SS_NOFDREF)) {
 901                 selthreadclear(&so->so_snd.sb_sel);
 902                 selthreadclear(&so->so_rcv.sb_sel);
 903                 so->so_rcv.sb_flags &= ~(SB_SEL|SB_UPCALL);
 904                 so->so_snd.sb_flags &= ~(SB_SEL|SB_UPCALL);
 905                 so->so_event = NULL;
 906                 return;
 907         }
 908         if (head != NULL) {
 909                 socket_lock(head, 1);
 910                 if (so->so_state & SS_INCOMP) {
 911                         TAILQ_REMOVE(&head->so_incomp, so, so_list);
 912                         head->so_incqlen--;
 913                 } else if (so->so_state & SS_COMP) {
 914                         /*
 915                          * We must not decommission a socket that's
 916                          * on the accept(2) queue.  If we do, then
 917                          * accept(2) may hang after select(2) indicated
 918                          * that the listening socket was ready.
 919                          */
 920                         selthreadclear(&so->so_snd.sb_sel);
 921                         selthreadclear(&so->so_rcv.sb_sel);
 922                         so->so_rcv.sb_flags &= ~(SB_SEL|SB_UPCALL);
 923                         so->so_snd.sb_flags &= ~(SB_SEL|SB_UPCALL);
 924                         so->so_event = NULL;
 925                         socket_unlock(head, 1);
 926                         return;
 927                 } else {
 928                         panic("sofree: not queued");
 929                 }
 930                 head->so_qlen--;
 931                 so->so_state &= ~SS_INCOMP;
 932                 so->so_head = NULL;
 933                 socket_unlock(head, 1);
 934         }
 935         sowflush(so);
 936         sorflush(so);
 937
 938 #if FLOW_DIVERT
 939         if (so->so_flags & SOF_FLOW_DIVERT) {
 940                 flow_divert_detach(so);
 941         }
 942 #endif  /* FLOW_DIVERT */
 943
 944         /* 3932268: disable upcall */
 945         so->so_rcv.sb_flags &= ~SB_UPCALL;
 946         so->so_snd.sb_flags &= ~SB_UPCALL;
 947         so->so_event = NULL;
 948
 949         if (dealloc)
 950                 sodealloc(so);
 951 }
 952
 953 void
 954 soclose_wait_locked(struct socket *so)
 955 {
 956         lck_mtx_t *mutex_held;
 957
 958         if (so->so_proto->pr_getlock != NULL)
 959                 mutex_held = (*so->so_proto->pr_getlock)(so, 0);
 960         else
 961                 mutex_held = so->so_proto->pr_domain->dom_mtx;
 962         lck_mtx_assert(mutex_held, LCK_MTX_ASSERT_OWNED);
 963
 964         /*
 965          * Double check here and return if there's no outstanding upcall;
 966          * otherwise proceed further only if SOF_UPCALLCLOSEWAIT is set.
 967          */
 968         if (!so->so_upcallusecount || !(so->so_flags & SOF_UPCALLCLOSEWAIT))
 969                 return;
 970         so->so_rcv.sb_flags &= ~SB_UPCALL;
 971         so->so_snd.sb_flags &= ~SB_UPCALL;
 972         so->so_flags |= SOF_CLOSEWAIT;
 973         (void) msleep((caddr_t)&so->so_upcallusecount, mutex_held, (PZERO - 1),
 974             "soclose_wait_locked", NULL);
 975         lck_mtx_assert(mutex_held, LCK_MTX_ASSERT_OWNED);
 976         so->so_flags &= ~SOF_CLOSEWAIT;
 977 }
 978
 979 /*
 980  * Close a socket on last file table reference removal.
 981  * Initiate disconnect if connected.
 982  * Free socket when disconnect complete.
 983  */
 984 int
 985 soclose_locked(struct socket *so)
 986 {
 987         int error = 0;
 988         lck_mtx_t *mutex_held;
 989         struct timespec ts;
 990
 991         if (so->so_usecount == 0) {
 992                 panic("soclose: so=%p refcount=0\n", so);
 993                 /* NOTREACHED */
 994         }
 995
 996         sflt_notify(so, sock_evt_closing, NULL);
 997
 998         if (so->so_upcallusecount)
 999                 soclose_wait_locked(so);
1000
1001         if ((so->so_options & SO_ACCEPTCONN)) {
1002                 struct socket *sp, *sonext;
1003                 int socklock = 0;
1004
1005                 /*
1006                  * We do not want new connection to be added
1007                  * to the connection queues
1008                  */
1009                 so->so_options &= ~SO_ACCEPTCONN;
1010
1011                 for (sp = TAILQ_FIRST(&so->so_incomp);
1012                     sp != NULL; sp = sonext) {
1013                         sonext = TAILQ_NEXT(sp, so_list);
1014
1015                         /*
1016                          * Radar 5350314
1017                          * skip sockets thrown away by tcpdropdropblreq
1018                          * they will get cleanup by the garbage collection.
1019                          * otherwise, remove the incomp socket from the queue
1020                          * and let soabort trigger the appropriate cleanup.
1021                          */
1022                         if (sp->so_flags & SOF_OVERFLOW)
1023                                 continue;
1024
1025                         if (so->so_proto->pr_getlock != NULL) {
1026                                 /*
1027                                  * Lock ordering for consistency with the
1028                                  * rest of the stack, we lock the socket
1029                                  * first and then grabb the head.
1030                                  */
1031                                 socket_unlock(so, 0);
1032                                 socket_lock(sp, 1);
1033                                 socket_lock(so, 0);
1034                                 socklock = 1;
1035                         }
1036
1037                         TAILQ_REMOVE(&so->so_incomp, sp, so_list);
1038                         so->so_incqlen--;
1039
1040                         if (sp->so_state & SS_INCOMP) {
1041                                 sp->so_state &= ~SS_INCOMP;
1042                                 sp->so_head = NULL;
1043
1044                                 (void) soabort(sp);
1045                         }
1046
1047                         if (socklock)
1048                                 socket_unlock(sp, 1);
1049                 }
1050
1051                 while ((sp = TAILQ_FIRST(&so->so_comp)) != NULL) {
1052                         /* Dequeue from so_comp since sofree() won't do it */
1053                         TAILQ_REMOVE(&so->so_comp, sp, so_list);
1054                         so->so_qlen--;
1055
1056                         if (so->so_proto->pr_getlock != NULL) {
1057                                 socket_unlock(so, 0);
1058                                 socket_lock(sp, 1);
1059                         }
1060
1061                         if (sp->so_state & SS_COMP) {
1062                                 sp->so_state &= ~SS_COMP;
1063                                 sp->so_head = NULL;
1064
1065                                 (void) soabort(sp);
1066                         }
1067
1068                         if (so->so_proto->pr_getlock != NULL) {
1069                                 socket_unlock(sp, 1);
1070                                 socket_lock(so, 0);
1071                         }
1072                 }
1073         }
1074         if (so->so_pcb == NULL) {
1075                 /* 3915887: mark the socket as ready for dealloc */
1076                 so->so_flags |= SOF_PCBCLEARING;
1077                 goto discard;
1078         }
1079         if (so->so_state & SS_ISCONNECTED) {
1080                 if ((so->so_state & SS_ISDISCONNECTING) == 0) {
1081                         error = sodisconnectlocked(so);
1082                         if (error)
1083                                 goto drop;
1084                 }
1085                 if (so->so_options & SO_LINGER) {
1086                         if ((so->so_state & SS_ISDISCONNECTING) &&
1087                             (so->so_state & SS_NBIO))
1088                                 goto drop;
1089                         if (so->so_proto->pr_getlock != NULL)
1090                                 mutex_held = (*so->so_proto->pr_getlock)(so, 0);
1091                         else
1092                                 mutex_held = so->so_proto->pr_domain->dom_mtx;
1093                         while (so->so_state & SS_ISCONNECTED) {
1094                                 ts.tv_sec = (so->so_linger/100);
1095                                 ts.tv_nsec = (so->so_linger % 100) *
1096                                     NSEC_PER_USEC * 1000 * 10;
1097                                 error = msleep((caddr_t)&so->so_timeo,
1098                                     mutex_held, PSOCK | PCATCH, "soclose", &ts);
1099                                 if (error) {
1100                                         /*
1101                                          * It's OK when the time fires,
1102                                          * don't report an error
1103                                          */
1104                                         if (error == EWOULDBLOCK)
1105                                                 error = 0;
1106                                         break;
1107                                 }
1108                         }
1109                 }
1110         }
1111 drop:
1112         if (so->so_usecount == 0) {
1113                 panic("soclose: usecount is zero so=%p\n", so);
1114                 /* NOTREACHED */
1115         }
1116         if (so->so_pcb != NULL && !(so->so_flags & SOF_PCBCLEARING)) {
1117                 /*
1118                  * Let NetworkStatistics know this PCB is going away
1119                  * before we detach it.
1120                  */
1121                 if (nstat_collect &&
1122                     (SOCK_DOM(so) == PF_INET || SOCK_DOM(so) == PF_INET6))
1123                         nstat_pcb_detach(so->so_pcb);
1124
1125                 int error2 = (*so->so_proto->pr_usrreqs->pru_detach)(so);
1126                 if (error == 0)
1127                         error = error2;
1128         }
1129         if (so->so_usecount <= 0) {
1130                 panic("soclose: usecount is zero so=%p\n", so);
1131                 /* NOTREACHED */
1132         }
1133 discard:
1134         if (so->so_pcb != NULL && !(so->so_flags & SOF_MP_SUBFLOW) &&
1135             (so->so_state & SS_NOFDREF)) {
1136                 panic("soclose: NOFDREF");
1137                 /* NOTREACHED */
1138         }
1139         so->so_state |= SS_NOFDREF;
1140
1141         if (so->so_flags & SOF_MP_SUBFLOW)
1142                 so->so_flags &= ~SOF_MP_SUBFLOW;
1143
1144         if ((so->so_flags & SOF_KNOTE) != 0)
1145                 KNOTE(&so->so_klist, SO_FILT_HINT_LOCKED);
1146
1147         atomic_add_32(&so->so_proto->pr_domain->dom_refs, -1);
1148         evsofree(so);
1149
1150         so->so_usecount--;
1151         sofree(so);
1152         return (error);
1153 }
1154
1155 int
1156 soclose(struct socket *so)
1157 {
1158         int error = 0;
1159         socket_lock(so, 1);
1160
1161         if (so->so_retaincnt == 0) {
1162                 error = soclose_locked(so);
1163         } else {
1164                 /*
1165                  * if the FD is going away, but socket is
1166                  * retained in kernel remove its reference
1167                  */
1168                 so->so_usecount--;
1169                 if (so->so_usecount < 2)
1170                         panic("soclose: retaincnt non null and so=%p "
1171                             "usecount=%d\n", so, so->so_usecount);
1172         }
1173         socket_unlock(so, 1);
1174         return (error);
1175 }
1176
1177 /*
1178  * Must be called at splnet...
1179  */
1180 /* Should already be locked */
1181 int
1182 soabort(struct socket *so)
1183 {
1184         int error;
1185
1186 #ifdef MORE_LOCKING_DEBUG
1187         lck_mtx_t *mutex_held;
1188
1189         if (so->so_proto->pr_getlock != NULL)
1190                 mutex_held = (*so->so_proto->pr_getlock)(so, 0);
1191         else
1192                 mutex_held = so->so_proto->pr_domain->dom_mtx;
1193         lck_mtx_assert(mutex_held, LCK_MTX_ASSERT_OWNED);
1194 #endif
1195
1196         if ((so->so_flags & SOF_ABORTED) == 0) {
1197                 so->so_flags |= SOF_ABORTED;
1198                 error = (*so->so_proto->pr_usrreqs->pru_abort)(so);
1199                 if (error) {
1200                         sofree(so);
1201                         return (error);
1202                 }
1203         }
1204         return (0);
1205 }
1206
1207 int
1208 soacceptlock(struct socket *so, struct sockaddr **nam, int dolock)
1209 {
1210         int error;
1211
1212         if (dolock)
1213                 socket_lock(so, 1);
1214
1215         so_update_last_owner_locked(so, PROC_NULL);
1216         so_update_policy(so);
1217
1218         if ((so->so_state & SS_NOFDREF) == 0)
1219                 panic("soaccept: !NOFDREF");
1220         so->so_state &= ~SS_NOFDREF;
1221         error = (*so->so_proto->pr_usrreqs->pru_accept)(so, nam);
1222
1223         if (dolock)
1224                 socket_unlock(so, 1);
1225         return (error);
1226 }
1227
1228 int
1229 soaccept(struct socket *so, struct sockaddr **nam)
1230 {
1231         return (soacceptlock(so, nam, 1));
1232 }
1233
1234 int
1235 soacceptfilter(struct socket *so)
1236 {
1237         struct sockaddr *local = NULL, *remote = NULL;
1238         int error = 0;
1239         struct socket *head = so->so_head;
1240
1241         /*
1242          * Hold the lock even if this socket has not been made visible
1243          * to the filter(s).  For sockets with global locks, this protects
1244          * against the head or peer going away
1245          */
1246         socket_lock(so, 1);
1247         if (sogetaddr_locked(so, &remote, 1) != 0 ||
1248             sogetaddr_locked(so, &local, 0) != 0) {
1249                 so->so_state &= ~(SS_NOFDREF | SS_COMP);
1250                 so->so_head = NULL;
1251                 socket_unlock(so, 1);
1252                 soclose(so);
1253                 /* Out of resources; try it again next time */
1254                 error = ECONNABORTED;
1255                 goto done;
1256         }
1257
1258         error = sflt_accept(head, so, local, remote);
1259
1260         /*
1261          * If we get EJUSTRETURN from one of the filters, mark this socket
1262          * as inactive and return it anyway.  This newly accepted socket
1263          * will be disconnected later before we hand it off to the caller.
1264          */
1265         if (error == EJUSTRETURN) {
1266                 error = 0;
1267                 (void) sosetdefunct(current_proc(), so,
1268                     SHUTDOWN_SOCKET_LEVEL_DISCONNECT_INTERNAL, FALSE);
1269         }
1270
1271         if (error != 0) {
1272                 /*
1273                  * This may seem like a duplication to the above error
1274                  * handling part when we return ECONNABORTED, except
1275                  * the following is done while holding the lock since
1276                  * the socket has been exposed to the filter(s) earlier.
1277                  */
1278                 so->so_state &= ~(SS_NOFDREF | SS_COMP);
1279                 so->so_head = NULL;
1280                 socket_unlock(so, 1);
1281                 soclose(so);
1282                 /* Propagate socket filter's error code to the caller */
1283         } else {
1284                 socket_unlock(so, 1);
1285         }
1286 done:
1287         /* Callee checks for NULL pointer */
1288         sock_freeaddr(remote);
1289         sock_freeaddr(local);
1290         return (error);
1291 }
1292
1293 /*
1294  * Returns:     0                       Success
1295  *              EOPNOTSUPP              Operation not supported on socket
1296  *              EISCONN                 Socket is connected
1297  *      <pru_connect>:EADDRNOTAVAIL     Address not available.
1298  *      <pru_connect>:EINVAL            Invalid argument
1299  *      <pru_connect>:EAFNOSUPPORT      Address family not supported [notdef]
1300  *      <pru_connect>:EACCES            Permission denied
1301  *      <pru_connect>:EADDRINUSE        Address in use
1302  *      <pru_connect>:EAGAIN            Resource unavailable, try again
1303  *      <pru_connect>:EPERM             Operation not permitted
1304  *      <sf_connect_out>:???            [anything a filter writer might set]
1305  */
1306 int
1307 soconnectlock(struct socket *so, struct sockaddr *nam, int dolock)
1308 {
1309         int error;
1310         struct proc *p = current_proc();
1311
1312         if (dolock)
1313                 socket_lock(so, 1);
1314
1315         so_update_last_owner_locked(so, p);
1316         so_update_policy(so);
1317
1318         /*
1319          * If this is a listening socket or if this is a previously-accepted
1320          * socket that has been marked as inactive, reject the connect request.
1321          */
1322         if ((so->so_options & SO_ACCEPTCONN) || (so->so_flags & SOF_DEFUNCT)) {
1323                 error = EOPNOTSUPP;
1324                 if (so->so_flags & SOF_DEFUNCT) {
1325                         SODEFUNCTLOG(("%s[%d]: defunct so 0x%llx [%d,%d] "
1326                             "(%d)\n", __func__, proc_pid(p),
1327                             (uint64_t)VM_KERNEL_ADDRPERM(so),
1328                             SOCK_DOM(so), SOCK_TYPE(so), error));
1329                 }
1330                 if (dolock)
1331                         socket_unlock(so, 1);
1332                 return (error);
1333         }
1334
1335         if ((so->so_restrictions & SO_RESTRICT_DENY_OUT) != 0) {
1336                 if (dolock)
1337                         socket_unlock(so, 1);
1338                 return (EPERM);
1339         }
1340
1341         /*
1342          * If protocol is connection-based, can only connect once.
1343          * Otherwise, if connected, try to disconnect first.
1344          * This allows user to disconnect by connecting to, e.g.,
1345          * a null address.
1346          */
1347         if (so->so_state & (SS_ISCONNECTED|SS_ISCONNECTING) &&
1348             ((so->so_proto->pr_flags & PR_CONNREQUIRED) ||
1349             (error = sodisconnectlocked(so)))) {
1350                 error = EISCONN;
1351         } else {
1352                 /*
1353                  * Run connect filter before calling protocol:
1354                  *  - non-blocking connect returns before completion;
1355                  */
1356                 error = sflt_connectout(so, nam);
1357                 if (error != 0) {
1358                         if (error == EJUSTRETURN)
1359                                 error = 0;
1360                 } else {
1361                         error = (*so->so_proto->pr_usrreqs->pru_connect)
1362                             (so, nam, p);
1363                 }
1364         }
1365         if (dolock)
1366                 socket_unlock(so, 1);
1367         return (error);
1368 }
1369
1370 int
1371 soconnect(struct socket *so, struct sockaddr *nam)
1372 {
1373         return (soconnectlock(so, nam, 1));
1374 }
1375
1376 /*
1377  * Returns:     0                       Success
1378  *      <pru_connect2>:EINVAL[AF_UNIX]
1379  *      <pru_connect2>:EPROTOTYPE[AF_UNIX]
1380  *      <pru_connect2>:???              [other protocol families]
1381  *
1382  * Notes:       <pru_connect2> is not supported by [TCP].
1383  */
1384 int
1385 soconnect2(struct socket *so1, struct socket *so2)
1386 {
1387         int error;
1388
1389         socket_lock(so1, 1);
1390         if (so2->so_proto->pr_lock)
1391                 socket_lock(so2, 1);
1392
1393         error = (*so1->so_proto->pr_usrreqs->pru_connect2)(so1, so2);
1394
1395         socket_unlock(so1, 1);
1396         if (so2->so_proto->pr_lock)
1397                 socket_unlock(so2, 1);
1398         return (error);
1399 }
1400
1401 int
1402 soconnectxlocked(struct socket *so, struct sockaddr_list **src_sl,
1403     struct sockaddr_list **dst_sl, struct proc *p, uint32_t ifscope,
1404     associd_t aid, connid_t *pcid, uint32_t flags, void *arg,
1405     uint32_t arglen)
1406 {
1407         int error;
1408
1409         /*
1410          * If this is a listening socket or if this is a previously-accepted
1411          * socket that has been marked as inactive, reject the connect request.
1412          */
1413         if ((so->so_options & SO_ACCEPTCONN) || (so->so_flags & SOF_DEFUNCT)) {
1414                 error = EOPNOTSUPP;
1415                 if (so->so_flags & SOF_DEFUNCT) {
1416                         SODEFUNCTLOG(("%s[%d]: defunct so 0x%llx [%d,%d] "
1417                             "(%d)\n", __func__, proc_pid(p),
1418                             (uint64_t)VM_KERNEL_ADDRPERM(so),
1419                             SOCK_DOM(so), SOCK_TYPE(so), error));
1420                 }
1421                 return (error);
1422         }
1423
1424         if ((so->so_restrictions & SO_RESTRICT_DENY_OUT) != 0)
1425                 return (EPERM);
1426
1427         /*
1428          * If protocol is connection-based, can only connect once
1429          * unless PR_MULTICONN is set.  Otherwise, if connected,
1430          * try to disconnect first.  This allows user to disconnect
1431          * by connecting to, e.g., a null address.
1432          */
1433         if ((so->so_state & (SS_ISCONNECTED|SS_ISCONNECTING)) &&
1434             !(so->so_proto->pr_flags & PR_MULTICONN) &&
1435             ((so->so_proto->pr_flags & PR_CONNREQUIRED) ||
1436             (error = sodisconnectlocked(so)) != 0)) {
1437                 error = EISCONN;
1438         } else {
1439                 /*
1440                  * Run connect filter before calling protocol:
1441                  *  - non-blocking connect returns before completion;
1442                  */
1443                 error = sflt_connectxout(so, dst_sl);
1444                 if (error != 0) {
1445                         if (error == EJUSTRETURN)
1446                                 error = 0;
1447                 } else {
1448                         error = (*so->so_proto->pr_usrreqs->pru_connectx)
1449                             (so, src_sl, dst_sl, p, ifscope, aid, pcid,
1450                             flags, arg, arglen);
1451                 }
1452         }
1453
1454         return (error);
1455 }
1456
1457 int
1458 sodisconnectlocked(struct socket *so)
1459 {
1460         int error;
1461
1462         if ((so->so_state & SS_ISCONNECTED) == 0) {
1463                 error = ENOTCONN;
1464                 goto bad;
1465         }
1466         if (so->so_state & SS_ISDISCONNECTING) {
1467                 error = EALREADY;
1468                 goto bad;
1469         }
1470
1471         error = (*so->so_proto->pr_usrreqs->pru_disconnect)(so);
1472         if (error == 0)
1473                 sflt_notify(so, sock_evt_disconnected, NULL);
1474
1475 bad:
1476         return (error);
1477 }
1478
1479 /* Locking version */
1480 int
1481 sodisconnect(struct socket *so)
1482 {
1483         int error;
1484
1485         socket_lock(so, 1);
1486         error = sodisconnectlocked(so);
1487         socket_unlock(so, 1);
1488         return (error);
1489 }
1490
1491 int
1492 sodisconnectxlocked(struct socket *so, associd_t aid, connid_t cid)
1493 {
1494         int error;
1495
1496         /*
1497          * Call the protocol disconnectx handler; let it handle all
1498          * matters related to the connection state of this session.
1499          */
1500         error = (*so->so_proto->pr_usrreqs->pru_disconnectx)(so, aid, cid);
1501         if (error == 0) {
1502                 /*
1503                  * The event applies only for the session, not for
1504                  * the disconnection of individual subflows.
1505                  */
1506                 if (so->so_state & (SS_ISDISCONNECTING|SS_ISDISCONNECTED))
1507                         sflt_notify(so, sock_evt_disconnected, NULL);
1508         }
1509         return (error);
1510 }
1511
1512 int
1513 sodisconnectx(struct socket *so, associd_t aid, connid_t cid)
1514 {
1515         int error;
1516
1517         socket_lock(so, 1);
1518         error = sodisconnectxlocked(so, aid, cid);
1519         socket_unlock(so, 1);
1520         return (error);
1521 }
1522
1523 int
1524 sopeelofflocked(struct socket *so, associd_t aid, struct socket **psop)
1525 {
1526         return ((*so->so_proto->pr_usrreqs->pru_peeloff)(so, aid, psop));
1527 }
1528
1529 #define SBLOCKWAIT(f)   (((f) & MSG_DONTWAIT) ? 0 : SBL_WAIT)
1530
1531 /*
1532  * sosendcheck will lock the socket buffer if it isn't locked and
1533  * verify that there is space for the data being inserted.
1534  *
1535  * Returns:     0                       Success
1536  *              EPIPE
1537  *      sblock:EWOULDBLOCK
1538  *      sblock:EINTR
1539  *      sbwait:EBADF
1540  *      sbwait:EINTR
1541  *      [so_error]:???
1542  */
1543 int
1544 sosendcheck(struct socket *so, struct sockaddr *addr, user_ssize_t resid,
1545     int32_t clen, int32_t atomic, int flags, int *sblocked,
1546     struct mbuf *control)
1547 {
1548         int     error = 0;
1549         int32_t space;
1550         int     assumelock = 0;
1551
1552 restart:
1553         if (*sblocked == 0) {
1554                 if ((so->so_snd.sb_flags & SB_LOCK) != 0 &&
1555                     so->so_send_filt_thread != 0 &&
1556                     so->so_send_filt_thread == current_thread()) {
1557                         /*
1558                          * We're being called recursively from a filter,
1559                          * allow this to continue. Radar 4150520.
1560                          * Don't set sblocked because we don't want
1561                          * to perform an unlock later.
1562                          */
1563                         assumelock = 1;
1564                 } else {
1565                         error = sblock(&so->so_snd, SBLOCKWAIT(flags));
1566                         if (error) {
1567                                 if (so->so_flags & SOF_DEFUNCT)
1568                                         goto defunct;
1569                                 return (error);
1570                         }
1571                         *sblocked = 1;
1572                 }
1573         }
1574
1575         /*
1576          * If a send attempt is made on a socket that has been marked
1577          * as inactive (disconnected), reject the request.
1578          */
1579         if (so->so_flags & SOF_DEFUNCT) {
1580 defunct:
1581                 error = EPIPE;
1582                 SODEFUNCTLOG(("%s[%d]: defunct so 0x%llx [%d,%d] (%d)\n",
1583                     __func__, proc_selfpid(), (uint64_t)VM_KERNEL_ADDRPERM(so),
1584                     SOCK_DOM(so), SOCK_TYPE(so), error));
1585                 return (error);
1586         }
1587
1588         if (so->so_state & SS_CANTSENDMORE)
1589                 return (EPIPE);
1590
1591         if (so->so_error) {
1592                 error = so->so_error;
1593                 so->so_error = 0;
1594                 return (error);
1595         }
1596
1597         if ((so->so_state & SS_ISCONNECTED) == 0) {
1598                 if ((so->so_proto->pr_flags & PR_CONNREQUIRED) != 0) {
1599                         if ((so->so_state & SS_ISCONFIRMING) == 0 &&
1600                             !(resid == 0 && clen != 0))
1601                                 return (ENOTCONN);
1602                 } else if (addr == 0 && !(flags&MSG_HOLD)) {
1603                         return ((so->so_proto->pr_flags & PR_CONNREQUIRED) ?
1604                             ENOTCONN : EDESTADDRREQ);
1605                 }
1606         }
1607         if (so->so_flags & SOF_ENABLE_MSGS)
1608                 space = msgq_sbspace(so, control);
1609         else
1610                 space = sbspace(&so->so_snd);
1611
1612         if (flags & MSG_OOB)
1613                 space += 1024;
1614         if ((atomic && resid > so->so_snd.sb_hiwat) ||
1615             clen > so->so_snd.sb_hiwat)
1616                 return (EMSGSIZE);
1617
1618         if ((space < resid + clen &&
1619             (atomic || space < (int32_t)so->so_snd.sb_lowat || space < clen)) ||
1620             (so->so_type == SOCK_STREAM && so_wait_for_if_feedback(so))) {
1621                 if ((so->so_state & SS_NBIO) || (flags & MSG_NBIO) ||
1622                     assumelock) {
1623                         return (EWOULDBLOCK);
1624                 }
1625                 sbunlock(&so->so_snd, TRUE);    /* keep socket locked */
1626                 *sblocked = 0;
1627                 error = sbwait(&so->so_snd);
1628                 if (error) {
1629                         if (so->so_flags & SOF_DEFUNCT)
1630                                 goto defunct;
1631                         return (error);
1632                 }
1633                 goto restart;
1634         }
1635         return (0);
1636 }
1637
1638 /*
1639  * Send on a socket.
1640  * If send must go all at once and message is larger than
1641  * send buffering, then hard error.
1642  * Lock against other senders.
1643  * If must go all at once and not enough room now, then
1644  * inform user that this would block and do nothing.
1645  * Otherwise, if nonblocking, send as much as possible.
1646  * The data to be sent is described by "uio" if nonzero,
1647  * otherwise by the mbuf chain "top" (which must be null
1648  * if uio is not).  Data provided in mbuf chain must be small
1649  * enough to send all at once.
1650  *
1651  * Returns nonzero on error, timeout or signal; callers
1652  * must check for short counts if EINTR/ERESTART are returned.
1653  * Data and control buffers are freed on return.
1654  * Experiment:
1655  * MSG_HOLD: go thru most of sosend(), but just enqueue the mbuf
1656  * MSG_SEND: go thru as for MSG_HOLD on current fragment, then
1657  *  point at the mbuf chain being constructed and go from there.
1658  *
1659  * Returns:     0                       Success
1660  *              EOPNOTSUPP
1661  *              EINVAL
1662  *              ENOBUFS
1663  *      uiomove:EFAULT
1664  *      sosendcheck:EPIPE
1665  *      sosendcheck:EWOULDBLOCK
1666  *      sosendcheck:EINTR
1667  *      sosendcheck:EBADF
1668  *      sosendcheck:EINTR
1669  *      sosendcheck:???                 [value from so_error]
1670  *      <pru_send>:ECONNRESET[TCP]
1671  *      <pru_send>:EINVAL[TCP]
1672  *      <pru_send>:ENOBUFS[TCP]
1673  *      <pru_send>:EADDRINUSE[TCP]
1674  *      <pru_send>:EADDRNOTAVAIL[TCP]
1675  *      <pru_send>:EAFNOSUPPORT[TCP]
1676  *      <pru_send>:EACCES[TCP]
1677  *      <pru_send>:EAGAIN[TCP]
1678  *      <pru_send>:EPERM[TCP]
1679  *      <pru_send>:EMSGSIZE[TCP]
1680  *      <pru_send>:EHOSTUNREACH[TCP]
1681  *      <pru_send>:ENETUNREACH[TCP]
1682  *      <pru_send>:ENETDOWN[TCP]
1683  *      <pru_send>:ENOMEM[TCP]
1684  *      <pru_send>:ENOBUFS[TCP]
1685  *      <pru_send>:???[TCP]             [ignorable: mostly IPSEC/firewall/DLIL]
1686  *      <pru_send>:EINVAL[AF_UNIX]
1687  *      <pru_send>:EOPNOTSUPP[AF_UNIX]
1688  *      <pru_send>:EPIPE[AF_UNIX]
1689  *      <pru_send>:ENOTCONN[AF_UNIX]
1690  *      <pru_send>:EISCONN[AF_UNIX]
1691  *      <pru_send>:???[AF_UNIX]         [whatever a filter author chooses]
1692  *      <sf_data_out>:???               [whatever a filter author chooses]
1693  *
1694  * Notes:       Other <pru_send> returns depend on the protocol family; all
1695  *              <sf_data_out> returns depend on what the filter author causes
1696  *              their filter to return.
1697  */
1698 int
1699 sosend(struct socket *so, struct sockaddr *addr, struct uio *uio,
1700     struct mbuf *top, struct mbuf *control, int flags)
1701 {
1702         struct mbuf **mp;
1703         struct mbuf *m, *freelist = NULL;
1704         user_ssize_t space, len, resid;
1705         int clen = 0, error, dontroute, mlen, sendflags;
1706         int atomic = sosendallatonce(so) || top;
1707         int sblocked = 0;
1708         struct proc *p = current_proc();
1709         struct mbuf *control_copy = NULL;
1710
1711         if (uio != NULL)
1712                 resid = uio_resid(uio);
1713         else
1714                 resid = top->m_pkthdr.len;
1715
1716         KERNEL_DEBUG((DBG_FNC_SOSEND | DBG_FUNC_START), so, resid,
1717             so->so_snd.sb_cc, so->so_snd.sb_lowat, so->so_snd.sb_hiwat);
1718
1719         socket_lock(so, 1);
1720         so_update_last_owner_locked(so, p);
1721         so_update_policy(so);
1722
1723         if (so->so_type != SOCK_STREAM && (flags & MSG_OOB) != 0) {
1724                 error = EOPNOTSUPP;
1725                 socket_unlock(so, 1);
1726                 goto out;
1727         }
1728
1729         /*
1730          * In theory resid should be unsigned.
1731          * However, space must be signed, as it might be less than 0
1732          * if we over-committed, and we must use a signed comparison
1733          * of space and resid.  On the other hand, a negative resid
1734          * causes us to loop sending 0-length segments to the protocol.
1735          *
1736          * Usually, MSG_EOR isn't used on SOCK_STREAM type sockets.
1737          * But it will be used by sockets doing message delivery.
1738          *
1739          * Note: We limit resid to be a positive 32 bits value as we use
1740          * imin() to set bytes_to_copy -- radr://14558484
1741          */
1742         if ((int32_t)resid < 0 || (so->so_type == SOCK_STREAM &&
1743             !(so->so_flags & SOF_ENABLE_MSGS) && (flags & MSG_EOR))) {
1744                 error = EINVAL;
1745                 socket_unlock(so, 1);
1746                 goto out;
1747         }
1748
1749         dontroute = (flags & MSG_DONTROUTE) &&
1750             (so->so_options & SO_DONTROUTE) == 0 &&
1751             (so->so_proto->pr_flags & PR_ATOMIC);
1752         OSIncrementAtomicLong(&p->p_stats->p_ru.ru_msgsnd);
1753
1754         if (control != NULL)
1755                 clen = control->m_len;
1756
1757         do {
1758                 error = sosendcheck(so, addr, resid, clen, atomic, flags,
1759                     &sblocked, control);
1760                 if (error)
1761                         goto release;
1762
1763                 mp = &top;
1764                 if (so->so_flags & SOF_ENABLE_MSGS)
1765                         space = msgq_sbspace(so, control);
1766                 else
1767                         space = sbspace(&so->so_snd) - clen;
1768                 space += ((flags & MSG_OOB) ? 1024 : 0);
1769
1770                 do {
1771                         if (uio == NULL) {
1772                                 /*
1773                                  * Data is prepackaged in "top".
1774                                  */
1775                                 resid = 0;
1776                                 if (flags & MSG_EOR)
1777                                         top->m_flags |= M_EOR;
1778                         } else {
1779                                 int chainlength;
1780                                 int bytes_to_copy;
1781                                 boolean_t jumbocl;
1782
1783                                 bytes_to_copy = imin(resid, space);
1784
1785                                 if (sosendminchain > 0)
1786                                         chainlength = 0;
1787                                 else
1788                                         chainlength = sosendmaxchain;
1789
1790                                 /*
1791                                  * Attempt to use larger than system page-size
1792                                  * clusters for large writes only if there is
1793                                  * a jumbo cluster pool and if the socket is
1794                                  * marked accordingly.
1795                                  */
1796                                 jumbocl = sosendjcl && njcl > 0 &&
1797                                     ((so->so_flags & SOF_MULTIPAGES) ||
1798                                     sosendjcl_ignore_capab);
1799
1800                                 socket_unlock(so, 0);
1801
1802                                 do {
1803                                         int num_needed;
1804                                         int hdrs_needed = (top == NULL) ? 1 : 0;
1805
1806                                         /*
1807                                          * try to maintain a local cache of mbuf
1808                                          * clusters needed to complete this
1809                                          * write the list is further limited to
1810                                          * the number that are currently needed
1811                                          * to fill the socket this mechanism
1812                                          * allows a large number of mbufs/
1813                                          * clusters to be grabbed under a single
1814                                          * mbuf lock... if we can't get any
1815                                          * clusters, than fall back to trying
1816                                          * for mbufs if we fail early (or
1817                                          * miscalcluate the number needed) make
1818                                          * sure to release any clusters we
1819                                          * haven't yet consumed.
1820                                          */
1821                                         if (freelist == NULL &&
1822                                             bytes_to_copy > MBIGCLBYTES &&
1823                                             jumbocl) {
1824                                                 num_needed =
1825                                                     bytes_to_copy / M16KCLBYTES;
1826
1827                                                 if ((bytes_to_copy -
1828                                                     (num_needed * M16KCLBYTES))
1829                                                     >= MINCLSIZE)
1830                                                         num_needed++;
1831
1832                                                 freelist =
1833                                                     m_getpackets_internal(
1834                                                     (unsigned int *)&num_needed,
1835                                                     hdrs_needed, M_WAIT, 0,
1836                                                     M16KCLBYTES);
1837                                                 /*
1838                                                  * Fall back to 4K cluster size
1839                                                  * if allocation failed
1840                                                  */
1841                                         }
1842
1843                                         if (freelist == NULL &&
1844                                             bytes_to_copy > MCLBYTES) {
1845                                                 num_needed =
1846                                                     bytes_to_copy / MBIGCLBYTES;
1847
1848                                                 if ((bytes_to_copy -
1849                                                     (num_needed * MBIGCLBYTES)) >=
1850                                                     MINCLSIZE)
1851                                                         num_needed++;
1852
1853                                                 freelist =
1854                                                     m_getpackets_internal(
1855                                                     (unsigned int *)&num_needed,
1856                                                     hdrs_needed, M_WAIT, 0,
1857                                                     MBIGCLBYTES);
1858                                                 /*
1859                                                  * Fall back to cluster size
1860                                                  * if allocation failed
1861                                                  */
1862                                         }
1863
1864                                         if (freelist == NULL &&
1865                                             bytes_to_copy > MINCLSIZE) {
1866                                                 num_needed =
1867                                                     bytes_to_copy / MCLBYTES;
1868
1869                                                 if ((bytes_to_copy -
1870                                                     (num_needed * MCLBYTES)) >=
1871                                                     MINCLSIZE)
1872                                                         num_needed++;
1873
1874                                                 freelist =
1875                                                     m_getpackets_internal(
1876                                                     (unsigned int *)&num_needed,
1877                                                     hdrs_needed, M_WAIT, 0,
1878                                                     MCLBYTES);
1879                                                 /*
1880                                                  * Fall back to a single mbuf
1881                                                  * if allocation failed
1882                                                  */
1883                                         }
1884
1885                                         if (freelist == NULL) {
1886                                                 if (top == NULL)
1887                                                         MGETHDR(freelist,
1888                                                             M_WAIT, MT_DATA);
1889                                                 else
1890                                                         MGET(freelist,
1891                                                             M_WAIT, MT_DATA);
1892
1893                                                 if (freelist == NULL) {
1894                                                         error = ENOBUFS;
1895                                                         socket_lock(so, 0);
1896                                                         goto release;
1897                                                 }
1898                                                 /*
1899                                                  * For datagram protocols,
1900                                                  * leave room for protocol
1901                                                  * headers in first mbuf.
1902                                                  */
1903                                                 if (atomic && top == NULL &&
1904                                                     bytes_to_copy < MHLEN) {
1905                                                         MH_ALIGN(freelist,
1906                                                             bytes_to_copy);
1907                                                 }
1908                                         }
1909                                         m = freelist;
1910                                         freelist = m->m_next;
1911                                         m->m_next = NULL;
1912
1913                                         if ((m->m_flags & M_EXT))
1914                                                 mlen = m->m_ext.ext_size;
1915                                         else if ((m->m_flags & M_PKTHDR))
1916                                                 mlen =
1917                                                     MHLEN - m_leadingspace(m);
1918                                         else
1919                                                 mlen = MLEN;
1920                                         len = imin(mlen, bytes_to_copy);
1921
1922                                         chainlength += len;
1923
1924                                         space -= len;
1925
1926                                         error = uiomove(mtod(m, caddr_t),
1927                                             len, uio);
1928
1929                                         resid = uio_resid(uio);
1930
1931                                         m->m_len = len;
1932                                         *mp = m;
1933                                         top->m_pkthdr.len += len;
1934                                         if (error)
1935                                                 break;
1936                                         mp = &m->m_next;
1937                                         if (resid <= 0) {
1938                                                 if (flags & MSG_EOR)
1939                                                         top->m_flags |= M_EOR;
1940                                                 break;
1941                                         }
1942                                         bytes_to_copy = min(resid, space);
1943
1944                                 } while (space > 0 &&
1945                                     (chainlength < sosendmaxchain || atomic ||
1946                                     resid < MINCLSIZE));
1947
1948                                 socket_lock(so, 0);
1949
1950                                 if (error)
1951                                         goto release;
1952                         }
1953
1954                         if (flags & (MSG_HOLD|MSG_SEND)) {
1955                                 /* Enqueue for later, go away if HOLD */
1956                                 struct mbuf *mb1;
1957                                 if (so->so_temp && (flags & MSG_FLUSH)) {
1958                                         m_freem(so->so_temp);
1959                                         so->so_temp = NULL;
1960                                 }
1961                                 if (so->so_temp)
1962                                         so->so_tail->m_next = top;
1963                                 else
1964                                         so->so_temp = top;
1965                                 mb1 = top;
1966                                 while (mb1->m_next)
1967                                         mb1 = mb1->m_next;
1968                                 so->so_tail = mb1;
1969                                 if (flags & MSG_HOLD) {
1970                                         top = NULL;
1971                                         goto release;
1972                                 }
1973                                 top = so->so_temp;
1974                         }
1975                         if (dontroute)
1976                                 so->so_options |= SO_DONTROUTE;
1977
1978                         /* Compute flags here, for pru_send and NKEs */
1979                         sendflags = (flags & MSG_OOB) ? PRUS_OOB :
1980                             /*
1981                              * If the user set MSG_EOF, the protocol
1982                              * understands this flag and nothing left to
1983                              * send then use PRU_SEND_EOF instead of PRU_SEND.
1984                              */
1985                             ((flags & MSG_EOF) &&
1986                              (so->so_proto->pr_flags & PR_IMPLOPCL) &&
1987                              (resid <= 0)) ? PRUS_EOF :
1988                              /* If there is more to send set PRUS_MORETOCOME */
1989                              (resid > 0 && space > 0) ? PRUS_MORETOCOME : 0;
1990
1991                         /*
1992                          * Socket filter processing
1993                          */
1994                         error = sflt_data_out(so, addr, &top,
1995                             &control, (sendflags & MSG_OOB) ?
1996                             sock_data_filt_flag_oob : 0);
1997                         if (error) {
1998                                 if (error == EJUSTRETURN) {
1999                                         error = 0;
2000                                         clen = 0;
2001                                         control = NULL;
2002                                         top = NULL;
2003                                 }
2004
2005                                 goto release;
2006                         }
2007                         /*
2008                          * End Socket filter processing
2009                          */
2010
2011                         if (so->so_flags & SOF_ENABLE_MSGS) {
2012                                 /*
2013                                  * Make a copy of control mbuf,
2014                                  * so that msg priority can be
2015                                  * passed to subsequent mbufs.
2016                                  */
2017                                 control_copy = m_dup(control, M_NOWAIT);
2018                         }
2019                         error = (*so->so_proto->pr_usrreqs->pru_send)
2020                             (so, sendflags, top, addr, control, p);
2021
2022                         if (flags & MSG_SEND)
2023                                 so->so_temp = NULL;
2024
2025                         if (dontroute)
2026                                 so->so_options &= ~SO_DONTROUTE;
2027
2028                         clen = 0;
2029                         control = control_copy;
2030                         control_copy = NULL;
2031                         top = NULL;
2032                         mp = &top;
2033                         if (error)
2034                                 goto release;
2035                 } while (resid && space > 0);
2036         } while (resid);
2037
2038 release:
2039         if (sblocked)
2040                 sbunlock(&so->so_snd, FALSE);   /* will unlock socket */
2041         else
2042                 socket_unlock(so, 1);
2043 out:
2044         if (top != NULL)
2045                 m_freem(top);
2046         if (control != NULL)
2047                 m_freem(control);
2048         if (freelist != NULL)
2049                 m_freem_list(freelist);
2050         if (control_copy != NULL)
2051                 m_freem(control_copy);
2052
2053         KERNEL_DEBUG(DBG_FNC_SOSEND | DBG_FUNC_END, so, resid, so->so_snd.sb_cc,
2054             space, error);
2055
2056         return (error);
2057 }
2058
2059 /*
2060  * Implement receive operations on a socket.
2061  * We depend on the way that records are added to the sockbuf
2062  * by sbappend*.  In particular, each record (mbufs linked through m_next)
2063  * must begin with an address if the protocol so specifies,
2064  * followed by an optional mbuf or mbufs containing ancillary data,
2065  * and then zero or more mbufs of data.
2066  * In order to avoid blocking network interrupts for the entire time here,
2067  * we splx() while doing the actual copy to user space.
2068  * Although the sockbuf is locked, new data may still be appended,
2069  * and thus we must maintain consistency of the sockbuf during that time.
2070  *
2071  * The caller may receive the data as a single mbuf chain by supplying
2072  * an mbuf **mp0 for use in returning the chain.  The uio is then used
2073  * only for the count in uio_resid.
2074  *
2075  * Returns:     0                       Success
2076  *              ENOBUFS
2077  *              ENOTCONN
2078  *              EWOULDBLOCK
2079  *      uiomove:EFAULT
2080  *      sblock:EWOULDBLOCK
2081  *      sblock:EINTR
2082  *      sbwait:EBADF
2083  *      sbwait:EINTR
2084  *      sodelayed_copy:EFAULT
2085  *      <pru_rcvoob>:EINVAL[TCP]
2086  *      <pru_rcvoob>:EWOULDBLOCK[TCP]
2087  *      <pru_rcvoob>:???
2088  *      <pr_domain->dom_externalize>:EMSGSIZE[AF_UNIX]
2089  *      <pr_domain->dom_externalize>:ENOBUFS[AF_UNIX]
2090  *      <pr_domain->dom_externalize>:???
2091  *
2092  * Notes:       Additional return values from calls through <pru_rcvoob> and
2093  *              <pr_domain->dom_externalize> depend on protocols other than
2094  *              TCP or AF_UNIX, which are documented above.
2095  */
2096 int
2097 soreceive(struct socket *so, struct sockaddr **psa, struct uio *uio,
2098     struct mbuf **mp0, struct mbuf **controlp, int *flagsp)
2099 {
2100         struct mbuf *m, **mp, *ml = NULL;
2101         struct mbuf *nextrecord, *free_list;
2102         int flags, error, offset;
2103         user_ssize_t len;
2104         struct protosw *pr = so->so_proto;
2105         int moff, type =0;
2106         user_ssize_t orig_resid = uio_resid(uio);
2107         user_ssize_t delayed_copy_len;
2108         int can_delay;
2109         int need_event;
2110         struct proc *p = current_proc();
2111
2112         KERNEL_DEBUG(DBG_FNC_SORECEIVE | DBG_FUNC_START, so, uio_resid(uio),
2113             so->so_rcv.sb_cc, so->so_rcv.sb_lowat, so->so_rcv.sb_hiwat);
2114
2115         socket_lock(so, 1);
2116         so_update_last_owner_locked(so, p);
2117         so_update_policy(so);
2118
2119 #ifdef MORE_LOCKING_DEBUG
2120         if (so->so_usecount == 1) {
2121                 panic("%s: so=%x no other reference on socket\n", __func__, so);
2122                 /* NOTREACHED */
2123         }
2124 #endif
2125         mp = mp0;
2126         if (psa != NULL)
2127                 *psa = NULL;
2128         if (controlp != NULL)
2129                 *controlp = NULL;
2130         if (flagsp != NULL)
2131                 flags = *flagsp &~ MSG_EOR;
2132         else
2133                 flags = 0;
2134
2135         /*
2136          * If a recv attempt is made on a previously-accepted socket
2137          * that has been marked as inactive (disconnected), reject
2138          * the request.
2139          */
2140         if (so->so_flags & SOF_DEFUNCT) {
2141                 struct sockbuf *sb = &so->so_rcv;
2142
2143                 error = ENOTCONN;
2144                 SODEFUNCTLOG(("%s[%d]: defunct so 0x%llx [%d,%d] (%d)\n",
2145                     __func__, proc_pid(p), (uint64_t)VM_KERNEL_ADDRPERM(so),
2146                     SOCK_DOM(so), SOCK_TYPE(so), error));
2147                 /*
2148                  * This socket should have been disconnected and flushed
2149                  * prior to being returned from sodefunct(); there should
2150                  * be no data on its receive list, so panic otherwise.
2151                  */
2152                 if (so->so_state & SS_DEFUNCT)
2153                         sb_empty_assert(sb, __func__);
2154                 socket_unlock(so, 1);
2155                 return (error);
2156         }
2157
2158         /*
2159          * When SO_WANTOOBFLAG is set we try to get out-of-band data
2160          * regardless of the flags argument. Here is the case were
2161          * out-of-band data is not inline.
2162          */
2163         if ((flags & MSG_OOB) ||
2164             ((so->so_options & SO_WANTOOBFLAG) != 0 &&
2165             (so->so_options & SO_OOBINLINE) == 0 &&
2166             (so->so_oobmark || (so->so_state & SS_RCVATMARK)))) {
2167                 m = m_get(M_WAIT, MT_DATA);
2168                 if (m == NULL) {
2169                         socket_unlock(so, 1);
2170                         KERNEL_DEBUG(DBG_FNC_SORECEIVE | DBG_FUNC_END,
2171                             ENOBUFS, 0, 0, 0, 0);
2172                         return (ENOBUFS);
2173                 }
2174                 error = (*pr->pr_usrreqs->pru_rcvoob)(so, m, flags & MSG_PEEK);
2175                 if (error)
2176                         goto bad;
2177                 socket_unlock(so, 0);
2178                 do {
2179                         error = uiomove(mtod(m, caddr_t),
2180                             imin(uio_resid(uio), m->m_len), uio);
2181                         m = m_free(m);
2182                 } while (uio_resid(uio) && error == 0 && m != NULL);
2183                 socket_lock(so, 0);
2184 bad:
2185                 if (m != NULL)
2186                         m_freem(m);
2187
2188                 if ((so->so_options & SO_WANTOOBFLAG) != 0) {
2189                         if (error == EWOULDBLOCK || error == EINVAL) {
2190                                 /*
2191                                  * Let's try to get normal data:
2192                                  * EWOULDBLOCK: out-of-band data not
2193                                  * receive yet. EINVAL: out-of-band data
2194                                  * already read.
2195                                  */
2196                                 error = 0;
2197                                 goto nooob;
2198                         } else if (error == 0 && flagsp != NULL) {
2199                                 *flagsp |= MSG_OOB;
2200                         }
2201                 }
2202                 socket_unlock(so, 1);
2203                 KERNEL_DEBUG(DBG_FNC_SORECEIVE | DBG_FUNC_END, error,
2204                     0, 0, 0, 0);
2205
2206                 return (error);
2207         }
2208 nooob:
2209         if (mp != NULL)
2210                 *mp = NULL;
2211         if (so->so_state & SS_ISCONFIRMING && uio_resid(uio))
2212                 (*pr->pr_usrreqs->pru_rcvd)(so, 0);
2213
2214         free_list = NULL;
2215         delayed_copy_len = 0;
2216 restart:
2217 #ifdef MORE_LOCKING_DEBUG
2218         if (so->so_usecount <= 1)
2219                 printf("soreceive: sblock so=%p ref=%d on socket\n",
2220                     so, so->so_usecount);
2221 #endif
2222         /*
2223          * See if the socket has been closed (SS_NOFDREF|SS_CANTRCVMORE)
2224          * and if so just return to the caller.  This could happen when
2225          * soreceive() is called by a socket upcall function during the
2226          * time the socket is freed.  The socket buffer would have been
2227          * locked across the upcall, therefore we cannot put this thread
2228          * to sleep (else we will deadlock) or return EWOULDBLOCK (else
2229          * we may livelock), because the lock on the socket buffer will
2230          * only be released when the upcall routine returns to its caller.
2231          * Because the socket has been officially closed, there can be
2232          * no further read on it.
2233          *
2234          * A multipath subflow socket would have its SS_NOFDREF set by
2235          * default, so check for SOF_MP_SUBFLOW socket flag; when the
2236          * socket is closed for real, SOF_MP_SUBFLOW would be cleared.
2237          */
2238         if ((so->so_state & (SS_NOFDREF | SS_CANTRCVMORE)) ==
2239             (SS_NOFDREF | SS_CANTRCVMORE) && !(so->so_flags & SOF_MP_SUBFLOW)) {
2240                 socket_unlock(so, 1);
2241                 return (0);
2242         }
2243
2244         error = sblock(&so->so_rcv, SBLOCKWAIT(flags));
2245         if (error) {
2246                 socket_unlock(so, 1);
2247                 KERNEL_DEBUG(DBG_FNC_SORECEIVE | DBG_FUNC_END, error,
2248                     0, 0, 0, 0);
2249                 return (error);
2250         }
2251
2252         m = so->so_rcv.sb_mb;
2253         /*
2254          * If we have less data than requested, block awaiting more
2255          * (subject to any timeout) if:
2256          *   1. the current count is less than the low water mark, or
2257          *   2. MSG_WAITALL is set, and it is possible to do the entire
2258          *      receive operation at once if we block (resid <= hiwat).
2259          *   3. MSG_DONTWAIT is not set
2260          * If MSG_WAITALL is set but resid is larger than the receive buffer,
2261          * we have to do the receive in sections, and thus risk returning
2262          * a short count if a timeout or signal occurs after we start.
2263          */
2264         if (m == NULL || (((flags & MSG_DONTWAIT) == 0 &&
2265             so->so_rcv.sb_cc < uio_resid(uio)) &&
2266             (so->so_rcv.sb_cc < so->so_rcv.sb_lowat ||
2267             ((flags & MSG_WAITALL) && uio_resid(uio) <= so->so_rcv.sb_hiwat)) &&
2268             m->m_nextpkt == NULL && (pr->pr_flags & PR_ATOMIC) == 0)) {
2269                 /*
2270                  * Panic if we notice inconsistencies in the socket's
2271                  * receive list; both sb_mb and sb_cc should correctly
2272                  * reflect the contents of the list, otherwise we may
2273                  * end up with false positives during select() or poll()
2274                  * which could put the application in a bad state.
2275                  */
2276                 SB_MB_CHECK(&so->so_rcv);
2277
2278                 if (so->so_error) {
2279                         if (m != NULL)
2280                                 goto dontblock;
2281                         error = so->so_error;
2282                         if ((flags & MSG_PEEK) == 0)
2283                                 so->so_error = 0;
2284                         goto release;
2285                 }
2286                 if (so->so_state & SS_CANTRCVMORE) {
2287                         if (m != NULL)
2288                                 goto dontblock;
2289                         else
2290                                 goto release;
2291                 }
2292                 for (; m != NULL; m = m->m_next)
2293                         if (m->m_type == MT_OOBDATA || (m->m_flags & M_EOR)) {
2294                                 m = so->so_rcv.sb_mb;
2295                                 goto dontblock;
2296                         }
2297                 if ((so->so_state & (SS_ISCONNECTED|SS_ISCONNECTING)) == 0 &&
2298                     (so->so_proto->pr_flags & PR_CONNREQUIRED)) {
2299                         error = ENOTCONN;
2300                         goto release;
2301                 }
2302                 if (uio_resid(uio) == 0)
2303                         goto release;
2304                 if ((so->so_state & SS_NBIO) ||
2305                     (flags & (MSG_DONTWAIT|MSG_NBIO))) {
2306                         error = EWOULDBLOCK;
2307                         goto release;
2308                 }
2309                 SBLASTRECORDCHK(&so->so_rcv, "soreceive sbwait 1");
2310                 SBLASTMBUFCHK(&so->so_rcv, "soreceive sbwait 1");
2311                 sbunlock(&so->so_rcv, TRUE);    /* keep socket locked */
2312 #if EVEN_MORE_LOCKING_DEBUG
2313                 if (socket_debug)
2314                         printf("Waiting for socket data\n");
2315 #endif
2316
2317                 error = sbwait(&so->so_rcv);
2318 #if EVEN_MORE_LOCKING_DEBUG
2319                 if (socket_debug)
2320                         printf("SORECEIVE - sbwait returned %d\n", error);
2321 #endif
2322                 if (so->so_usecount < 1) {
2323                         panic("%s: after 2nd sblock so=%p ref=%d on socket\n",
2324                             __func__, so, so->so_usecount);
2325                         /* NOTREACHED */
2326                 }
2327                 if (error) {
2328                         socket_unlock(so, 1);
2329                         KERNEL_DEBUG(DBG_FNC_SORECEIVE | DBG_FUNC_END, error,
2330                             0, 0, 0, 0);
2331                         return (error);
2332                 }
2333                 goto restart;
2334         }
2335 dontblock:
2336         OSIncrementAtomicLong(&p->p_stats->p_ru.ru_msgrcv);
2337         SBLASTRECORDCHK(&so->so_rcv, "soreceive 1");
2338         SBLASTMBUFCHK(&so->so_rcv, "soreceive 1");
2339         nextrecord = m->m_nextpkt;
2340         if ((pr->pr_flags & PR_ADDR) && m->m_type == MT_SONAME) {
2341                 KASSERT(m->m_type == MT_SONAME, ("receive 1a"));
2342 #if CONFIG_MACF_SOCKET_SUBSET
2343                 /*
2344                  * Call the MAC framework for policy checking if we're in
2345                  * the user process context and the socket isn't connected.
2346                  */
2347                 if (p != kernproc && !(so->so_state & SS_ISCONNECTED)) {
2348                         struct mbuf *m0 = m;
2349                         /*
2350                          * Dequeue this record (temporarily) from the receive
2351                          * list since we're about to drop the socket's lock
2352                          * where a new record may arrive and be appended to
2353                          * the list.  Upon MAC policy failure, the record
2354                          * will be freed.  Otherwise, we'll add it back to
2355                          * the head of the list.  We cannot rely on SB_LOCK
2356                          * because append operation uses the socket's lock.
2357                          */
2358                         do {
2359                                 m->m_nextpkt = NULL;
2360                                 sbfree(&so->so_rcv, m);
2361                                 m = m->m_next;
2362                         } while (m != NULL);
2363                         m = m0;
2364                         so->so_rcv.sb_mb = nextrecord;
2365                         SB_EMPTY_FIXUP(&so->so_rcv);
2366                         SBLASTRECORDCHK(&so->so_rcv, "soreceive 1a");
2367                         SBLASTMBUFCHK(&so->so_rcv, "soreceive 1a");
2368                         socket_unlock(so, 0);
2369                         if (mac_socket_check_received(proc_ucred(p), so,
2370                             mtod(m, struct sockaddr *)) != 0) {
2371                                 /*
2372                                  * MAC policy failure; free this record and
2373                                  * process the next record (or block until
2374                                  * one is available).  We have adjusted sb_cc
2375                                  * and sb_mbcnt above so there is no need to
2376                                  * call sbfree() again.
2377                                  */
2378                                 do {
2379                                         m = m_free(m);
2380                                 } while (m != NULL);
2381                                 /*
2382                                  * Clear SB_LOCK but don't unlock the socket.
2383                                  * Process the next record or wait for one.
2384                                  */
2385                                 socket_lock(so, 0);
2386                                 sbunlock(&so->so_rcv, TRUE); /* stay locked */
2387                                 goto restart;
2388                         }
2389                         socket_lock(so, 0);
2390                         /*
2391                          * If the socket has been defunct'd, drop it.
2392                          */
2393                         if (so->so_flags & SOF_DEFUNCT) {
2394                                 m_freem(m);
2395                                 error = ENOTCONN;
2396                                 goto release;
2397                         }
2398                         /*
2399                          * Re-adjust the socket receive list and re-enqueue
2400                          * the record in front of any packets which may have
2401                          * been appended while we dropped the lock.
2402                          */
2403                         for (m = m0; m->m_next != NULL; m = m->m_next)
2404                                 sballoc(&so->so_rcv, m);
2405                         sballoc(&so->so_rcv, m);
2406                         if (so->so_rcv.sb_mb == NULL) {
2407                                 so->so_rcv.sb_lastrecord = m0;
2408                                 so->so_rcv.sb_mbtail = m;
2409                         }
2410                         m = m0;
2411                         nextrecord = m->m_nextpkt = so->so_rcv.sb_mb;
2412                         so->so_rcv.sb_mb = m;
2413                         SBLASTRECORDCHK(&so->so_rcv, "soreceive 1b");
2414                         SBLASTMBUFCHK(&so->so_rcv, "soreceive 1b");
2415                 }
2416 #endif /* CONFIG_MACF_SOCKET_SUBSET */
2417                 orig_resid = 0;
2418                 if (psa != NULL) {
2419                         *psa = dup_sockaddr(mtod(m, struct sockaddr *),
2420                             mp0 == NULL);
2421                         if ((*psa == NULL) && (flags & MSG_NEEDSA)) {
2422                                 error = EWOULDBLOCK;
2423                                 goto release;
2424                         }
2425                 }
2426                 if (flags & MSG_PEEK) {
2427                         m = m->m_next;
2428                 } else {
2429                         sbfree(&so->so_rcv, m);
2430                         if (m->m_next == NULL && so->so_rcv.sb_cc != 0) {
2431                                 panic("%s: about to create invalid socketbuf",
2432                                     __func__);
2433                                 /* NOTREACHED */
2434                         }
2435                         MFREE(m, so->so_rcv.sb_mb);
2436                         m = so->so_rcv.sb_mb;
2437                         if (m != NULL) {
2438                                 m->m_nextpkt = nextrecord;
2439                         } else {
2440                                 so->so_rcv.sb_mb = nextrecord;
2441                                 SB_EMPTY_FIXUP(&so->so_rcv);
2442                         }
2443                 }
2444         }
2445
2446         /*
2447          * Process one or more MT_CONTROL mbufs present before any data mbufs
2448          * in the first mbuf chain on the socket buffer.  If MSG_PEEK, we
2449          * just copy the data; if !MSG_PEEK, we call into the protocol to
2450          * perform externalization.
2451          */
2452         if (m != NULL && m->m_type == MT_CONTROL) {
2453                 struct mbuf *cm = NULL, *cmn;
2454                 struct mbuf **cme = &cm;
2455                 struct sockbuf *sb_rcv = &so->so_rcv;
2456                 struct mbuf **msgpcm = NULL;
2457
2458                 /*
2459                  * Externalizing the control messages would require us to
2460                  * drop the socket's lock below.  Once we re-acquire the
2461                  * lock, the mbuf chain might change.  In order to preserve
2462                  * consistency, we unlink all control messages from the
2463                  * first mbuf chain in one shot and link them separately
2464                  * onto a different chain.
2465                  */
2466                 do {
2467                         if (flags & MSG_PEEK) {
2468                                 if (controlp != NULL) {
2469                                         if (*controlp == NULL) {
2470                                                 msgpcm = controlp;
2471                                         }
2472                                         *controlp = m_copy(m, 0, m->m_len);
2473
2474                                         /*
2475                                          * If we failed to allocate an mbuf,
2476                                          * release any previously allocated
2477                                          * mbufs for control data. Return
2478                                          * an error. Keep the mbufs in the
2479                                          * socket as this is using
2480                                          * MSG_PEEK flag.
2481                                          */
2482                                         if (*controlp == NULL) {
2483                                                 m_freem(*msgpcm);
2484                                                 error = ENOBUFS;
2485                                                 goto release;
2486                                         }
2487                                         controlp = &(*controlp)->m_next;
2488                                 }
2489                                 m = m->m_next;
2490                         } else {
2491                                 m->m_nextpkt = NULL;
2492                                 sbfree(sb_rcv, m);
2493                                 sb_rcv->sb_mb = m->m_next;
2494                                 m->m_next = NULL;
2495                                 *cme = m;
2496                                 cme = &(*cme)->m_next;
2497                                 m = sb_rcv->sb_mb;
2498                         }
2499                 } while (m != NULL && m->m_type == MT_CONTROL);
2500
2501                 if (!(flags & MSG_PEEK)) {
2502                         if (sb_rcv->sb_mb != NULL) {
2503                                 sb_rcv->sb_mb->m_nextpkt = nextrecord;
2504                         } else {
2505                                 sb_rcv->sb_mb = nextrecord;
2506                                 SB_EMPTY_FIXUP(sb_rcv);
2507                         }
2508                         if (nextrecord == NULL)
2509                                 sb_rcv->sb_lastrecord = m;
2510                 }
2511
2512                 SBLASTRECORDCHK(&so->so_rcv, "soreceive ctl");
2513                 SBLASTMBUFCHK(&so->so_rcv, "soreceive ctl");
2514
2515                 while (cm != NULL) {
2516                         int cmsg_type;
2517
2518                         cmn = cm->m_next;
2519                         cm->m_next = NULL;
2520                         cmsg_type = mtod(cm, struct cmsghdr *)->cmsg_type;
2521
2522                         /*
2523                          * Call the protocol to externalize SCM_RIGHTS message
2524                          * and return the modified message to the caller upon
2525                          * success.  Otherwise, all other control messages are
2526                          * returned unmodified to the caller.  Note that we
2527                          * only get into this loop if MSG_PEEK is not set.
2528                          */
2529                         if (pr->pr_domain->dom_externalize != NULL &&
2530                             cmsg_type == SCM_RIGHTS) {
2531                                 /*
2532                                  * Release socket lock: see 3903171.  This
2533                                  * would also allow more records to be appended
2534                                  * to the socket buffer.  We still have SB_LOCK
2535                                  * set on it, so we can be sure that the head
2536                                  * of the mbuf chain won't change.
2537                                  */
2538                                 socket_unlock(so, 0);
2539                                 error = (*pr->pr_domain->dom_externalize)(cm);
2540                                 socket_lock(so, 0);
2541                         } else {
2542                                 error = 0;
2543                         }
2544
2545                         if (controlp != NULL && error == 0) {
2546                                 *controlp = cm;
2547                                 controlp = &(*controlp)->m_next;
2548                                 orig_resid = 0;
2549                         } else {
2550                                 (void) m_free(cm);
2551                         }
2552                         cm = cmn;
2553                 }
2554                 /*
2555                  * Update the value of nextrecord in case we received new
2556                  * records when the socket was unlocked above for
2557                  * externalizing SCM_RIGHTS.
2558                  */
2559                 if (m != NULL)
2560                         nextrecord = sb_rcv->sb_mb->m_nextpkt;
2561                 else
2562                         nextrecord = sb_rcv->sb_mb;
2563                 orig_resid = 0;
2564         }
2565
2566         /*
2567          * If the socket is a TCP socket with message delivery
2568          * enabled, then create a control msg to deliver the
2569          * relative TCP sequence number for this data. Waiting
2570          * until this point will protect against failures to
2571          * allocate an mbuf for control msgs.
2572          */
2573         if (so->so_type == SOCK_STREAM && SOCK_PROTO(so) == IPPROTO_TCP &&
2574             (so->so_flags & SOF_ENABLE_MSGS) && controlp != NULL) {
2575                 struct mbuf *seq_cm;
2576
2577                 seq_cm = sbcreatecontrol((caddr_t)&m->m_pkthdr.msg_seq,
2578                     sizeof (uint32_t), SCM_SEQNUM, SOL_SOCKET);
2579                 if (seq_cm == NULL) {
2580                         /* unable to allocate a control mbuf */
2581                         error = ENOBUFS;
2582                         goto release;
2583                 }
2584                 *controlp = seq_cm;
2585                 controlp = &seq_cm->m_next;
2586         }
2587
2588         if (m != NULL) {
2589                 if (!(flags & MSG_PEEK)) {
2590                         /*
2591                          * We get here because m points to an mbuf following
2592                          * any MT_SONAME or MT_CONTROL mbufs which have been
2593                          * processed above.  In any case, m should be pointing
2594                          * to the head of the mbuf chain, and the nextrecord
2595                          * should be either NULL or equal to m->m_nextpkt.
2596                          * See comments above about SB_LOCK.
2597                          */
2598                         if (m != so->so_rcv.sb_mb ||
2599                             m->m_nextpkt != nextrecord) {
2600                                 panic("%s: post-control !sync so=%p m=%p "
2601                                     "nextrecord=%p\n", __func__, so, m,
2602                                     nextrecord);
2603                                 /* NOTREACHED */
2604                         }
2605                         if (nextrecord == NULL)
2606                                 so->so_rcv.sb_lastrecord = m;
2607                 }
2608                 type = m->m_type;
2609                 if (type == MT_OOBDATA)
2610                         flags |= MSG_OOB;
2611         } else {
2612                 if (!(flags & MSG_PEEK)) {
2613                         SB_EMPTY_FIXUP(&so->so_rcv);
2614                 }
2615         }
2616         SBLASTRECORDCHK(&so->so_rcv, "soreceive 2");
2617         SBLASTMBUFCHK(&so->so_rcv, "soreceive 2");
2618
2619         moff = 0;
2620         offset = 0;
2621
2622         if (!(flags & MSG_PEEK) && uio_resid(uio) > sorecvmincopy)
2623                 can_delay = 1;
2624         else
2625                 can_delay = 0;
2626
2627         need_event = 0;
2628
2629         while (m != NULL &&
2630             (uio_resid(uio) - delayed_copy_len) > 0 && error == 0) {
2631                 if (m->m_type == MT_OOBDATA) {
2632                         if (type != MT_OOBDATA)
2633                                 break;
2634                 } else if (type == MT_OOBDATA) {
2635                         break;
2636                 }
2637                 /*
2638                  * Make sure to allways set MSG_OOB event when getting
2639                  * out of band data inline.
2640                  */
2641                 if ((so->so_options & SO_WANTOOBFLAG) != 0 &&
2642                     (so->so_options & SO_OOBINLINE) != 0 &&
2643                     (so->so_state & SS_RCVATMARK) != 0) {
2644                         flags |= MSG_OOB;
2645                 }
2646                 so->so_state &= ~SS_RCVATMARK;
2647                 len = uio_resid(uio) - delayed_copy_len;
2648                 if (so->so_oobmark && len > so->so_oobmark - offset)
2649                         len = so->so_oobmark - offset;
2650                 if (len > m->m_len - moff)
2651                         len = m->m_len - moff;
2652                 /*
2653                  * If mp is set, just pass back the mbufs.
2654                  * Otherwise copy them out via the uio, then free.
2655                  * Sockbuf must be consistent here (points to current mbuf,
2656                  * it points to next record) when we drop priority;
2657                  * we must note any additions to the sockbuf when we
2658                  * block interrupts again.
2659                  */
2660                 if (mp == NULL) {
2661                         SBLASTRECORDCHK(&so->so_rcv, "soreceive uiomove");
2662                         SBLASTMBUFCHK(&so->so_rcv, "soreceive uiomove");
2663                         if (can_delay && len == m->m_len) {
2664                                 /*
2665                                  * only delay the copy if we're consuming the
2666                                  * mbuf and we're NOT in MSG_PEEK mode
2667                                  * and we have enough data to make it worthwile
2668                                  * to drop and retake the lock... can_delay
2669                                  * reflects the state of the 2 latter
2670                                  * constraints moff should always be zero
2671                                  * in these cases
2672                                  */
2673                                 delayed_copy_len += len;
2674                         } else {
2675                                 if (delayed_copy_len) {
2676                                         error = sodelayed_copy(so, uio,
2677                                             &free_list, &delayed_copy_len);
2678
2679                                         if (error) {
2680                                                 goto release;
2681                                         }
2682                                         /*
2683                                          * can only get here if MSG_PEEK is not
2684                                          * set therefore, m should point at the
2685                                          * head of the rcv queue; if it doesn't,
2686                                          * it means something drastically
2687                                          * changed while we were out from behind
2688                                          * the lock in sodelayed_copy. perhaps
2689                                          * a RST on the stream. in any event,
2690                                          * the stream has been interrupted. it's
2691                                          * probably best just to return whatever
2692                                          * data we've moved and let the caller
2693                                          * sort it out...
2694                                          */
2695                                         if (m != so->so_rcv.sb_mb) {
2696                                                 break;
2697                                         }
2698                                 }
2699                                 socket_unlock(so, 0);
2700                                 error = uiomove(mtod(m, caddr_t) + moff,
2701                                     (int)len, uio);
2702                                 socket_lock(so, 0);
2703
2704                                 if (error)
2705                                         goto release;
2706                         }
2707                 } else {
2708                         uio_setresid(uio, (uio_resid(uio) - len));
2709                 }
2710                 if (len == m->m_len - moff) {
2711                         if (m->m_flags & M_EOR)
2712                                 flags |= MSG_EOR;
2713                         if (flags & MSG_PEEK) {
2714                                 m = m->m_next;
2715                                 moff = 0;
2716                         } else {
2717                                 nextrecord = m->m_nextpkt;
2718                                 sbfree(&so->so_rcv, m);
2719                                 m->m_nextpkt = NULL;
2720
2721                                 /*
2722                                  * If this packet is an unordered packet
2723                                  * (indicated by M_UNORDERED_DATA flag), remove
2724                                  * the additional bytes added to the
2725                                  * receive socket buffer size.
2726                                  */
2727                                 if ((so->so_flags & SOF_ENABLE_MSGS) &&
2728                                     m->m_len &&
2729                                     (m->m_flags & M_UNORDERED_DATA) &&
2730                                     sbreserve(&so->so_rcv,
2731                                     so->so_rcv.sb_hiwat - m->m_len)) {
2732                                         if (so->so_msg_state->msg_uno_bytes >
2733                                             m->m_len) {
2734                                                 so->so_msg_state->
2735                                                     msg_uno_bytes -= m->m_len;
2736                                         } else {
2737                                                 so->so_msg_state->
2738                                                     msg_uno_bytes = 0;
2739                                         }
2740                                         m->m_flags &= ~M_UNORDERED_DATA;
2741                                 }
2742
2743                                 if (mp != NULL) {
2744                                         *mp = m;
2745                                         mp = &m->m_next;
2746                                         so->so_rcv.sb_mb = m = m->m_next;
2747                                         *mp = NULL;
2748                                 } else {
2749                                         if (free_list == NULL)
2750                                                 free_list = m;
2751                                         else
2752                                                 ml->m_next = m;
2753                                         ml = m;
2754                                         so->so_rcv.sb_mb = m = m->m_next;
2755                                         ml->m_next = NULL;
2756                                 }
2757                                 if (m != NULL) {
2758                                         m->m_nextpkt = nextrecord;
2759                                         if (nextrecord == NULL)
2760                                                 so->so_rcv.sb_lastrecord = m;
2761                                 } else {
2762                                         so->so_rcv.sb_mb = nextrecord;
2763                                         SB_EMPTY_FIXUP(&so->so_rcv);
2764                                 }
2765                                 SBLASTRECORDCHK(&so->so_rcv, "soreceive 3");
2766                                 SBLASTMBUFCHK(&so->so_rcv, "soreceive 3");
2767                         }
2768                 } else {
2769                         if (flags & MSG_PEEK) {
2770                                 moff += len;
2771                         } else {
2772                                 if (mp != NULL) {
2773                                         int copy_flag;
2774
2775                                         if (flags & MSG_DONTWAIT)
2776                                                 copy_flag = M_DONTWAIT;
2777                                         else
2778                                                 copy_flag = M_WAIT;
2779                                         *mp = m_copym(m, 0, len, copy_flag);
2780                                         /*
2781                                          * Failed to allocate an mbuf?
2782                                          * Adjust uio_resid back, it was
2783                                          * adjusted down by len bytes which
2784                                          * we didn't copy over.
2785                                          */
2786                                         if (*mp == NULL) {
2787                                                 uio_setresid(uio,
2788                                                     (uio_resid(uio) + len));
2789                                                 break;
2790                                         }
2791                                 }
2792                                 m->m_data += len;
2793                                 m->m_len -= len;
2794                                 so->so_rcv.sb_cc -= len;
2795                         }
2796                 }
2797                 if (so->so_oobmark) {
2798                         if ((flags & MSG_PEEK) == 0) {
2799                                 so->so_oobmark -= len;
2800                                 if (so->so_oobmark == 0) {
2801                                         so->so_state |= SS_RCVATMARK;
2802                                         /*
2803                                          * delay posting the actual event until
2804                                          * after any delayed copy processing
2805                                          * has finished
2806                                          */
2807                                         need_event = 1;
2808                                         break;
2809                                 }
2810                         } else {
2811                                 offset += len;
2812                                 if (offset == so->so_oobmark)
2813                                         break;
2814                         }
2815                 }
2816                 if (flags & MSG_EOR)
2817                         break;
2818                 /*
2819                  * If the MSG_WAITALL or MSG_WAITSTREAM flag is set
2820                  * (for non-atomic socket), we must not quit until
2821                  * "uio->uio_resid == 0" or an error termination.
2822                  * If a signal/timeout occurs, return with a short
2823                  * count but without error.  Keep sockbuf locked
2824                  * against other readers.
2825                  */
2826                 while (flags & (MSG_WAITALL|MSG_WAITSTREAM) && m == NULL &&
2827                     (uio_resid(uio) - delayed_copy_len) > 0 &&
2828                     !sosendallatonce(so) && !nextrecord) {
2829                         if (so->so_error || so->so_state & SS_CANTRCVMORE)
2830                                 goto release;
2831
2832                         /*
2833                          * Depending on the protocol (e.g. TCP), the following
2834                          * might cause the socket lock to be dropped and later
2835                          * be reacquired, and more data could have arrived and
2836                          * have been appended to the receive socket buffer by
2837                          * the time it returns.  Therefore, we only sleep in
2838                          * sbwait() below if and only if the socket buffer is
2839                          * empty, in order to avoid a false sleep.
2840                          */
2841                         if (pr->pr_flags & PR_WANTRCVD && so->so_pcb &&
2842                             (((struct inpcb *)so->so_pcb)->inp_state !=
2843                             INPCB_STATE_DEAD))
2844                                 (*pr->pr_usrreqs->pru_rcvd)(so, flags);
2845
2846                         SBLASTRECORDCHK(&so->so_rcv, "soreceive sbwait 2");
2847                         SBLASTMBUFCHK(&so->so_rcv, "soreceive sbwait 2");
2848
2849                         if (so->so_rcv.sb_mb == NULL && sbwait(&so->so_rcv)) {
2850                                 error = 0;
2851                                 goto release;
2852                         }
2853                         /*
2854                          * have to wait until after we get back from the sbwait
2855                          * to do the copy because we will drop the lock if we
2856                          * have enough data that has been delayed... by dropping
2857                          * the lock we open up a window allowing the netisr
2858                          * thread to process the incoming packets and to change
2859                          * the state of this socket... we're issuing the sbwait
2860                          * because the socket is empty and we're expecting the
2861                          * netisr thread to wake us up when more packets arrive;
2862                          * if we allow that processing to happen and then sbwait
2863                          * we could stall forever with packets sitting in the
2864                          * socket if no further packets arrive from the remote
2865                          * side.
2866                          *
2867                          * we want to copy before we've collected all the data
2868                          * to satisfy this request to allow the copy to overlap
2869                          * the incoming packet processing on an MP system
2870                          */
2871                         if (delayed_copy_len > sorecvmincopy &&
2872                             (delayed_copy_len > (so->so_rcv.sb_hiwat / 2))) {
2873                                 error = sodelayed_copy(so, uio,
2874                                     &free_list, &delayed_copy_len);
2875
2876                                 if (error)
2877                                         goto release;
2878                         }
2879                         m = so->so_rcv.sb_mb;
2880                         if (m != NULL) {
2881                                 nextrecord = m->m_nextpkt;
2882                         }
2883                         SB_MB_CHECK(&so->so_rcv);
2884                 }
2885         }
2886 #ifdef MORE_LOCKING_DEBUG
2887         if (so->so_usecount <= 1) {
2888                 panic("%s: after big while so=%p ref=%d on socket\n",
2889                     __func__, so, so->so_usecount);
2890                 /* NOTREACHED */
2891         }
2892 #endif
2893
2894         if (m != NULL && pr->pr_flags & PR_ATOMIC) {
2895                 if (so->so_options & SO_DONTTRUNC) {
2896                         flags |= MSG_RCVMORE;
2897                 } else {
2898                         flags |= MSG_TRUNC;
2899                         if ((flags & MSG_PEEK) == 0)
2900                                 (void) sbdroprecord(&so->so_rcv);
2901                 }
2902         }
2903
2904         /*
2905          * pru_rcvd below (for TCP) may cause more data to be received
2906          * if the socket lock is dropped prior to sending the ACK; some
2907          * legacy OpenTransport applications don't handle this well
2908          * (if it receives less data than requested while MSG_HAVEMORE
2909          * is set), and so we set the flag now based on what we know
2910          * prior to calling pru_rcvd.
2911          */
2912         if ((so->so_options & SO_WANTMORE) && so->so_rcv.sb_cc > 0)
2913                 flags |= MSG_HAVEMORE;
2914
2915         if ((flags & MSG_PEEK) == 0) {
2916                 if (m == NULL) {
2917                         so->so_rcv.sb_mb = nextrecord;
2918                         /*
2919                          * First part is an inline SB_EMPTY_FIXUP().  Second
2920                          * part makes sure sb_lastrecord is up-to-date if
2921                          * there is still data in the socket buffer.
2922                          */
2923                         if (so->so_rcv.sb_mb == NULL) {
2924                                 so->so_rcv.sb_mbtail = NULL;
2925                                 so->so_rcv.sb_lastrecord = NULL;
2926                         } else if (nextrecord->m_nextpkt == NULL) {
2927                                 so->so_rcv.sb_lastrecord = nextrecord;
2928                         }
2929                         SB_MB_CHECK(&so->so_rcv);
2930                 }
2931                 SBLASTRECORDCHK(&so->so_rcv, "soreceive 4");
2932                 SBLASTMBUFCHK(&so->so_rcv, "soreceive 4");
2933                 if (pr->pr_flags & PR_WANTRCVD && so->so_pcb)
2934                         (*pr->pr_usrreqs->pru_rcvd)(so, flags);
2935         }
2936
2937         if (delayed_copy_len) {
2938                 error = sodelayed_copy(so, uio, &free_list, &delayed_copy_len);
2939                 if (error)
2940                         goto release;
2941         }
2942         if (free_list != NULL) {
2943                 m_freem_list(free_list);
2944                 free_list = NULL;
2945         }
2946         if (need_event)
2947                 postevent(so, 0, EV_OOB);
2948
2949         if (orig_resid == uio_resid(uio) && orig_resid &&
2950             (flags & MSG_EOR) == 0 && (so->so_state & SS_CANTRCVMORE) == 0) {
2951                 sbunlock(&so->so_rcv, TRUE);    /* keep socket locked */
2952                 goto restart;
2953         }
2954
2955         if (flagsp != NULL)
2956                 *flagsp |= flags;
2957 release:
2958 #ifdef MORE_LOCKING_DEBUG
2959         if (so->so_usecount <= 1) {
2960                 panic("%s: release so=%p ref=%d on socket\n", __func__,
2961                     so, so->so_usecount);
2962                 /* NOTREACHED */
2963         }
2964 #endif
2965         if (delayed_copy_len)
2966                 error = sodelayed_copy(so, uio, &free_list, &delayed_copy_len);
2967
2968         if (free_list != NULL)
2969                 m_freem_list(free_list);
2970
2971         sbunlock(&so->so_rcv, FALSE);   /* will unlock socket */
2972
2973         KERNEL_DEBUG(DBG_FNC_SORECEIVE | DBG_FUNC_END, so, uio_resid(uio),
2974             so->so_rcv.sb_cc, 0, error);
2975
2976         return (error);
2977 }
2978
2979 /*
2980  * Returns:     0                       Success
2981  *      uiomove:EFAULT
2982  */
2983 static int
2984 sodelayed_copy(struct socket *so, struct uio *uio, struct mbuf **free_list,
2985     user_ssize_t *resid)
2986 {
2987         int error = 0;
2988         struct mbuf *m;
2989
2990         m = *free_list;
2991
2992         socket_unlock(so, 0);
2993
2994         while (m != NULL && error == 0) {
2995                 error = uiomove(mtod(m, caddr_t), (int)m->m_len, uio);
2996                 m = m->m_next;
2997         }
2998         m_freem_list(*free_list);
2999
3000         *free_list = NULL;
3001         *resid = 0;
3002
3003         socket_lock(so, 0);
3004
3005         return (error);
3006 }
3007
3008 /*
3009  * Returns:     0                       Success
3010  *              EINVAL
3011  *              ENOTCONN
3012  *      <pru_shutdown>:EINVAL
3013  *      <pru_shutdown>:EADDRNOTAVAIL[TCP]
3014  *      <pru_shutdown>:ENOBUFS[TCP]
3015  *      <pru_shutdown>:EMSGSIZE[TCP]
3016  *      <pru_shutdown>:EHOSTUNREACH[TCP]
3017  *      <pru_shutdown>:ENETUNREACH[TCP]
3018  *      <pru_shutdown>:ENETDOWN[TCP]
3019  *      <pru_shutdown>:ENOMEM[TCP]
3020  *      <pru_shutdown>:EACCES[TCP]
3021  *      <pru_shutdown>:EMSGSIZE[TCP]
3022  *      <pru_shutdown>:ENOBUFS[TCP]
3023  *      <pru_shutdown>:???[TCP]         [ignorable: mostly IPSEC/firewall/DLIL]
3024  *      <pru_shutdown>:???              [other protocol families]
3025  */
3026 int
3027 soshutdown(struct socket *so, int how)
3028 {
3029         int error;
3030
3031         switch (how) {
3032         case SHUT_RD:
3033         case SHUT_WR:
3034         case SHUT_RDWR:
3035                 socket_lock(so, 1);
3036                 if ((so->so_state &
3037                     (SS_ISCONNECTED|SS_ISCONNECTING|SS_ISDISCONNECTING)) == 0) {
3038                         error = ENOTCONN;
3039                 } else {
3040                         error = soshutdownlock(so, how);
3041                 }
3042                 socket_unlock(so, 1);
3043                 break;
3044         default:
3045                 error = EINVAL;
3046                 break;
3047         }
3048
3049         return (error);
3050 }
3051
3052 int
3053 soshutdownlock(struct socket *so, int how)
3054 {
3055         struct protosw *pr = so->so_proto;
3056         int error = 0;
3057
3058         sflt_notify(so, sock_evt_shutdown, &how);
3059
3060         if (how != SHUT_WR) {
3061                 if ((so->so_state & SS_CANTRCVMORE) != 0) {
3062                         /* read already shut down */
3063                         error = ENOTCONN;
3064                         goto done;
3065                 }
3066                 sorflush(so);
3067                 postevent(so, 0, EV_RCLOSED);
3068         }
3069         if (how != SHUT_RD) {
3070                 if ((so->so_state & SS_CANTSENDMORE) != 0) {
3071                         /* write already shut down */
3072                         error = ENOTCONN;
3073                         goto done;
3074                 }
3075                 error = (*pr->pr_usrreqs->pru_shutdown)(so);
3076                 postevent(so, 0, EV_WCLOSED);
3077         }
3078 done:
3079         KERNEL_DEBUG(DBG_FNC_SOSHUTDOWN | DBG_FUNC_END, 0, 0, 0, 0, 0);
3080         return (error);
3081 }
3082
3083 void
3084 sowflush(struct socket *so)
3085 {
3086         struct sockbuf *sb = &so->so_snd;
3087 #ifdef notyet
3088         lck_mtx_t *mutex_held;
3089         /*
3090          * XXX: This code is currently commented out, because we may get here
3091          * as part of sofreelastref(), and at that time, pr_getlock() may no
3092          * longer be able to return us the lock; this will be fixed in future.
3093          */
3094         if (so->so_proto->pr_getlock != NULL)
3095                 mutex_held = (*so->so_proto->pr_getlock)(so, 0);
3096         else
3097                 mutex_held = so->so_proto->pr_domain->dom_mtx;
3098
3099         lck_mtx_assert(mutex_held, LCK_MTX_ASSERT_OWNED);
3100 #endif /* notyet */
3101
3102         /*
3103          * Obtain lock on the socket buffer (SB_LOCK).  This is required
3104          * to prevent the socket buffer from being unexpectedly altered
3105          * while it is used by another thread in socket send/receive.
3106          *
3107          * sblock() must not fail here, hence the assertion.
3108          */
3109         (void) sblock(sb, SBL_WAIT | SBL_NOINTR | SBL_IGNDEFUNCT);
3110         VERIFY(sb->sb_flags & SB_LOCK);
3111
3112         sb->sb_flags            &= ~(SB_SEL|SB_UPCALL);
3113         sb->sb_flags            |= SB_DROP;
3114         sb->sb_upcall           = NULL;
3115         sb->sb_upcallarg        = NULL;
3116
3117         sbunlock(sb, TRUE);     /* keep socket locked */
3118
3119         selthreadclear(&sb->sb_sel);
3120         sbrelease(sb);
3121 }
3122
3123 void
3124 sorflush(struct socket *so)
3125 {
3126         struct sockbuf *sb = &so->so_rcv;
3127         struct protosw *pr = so->so_proto;
3128         struct sockbuf asb;
3129 #ifdef notyet
3130         lck_mtx_t *mutex_held;
3131         /*
3132          * XXX: This code is currently commented out, because we may get here
3133          * as part of sofreelastref(), and at that time, pr_getlock() may no
3134          * longer be able to return us the lock; this will be fixed in future.
3135          */
3136         if (so->so_proto->pr_getlock != NULL)
3137                 mutex_held = (*so->so_proto->pr_getlock)(so, 0);
3138         else
3139                 mutex_held = so->so_proto->pr_domain->dom_mtx;
3140
3141         lck_mtx_assert(mutex_held, LCK_MTX_ASSERT_OWNED);
3142 #endif /* notyet */
3143
3144         sflt_notify(so, sock_evt_flush_read, NULL);
3145
3146         socantrcvmore(so);
3147
3148         /*
3149          * Obtain lock on the socket buffer (SB_LOCK).  This is required
3150          * to prevent the socket buffer from being unexpectedly altered
3151          * while it is used by another thread in socket send/receive.
3152          *
3153          * sblock() must not fail here, hence the assertion.
3154          */
3155         (void) sblock(sb, SBL_WAIT | SBL_NOINTR | SBL_IGNDEFUNCT);
3156         VERIFY(sb->sb_flags & SB_LOCK);
3157
3158         /*
3159          * Copy only the relevant fields from "sb" to "asb" which we
3160          * need for sbrelease() to function.  In particular, skip
3161          * sb_sel as it contains the wait queue linkage, which would
3162          * wreak havoc if we were to issue selthreadclear() on "asb".
3163          * Make sure to not carry over SB_LOCK in "asb", as we need
3164          * to acquire it later as part of sbrelease().
3165          */
3166         bzero(&asb, sizeof (asb));
3167         asb.sb_cc               = sb->sb_cc;
3168         asb.sb_hiwat            = sb->sb_hiwat;
3169         asb.sb_mbcnt            = sb->sb_mbcnt;
3170         asb.sb_mbmax            = sb->sb_mbmax;
3171         asb.sb_ctl              = sb->sb_ctl;
3172         asb.sb_lowat            = sb->sb_lowat;
3173         asb.sb_mb               = sb->sb_mb;
3174         asb.sb_mbtail           = sb->sb_mbtail;
3175         asb.sb_lastrecord       = sb->sb_lastrecord;
3176         asb.sb_so               = sb->sb_so;
3177         asb.sb_flags            = sb->sb_flags;
3178         asb.sb_flags            &= ~(SB_LOCK|SB_SEL|SB_KNOTE|SB_UPCALL);
3179         asb.sb_flags            |= SB_DROP;
3180
3181         /*
3182          * Ideally we'd bzero() these and preserve the ones we need;
3183          * but to do that we'd need to shuffle things around in the
3184          * sockbuf, and we can't do it now because there are KEXTS
3185          * that are directly referring to the socket structure.
3186          *
3187          * Setting SB_DROP acts as a barrier to prevent further appends.
3188          * Clearing SB_SEL is done for selthreadclear() below.
3189          */
3190         sb->sb_cc               = 0;
3191         sb->sb_hiwat            = 0;
3192         sb->sb_mbcnt            = 0;
3193         sb->sb_mbmax            = 0;
3194         sb->sb_ctl              = 0;
3195         sb->sb_lowat            = 0;
3196         sb->sb_mb               = NULL;
3197         sb->sb_mbtail           = NULL;
3198         sb->sb_lastrecord       = NULL;
3199         sb->sb_timeo.tv_sec     = 0;
3200         sb->sb_timeo.tv_usec    = 0;
3201         sb->sb_upcall           = NULL;
3202         sb->sb_upcallarg        = NULL;
3203         sb->sb_flags            &= ~(SB_SEL|SB_UPCALL);
3204         sb->sb_flags            |= SB_DROP;
3205
3206         sbunlock(sb, TRUE);     /* keep socket locked */
3207
3208         /*
3209          * Note that selthreadclear() is called on the original "sb" and
3210          * not the local "asb" because of the way wait queue linkage is
3211          * implemented.  Given that selwakeup() may be triggered, SB_SEL
3212          * should no longer be set (cleared above.)
3213          */
3214         selthreadclear(&sb->sb_sel);
3215
3216         if ((pr->pr_flags & PR_RIGHTS) && pr->pr_domain->dom_dispose)
3217                 (*pr->pr_domain->dom_dispose)(asb.sb_mb);
3218
3219         sbrelease(&asb);
3220 }
3221
3222 /*
3223  * Perhaps this routine, and sooptcopyout(), below, ought to come in
3224  * an additional variant to handle the case where the option value needs
3225  * to be some kind of integer, but not a specific size.
3226  * In addition to their use here, these functions are also called by the
3227  * protocol-level pr_ctloutput() routines.
3228  *
3229  * Returns:     0                       Success
3230  *              EINVAL
3231  *      copyin:EFAULT
3232  */
3233 int
3234 sooptcopyin(struct sockopt *sopt, void *buf, size_t len, size_t minlen)
3235 {
3236         size_t  valsize;
3237
3238         /*
3239          * If the user gives us more than we wanted, we ignore it,
3240          * but if we don't get the minimum length the caller
3241          * wants, we return EINVAL.  On success, sopt->sopt_valsize
3242          * is set to however much we actually retrieved.
3243          */
3244         if ((valsize = sopt->sopt_valsize) < minlen)
3245                 return (EINVAL);
3246         if (valsize > len)
3247                 sopt->sopt_valsize = valsize = len;
3248
3249         if (sopt->sopt_p != kernproc)
3250                 return (copyin(sopt->sopt_val, buf, valsize));
3251
3252         bcopy(CAST_DOWN(caddr_t, sopt->sopt_val), buf, valsize);
3253         return (0);
3254 }
3255
3256 /*
3257  * sooptcopyin_timeval
3258  *   Copy in a timeval value into tv_p, and take into account whether the
3259  *   the calling process is 64-bit or 32-bit.  Moved the sanity checking
3260  *   code here so that we can verify the 64-bit tv_sec value before we lose
3261  *   the top 32-bits assigning tv64.tv_sec to tv_p->tv_sec.
3262  */
3263 static int
3264 sooptcopyin_timeval(struct sockopt *sopt, struct timeval *tv_p)
3265 {
3266         int                     error;
3267
3268         if (proc_is64bit(sopt->sopt_p)) {
3269                 struct user64_timeval   tv64;
3270
3271                 if (sopt->sopt_valsize < sizeof (tv64))
3272                         return (EINVAL);
3273
3274                 sopt->sopt_valsize = sizeof (tv64);
3275                 if (sopt->sopt_p != kernproc) {
3276                         error = copyin(sopt->sopt_val, &tv64, sizeof (tv64));
3277                         if (error != 0)
3278                                 return (error);
3279                 } else {
3280                         bcopy(CAST_DOWN(caddr_t, sopt->sopt_val), &tv64,
3281                             sizeof (tv64));
3282                 }
3283                 if (tv64.tv_sec < 0 || tv64.tv_sec > LONG_MAX ||
3284                     tv64.tv_usec < 0 || tv64.tv_usec >= 1000000)
3285                         return (EDOM);
3286
3287                 tv_p->tv_sec = tv64.tv_sec;
3288                 tv_p->tv_usec = tv64.tv_usec;
3289         } else {
3290                 struct user32_timeval   tv32;
3291
3292                 if (sopt->sopt_valsize < sizeof (tv32))
3293                         return (EINVAL);
3294
3295                 sopt->sopt_valsize = sizeof (tv32);
3296                 if (sopt->sopt_p != kernproc) {
3297                         error = copyin(sopt->sopt_val, &tv32, sizeof (tv32));
3298                         if (error != 0) {
3299                                 return (error);
3300                         }
3301                 } else {
3302                         bcopy(CAST_DOWN(caddr_t, sopt->sopt_val), &tv32,
3303                             sizeof (tv32));
3304                 }
3305 #ifndef __LP64__
3306                 /*
3307                  * K64todo "comparison is always false due to
3308                  * limited range of data type"
3309                  */
3310                 if (tv32.tv_sec < 0 || tv32.tv_sec > LONG_MAX ||
3311                     tv32.tv_usec < 0 || tv32.tv_usec >= 1000000)
3312                         return (EDOM);
3313 #endif
3314                 tv_p->tv_sec = tv32.tv_sec;
3315                 tv_p->tv_usec = tv32.tv_usec;
3316         }
3317         return (0);
3318 }
3319
3320 /*
3321  * Returns:     0                       Success
3322  *              EINVAL
3323  *              ENOPROTOOPT
3324  *              ENOBUFS
3325  *              EDOM
3326  *      sooptcopyin:EINVAL
3327  *      sooptcopyin:EFAULT
3328  *      sooptcopyin_timeval:EINVAL
3329  *      sooptcopyin_timeval:EFAULT
3330  *      sooptcopyin_timeval:EDOM
3331  *      <pr_ctloutput>:EOPNOTSUPP[AF_UNIX]
3332  *      <pr_ctloutput>:???w
3333  *      sflt_attach_private:???         [whatever a filter author chooses]
3334  *      <sf_setoption>:???              [whatever a filter author chooses]
3335  *
3336  * Notes:       Other <pru_listen> returns depend on the protocol family; all
3337  *              <sf_listen> returns depend on what the filter author causes
3338  *              their filter to return.
3339  */
3340 int
3341 sosetoptlock(struct socket *so, struct sockopt *sopt, int dolock)
3342 {
3343         int     error, optval;
3344         struct  linger l;
3345         struct  timeval tv;
3346 #if CONFIG_MACF_SOCKET
3347         struct mac extmac;
3348 #endif /* MAC_SOCKET */
3349
3350         if (sopt->sopt_dir != SOPT_SET)
3351                 sopt->sopt_dir = SOPT_SET;
3352
3353         if (dolock)
3354                 socket_lock(so, 1);
3355
3356         if ((so->so_state & (SS_CANTRCVMORE | SS_CANTSENDMORE)) ==
3357             (SS_CANTRCVMORE | SS_CANTSENDMORE) &&
3358             (so->so_flags & SOF_NPX_SETOPTSHUT) == 0) {
3359                 /* the socket has been shutdown, no more sockopt's */
3360                 error = EINVAL;
3361                 goto out;
3362         }
3363
3364         error = sflt_setsockopt(so, sopt);
3365         if (error != 0) {
3366                 if (error == EJUSTRETURN)
3367                         error = 0;
3368                 goto out;
3369         }
3370
3371         if (sopt->sopt_level != SOL_SOCKET) {
3372                 if (so->so_proto != NULL &&
3373                     so->so_proto->pr_ctloutput != NULL) {
3374                         error = (*so->so_proto->pr_ctloutput)(so, sopt);
3375                         goto out;
3376                 }
3377                 error = ENOPROTOOPT;
3378         } else {
3379                 /*
3380                  * Allow socket-level (SOL_SOCKET) options to be filtered by
3381                  * the protocol layer, if needed.  A zero value returned from
3382                  * the handler means use default socket-level processing as
3383                  * done by the rest of this routine.  Otherwise, any other
3384                  * return value indicates that the option is unsupported.
3385                  */
3386                 if (so->so_proto != NULL && (error = so->so_proto->pr_usrreqs->
3387                     pru_socheckopt(so, sopt)) != 0)
3388                         goto out;
3389
3390                 error = 0;
3391                 switch (sopt->sopt_name) {
3392                 case SO_LINGER:
3393                 case SO_LINGER_SEC:
3394                         error = sooptcopyin(sopt, &l, sizeof (l), sizeof (l));
3395                         if (error != 0)
3396                                 goto out;
3397
3398                         so->so_linger = (sopt->sopt_name == SO_LINGER) ?
3399                             l.l_linger : l.l_linger * hz;
3400                         if (l.l_onoff != 0)
3401                                 so->so_options |= SO_LINGER;
3402                         else
3403                                 so->so_options &= ~SO_LINGER;
3404                         break;
3405
3406                 case SO_DEBUG:
3407                 case SO_KEEPALIVE:
3408                 case SO_DONTROUTE:
3409                 case SO_USELOOPBACK:
3410                 case SO_BROADCAST:
3411                 case SO_REUSEADDR:
3412                 case SO_REUSEPORT:
3413                 case SO_OOBINLINE:
3414                 case SO_TIMESTAMP:
3415                 case SO_TIMESTAMP_MONOTONIC:
3416                 case SO_DONTTRUNC:
3417                 case SO_WANTMORE:
3418                 case SO_WANTOOBFLAG:
3419                         error = sooptcopyin(sopt, &optval, sizeof (optval),
3420                             sizeof (optval));
3421                         if (error != 0)
3422                                 goto out;
3423                         if (optval)
3424                                 so->so_options |= sopt->sopt_name;
3425                         else
3426                                 so->so_options &= ~sopt->sopt_name;
3427                         break;
3428
3429                 case SO_SNDBUF:
3430                 case SO_RCVBUF:
3431                 case SO_SNDLOWAT:
3432                 case SO_RCVLOWAT:
3433                         error = sooptcopyin(sopt, &optval, sizeof (optval),
3434                             sizeof (optval));
3435                         if (error != 0)
3436                                 goto out;
3437
3438                         /*
3439                          * Values < 1 make no sense for any of these
3440                          * options, so disallow them.
3441                          */
3442                         if (optval < 1) {
3443                                 error = EINVAL;
3444                                 goto out;
3445                         }
3446
3447                         switch (sopt->sopt_name) {
3448                         case SO_SNDBUF:
3449                         case SO_RCVBUF: {
3450                                 struct sockbuf *sb =
3451                                     (sopt->sopt_name == SO_SNDBUF) ?
3452                                     &so->so_snd : &so->so_rcv;
3453                                 if (sbreserve(sb, (u_int32_t)optval) == 0) {
3454                                         error = ENOBUFS;
3455                                         goto out;
3456                                 }
3457                                 sb->sb_flags |= SB_USRSIZE;
3458                                 sb->sb_flags &= ~SB_AUTOSIZE;
3459                                 sb->sb_idealsize = (u_int32_t)optval;
3460                                 break;
3461                         }
3462                         /*
3463                          * Make sure the low-water is never greater than
3464                          * the high-water.
3465                          */
3466                         case SO_SNDLOWAT:
3467                                 so->so_snd.sb_lowat =
3468                                     (optval > so->so_snd.sb_hiwat) ?
3469                                     so->so_snd.sb_hiwat : optval;
3470                                 break;
3471                         case SO_RCVLOWAT:
3472                                 so->so_rcv.sb_lowat =
3473                                     (optval > so->so_rcv.sb_hiwat) ?
3474                                     so->so_rcv.sb_hiwat : optval;
3475                                 break;
3476                         }
3477                         break;
3478
3479                 case SO_SNDTIMEO:
3480                 case SO_RCVTIMEO:
3481                         error = sooptcopyin_timeval(sopt, &tv);
3482                         if (error != 0)
3483                                 goto out;
3484
3485                         switch (sopt->sopt_name) {
3486                         case SO_SNDTIMEO:
3487                                 so->so_snd.sb_timeo = tv;
3488                                 break;
3489                         case SO_RCVTIMEO:
3490                                 so->so_rcv.sb_timeo = tv;
3491                                 break;
3492                         }
3493                         break;
3494
3495                 case SO_NKE: {
3496                         struct so_nke nke;
3497
3498                         error = sooptcopyin(sopt, &nke, sizeof (nke),
3499                             sizeof (nke));
3500                         if (error != 0)
3501                                 goto out;
3502
3503                         error = sflt_attach_internal(so, nke.nke_handle);
3504                         break;
3505                 }
3506
3507                 case SO_NOSIGPIPE:
3508                         error = sooptcopyin(sopt, &optval, sizeof (optval),
3509                             sizeof (optval));
3510                         if (error != 0)
3511                                 goto out;
3512                         if (optval != 0)
3513                                 so->so_flags |= SOF_NOSIGPIPE;
3514                         else
3515                                 so->so_flags &= ~SOF_NOSIGPIPE;
3516                         break;
3517
3518                 case SO_NOADDRERR:
3519                         error = sooptcopyin(sopt, &optval, sizeof (optval),
3520                             sizeof (optval));
3521                         if (error != 0)
3522                                 goto out;
3523                         if (optval != 0)
3524                                 so->so_flags |= SOF_NOADDRAVAIL;
3525                         else
3526                                 so->so_flags &= ~SOF_NOADDRAVAIL;
3527                         break;
3528
3529                 case SO_REUSESHAREUID:
3530                         error = sooptcopyin(sopt, &optval, sizeof (optval),
3531                             sizeof (optval));
3532                         if (error != 0)
3533                                 goto out;
3534                         if (optval != 0)
3535                                 so->so_flags |= SOF_REUSESHAREUID;
3536                         else
3537                                 so->so_flags &= ~SOF_REUSESHAREUID;
3538                         break;
3539
3540                 case SO_NOTIFYCONFLICT:
3541                         if (kauth_cred_issuser(kauth_cred_get()) == 0) {
3542                                 error = EPERM;
3543                                 goto out;
3544                         }
3545                         error = sooptcopyin(sopt, &optval, sizeof (optval),
3546                             sizeof (optval));
3547                         if (error != 0)
3548                                 goto out;
3549                         if (optval != 0)
3550                                 so->so_flags |= SOF_NOTIFYCONFLICT;
3551                         else
3552                                 so->so_flags &= ~SOF_NOTIFYCONFLICT;
3553                         break;
3554
3555                 case SO_RESTRICTIONS:
3556                         error = sooptcopyin(sopt, &optval, sizeof (optval),
3557                             sizeof (optval));
3558                         if (error != 0)
3559                                 goto out;
3560
3561                         error = so_set_restrictions(so, optval);
3562                         break;
3563
3564                 case SO_LABEL:
3565 #if CONFIG_MACF_SOCKET
3566                         if ((error = sooptcopyin(sopt, &extmac, sizeof (extmac),
3567                             sizeof (extmac))) != 0)
3568                                 goto out;
3569
3570                         error = mac_setsockopt_label(proc_ucred(sopt->sopt_p),
3571                             so, &extmac);
3572 #else
3573                         error = EOPNOTSUPP;
3574 #endif /* MAC_SOCKET */
3575                         break;
3576
3577                 case SO_UPCALLCLOSEWAIT:
3578                         error = sooptcopyin(sopt, &optval, sizeof (optval),
3579                             sizeof (optval));
3580                         if (error != 0)
3581                                 goto out;
3582                         if (optval != 0)
3583                                 so->so_flags |= SOF_UPCALLCLOSEWAIT;
3584                         else
3585                                 so->so_flags &= ~SOF_UPCALLCLOSEWAIT;
3586                         break;
3587
3588                 case SO_RANDOMPORT:
3589                         error = sooptcopyin(sopt, &optval, sizeof (optval),
3590                             sizeof (optval));
3591                         if (error != 0)
3592                                 goto out;
3593                         if (optval != 0)
3594                                 so->so_flags |= SOF_BINDRANDOMPORT;
3595                         else
3596                                 so->so_flags &= ~SOF_BINDRANDOMPORT;
3597                         break;
3598
3599                 case SO_NP_EXTENSIONS: {
3600                         struct so_np_extensions sonpx;
3601
3602                         error = sooptcopyin(sopt, &sonpx, sizeof (sonpx),
3603                             sizeof (sonpx));
3604                         if (error != 0)
3605                                 goto out;
3606                         if (sonpx.npx_mask & ~SONPX_MASK_VALID) {
3607                                 error = EINVAL;
3608                                 goto out;
3609                         }
3610                         /*
3611                          * Only one bit defined for now
3612                          */
3613                         if ((sonpx.npx_mask & SONPX_SETOPTSHUT)) {
3614                                 if ((sonpx.npx_flags & SONPX_SETOPTSHUT))
3615                                         so->so_flags |= SOF_NPX_SETOPTSHUT;
3616                                 else
3617                                         so->so_flags &= ~SOF_NPX_SETOPTSHUT;
3618                         }
3619                         break;
3620                 }
3621
3622                 case SO_TRAFFIC_CLASS: {
3623                         error = sooptcopyin(sopt, &optval, sizeof (optval),
3624                             sizeof (optval));
3625                         if (error != 0)
3626                                 goto out;
3627                         error = so_set_traffic_class(so, optval);
3628                         if (error != 0)
3629                                 goto out;
3630                         break;
3631                 }
3632
3633                 case SO_RECV_TRAFFIC_CLASS: {
3634                         error = sooptcopyin(sopt, &optval, sizeof (optval),
3635                             sizeof (optval));
3636                         if (error != 0)
3637                                 goto out;
3638                         if (optval == 0)
3639                                 so->so_flags &= ~SOF_RECV_TRAFFIC_CLASS;
3640                         else
3641                                 so->so_flags |= SOF_RECV_TRAFFIC_CLASS;
3642                         break;
3643                 }
3644
3645                 case SO_TRAFFIC_CLASS_DBG: {
3646                         struct so_tcdbg so_tcdbg;
3647
3648                         error = sooptcopyin(sopt, &so_tcdbg,
3649                             sizeof (struct so_tcdbg), sizeof (struct so_tcdbg));
3650                         if (error != 0)
3651                                 goto out;
3652                         error = so_set_tcdbg(so, &so_tcdbg);
3653                         if (error != 0)
3654                                 goto out;
3655                         break;
3656                 }
3657
3658                 case SO_PRIVILEGED_TRAFFIC_CLASS:
3659                         error = priv_check_cred(kauth_cred_get(),
3660                             PRIV_NET_PRIVILEGED_TRAFFIC_CLASS, 0);
3661                         if (error != 0)
3662                                 goto out;
3663                         error = sooptcopyin(sopt, &optval, sizeof (optval),
3664                             sizeof (optval));
3665                         if (error != 0)
3666                                 goto out;
3667                         if (optval == 0)
3668                                 so->so_flags &= ~SOF_PRIVILEGED_TRAFFIC_CLASS;
3669                         else
3670                                 so->so_flags |= SOF_PRIVILEGED_TRAFFIC_CLASS;
3671                         break;
3672
3673                 case SO_DEFUNCTOK:
3674                         error = sooptcopyin(sopt, &optval, sizeof (optval),
3675                             sizeof (optval));
3676                         if (error != 0 || (so->so_flags & SOF_DEFUNCT)) {
3677                                 if (error == 0)
3678                                         error = EBADF;
3679                                 goto out;
3680                         }
3681                         /*
3682                          * Any process can set SO_DEFUNCTOK (clear
3683                          * SOF_NODEFUNCT), but only root can clear
3684                          * SO_DEFUNCTOK (set SOF_NODEFUNCT).
3685                          */
3686                         if (optval == 0 &&
3687                             kauth_cred_issuser(kauth_cred_get()) == 0) {
3688                                 error = EPERM;
3689                                 goto out;
3690                         }
3691                         if (optval)
3692                                 so->so_flags &= ~SOF_NODEFUNCT;
3693                         else
3694                                 so->so_flags |= SOF_NODEFUNCT;
3695
3696                         if (SOCK_DOM(so) == PF_INET ||
3697                             SOCK_DOM(so) == PF_INET6) {
3698                                 char s[MAX_IPv6_STR_LEN];
3699                                 char d[MAX_IPv6_STR_LEN];
3700                                 struct inpcb *inp = sotoinpcb(so);
3701
3702                                 SODEFUNCTLOG(("%s[%d]: so 0x%llx [%s %s:%d -> "
3703                                     "%s:%d] is now marked as %seligible for "
3704                                     "defunct\n", __func__, proc_selfpid(),
3705                                     (uint64_t)VM_KERNEL_ADDRPERM(so),
3706                                     (SOCK_TYPE(so) == SOCK_STREAM) ?
3707                                     "TCP" : "UDP", inet_ntop(SOCK_DOM(so),
3708                                     ((SOCK_DOM(so) == PF_INET) ?
3709                                     (void *)&inp->inp_laddr.s_addr :
3710                                     (void *)&inp->in6p_laddr), s, sizeof (s)),
3711                                     ntohs(inp->in6p_lport),
3712                                     inet_ntop(SOCK_DOM(so),
3713                                     (SOCK_DOM(so) == PF_INET) ?
3714                                     (void *)&inp->inp_faddr.s_addr :
3715                                     (void *)&inp->in6p_faddr, d, sizeof (d)),
3716                                     ntohs(inp->in6p_fport),
3717                                     (so->so_flags & SOF_NODEFUNCT) ?
3718                                     "not " : ""));
3719                         } else {
3720                                 SODEFUNCTLOG(("%s[%d]: so 0x%llx [%d,%d] is "
3721                                     "now marked as %seligible for defunct\n",
3722                                     __func__, proc_selfpid(),
3723                                     (uint64_t)VM_KERNEL_ADDRPERM(so),
3724                                     SOCK_DOM(so), SOCK_TYPE(so),
3725                                     (so->so_flags & SOF_NODEFUNCT) ?
3726                                     "not " : ""));
3727                         }
3728                         break;
3729
3730                 case SO_ISDEFUNCT:
3731                         /* This option is not settable */
3732                         error = EINVAL;
3733                         break;
3734
3735                 case SO_OPPORTUNISTIC:
3736                         error = sooptcopyin(sopt, &optval, sizeof (optval),
3737                             sizeof (optval));
3738                         if (error == 0)
3739                                 error = so_set_opportunistic(so, optval);
3740                         break;
3741
3742                 case SO_FLUSH:
3743                         /* This option is handled by lower layer(s) */
3744                         error = 0;
3745                         break;
3746
3747                 case SO_RECV_ANYIF:
3748                         error = sooptcopyin(sopt, &optval, sizeof (optval),
3749                             sizeof (optval));
3750                         if (error == 0)
3751                                 error = so_set_recv_anyif(so, optval);
3752                         break;
3753
3754                 case SO_TRAFFIC_MGT_BACKGROUND: {
3755                         /* This option is handled by lower layer(s) */
3756                         error = 0;
3757                         break;
3758                 }
3759
3760 #if FLOW_DIVERT
3761                 case SO_FLOW_DIVERT_TOKEN:
3762                         error = flow_divert_token_set(so, sopt);
3763                         break;
3764 #endif  /* FLOW_DIVERT */
3765
3766
3767                 case SO_DELEGATED:
3768                         if ((error = sooptcopyin(sopt, &optval, sizeof (optval),
3769                             sizeof (optval))) != 0)
3770                                 break;
3771
3772                         error = so_set_effective_pid(so, optval, sopt->sopt_p);
3773                         break;
3774
3775                 case SO_DELEGATED_UUID: {
3776                         uuid_t euuid;
3777
3778                         if ((error = sooptcopyin(sopt, &euuid, sizeof (euuid),
3779                             sizeof (euuid))) != 0)
3780                                 break;
3781
3782                         error = so_set_effective_uuid(so, euuid, sopt->sopt_p);
3783                         break;
3784                 }
3785
3786                 default:
3787                         error = ENOPROTOOPT;
3788                         break;
3789                 }
3790                 if (error == 0 && so->so_proto != NULL &&
3791                     so->so_proto->pr_ctloutput != NULL) {
3792                         (void) so->so_proto->pr_ctloutput(so, sopt);
3793                 }
3794         }
3795 out:
3796         if (dolock)
3797                 socket_unlock(so, 1);
3798         return (error);
3799 }
3800
3801 /* Helper routines for getsockopt */
3802 int
3803 sooptcopyout(struct sockopt *sopt, void *buf, size_t len)
3804 {
3805         int     error;
3806         size_t  valsize;
3807
3808         error = 0;
3809
3810         /*
3811          * Documented get behavior is that we always return a value,
3812          * possibly truncated to fit in the user's buffer.
3813          * Traditional behavior is that we always tell the user
3814          * precisely how much we copied, rather than something useful
3815          * like the total amount we had available for her.
3816          * Note that this interface is not idempotent; the entire answer must
3817          * generated ahead of time.
3818          */
3819         valsize = min(len, sopt->sopt_valsize);
3820         sopt->sopt_valsize = valsize;
3821         if (sopt->sopt_val != USER_ADDR_NULL) {
3822                 if (sopt->sopt_p != kernproc)
3823                         error = copyout(buf, sopt->sopt_val, valsize);
3824                 else
3825                         bcopy(buf, CAST_DOWN(caddr_t, sopt->sopt_val), valsize);
3826         }
3827         return (error);
3828 }
3829
3830 static int
3831 sooptcopyout_timeval(struct sockopt *sopt, const struct timeval *tv_p)
3832 {
3833         int                     error;
3834         size_t                  len;
3835         struct user64_timeval   tv64;
3836         struct user32_timeval   tv32;
3837         const void *            val;
3838         size_t                  valsize;
3839
3840         error = 0;
3841         if (proc_is64bit(sopt->sopt_p)) {
3842                 len = sizeof (tv64);
3843                 tv64.tv_sec = tv_p->tv_sec;
3844                 tv64.tv_usec = tv_p->tv_usec;
3845                 val = &tv64;
3846         } else {
3847                 len = sizeof (tv32);
3848                 tv32.tv_sec = tv_p->tv_sec;
3849                 tv32.tv_usec = tv_p->tv_usec;
3850                 val = &tv32;
3851         }
3852         valsize = min(len, sopt->sopt_valsize);
3853         sopt->sopt_valsize = valsize;
3854         if (sopt->sopt_val != USER_ADDR_NULL) {
3855                 if (sopt->sopt_p != kernproc)
3856                         error = copyout(val, sopt->sopt_val, valsize);
3857                 else
3858                         bcopy(val, CAST_DOWN(caddr_t, sopt->sopt_val), valsize);
3859         }
3860         return (error);
3861 }
3862
3863 /*
3864  * Return:      0                       Success
3865  *              ENOPROTOOPT
3866  *      <pr_ctloutput>:EOPNOTSUPP[AF_UNIX]
3867  *      <pr_ctloutput>:???
3868  *      <sf_getoption>:???
3869  */
3870 int
3871 sogetoptlock(struct socket *so, struct sockopt *sopt, int dolock)
3872 {
3873         int     error, optval;
3874         struct  linger l;
3875         struct  timeval tv;
3876 #if CONFIG_MACF_SOCKET
3877         struct mac extmac;
3878 #endif /* MAC_SOCKET */
3879
3880         if (sopt->sopt_dir != SOPT_GET)
3881                 sopt->sopt_dir = SOPT_GET;
3882
3883         if (dolock)
3884                 socket_lock(so, 1);
3885
3886         error = sflt_getsockopt(so, sopt);
3887         if (error != 0) {
3888                 if (error == EJUSTRETURN)
3889                         error = 0;
3890                 goto out;
3891         }
3892
3893         if (sopt->sopt_level != SOL_SOCKET) {
3894                 if (so->so_proto != NULL &&
3895                     so->so_proto->pr_ctloutput != NULL) {
3896                         error = (*so->so_proto->pr_ctloutput)(so, sopt);
3897                         goto out;
3898                 }
3899                 error = ENOPROTOOPT;
3900         } else {
3901                 /*
3902                  * Allow socket-level (SOL_SOCKET) options to be filtered by
3903                  * the protocol layer, if needed.  A zero value returned from
3904                  * the handler means use default socket-level processing as
3905                  * done by the rest of this routine.  Otherwise, any other
3906                  * return value indicates that the option is unsupported.
3907                  */
3908                 if (so->so_proto != NULL && (error = so->so_proto->pr_usrreqs->
3909                     pru_socheckopt(so, sopt)) != 0)
3910                         goto out;
3911
3912                 error = 0;
3913                 switch (sopt->sopt_name) {
3914                 case SO_LINGER:
3915                 case SO_LINGER_SEC:
3916                         l.l_onoff = ((so->so_options & SO_LINGER) ? 1 : 0);
3917                         l.l_linger = (sopt->sopt_name == SO_LINGER) ?
3918                             so->so_linger : so->so_linger / hz;
3919                         error = sooptcopyout(sopt, &l, sizeof (l));
3920                         break;
3921
3922                 case SO_USELOOPBACK:
3923                 case SO_DONTROUTE:
3924                 case SO_DEBUG:
3925                 case SO_KEEPALIVE:
3926                 case SO_REUSEADDR:
3927                 case SO_REUSEPORT:
3928                 case SO_BROADCAST:
3929                 case SO_OOBINLINE:
3930                 case SO_TIMESTAMP:
3931                 case SO_TIMESTAMP_MONOTONIC:
3932                 case SO_DONTTRUNC:
3933                 case SO_WANTMORE:
3934                 case SO_WANTOOBFLAG:
3935                         optval = so->so_options & sopt->sopt_name;
3936 integer:
3937                         error = sooptcopyout(sopt, &optval, sizeof (optval));
3938                         break;
3939
3940                 case SO_TYPE:
3941                         optval = so->so_type;
3942                         goto integer;
3943
3944                 case SO_NREAD:
3945                         if (so->so_proto->pr_flags & PR_ATOMIC) {
3946                                 int pkt_total;
3947                                 struct mbuf *m1;
3948
3949                                 pkt_total = 0;
3950                                 m1 = so->so_rcv.sb_mb;
3951                                 while (m1 != NULL) {
3952                                         if (m1->m_type == MT_DATA ||
3953                                             m1->m_type == MT_HEADER ||
3954                                             m1->m_type == MT_OOBDATA)
3955                                                 pkt_total += m1->m_len;
3956                                         m1 = m1->m_next;
3957                                 }
3958                                 optval = pkt_total;
3959                         } else {
3960                                 optval = so->so_rcv.sb_cc - so->so_rcv.sb_ctl;
3961                         }
3962                         goto integer;
3963
3964                 case SO_NWRITE:
3965                         optval = so->so_snd.sb_cc;
3966                         goto integer;
3967
3968                 case SO_ERROR:
3969                         optval = so->so_error;
3970                         so->so_error = 0;
3971                         goto integer;
3972
3973                 case SO_SNDBUF:
3974                         optval = so->so_snd.sb_hiwat;
3975                         goto integer;
3976
3977                 case SO_RCVBUF:
3978                         optval = so->so_rcv.sb_hiwat;
3979                         goto integer;
3980
3981                 case SO_SNDLOWAT:
3982                         optval = so->so_snd.sb_lowat;
3983                         goto integer;
3984
3985                 case SO_RCVLOWAT:
3986                         optval = so->so_rcv.sb_lowat;
3987                         goto integer;
3988
3989                 case SO_SNDTIMEO:
3990                 case SO_RCVTIMEO:
3991                         tv = (sopt->sopt_name == SO_SNDTIMEO ?
3992                             so->so_snd.sb_timeo : so->so_rcv.sb_timeo);
3993
3994                         error = sooptcopyout_timeval(sopt, &tv);
3995                         break;
3996
3997                 case SO_NOSIGPIPE:
3998                         optval = (so->so_flags & SOF_NOSIGPIPE);
3999                         goto integer;
4000
4001                 case SO_NOADDRERR:
4002                         optval = (so->so_flags & SOF_NOADDRAVAIL);
4003                         goto integer;
4004
4005                 case SO_REUSESHAREUID:
4006                         optval = (so->so_flags & SOF_REUSESHAREUID);
4007                         goto integer;
4008
4009
4010                 case SO_NOTIFYCONFLICT:
4011                         optval = (so->so_flags & SOF_NOTIFYCONFLICT);
4012                         goto integer;
4013
4014                 case SO_RESTRICTIONS:
4015                         optval = so_get_restrictions(so);
4016                         goto integer;
4017
4018                 case SO_LABEL:
4019 #if CONFIG_MACF_SOCKET
4020                         if ((error = sooptcopyin(sopt, &extmac, sizeof (extmac),
4021                             sizeof (extmac))) != 0 ||
4022                             (error = mac_socket_label_get(proc_ucred(
4023                             sopt->sopt_p), so, &extmac)) != 0)
4024                                 break;
4025
4026                         error = sooptcopyout(sopt, &extmac, sizeof (extmac));
4027 #else
4028                         error = EOPNOTSUPP;
4029 #endif /* MAC_SOCKET */
4030                         break;
4031
4032                 case SO_PEERLABEL:
4033 #if CONFIG_MACF_SOCKET
4034                         if ((error = sooptcopyin(sopt, &extmac, sizeof (extmac),
4035                             sizeof (extmac))) != 0 ||
4036                             (error = mac_socketpeer_label_get(proc_ucred(
4037                             sopt->sopt_p), so, &extmac)) != 0)
4038                                 break;
4039
4040                         error = sooptcopyout(sopt, &extmac, sizeof (extmac));
4041 #else
4042                         error = EOPNOTSUPP;
4043 #endif /* MAC_SOCKET */
4044                         break;
4045
4046 #ifdef __APPLE_API_PRIVATE
4047                 case SO_UPCALLCLOSEWAIT:
4048                         optval = (so->so_flags & SOF_UPCALLCLOSEWAIT);
4049                         goto integer;
4050 #endif
4051                 case SO_RANDOMPORT:
4052                         optval = (so->so_flags & SOF_BINDRANDOMPORT);
4053                         goto integer;
4054
4055                 case SO_NP_EXTENSIONS: {
4056                         struct so_np_extensions sonpx;
4057
4058                         sonpx.npx_flags = (so->so_flags & SOF_NPX_SETOPTSHUT) ?
4059                             SONPX_SETOPTSHUT : 0;
4060                         sonpx.npx_mask = SONPX_MASK_VALID;
4061
4062                         error = sooptcopyout(sopt, &sonpx,
4063                             sizeof (struct so_np_extensions));
4064                         break;
4065                 }
4066
4067                 case SO_TRAFFIC_CLASS:
4068                         optval = so->so_traffic_class;
4069                         goto integer;
4070
4071                 case SO_RECV_TRAFFIC_CLASS:
4072                         optval = (so->so_flags & SOF_RECV_TRAFFIC_CLASS);
4073                         goto integer;
4074
4075                 case SO_TRAFFIC_CLASS_STATS:
4076                         error = sooptcopyout(sopt, &so->so_tc_stats,
4077                             sizeof (so->so_tc_stats));
4078                         break;
4079
4080                 case SO_TRAFFIC_CLASS_DBG:
4081                         error = sogetopt_tcdbg(so, sopt);
4082                         break;
4083
4084                 case SO_PRIVILEGED_TRAFFIC_CLASS:
4085                         optval = (so->so_flags & SOF_PRIVILEGED_TRAFFIC_CLASS);
4086                         goto integer;
4087
4088                 case SO_DEFUNCTOK:
4089                         optval = !(so->so_flags & SOF_NODEFUNCT);
4090                         goto integer;
4091
4092                 case SO_ISDEFUNCT:
4093                         optval = (so->so_flags & SOF_DEFUNCT);
4094                         goto integer;
4095
4096                 case SO_OPPORTUNISTIC:
4097                         optval = so_get_opportunistic(so);
4098                         goto integer;
4099
4100                 case SO_FLUSH:
4101                         /* This option is not gettable */
4102                         error = EINVAL;
4103                         break;
4104
4105                 case SO_RECV_ANYIF:
4106                         optval = so_get_recv_anyif(so);
4107                         goto integer;
4108
4109                 case SO_TRAFFIC_MGT_BACKGROUND:
4110                         /* This option is handled by lower layer(s) */
4111                         if (so->so_proto != NULL &&
4112                             so->so_proto->pr_ctloutput != NULL) {
4113                                 (void) so->so_proto->pr_ctloutput(so, sopt);
4114                         }
4115                         break;
4116
4117 #if FLOW_DIVERT
4118                 case SO_FLOW_DIVERT_TOKEN:
4119                         error = flow_divert_token_get(so, sopt);
4120                         break;
4121 #endif  /* FLOW_DIVERT */
4122
4123                 default:
4124                         error = ENOPROTOOPT;
4125                         break;
4126                 }
4127         }
4128 out:
4129         if (dolock)
4130                 socket_unlock(so, 1);
4131         return (error);
4132 }
4133
4134 /*
4135  * The size limits on our soopt_getm is different from that on FreeBSD.
4136  * We limit the size of options to MCLBYTES. This will have to change
4137  * if we need to define options that need more space than MCLBYTES.
4138  */
4139 int
4140 soopt_getm(struct sockopt *sopt, struct mbuf **mp)
4141 {
4142         struct mbuf *m, *m_prev;
4143         int sopt_size = sopt->sopt_valsize;
4144         int how;
4145
4146         if (sopt_size <= 0 || sopt_size > MCLBYTES)
4147                 return (EMSGSIZE);
4148
4149         how = sopt->sopt_p != kernproc ? M_WAIT : M_DONTWAIT;
4150         MGET(m, how, MT_DATA);
4151         if (m == NULL)
4152                 return (ENOBUFS);
4153         if (sopt_size > MLEN) {
4154                 MCLGET(m, how);
4155                 if ((m->m_flags & M_EXT) == 0) {
4156                         m_free(m);
4157                         return (ENOBUFS);
4158                 }
4159                 m->m_len = min(MCLBYTES, sopt_size);
4160         } else {
4161                 m->m_len = min(MLEN, sopt_size);
4162         }
4163         sopt_size -= m->m_len;
4164         *mp = m;
4165         m_prev = m;
4166
4167         while (sopt_size > 0) {
4168                 MGET(m, how, MT_DATA);
4169                 if (m == NULL) {
4170                         m_freem(*mp);
4171                         return (ENOBUFS);
4172                 }
4173                 if (sopt_size > MLEN) {
4174                         MCLGET(m, how);
4175                         if ((m->m_flags & M_EXT) == 0) {
4176                                 m_freem(*mp);
4177                                 m_freem(m);
4178                                 return (ENOBUFS);
4179                         }
4180                         m->m_len = min(MCLBYTES, sopt_size);
4181                 } else {
4182                         m->m_len = min(MLEN, sopt_size);
4183                 }
4184                 sopt_size -= m->m_len;
4185                 m_prev->m_next = m;
4186                 m_prev = m;
4187         }
4188         return (0);
4189 }
4190
4191 /* copyin sopt data into mbuf chain */
4192 int
4193 soopt_mcopyin(struct sockopt *sopt, struct mbuf *m)
4194 {
4195         struct mbuf *m0 = m;
4196
4197         if (sopt->sopt_val == USER_ADDR_NULL)
4198                 return (0);
4199         while (m != NULL && sopt->sopt_valsize >= m->m_len) {
4200                 if (sopt->sopt_p != kernproc) {
4201                         int error;
4202
4203                         error = copyin(sopt->sopt_val, mtod(m, char *),
4204                             m->m_len);
4205                         if (error != 0) {
4206                                 m_freem(m0);
4207                                 return (error);
4208                         }
4209                 } else {
4210                         bcopy(CAST_DOWN(caddr_t, sopt->sopt_val),
4211                             mtod(m, char *), m->m_len);
4212                 }
4213                 sopt->sopt_valsize -= m->m_len;
4214                 sopt->sopt_val += m->m_len;
4215                 m = m->m_next;
4216         }
4217         /* should be allocated enoughly at ip6_sooptmcopyin() */
4218         if (m != NULL) {
4219                 panic("soopt_mcopyin");
4220                 /* NOTREACHED */
4221         }
4222         return (0);
4223 }
4224
4225 /* copyout mbuf chain data into soopt */
4226 int
4227 soopt_mcopyout(struct sockopt *sopt, struct mbuf *m)
4228 {
4229         struct mbuf *m0 = m;
4230         size_t valsize = 0;
4231
4232         if (sopt->sopt_val == USER_ADDR_NULL)
4233                 return (0);
4234         while (m != NULL && sopt->sopt_valsize >= m->m_len) {
4235                 if (sopt->sopt_p != kernproc) {
4236                         int error;
4237
4238                         error = copyout(mtod(m, char *), sopt->sopt_val,
4239                             m->m_len);
4240                         if (error != 0) {
4241                                 m_freem(m0);
4242                                 return (error);
4243                         }
4244                 } else {
4245                         bcopy(mtod(m, char *),
4246                             CAST_DOWN(caddr_t, sopt->sopt_val), m->m_len);
4247                 }
4248                 sopt->sopt_valsize -= m->m_len;
4249                 sopt->sopt_val += m->m_len;
4250                 valsize += m->m_len;
4251                 m = m->m_next;
4252         }
4253         if (m != NULL) {
4254                 /* enough soopt buffer should be given from user-land */
4255                 m_freem(m0);
4256                 return (EINVAL);
4257         }
4258         sopt->sopt_valsize = valsize;
4259         return (0);
4260 }
4261
4262 void
4263 sohasoutofband(struct socket *so)
4264 {
4265         if (so->so_pgid < 0)
4266                 gsignal(-so->so_pgid, SIGURG);
4267         else if (so->so_pgid > 0)
4268                 proc_signal(so->so_pgid, SIGURG);
4269         selwakeup(&so->so_rcv.sb_sel);
4270 }
4271
4272 int
4273 sopoll(struct socket *so, int events, kauth_cred_t cred, void * wql)
4274 {
4275 #pragma unused(cred)
4276         struct proc *p = current_proc();
4277         int revents = 0;
4278
4279         socket_lock(so, 1);
4280         so_update_last_owner_locked(so, PROC_NULL);
4281         so_update_policy(so);
4282
4283         if (events & (POLLIN | POLLRDNORM))
4284                 if (soreadable(so))
4285                         revents |= events & (POLLIN | POLLRDNORM);
4286
4287         if (events & (POLLOUT | POLLWRNORM))
4288                 if (sowriteable(so))
4289                         revents |= events & (POLLOUT | POLLWRNORM);
4290
4291         if (events & (POLLPRI | POLLRDBAND))
4292                 if (so->so_oobmark || (so->so_state & SS_RCVATMARK))
4293                         revents |= events & (POLLPRI | POLLRDBAND);
4294
4295         if (revents == 0) {
4296                 if (events & (POLLIN | POLLPRI | POLLRDNORM | POLLRDBAND)) {
4297                         /*
4298                          * Darwin sets the flag first,
4299                          * BSD calls selrecord first
4300                          */
4301                         so->so_rcv.sb_flags |= SB_SEL;
4302                         selrecord(p, &so->so_rcv.sb_sel, wql);
4303                 }
4304
4305                 if (events & (POLLOUT | POLLWRNORM)) {
4306                         /*
4307                          * Darwin sets the flag first,
4308                          * BSD calls selrecord first
4309                          */
4310                         so->so_snd.sb_flags |= SB_SEL;
4311                         selrecord(p, &so->so_snd.sb_sel, wql);
4312                 }
4313         }
4314
4315         socket_unlock(so, 1);
4316         return (revents);
4317 }
4318
4319 int
4320 soo_kqfilter(struct fileproc *fp, struct knote *kn, vfs_context_t ctx)
4321 {
4322 #pragma unused(fp)
4323 #if !CONFIG_MACF_SOCKET
4324 #pragma unused(ctx)
4325 #endif /* MAC_SOCKET */
4326         struct socket *so = (struct socket *)kn->kn_fp->f_fglob->fg_data;
4327         struct klist *skl;
4328
4329         socket_lock(so, 1);
4330         so_update_last_owner_locked(so, PROC_NULL);
4331         so_update_policy(so);
4332
4333 #if CONFIG_MACF_SOCKET
4334         if (mac_socket_check_kqfilter(proc_ucred(vfs_context_proc(ctx)),
4335             kn, so) != 0) {
4336                 socket_unlock(so, 1);
4337                 return (1);
4338         }
4339 #endif /* MAC_SOCKET */
4340
4341         switch (kn->kn_filter) {
4342         case EVFILT_READ:
4343                 kn->kn_fop = &soread_filtops;
4344                 skl = &so->so_rcv.sb_sel.si_note;
4345                 break;
4346         case EVFILT_WRITE:
4347                 kn->kn_fop = &sowrite_filtops;
4348                 skl = &so->so_snd.sb_sel.si_note;
4349                 break;
4350         case EVFILT_SOCK:
4351                 kn->kn_fop = &sock_filtops;
4352                 skl = &so->so_klist;
4353                 break;
4354         default:
4355                 socket_unlock(so, 1);
4356                 return (1);
4357         }
4358
4359         if (KNOTE_ATTACH(skl, kn)) {
4360                 switch (kn->kn_filter) {
4361                 case EVFILT_READ:
4362                         so->so_rcv.sb_flags |= SB_KNOTE;
4363                         break;
4364                 case EVFILT_WRITE:
4365                         so->so_snd.sb_flags |= SB_KNOTE;
4366                         break;
4367                 case EVFILT_SOCK:
4368                         so->so_flags |= SOF_KNOTE;
4369                         break;
4370                 default:
4371                         socket_unlock(so, 1);
4372                         return (1);
4373                 }
4374         }
4375         socket_unlock(so, 1);
4376         return (0);
4377 }
4378
4379 static void
4380 filt_sordetach(struct knote *kn)
4381 {
4382         struct socket *so = (struct socket *)kn->kn_fp->f_fglob->fg_data;
4383
4384         socket_lock(so, 1);
4385         if (so->so_rcv.sb_flags & SB_KNOTE)
4386                 if (KNOTE_DETACH(&so->so_rcv.sb_sel.si_note, kn))
4387                         so->so_rcv.sb_flags &= ~SB_KNOTE;
4388         socket_unlock(so, 1);
4389 }
4390
4391 /*ARGSUSED*/
4392 static int
4393 filt_soread(struct knote *kn, long hint)
4394 {
4395         struct socket *so = (struct socket *)kn->kn_fp->f_fglob->fg_data;
4396
4397         if ((hint & SO_FILT_HINT_LOCKED) == 0)
4398                 socket_lock(so, 1);
4399
4400         if (so->so_options & SO_ACCEPTCONN) {
4401                 int isempty;
4402
4403                 /*
4404                  * Radar 6615193 handle the listen case dynamically
4405                  * for kqueue read filter. This allows to call listen()
4406                  * after registering the kqueue EVFILT_READ.
4407                  */
4408
4409                 kn->kn_data = so->so_qlen;
4410                 isempty = ! TAILQ_EMPTY(&so->so_comp);
4411
4412                 if ((hint & SO_FILT_HINT_LOCKED) == 0)
4413                         socket_unlock(so, 1);
4414
4415                 return (isempty);
4416         }
4417
4418         /* socket isn't a listener */
4419
4420         kn->kn_data = so->so_rcv.sb_cc - so->so_rcv.sb_ctl;
4421
4422         if (so->so_oobmark) {
4423                 if (kn->kn_flags & EV_OOBAND) {
4424                         kn->kn_data -= so->so_oobmark;
4425                         if ((hint & SO_FILT_HINT_LOCKED) == 0)
4426                                 socket_unlock(so, 1);
4427                         return (1);
4428                 }
4429                 kn->kn_data = so->so_oobmark;
4430                 kn->kn_flags |= EV_OOBAND;
4431         } else {
4432                 if (so->so_state & SS_CANTRCVMORE) {
4433                         kn->kn_flags |= EV_EOF;
4434                         kn->kn_fflags = so->so_error;
4435                         if ((hint & SO_FILT_HINT_LOCKED) == 0)
4436                                 socket_unlock(so, 1);
4437                         return (1);
4438                 }
4439         }
4440
4441         if (so->so_state & SS_RCVATMARK) {
4442                 if (kn->kn_flags & EV_OOBAND) {
4443                         if ((hint & SO_FILT_HINT_LOCKED) == 0)
4444                                 socket_unlock(so, 1);
4445                         return (1);
4446                 }
4447                 kn->kn_flags |= EV_OOBAND;
4448         } else if (kn->kn_flags & EV_OOBAND) {
4449                 kn->kn_data = 0;
4450                 if ((hint & SO_FILT_HINT_LOCKED) == 0)
4451                         socket_unlock(so, 1);
4452                 return (0);
4453         }
4454
4455         if (so->so_error) {     /* temporary udp error */
4456                 if ((hint & SO_FILT_HINT_LOCKED) == 0)
4457                         socket_unlock(so, 1);
4458                 return (1);
4459         }
4460
4461         int64_t lowwat = so->so_rcv.sb_lowat;
4462         if (kn->kn_sfflags & NOTE_LOWAT) {
4463                 if (kn->kn_sdata > so->so_rcv.sb_hiwat)
4464                         lowwat = so->so_rcv.sb_hiwat;
4465                 else if (kn->kn_sdata > lowwat)
4466                         lowwat = kn->kn_sdata;
4467         }
4468
4469         if ((hint & SO_FILT_HINT_LOCKED) == 0)
4470                 socket_unlock(so, 1);
4471
4472         return ((kn->kn_flags & EV_OOBAND) || kn->kn_data >= lowwat);
4473 }
4474
4475 static void
4476 filt_sowdetach(struct knote *kn)
4477 {
4478         struct socket *so = (struct socket *)kn->kn_fp->f_fglob->fg_data;
4479         socket_lock(so, 1);
4480
4481         if (so->so_snd.sb_flags & SB_KNOTE)
4482                 if (KNOTE_DETACH(&so->so_snd.sb_sel.si_note, kn))
4483                         so->so_snd.sb_flags &= ~SB_KNOTE;
4484         socket_unlock(so, 1);
4485 }
4486
4487 int
4488 so_wait_for_if_feedback(struct socket *so)
4489 {
4490         if ((SOCK_DOM(so) == PF_INET || SOCK_DOM(so) == PF_INET6) &&
4491             (so->so_state & SS_ISCONNECTED)) {
4492                 struct inpcb *inp = sotoinpcb(so);
4493                 if (INP_WAIT_FOR_IF_FEEDBACK(inp))
4494                         return (1);
4495         }
4496         return (0);
4497 }
4498
4499 /*ARGSUSED*/
4500 static int
4501 filt_sowrite(struct knote *kn, long hint)
4502 {
4503         struct socket *so = (struct socket *)kn->kn_fp->f_fglob->fg_data;
4504         int ret = 0;
4505
4506         if ((hint & SO_FILT_HINT_LOCKED) == 0)
4507                 socket_lock(so, 1);
4508
4509         kn->kn_data = sbspace(&so->so_snd);
4510         if (so->so_state & SS_CANTSENDMORE) {
4511                 kn->kn_flags |= EV_EOF;
4512                 kn->kn_fflags = so->so_error;
4513                 ret = 1;
4514                 goto out;
4515         }
4516         if (so->so_error) {     /* temporary udp error */
4517                 ret = 1;
4518                 goto out;
4519         }
4520         if (((so->so_state & SS_ISCONNECTED) == 0) &&
4521             (so->so_proto->pr_flags & PR_CONNREQUIRED)) {
4522                 ret = 0;
4523                 goto out;
4524         }
4525         int64_t lowwat = so->so_snd.sb_lowat;
4526         if (kn->kn_sfflags & NOTE_LOWAT) {
4527                 if (kn->kn_sdata > so->so_snd.sb_hiwat)
4528                         lowwat = so->so_snd.sb_hiwat;
4529                 else if (kn->kn_sdata > lowwat)
4530                         lowwat = kn->kn_sdata;
4531         }
4532         if (kn->kn_data >= lowwat) {
4533                 if ((so->so_flags & SOF_NOTSENT_LOWAT) != 0) {
4534                         ret = tcp_notsent_lowat_check(so);
4535                 } else {
4536                         ret = 1;
4537                 }
4538         }
4539         if (so_wait_for_if_feedback(so))
4540                 ret = 0;
4541 out:
4542         if ((hint & SO_FILT_HINT_LOCKED) == 0)
4543                 socket_unlock(so, 1);
4544         return (ret);
4545 }
4546
4547 static void
4548 filt_sockdetach(struct knote *kn)
4549 {
4550         struct socket *so = (struct socket *)kn->kn_fp->f_fglob->fg_data;
4551         socket_lock(so, 1);
4552
4553         if ((so->so_flags & SOF_KNOTE) != 0)
4554                 if (KNOTE_DETACH(&so->so_klist, kn))
4555                         so->so_flags &= ~SOF_KNOTE;
4556         socket_unlock(so, 1);
4557 }
4558
4559 static int
4560 filt_sockev(struct knote *kn, long hint)
4561 {
4562         int ret = 0, locked = 0;
4563         struct socket *so = (struct socket *)kn->kn_fp->f_fglob->fg_data;
4564         long ev_hint = (hint & SO_FILT_HINT_EV);
4565
4566         if ((hint & SO_FILT_HINT_LOCKED) == 0) {
4567                 socket_lock(so, 1);
4568                 locked = 1;
4569         }
4570
4571         if (ev_hint & SO_FILT_HINT_CONNRESET) {
4572                 if (kn->kn_sfflags & NOTE_CONNRESET)
4573                         kn->kn_fflags |= NOTE_CONNRESET;
4574         }
4575         if (ev_hint & SO_FILT_HINT_TIMEOUT) {
4576                 if (kn->kn_sfflags & NOTE_TIMEOUT)
4577                         kn->kn_fflags |= NOTE_TIMEOUT;
4578         }
4579         if (ev_hint & SO_FILT_HINT_NOSRCADDR) {
4580                 if (kn->kn_sfflags & NOTE_NOSRCADDR)
4581                         kn->kn_fflags |= NOTE_NOSRCADDR;
4582         }
4583         if (ev_hint & SO_FILT_HINT_IFDENIED) {
4584                 if ((kn->kn_sfflags & NOTE_IFDENIED))
4585                         kn->kn_fflags |= NOTE_IFDENIED;
4586         }
4587         if (ev_hint & SO_FILT_HINT_KEEPALIVE) {
4588                 if (kn->kn_sfflags & NOTE_KEEPALIVE)
4589                         kn->kn_fflags |= NOTE_KEEPALIVE;
4590         }
4591         if (ev_hint & SO_FILT_HINT_ADAPTIVE_WTIMO) {
4592                 if (kn->kn_sfflags & NOTE_ADAPTIVE_WTIMO)
4593                         kn->kn_fflags |= NOTE_ADAPTIVE_WTIMO;
4594         }
4595         if (ev_hint & SO_FILT_HINT_ADAPTIVE_RTIMO) {
4596                 if (kn->kn_sfflags & NOTE_ADAPTIVE_RTIMO)
4597                         kn->kn_fflags |= NOTE_ADAPTIVE_RTIMO;
4598         }
4599         if (ev_hint & SO_FILT_HINT_CONNECTED) {
4600                 if (kn->kn_sfflags & NOTE_CONNECTED)
4601                         kn->kn_fflags |= NOTE_CONNECTED;
4602         }
4603         if (ev_hint & SO_FILT_HINT_DISCONNECTED) {
4604                 if (kn->kn_sfflags & NOTE_DISCONNECTED)
4605                         kn->kn_fflags |= NOTE_DISCONNECTED;
4606         }
4607         if (ev_hint & SO_FILT_HINT_CONNINFO_UPDATED) {
4608                 if (so->so_proto != NULL &&
4609                     (so->so_proto->pr_flags & PR_EVCONNINFO) &&
4610                     (kn->kn_sfflags & NOTE_CONNINFO_UPDATED))
4611                         kn->kn_fflags |= NOTE_CONNINFO_UPDATED;
4612         }
4613
4614         if ((kn->kn_sfflags & NOTE_READCLOSED) &&
4615             (so->so_state & SS_CANTRCVMORE))
4616                 kn->kn_fflags |= NOTE_READCLOSED;
4617
4618         if ((kn->kn_sfflags & NOTE_WRITECLOSED) &&
4619             (so->so_state & SS_CANTSENDMORE))
4620                 kn->kn_fflags |= NOTE_WRITECLOSED;
4621
4622         if ((kn->kn_sfflags & NOTE_SUSPEND) &&
4623             ((ev_hint & SO_FILT_HINT_SUSPEND) ||
4624             (so->so_flags & SOF_SUSPENDED))) {
4625                 kn->kn_fflags &= ~(NOTE_SUSPEND | NOTE_RESUME);
4626                 kn->kn_fflags |= NOTE_SUSPEND;
4627         }
4628
4629         if ((kn->kn_sfflags & NOTE_RESUME) &&
4630             ((ev_hint & SO_FILT_HINT_RESUME) ||
4631             (so->so_flags & SOF_SUSPENDED) == 0)) {
4632                 kn->kn_fflags &= ~(NOTE_SUSPEND | NOTE_RESUME);
4633                 kn->kn_fflags |= NOTE_RESUME;
4634         }
4635
4636         if (so->so_error != 0) {
4637                 ret = 1;
4638                 kn->kn_data = so->so_error;
4639                 kn->kn_flags |= EV_EOF;
4640         } else {
4641                 get_sockev_state(so, (u_int32_t *)&(kn->kn_data));
4642         }
4643
4644         if (kn->kn_fflags != 0)
4645                 ret = 1;
4646
4647         if (locked)
4648                 socket_unlock(so, 1);
4649
4650         return (ret);
4651 }
4652
4653 void
4654 get_sockev_state(struct socket *so, u_int32_t *statep)
4655 {
4656         u_int32_t state = *(statep);
4657
4658         if (so->so_state & SS_ISCONNECTED)
4659                 state |= SOCKEV_CONNECTED;
4660         else
4661                 state &= ~(SOCKEV_CONNECTED);
4662         state |= ((so->so_state & SS_ISDISCONNECTED) ? SOCKEV_DISCONNECTED : 0);
4663         *(statep) = state;
4664 }
4665
4666 #define SO_LOCK_HISTORY_STR_LEN \
4667         (2 * SO_LCKDBG_MAX * (2 + (2 * sizeof (void *)) + 1) + 1)
4668
4669 __private_extern__ const char *
4670 solockhistory_nr(struct socket *so)
4671 {
4672         size_t n = 0;
4673         int i;
4674         static char lock_history_str[SO_LOCK_HISTORY_STR_LEN];
4675
4676         bzero(lock_history_str, sizeof (lock_history_str));
4677         for (i = SO_LCKDBG_MAX - 1; i >= 0; i--) {
4678                 n += snprintf(lock_history_str + n,
4679                     SO_LOCK_HISTORY_STR_LEN - n, "%p:%p ",
4680                     so->lock_lr[(so->next_lock_lr + i) % SO_LCKDBG_MAX],
4681                     so->unlock_lr[(so->next_unlock_lr + i) % SO_LCKDBG_MAX]);
4682         }
4683         return (lock_history_str);
4684 }
4685
4686 int
4687 socket_lock(struct socket *so, int refcount)
4688 {
4689         int error = 0;
4690         void *lr_saved;
4691
4692         lr_saved = __builtin_return_address(0);
4693
4694         if (so->so_proto->pr_lock) {
4695                 error = (*so->so_proto->pr_lock)(so, refcount, lr_saved);
4696         } else {
4697 #ifdef MORE_LOCKING_DEBUG
4698                 lck_mtx_assert(so->so_proto->pr_domain->dom_mtx,
4699                     LCK_MTX_ASSERT_NOTOWNED);
4700 #endif
4701                 lck_mtx_lock(so->so_proto->pr_domain->dom_mtx);
4702                 if (refcount)
4703                         so->so_usecount++;
4704                 so->lock_lr[so->next_lock_lr] = lr_saved;
4705                 so->next_lock_lr = (so->next_lock_lr+1) % SO_LCKDBG_MAX;
4706         }
4707
4708         return (error);
4709 }
4710
4711 int
4712 socket_unlock(struct socket *so, int refcount)
4713 {
4714         int error = 0;
4715         void *lr_saved;
4716         lck_mtx_t *mutex_held;
4717
4718         lr_saved = __builtin_return_address(0);
4719
4720         if (so->so_proto == NULL) {
4721                 panic("%s: null so_proto so=%p\n", __func__, so);
4722                 /* NOTREACHED */
4723         }
4724
4725         if (so && so->so_proto->pr_unlock) {
4726                 error = (*so->so_proto->pr_unlock)(so, refcount, lr_saved);
4727         } else {
4728                 mutex_held = so->so_proto->pr_domain->dom_mtx;
4729 #ifdef MORE_LOCKING_DEBUG
4730                 lck_mtx_assert(mutex_held, LCK_MTX_ASSERT_OWNED);
4731 #endif
4732                 so->unlock_lr[so->next_unlock_lr] = lr_saved;
4733                 so->next_unlock_lr = (so->next_unlock_lr+1) % SO_LCKDBG_MAX;
4734
4735                 if (refcount) {
4736                         if (so->so_usecount <= 0) {
4737                                 panic("%s: bad refcount=%d so=%p (%d, %d, %d) "
4738                                     "lrh=%s", __func__, so->so_usecount, so,
4739                                     SOCK_DOM(so), so->so_type,
4740                                     SOCK_PROTO(so), solockhistory_nr(so));
4741                                 /* NOTREACHED */
4742                         }
4743
4744                         so->so_usecount--;
4745                         if (so->so_usecount == 0)
4746                                 sofreelastref(so, 1);
4747                 }
4748                 lck_mtx_unlock(mutex_held);
4749         }
4750
4751         return (error);
4752 }
4753
4754 /* Called with socket locked, will unlock socket */
4755 void
4756 sofree(struct socket *so)
4757 {
4758         lck_mtx_t *mutex_held;
4759
4760         if (so->so_proto->pr_getlock != NULL)
4761                 mutex_held = (*so->so_proto->pr_getlock)(so, 0);
4762         else
4763                 mutex_held = so->so_proto->pr_domain->dom_mtx;
4764         lck_mtx_assert(mutex_held, LCK_MTX_ASSERT_OWNED);
4765
4766         sofreelastref(so, 0);
4767 }
4768
4769 void
4770 soreference(struct socket *so)
4771 {
4772         socket_lock(so, 1);     /* locks & take one reference on socket */
4773         socket_unlock(so, 0);   /* unlock only */
4774 }
4775
4776 void
4777 sodereference(struct socket *so)
4778 {
4779         socket_lock(so, 0);
4780         socket_unlock(so, 1);
4781 }
4782
4783 /*
4784  * Set or clear SOF_MULTIPAGES on the socket to enable or disable the
4785  * possibility of using jumbo clusters.  Caller must ensure to hold
4786  * the socket lock.
4787  */
4788 void
4789 somultipages(struct socket *so, boolean_t set)
4790 {
4791         if (set)
4792                 so->so_flags |= SOF_MULTIPAGES;
4793         else
4794                 so->so_flags &= ~SOF_MULTIPAGES;
4795 }
4796
4797 int
4798 so_isdstlocal(struct socket *so) {
4799
4800         struct inpcb *inp = (struct inpcb *)so->so_pcb;
4801
4802         if (SOCK_DOM(so) == PF_INET)
4803                 return (inaddr_local(inp->inp_faddr));
4804         else if (SOCK_DOM(so) == PF_INET6)
4805                 return (in6addr_local(&inp->in6p_faddr));
4806
4807         return (0);
4808 }
4809
4810 int
4811 sosetdefunct(struct proc *p, struct socket *so, int level, boolean_t noforce)
4812 {
4813         struct sockbuf *rcv, *snd;
4814         int err = 0, defunct;
4815
4816         rcv = &so->so_rcv;
4817         snd = &so->so_snd;
4818
4819         defunct = (so->so_flags & SOF_DEFUNCT);
4820         if (defunct) {
4821                 if (!(snd->sb_flags & rcv->sb_flags & SB_DROP)) {
4822                         panic("%s: SB_DROP not set", __func__);
4823                         /* NOTREACHED */
4824                 }
4825                 goto done;
4826         }
4827
4828         if (so->so_flags & SOF_NODEFUNCT) {
4829                 if (noforce) {
4830                         err = EOPNOTSUPP;
4831                         SODEFUNCTLOG(("%s[%d]: (target pid %d level %d) "
4832                             "so 0x%llx [%d,%d] is not eligible for defunct "
4833                             "(%d)\n", __func__, proc_selfpid(), proc_pid(p),
4834                             level, (uint64_t)VM_KERNEL_ADDRPERM(so),
4835                             SOCK_DOM(so), SOCK_TYPE(so), err));
4836                         return (err);
4837                 }
4838                 so->so_flags &= ~SOF_NODEFUNCT;
4839                 SODEFUNCTLOG(("%s[%d]: (target pid %d level %d) so 0x%llx "
4840                     "[%d,%d] defunct by force\n", __func__, proc_selfpid(),
4841                     proc_pid(p), level, (uint64_t)VM_KERNEL_ADDRPERM(so),
4842                     SOCK_DOM(so), SOCK_TYPE(so)));
4843         }
4844
4845         so->so_flags |= SOF_DEFUNCT;
4846
4847         /* Prevent further data from being appended to the socket buffers */
4848         snd->sb_flags |= SB_DROP;
4849         rcv->sb_flags |= SB_DROP;
4850
4851         /* Flush any existing data in the socket buffers */
4852         if (rcv->sb_cc != 0) {
4853                 rcv->sb_flags &= ~SB_SEL;
4854                 selthreadclear(&rcv->sb_sel);
4855                 sbrelease(rcv);
4856         }
4857         if (snd->sb_cc != 0) {
4858                 snd->sb_flags &= ~SB_SEL;
4859                 selthreadclear(&snd->sb_sel);
4860                 sbrelease(snd);
4861         }
4862
4863 done:
4864         SODEFUNCTLOG(("%s[%d]: (target pid %d level %d) so 0x%llx [%d,%d] %s "
4865             "defunct\n", __func__, proc_selfpid(), proc_pid(p), level,
4866             (uint64_t)VM_KERNEL_ADDRPERM(so), SOCK_DOM(so), SOCK_TYPE(so),
4867             defunct ? "is already" : "marked as"));
4868
4869         return (err);
4870 }
4871
4872 int
4873 sodefunct(struct proc *p, struct socket *so, int level)
4874 {
4875         struct sockbuf *rcv, *snd;
4876
4877         if (!(so->so_flags & SOF_DEFUNCT)) {
4878                 panic("%s improperly called", __func__);
4879                 /* NOTREACHED */
4880         }
4881         if (so->so_state & SS_DEFUNCT)
4882                 goto done;
4883
4884         rcv = &so->so_rcv;
4885         snd = &so->so_snd;
4886
4887         if (SOCK_DOM(so) == PF_INET || SOCK_DOM(so) == PF_INET6) {
4888                 char s[MAX_IPv6_STR_LEN];
4889                 char d[MAX_IPv6_STR_LEN];
4890                 struct inpcb *inp = sotoinpcb(so);
4891
4892                 SODEFUNCTLOG(("%s[%d]: (target pid %d level %d) so 0x%llx [%s "
4893                     "%s:%d -> %s:%d] is now defunct [rcv_si 0x%x, snd_si 0x%x, "
4894                     "rcv_fl 0x%x, snd_fl 0x%x]\n", __func__, proc_selfpid(),
4895                     proc_pid(p), level, (uint64_t)VM_KERNEL_ADDRPERM(so),
4896                     (SOCK_TYPE(so) == SOCK_STREAM) ? "TCP" : "UDP",
4897                     inet_ntop(SOCK_DOM(so), ((SOCK_DOM(so) == PF_INET) ?
4898                     (void *)&inp->inp_laddr.s_addr : (void *)&inp->in6p_laddr),
4899                     s, sizeof (s)), ntohs(inp->in6p_lport),
4900                     inet_ntop(SOCK_DOM(so), (SOCK_DOM(so) == PF_INET) ?
4901                     (void *)&inp->inp_faddr.s_addr : (void *)&inp->in6p_faddr,
4902                     d, sizeof (d)), ntohs(inp->in6p_fport),
4903                     (uint32_t)rcv->sb_sel.si_flags,
4904                     (uint32_t)snd->sb_sel.si_flags,
4905                     rcv->sb_flags, snd->sb_flags));
4906         } else {
4907                 SODEFUNCTLOG(("%s[%d]: (target pid %d level %d) so 0x%llx "
4908                     "[%d,%d] is now defunct [rcv_si 0x%x, snd_si 0x%x, "
4909                     "rcv_fl 0x%x, snd_fl 0x%x]\n", __func__, proc_selfpid(),
4910                     proc_pid(p), level, (uint64_t)VM_KERNEL_ADDRPERM(so),
4911                     SOCK_DOM(so), SOCK_TYPE(so), (uint32_t)rcv->sb_sel.si_flags,
4912                     (uint32_t)snd->sb_sel.si_flags, rcv->sb_flags,
4913                     snd->sb_flags));
4914         }
4915
4916         /*
4917          * Unwedge threads blocked on sbwait() and sb_lock().
4918          */
4919         sbwakeup(rcv);
4920         sbwakeup(snd);
4921
4922         if (rcv->sb_flags & SB_LOCK)
4923                 sbunlock(rcv, TRUE);    /* keep socket locked */
4924         if (snd->sb_flags & SB_LOCK)
4925                 sbunlock(snd, TRUE);    /* keep socket locked */
4926
4927         /*
4928          * Flush the buffers and disconnect.  We explicitly call shutdown
4929          * on both data directions to ensure that SS_CANT{RCV,SEND}MORE
4930          * states are set for the socket.  This would also flush out data
4931          * hanging off the receive list of this socket.
4932          */
4933         (void) soshutdownlock(so, SHUT_RD);
4934         (void) soshutdownlock(so, SHUT_WR);
4935         (void) sodisconnectlocked(so);
4936
4937         /*
4938          * Explicitly handle connectionless-protocol disconnection
4939          * and release any remaining data in the socket buffers.
4940          */
4941         if (!(so->so_flags & SS_ISDISCONNECTED))
4942                 (void) soisdisconnected(so);
4943
4944         if (so->so_error == 0)
4945                 so->so_error = EBADF;
4946
4947         if (rcv->sb_cc != 0) {
4948                 rcv->sb_flags &= ~SB_SEL;
4949                 selthreadclear(&rcv->sb_sel);
4950                 sbrelease(rcv);
4951         }
4952         if (snd->sb_cc != 0) {
4953                 snd->sb_flags &= ~SB_SEL;
4954                 selthreadclear(&snd->sb_sel);
4955                 sbrelease(snd);
4956         }
4957         so->so_state |= SS_DEFUNCT;
4958
4959 done:
4960         return (0);
4961 }
4962
4963 __private_extern__ int
4964 so_set_recv_anyif(struct socket *so, int optval)
4965 {
4966         int ret = 0;
4967
4968 #if INET6
4969         if (SOCK_DOM(so) == PF_INET || SOCK_DOM(so) == PF_INET6) {
4970 #else
4971         if (SOCK_DOM(so) == PF_INET) {
4972 #endif /* !INET6 */
4973                 if (optval)
4974                         sotoinpcb(so)->inp_flags |= INP_RECV_ANYIF;
4975                 else
4976                         sotoinpcb(so)->inp_flags &= ~INP_RECV_ANYIF;
4977         }
4978
4979         return (ret);
4980 }
4981
4982 __private_extern__ int
4983 so_get_recv_anyif(struct socket *so)
4984 {
4985         int ret = 0;
4986
4987 #if INET6
4988         if (SOCK_DOM(so) == PF_INET || SOCK_DOM(so) == PF_INET6) {
4989 #else
4990         if (SOCK_DOM(so) == PF_INET) {
4991 #endif /* !INET6 */
4992                 ret = (sotoinpcb(so)->inp_flags & INP_RECV_ANYIF) ? 1 : 0;
4993         }
4994
4995         return (ret);
4996 }
4997
4998 int
4999 so_set_restrictions(struct socket *so, uint32_t vals)
5000 {
5001         int nocell_old, nocell_new;
5002         int ret = 0;
5003
5004         /*
5005          * Deny-type restrictions are trapdoors; once set they cannot be
5006          * unset for the lifetime of the socket.  This allows them to be
5007          * issued by a framework on behalf of the application without
5008          * having to worry that they can be undone.
5009          *
5010          * Note here that socket-level restrictions overrides any protocol
5011          * level restrictions.  For instance, SO_RESTRICT_DENY_CELLULAR
5012          * socket restriction issued on the socket has a higher precendence
5013          * than INP_NO_IFT_CELLULAR.  The latter is affected by the UUID
5014          * policy PROC_UUID_NO_CELLULAR for unrestricted sockets only,
5015          * i.e. when SO_RESTRICT_DENY_CELLULAR has not been issued.
5016          */
5017         nocell_old = (so->so_restrictions & SO_RESTRICT_DENY_CELLULAR);
5018         so->so_restrictions |= (vals & (SO_RESTRICT_DENY_IN |
5019             SO_RESTRICT_DENY_OUT | SO_RESTRICT_DENY_CELLULAR));
5020         nocell_new = (so->so_restrictions & SO_RESTRICT_DENY_CELLULAR);
5021
5022         /* other than deny cellular, there's nothing more to do */
5023         if ((nocell_new - nocell_old) == 0)
5024                 return (ret);
5025
5026         /* we can only set, not clear restrictions */
5027         VERIFY((nocell_new - nocell_old) > 0);
5028
5029 #if INET6
5030         if (SOCK_DOM(so) == PF_INET || SOCK_DOM(so) == PF_INET6) {
5031 #else
5032         if (SOCK_DOM(so) == PF_INET) {
5033 #endif /* !INET6 */
5034                 /* if deny cellular is now set, do what's needed for INPCB */
5035                 inp_set_nocellular(sotoinpcb(so));
5036         }
5037
5038         return (ret);
5039 }
5040
5041 uint32_t
5042 so_get_restrictions(struct socket *so)
5043 {
5044         return (so->so_restrictions & (SO_RESTRICT_DENY_IN |
5045             SO_RESTRICT_DENY_OUT | SO_RESTRICT_DENY_CELLULAR));
5046 }
5047
5048 struct sockaddr_entry *
5049 sockaddrentry_alloc(int how)
5050 {
5051         struct sockaddr_entry *se;
5052
5053         se = (how == M_WAITOK) ? zalloc(se_zone) : zalloc_noblock(se_zone);
5054         if (se != NULL)
5055                 bzero(se, se_zone_size);
5056
5057         return (se);
5058 }
5059
5060 void
5061 sockaddrentry_free(struct sockaddr_entry *se)
5062 {
5063         if (se->se_addr != NULL) {
5064                 FREE(se->se_addr, M_SONAME);
5065                 se->se_addr = NULL;
5066         }
5067         zfree(se_zone, se);
5068 }
5069
5070 struct sockaddr_entry *
5071 sockaddrentry_dup(const struct sockaddr_entry *src_se, int how)
5072 {
5073         struct sockaddr_entry *dst_se;
5074
5075         dst_se = sockaddrentry_alloc(how);
5076         if (dst_se != NULL) {
5077                 int len = src_se->se_addr->sa_len;
5078
5079                 MALLOC(dst_se->se_addr, struct sockaddr *,
5080                     len, M_SONAME, how | M_ZERO);
5081                 if (dst_se->se_addr != NULL) {
5082                         bcopy(src_se->se_addr, dst_se->se_addr, len);
5083                 } else {
5084                         sockaddrentry_free(dst_se);
5085                         dst_se = NULL;
5086                 }
5087         }
5088
5089         return (dst_se);
5090 }
5091
5092 struct sockaddr_list *
5093 sockaddrlist_alloc(int how)
5094 {
5095         struct sockaddr_list *sl;
5096
5097         sl = (how == M_WAITOK) ? zalloc(sl_zone) : zalloc_noblock(sl_zone);
5098         if (sl != NULL) {
5099                 bzero(sl, sl_zone_size);
5100                 TAILQ_INIT(&sl->sl_head);
5101         }
5102         return (sl);
5103 }
5104
5105 void
5106 sockaddrlist_free(struct sockaddr_list *sl)
5107 {
5108         struct sockaddr_entry *se, *tse;
5109
5110         TAILQ_FOREACH_SAFE(se, &sl->sl_head, se_link, tse) {
5111                 sockaddrlist_remove(sl, se);
5112                 sockaddrentry_free(se);
5113         }
5114         VERIFY(sl->sl_cnt == 0 && TAILQ_EMPTY(&sl->sl_head));
5115         zfree(sl_zone, sl);
5116 }
5117
5118 void
5119 sockaddrlist_insert(struct sockaddr_list *sl, struct sockaddr_entry *se)
5120 {
5121         VERIFY(!(se->se_flags & SEF_ATTACHED));
5122         se->se_flags |= SEF_ATTACHED;
5123         TAILQ_INSERT_TAIL(&sl->sl_head, se, se_link);
5124         sl->sl_cnt++;
5125         VERIFY(sl->sl_cnt != 0);
5126 }
5127
5128 void
5129 sockaddrlist_remove(struct sockaddr_list *sl, struct sockaddr_entry *se)
5130 {
5131         VERIFY(se->se_flags & SEF_ATTACHED);
5132         se->se_flags &= ~SEF_ATTACHED;
5133         VERIFY(sl->sl_cnt != 0);
5134         sl->sl_cnt--;
5135         TAILQ_REMOVE(&sl->sl_head, se, se_link);
5136 }
5137
5138 struct sockaddr_list *
5139 sockaddrlist_dup(const struct sockaddr_list *src_sl, int how)
5140 {
5141         struct sockaddr_entry *src_se, *tse;
5142         struct sockaddr_list *dst_sl;
5143
5144         dst_sl = sockaddrlist_alloc(how);
5145         if (dst_sl == NULL)
5146                 return (NULL);
5147
5148         TAILQ_FOREACH_SAFE(src_se, &src_sl->sl_head, se_link, tse) {
5149                 struct sockaddr_entry *dst_se;
5150
5151                 if (src_se->se_addr == NULL)
5152                         continue;
5153
5154                 dst_se = sockaddrentry_dup(src_se, how);
5155                 if (dst_se == NULL) {
5156                         sockaddrlist_free(dst_sl);
5157                         return (NULL);
5158                 }
5159
5160                 sockaddrlist_insert(dst_sl, dst_se);
5161         }
5162         VERIFY(src_sl->sl_cnt == dst_sl->sl_cnt);
5163
5164         return (dst_sl);
5165 }
5166
5167 int
5168 so_set_effective_pid(struct socket *so, int epid, struct proc *p)
5169 {
5170         struct proc *ep = PROC_NULL;
5171         int error = 0;
5172
5173         /* pid 0 is reserved for kernel */
5174         if (epid == 0) {
5175                 error = EINVAL;
5176                 goto done;
5177         }
5178
5179         /*
5180          * If this is an in-kernel socket, prevent its delegate
5181          * association from changing unless the socket option is
5182          * coming from within the kernel itself.
5183          */
5184         if (so->last_pid == 0 && p != kernproc) {
5185                 error = EACCES;
5186                 goto done;
5187         }
5188
5189         /*
5190          * If this is issued by a process that's recorded as the
5191          * real owner of the socket, or if the pid is the same as
5192          * the process's own pid, then proceed.  Otherwise ensure
5193          * that the issuing process has the necessary privileges.
5194          */
5195         if (epid != so->last_pid || epid != proc_pid(p)) {
5196                 if ((error = priv_check_cred(kauth_cred_get(),
5197                     PRIV_NET_PRIVILEGED_SOCKET_DELEGATE, 0))) {
5198                         error = EACCES;
5199                         goto done;
5200                 }
5201         }
5202
5203         /* Find the process that corresponds to the effective pid */
5204         if ((ep = proc_find(epid)) == PROC_NULL) {
5205                 error = ESRCH;
5206                 goto done;
5207         }
5208
5209         /*
5210          * If a process tries to delegate the socket to itself, then
5211          * there's really nothing to do; treat it as a way for the
5212          * delegate association to be cleared.  Note that we check
5213          * the passed-in proc rather than calling proc_selfpid(),
5214          * as we need to check the process issuing the socket option
5215          * which could be kernproc.  Given that we don't allow 0 for
5216          * effective pid, it means that a delegated in-kernel socket
5217          * stays delegated during its lifetime (which is probably OK.)
5218          */
5219         if (epid == proc_pid(p)) {
5220                 so->so_flags &= ~SOF_DELEGATED;
5221                 so->e_upid = 0;
5222                 so->e_pid = 0;
5223                 uuid_clear(so->e_uuid);
5224         } else {
5225                 so->so_flags |= SOF_DELEGATED;
5226                 so->e_upid = proc_uniqueid(ep);
5227                 so->e_pid = proc_pid(ep);
5228                 proc_getexecutableuuid(ep, so->e_uuid, sizeof (so->e_uuid));
5229         }
5230
5231 done:
5232         if (error == 0 && net_io_policy_log) {
5233                 uuid_string_t buf;
5234
5235                 uuid_unparse(so->e_uuid, buf);
5236                 log(LOG_DEBUG, "%s[%s,%d]: so 0x%llx [%d,%d] epid %d (%s) "
5237                     "euuid %s%s\n", __func__, proc_name_address(p),
5238                     proc_pid(p), (uint64_t)VM_KERNEL_ADDRPERM(so), SOCK_DOM(so),
5239                     SOCK_TYPE(so), so->e_pid, proc_name_address(ep), buf,
5240                     ((so->so_flags & SOF_DELEGATED) ? " [delegated]" : ""));
5241         } else if (error != 0 && net_io_policy_log) {
5242                 log(LOG_ERR, "%s[%s,%d]: so 0x%llx [%d,%d] epid %d (%s) "
5243                     "ERROR (%d)\n", __func__, proc_name_address(p),
5244                     proc_pid(p), (uint64_t)VM_KERNEL_ADDRPERM(so), SOCK_DOM(so),
5245                     SOCK_TYPE(so), epid, (ep == PROC_NULL) ? "PROC_NULL" :
5246                     proc_name_address(ep), error);
5247         }
5248
5249         if (ep != PROC_NULL)
5250                 proc_rele(ep);
5251
5252         return (error);
5253 }
5254
5255 int
5256 so_set_effective_uuid(struct socket *so, uuid_t euuid, struct proc *p)
5257 {
5258         uuid_string_t buf;
5259         uuid_t uuid;
5260         int error = 0;
5261
5262         /* UUID must not be all-zeroes (reserved for kernel) */
5263         if (uuid_is_null(euuid)) {
5264                 error = EINVAL;
5265                 goto done;;
5266         }
5267
5268         /*
5269          * If this is an in-kernel socket, prevent its delegate
5270          * association from changing unless the socket option is
5271          * coming from within the kernel itself.
5272          */
5273         if (so->last_pid == 0 && p != kernproc) {
5274                 error = EACCES;
5275                 goto done;
5276         }
5277
5278         /* Get the UUID of the issuing process */
5279         proc_getexecutableuuid(p, uuid, sizeof (uuid));
5280
5281         /*
5282          * If this is issued by a process that's recorded as the
5283          * real owner of the socket, or if the uuid is the same as
5284          * the process's own uuid, then proceed.  Otherwise ensure
5285          * that the issuing process has the necessary privileges.
5286          */
5287         if (uuid_compare(euuid, so->last_uuid) != 0 ||
5288             uuid_compare(euuid, uuid) != 0) {
5289                 if ((error = priv_check_cred(kauth_cred_get(),
5290                     PRIV_NET_PRIVILEGED_SOCKET_DELEGATE, 0))) {
5291                         error = EACCES;
5292                         goto done;
5293                 }
5294         }
5295
5296         /*
5297          * If a process tries to delegate the socket to itself, then
5298          * there's really nothing to do; treat it as a way for the
5299          * delegate association to be cleared.  Note that we check
5300          * the uuid of the passed-in proc rather than that of the
5301          * current process, as we need to check the process issuing
5302          * the socket option which could be kernproc itself.  Given
5303          * that we don't allow 0 for effective uuid, it means that
5304          * a delegated in-kernel socket stays delegated during its
5305          * lifetime (which is okay.)
5306          */
5307         if (uuid_compare(euuid, uuid) == 0) {
5308                 so->so_flags &= ~SOF_DELEGATED;
5309                 so->e_upid = 0;
5310                 so->e_pid = 0;
5311                 uuid_clear(so->e_uuid);
5312         } else {
5313                 so->so_flags |= SOF_DELEGATED;
5314                 /*
5315                  * Unlike so_set_effective_pid(), we only have the UUID
5316                  * here and the process ID is not known.  Inherit the
5317                  * real {pid,upid} of the socket.
5318                  */
5319                 so->e_upid = so->last_upid;
5320                 so->e_pid = so->last_pid;
5321                 uuid_copy(so->e_uuid, euuid);
5322         }
5323
5324 done:
5325         if (error == 0 && net_io_policy_log) {
5326                 uuid_unparse(so->e_uuid, buf);
5327                 log(LOG_DEBUG, "%s[%s,%d]: so 0x%llx [%d,%d] epid %d "
5328                     "euuid %s%s\n", __func__, proc_name_address(p), proc_pid(p),
5329                     (uint64_t)VM_KERNEL_ADDRPERM(so), SOCK_DOM(so),
5330                     SOCK_TYPE(so), so->e_pid, buf,
5331                     ((so->so_flags & SOF_DELEGATED) ? " [delegated]" : ""));
5332         } else if (error != 0 && net_io_policy_log) {
5333                 uuid_unparse(euuid, buf);
5334                 log(LOG_DEBUG, "%s[%s,%d]: so 0x%llx [%d,%d] euuid %s "
5335                     "ERROR (%d)\n", __func__, proc_name_address(p), proc_pid(p),
5336                     (uint64_t)VM_KERNEL_ADDRPERM(so), SOCK_DOM(so),
5337                     SOCK_TYPE(so), buf, error);
5338         }
5339
5340         return (error);
5341 }
5342
5343 void
5344 netpolicy_post_msg(uint32_t ev_code, struct netpolicy_event_data *ev_data,
5345     uint32_t ev_datalen)
5346 {
5347         struct kev_msg ev_msg;
5348
5349         /*
5350          * A netpolicy event always starts with a netpolicy_event_data
5351          * structure, but the caller can provide for a longer event
5352          * structure to post, depending on the event code.
5353          */
5354         VERIFY(ev_data != NULL && ev_datalen >= sizeof (*ev_data));
5355
5356         bzero(&ev_msg, sizeof (ev_msg));
5357         ev_msg.vendor_code      = KEV_VENDOR_APPLE;
5358         ev_msg.kev_class        = KEV_NETWORK_CLASS;
5359         ev_msg.kev_subclass     = KEV_NETPOLICY_SUBCLASS;
5360         ev_msg.event_code       = ev_code;
5361
5362         ev_msg.dv[0].data_ptr   = ev_data;
5363         ev_msg.dv[0].data_length = ev_datalen;
5364
5365         kev_post_msg(&ev_msg);
5366 }