bsd/kern/uipc_socket.c

   1 /*
   2  * Copyright (c) 1998-2007 Apple Inc. All rights reserved.
   3  *
   4  * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
   5  *
   6  * This file contains Original Code and/or Modifications of Original Code
   7  * as defined in and that are subject to the Apple Public Source License
   8  * Version 2.0 (the 'License'). You may not use this file except in
   9  * compliance with the License. The rights granted to you under the License
  10  * may not be used to create, or enable the creation or redistribution of,
  11  * unlawful or unlicensed copies of an Apple operating system, or to
  12  * circumvent, violate, or enable the circumvention or violation of, any
  13  * terms of an Apple operating system software license agreement.
  14  *
  15  * Please obtain a copy of the License at
  16  * http://www.opensource.apple.com/apsl/ and read it before using this file.
  17  *
  18  * The Original Code and all software distributed under the License are
  19  * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
  20  * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
  21  * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
  22  * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
  23  * Please see the License for the specific language governing rights and
  24  * limitations under the License.
  25  *
  26  * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
  27  */
  28 /* Copyright (c) 1995 NeXT Computer, Inc. All Rights Reserved */
  29 /*
  30  * Copyright (c) 1982, 1986, 1988, 1990, 1993
  31  *      The Regents of the University of California.  All rights reserved.
  32  *
  33  * Redistribution and use in source and binary forms, with or without
  34  * modification, are permitted provided that the following conditions
  35  * are met:
  36  * 1. Redistributions of source code must retain the above copyright
  37  *    notice, this list of conditions and the following disclaimer.
  38  * 2. Redistributions in binary form must reproduce the above copyright
  39  *    notice, this list of conditions and the following disclaimer in the
  40  *    documentation and/or other materials provided with the distribution.
  41  * 3. All advertising materials mentioning features or use of this software
  42  *    must display the following acknowledgement:
  43  *      This product includes software developed by the University of
  44  *      California, Berkeley and its contributors.
  45  * 4. Neither the name of the University nor the names of its contributors
  46  *    may be used to endorse or promote products derived from this software
  47  *    without specific prior written permission.
  48  *
  49  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
  50  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  51  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  52  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  53  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  54  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  55  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  56  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  57  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  58  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  59  * SUCH DAMAGE.
  60  *
  61  *      @(#)uipc_socket.c       8.3 (Berkeley) 4/15/94
  62  * $FreeBSD: src/sys/kern/uipc_socket.c,v 1.68.2.16 2001/06/14 20:46:06 ume Exp $
  63  */
  64 /*
  65  * NOTICE: This file was modified by SPARTA, Inc. in 2005 to introduce
  66  * support for mandatory and extensible security protections.  This notice
  67  * is included in support of clause 2.2 (b) of the Apple Public License,
  68  * Version 2.0.
  69  */
  70
  71 #include <sys/param.h>
  72 #include <sys/systm.h>
  73 #include <sys/filedesc.h>
  74 #include <sys/proc.h>
  75 #include <sys/proc_internal.h>
  76 #include <sys/kauth.h>
  77 #include <sys/file_internal.h>
  78 #include <sys/fcntl.h>
  79 #include <sys/malloc.h>
  80 #include <sys/mbuf.h>
  81 #include <sys/domain.h>
  82 #include <sys/kernel.h>
  83 #include <sys/event.h>
  84 #include <sys/poll.h>
  85 #include <sys/protosw.h>
  86 #include <sys/socket.h>
  87 #include <sys/socketvar.h>
  88 #include <sys/resourcevar.h>
  89 #include <sys/signalvar.h>
  90 #include <sys/sysctl.h>
  91 #include <sys/uio.h>
  92 #include <sys/ev.h>
  93 #include <sys/kdebug.h>
  94 #include <sys/un.h>
  95 #include <net/route.h>
  96 #include <netinet/in.h>
  97 #include <netinet/in_pcb.h>
  98 #include <kern/zalloc.h>
  99 #include <kern/locks.h>
 100 #include <machine/limits.h>
 101 #include <libkern/OSAtomic.h>
 102 #include <pexpert/pexpert.h>
 103
 104 #if CONFIG_MACF
 105 #include <security/mac.h>
 106 #include <security/mac_framework.h>
 107 #endif /* MAC */
 108
 109 /* how a timeval looks to a 64-bit process */
 110 struct timeval64 {
 111         int64_t         tv_sec;
 112         int32_t         tv_usec;
 113 };
 114
 115 int                     so_cache_hw = 0;
 116 int                     so_cache_timeouts = 0;
 117 int                     so_cache_max_freed = 0;
 118 int                     cached_sock_count = 0;
 119 struct socket           *socket_cache_head = 0;
 120 struct socket           *socket_cache_tail = 0;
 121 u_long                  so_cache_time = 0;
 122 int                     so_cache_init_done = 0;
 123 struct zone             *so_cache_zone;
 124
 125 static lck_grp_t                *so_cache_mtx_grp;
 126 static lck_attr_t               *so_cache_mtx_attr;
 127 static lck_grp_attr_t   *so_cache_mtx_grp_attr;
 128 lck_mtx_t                               *so_cache_mtx;
 129
 130 #include <machine/limits.h>
 131
 132 static void     filt_sordetach(struct knote *kn);
 133 static int      filt_soread(struct knote *kn, long hint);
 134 static void     filt_sowdetach(struct knote *kn);
 135 static int      filt_sowrite(struct knote *kn, long hint);
 136 static int      filt_solisten(struct knote *kn, long hint);
 137
 138 static int
 139 sooptcopyin_timeval(struct sockopt *sopt, struct timeval * tv_p);
 140
 141 static int
 142 sooptcopyout_timeval(struct sockopt *sopt, const struct timeval * tv_p);
 143
 144 static struct filterops solisten_filtops =
 145         { 1, NULL, filt_sordetach, filt_solisten };
 146 static struct filterops soread_filtops =
 147         { 1, NULL, filt_sordetach, filt_soread };
 148 static struct filterops sowrite_filtops =
 149         { 1, NULL, filt_sowdetach, filt_sowrite };
 150
 151 #define EVEN_MORE_LOCKING_DEBUG 0
 152 int socket_debug = 0;
 153 int socket_zone = M_SOCKET;
 154 so_gen_t        so_gencnt;      /* generation count for sockets */
 155
 156 MALLOC_DEFINE(M_SONAME, "soname", "socket name");
 157 MALLOC_DEFINE(M_PCB, "pcb", "protocol control block");
 158
 159 #define DBG_LAYER_IN_BEG        NETDBG_CODE(DBG_NETSOCK, 0)
 160 #define DBG_LAYER_IN_END        NETDBG_CODE(DBG_NETSOCK, 2)
 161 #define DBG_LAYER_OUT_BEG       NETDBG_CODE(DBG_NETSOCK, 1)
 162 #define DBG_LAYER_OUT_END       NETDBG_CODE(DBG_NETSOCK, 3)
 163 #define DBG_FNC_SOSEND          NETDBG_CODE(DBG_NETSOCK, (4 << 8) | 1)
 164 #define DBG_FNC_SORECEIVE       NETDBG_CODE(DBG_NETSOCK, (8 << 8))
 165 #define DBG_FNC_SOSHUTDOWN      NETDBG_CODE(DBG_NETSOCK, (9 << 8))
 166
 167 #define MAX_SOOPTGETM_SIZE      (128 * MCLBYTES)
 168
 169
 170 SYSCTL_DECL(_kern_ipc);
 171
 172 int somaxconn = SOMAXCONN;
 173 SYSCTL_INT(_kern_ipc, KIPC_SOMAXCONN, somaxconn, CTLFLAG_RW, &somaxconn, 0, "");
 174
 175 /* Should we get a maximum also ??? */
 176 static int sosendmaxchain = 65536;
 177 static int sosendminchain = 16384;
 178 static int sorecvmincopy  = 16384;
 179 SYSCTL_INT(_kern_ipc, OID_AUTO, sosendminchain, CTLFLAG_RW, &sosendminchain,
 180     0, "");
 181 SYSCTL_INT(_kern_ipc, OID_AUTO, sorecvmincopy, CTLFLAG_RW, &sorecvmincopy,
 182     0, "");
 183
 184 /*
 185  * Set to enable jumbo clusters (if available) for large writes when
 186  * the socket is marked with SOF_MULTIPAGES; see below.
 187  */
 188 int sosendjcl = 1;
 189 SYSCTL_INT(_kern_ipc, OID_AUTO, sosendjcl, CTLFLAG_RW, &sosendjcl, 0, "");
 190
 191 /*
 192  * Set this to ignore SOF_MULTIPAGES and use jumbo clusters for large
 193  * writes on the socket for all protocols on any network interfaces,
 194  * depending upon sosendjcl above.  Be extra careful when setting this
 195  * to 1, because sending down packets that cross physical pages down to
 196  * broken drivers (those that falsely assume that the physical pages
 197  * are contiguous) might lead to system panics or silent data corruption.
 198  * When set to 0, the system will respect SOF_MULTIPAGES, which is set
 199  * only for TCP sockets whose outgoing interface is IFNET_MULTIPAGES
 200  * capable.  Set this to 1 only for testing/debugging purposes.
 201  */
 202 int sosendjcl_ignore_capab = 0;
 203 SYSCTL_INT(_kern_ipc, OID_AUTO, sosendjcl_ignore_capab, CTLFLAG_RW,
 204     &sosendjcl_ignore_capab, 0, "");
 205
 206 /*
 207  * Socket operation routines.
 208  * These routines are called by the routines in
 209  * sys_socket.c or from a system process, and
 210  * implement the semantics of socket operations by
 211  * switching out to the protocol specific routines.
 212  */
 213
 214 /* sys_generic.c */
 215 extern void postevent(struct socket *, struct sockbuf *, int);
 216 extern void evsofree(struct socket *);
 217
 218 /* TODO: these should be in header file */
 219 extern int get_inpcb_str_size(void);
 220 extern int get_tcp_str_size(void);
 221 extern struct domain *pffinddomain(int);
 222 extern struct protosw *pffindprotonotype(int, int);
 223 extern int soclose_locked(struct socket *);
 224 extern int soo_kqfilter(struct fileproc *, struct knote *, struct proc *);
 225
 226 #ifdef __APPLE__
 227
 228 vm_size_t       so_cache_zone_element_size;
 229
 230 static int sodelayed_copy(struct socket *, struct uio *, struct mbuf **, int *);
 231 static void cached_sock_alloc(struct socket **, int);
 232 static void cached_sock_free(struct socket *);
 233 static void so_cache_timer(void *);
 234
 235 void soclose_wait_locked(struct socket *so);
 236
 237
 238 void
 239 socketinit(void)
 240 {
 241         vm_size_t str_size;
 242
 243         if (so_cache_init_done) {
 244                 printf("socketinit: already called...\n");
 245                 return;
 246         }
 247
 248         PE_parse_boot_arg("socket_debug", &socket_debug);
 249
 250         /*
 251          * allocate lock group attribute and group for socket cache mutex
 252          */
 253         so_cache_mtx_grp_attr = lck_grp_attr_alloc_init();
 254
 255         so_cache_mtx_grp = lck_grp_alloc_init("so_cache",
 256             so_cache_mtx_grp_attr);
 257
 258         /*
 259          * allocate the lock attribute for socket cache mutex
 260          */
 261         so_cache_mtx_attr = lck_attr_alloc_init();
 262
 263         so_cache_init_done = 1;
 264
 265         /* cached sockets mutex */
 266         so_cache_mtx = lck_mtx_alloc_init(so_cache_mtx_grp, so_cache_mtx_attr);
 267
 268         if (so_cache_mtx == NULL)
 269                 return; /* we're hosed... */
 270
 271         str_size = (vm_size_t)(sizeof (struct socket) + 4 +
 272             get_inpcb_str_size() + 4 + get_tcp_str_size());
 273
 274         so_cache_zone = zinit(str_size, 120000*str_size, 8192, "socache zone");
 275 #if TEMPDEBUG
 276         printf("cached_sock_alloc -- so_cache_zone size is %x\n", str_size);
 277 #endif
 278         timeout(so_cache_timer, NULL, (SO_CACHE_FLUSH_INTERVAL * hz));
 279
 280         so_cache_zone_element_size = str_size;
 281
 282         sflt_init();
 283 }
 284
 285 static void
 286 cached_sock_alloc(struct socket **so, int waitok)
 287 {
 288         caddr_t temp;
 289         register u_long offset;
 290
 291         lck_mtx_lock(so_cache_mtx);
 292
 293         if (cached_sock_count) {
 294                 cached_sock_count--;
 295                 *so = socket_cache_head;
 296                 if (*so == 0)
 297                         panic("cached_sock_alloc: cached sock is null");
 298
 299                 socket_cache_head = socket_cache_head->cache_next;
 300                 if (socket_cache_head)
 301                         socket_cache_head->cache_prev = 0;
 302                 else
 303                         socket_cache_tail = 0;
 304
 305                 lck_mtx_unlock(so_cache_mtx);
 306
 307                 temp = (*so)->so_saved_pcb;
 308                 bzero((caddr_t)*so, sizeof (struct socket));
 309 #if TEMPDEBUG
 310                 kprintf("cached_sock_alloc - retreiving cached sock %p - "
 311                     "count == %d\n", *so, cached_sock_count);
 312 #endif
 313                 (*so)->so_saved_pcb = temp;
 314                 (*so)->cached_in_sock_layer = 1;
 315         } else {
 316 #if TEMPDEBUG
 317                 kprintf("Allocating cached sock %p from memory\n", *so);
 318 #endif
 319
 320                 lck_mtx_unlock(so_cache_mtx);
 321
 322                 if (waitok)
 323                         *so = (struct socket *)zalloc(so_cache_zone);
 324                 else
 325                         *so = (struct socket *)zalloc_noblock(so_cache_zone);
 326
 327                 if (*so == 0)
 328                         return;
 329
 330                 bzero((caddr_t)*so, sizeof (struct socket));
 331
 332                 /*
 333                  * Define offsets for extra structures into our single block of
 334                  * memory. Align extra structures on longword boundaries.
 335                  */
 336                 offset = (u_long) *so;
 337                 offset += sizeof (struct socket);
 338                 if (offset & 0x3) {
 339                         offset += 4;
 340                         offset &= 0xfffffffc;
 341                 }
 342                 (*so)->so_saved_pcb = (caddr_t)offset;
 343                 offset += get_inpcb_str_size();
 344                 if (offset & 0x3) {
 345                         offset += 4;
 346                         offset &= 0xfffffffc;
 347                 }
 348
 349                 ((struct inpcb *)(*so)->so_saved_pcb)->inp_saved_ppcb =
 350                     (caddr_t)offset;
 351 #if TEMPDEBUG
 352                 kprintf("Allocating cached socket - %p, pcb=%p tcpcb=%p\n",
 353                     *so, (*so)->so_saved_pcb,
 354                     ((struct inpcb *)(*so)->so_saved_pcb)->inp_saved_ppcb);
 355 #endif
 356         }
 357
 358         (*so)->cached_in_sock_layer = 1;
 359 }
 360
 361 static void
 362 cached_sock_free(struct socket *so)
 363 {
 364
 365         lck_mtx_lock(so_cache_mtx);
 366
 367         if (++cached_sock_count > MAX_CACHED_SOCKETS) {
 368                 --cached_sock_count;
 369                 lck_mtx_unlock(so_cache_mtx);
 370 #if TEMPDEBUG
 371                 kprintf("Freeing overflowed cached socket %p\n", so);
 372 #endif
 373                 zfree(so_cache_zone, so);
 374         } else {
 375 #if TEMPDEBUG
 376                 kprintf("Freeing socket %p into cache\n", so);
 377 #endif
 378                 if (so_cache_hw < cached_sock_count)
 379                         so_cache_hw = cached_sock_count;
 380
 381                 so->cache_next = socket_cache_head;
 382                 so->cache_prev = 0;
 383                 if (socket_cache_head)
 384                         socket_cache_head->cache_prev = so;
 385                 else
 386                         socket_cache_tail = so;
 387
 388                 so->cache_timestamp = so_cache_time;
 389                 socket_cache_head = so;
 390                 lck_mtx_unlock(so_cache_mtx);
 391         }
 392
 393 #if TEMPDEBUG
 394         kprintf("Freed cached sock %p into cache - count is %d\n",
 395             so, cached_sock_count);
 396 #endif
 397 }
 398
 399 static void
 400 so_cache_timer(__unused void *dummy)
 401 {
 402         register struct socket  *p;
 403         register int            n_freed = 0;
 404
 405         lck_mtx_lock(so_cache_mtx);
 406
 407         ++so_cache_time;
 408
 409         while ((p = socket_cache_tail)) {
 410                 if ((so_cache_time - p->cache_timestamp) < SO_CACHE_TIME_LIMIT)
 411                         break;
 412
 413                 so_cache_timeouts++;
 414
 415                 if ((socket_cache_tail = p->cache_prev))
 416                         p->cache_prev->cache_next = 0;
 417                 if (--cached_sock_count == 0)
 418                         socket_cache_head = 0;
 419
 420                 zfree(so_cache_zone, p);
 421
 422                 if (++n_freed >= SO_CACHE_MAX_FREE_BATCH) {
 423                         so_cache_max_freed++;
 424                         break;
 425                 }
 426         }
 427         lck_mtx_unlock(so_cache_mtx);
 428
 429         timeout(so_cache_timer, NULL, (SO_CACHE_FLUSH_INTERVAL * hz));
 430 }
 431 #endif /* __APPLE__ */
 432
 433 /*
 434  * Get a socket structure from our zone, and initialize it.
 435  * We don't implement `waitok' yet (see comments in uipc_domain.c).
 436  * Note that it would probably be better to allocate socket
 437  * and PCB at the same time, but I'm not convinced that all
 438  * the protocols can be easily modified to do this.
 439  */
 440 struct socket *
 441 soalloc(int waitok, int dom, int type)
 442 {
 443         struct socket *so;
 444
 445         if ((dom == PF_INET) && (type == SOCK_STREAM)) {
 446                 cached_sock_alloc(&so, waitok);
 447         } else {
 448                 MALLOC_ZONE(so, struct socket *, sizeof (*so), socket_zone,
 449                     M_WAITOK);
 450                 if (so != NULL)
 451                         bzero(so, sizeof (*so));
 452         }
 453         /* XXX race condition for reentrant kernel */
 454 //###LD Atomic add for so_gencnt
 455         if (so != NULL) {
 456                 so->so_gencnt = ++so_gencnt;
 457                 so->so_zone = socket_zone;
 458 #if CONFIG_MACF_SOCKET
 459              /* Convert waitok to  M_WAITOK/M_NOWAIT for MAC Framework. */
 460              if (mac_socket_label_init(so, !waitok) != 0) {
 461                         sodealloc(so);
 462                         return (NULL);
 463                 }
 464 #endif /* MAC_SOCKET */
 465         }
 466
 467         return (so);
 468 }
 469
 470 /*
 471  * Returns:     0                       Success
 472  *              EAFNOSUPPORT
 473  *              EPROTOTYPE
 474  *              EPROTONOSUPPORT
 475  *              ENOBUFS
 476  *      <pru_attach>:ENOBUFS[AF_UNIX]
 477  *      <pru_attach>:ENOBUFS[TCP]
 478  *      <pru_attach>:ENOMEM[TCP]
 479  *      <pru_attach>:EISCONN[TCP]
 480  *      <pru_attach>:???                [other protocol families, IPSEC]
 481  */
 482 int
 483 socreate(int dom, struct socket **aso, int type, int proto)
 484 {
 485         struct proc *p = current_proc();
 486         register struct protosw *prp;
 487         register struct socket *so;
 488         register int error = 0;
 489 #if TCPDEBUG
 490         extern int tcpconsdebug;
 491 #endif
 492         if (proto)
 493                 prp = pffindproto(dom, proto, type);
 494         else
 495                 prp = pffindtype(dom, type);
 496
 497         if (prp == 0 || prp->pr_usrreqs->pru_attach == 0) {
 498                 if (pffinddomain(dom) == NULL) {
 499                         return (EAFNOSUPPORT);
 500                 }
 501                 if (proto != 0) {
 502                         if (pffindprotonotype(dom, proto) != NULL) {
 503                                 return (EPROTOTYPE);
 504                         }
 505                 }
 506                 return (EPROTONOSUPPORT);
 507         }
 508         if (prp->pr_type != type)
 509                 return (EPROTOTYPE);
 510         so = soalloc(p != 0, dom, type);
 511         if (so == 0)
 512                 return (ENOBUFS);
 513
 514         TAILQ_INIT(&so->so_incomp);
 515         TAILQ_INIT(&so->so_comp);
 516         so->so_type = type;
 517
 518         if (p != 0) {
 519                 so->so_uid = kauth_cred_getuid(kauth_cred_get());
 520                 if (!suser(kauth_cred_get(), NULL))
 521                         so->so_state = SS_PRIV;
 522         }
 523         so->so_proto = prp;
 524 #ifdef __APPLE__
 525         so->so_rcv.sb_flags |= SB_RECV; /* XXX */
 526         so->so_rcv.sb_so = so->so_snd.sb_so = so;
 527 #endif
 528         so->next_lock_lr = 0;
 529         so->next_unlock_lr = 0;
 530
 531 #if CONFIG_MACF_SOCKET
 532         mac_socket_label_associate(kauth_cred_get(), so);
 533 #endif /* MAC_SOCKET */
 534
 535 //### Attachement will create the per pcb lock if necessary and increase refcount
 536         /*
 537          * for creation, make sure it's done before
 538          * socket is inserted in lists
 539          */
 540         so->so_usecount++;
 541
 542         error = (*prp->pr_usrreqs->pru_attach)(so, proto, p);
 543         if (error) {
 544                 /*
 545                  * Warning:
 546                  * If so_pcb is not zero, the socket will be leaked,
 547                  * so protocol attachment handler must be coded carefuly
 548                  */
 549                 so->so_state |= SS_NOFDREF;
 550                 so->so_usecount--;
 551                 sofreelastref(so, 1);   /* will deallocate the socket */
 552                 return (error);
 553         }
 554 #ifdef __APPLE__
 555         prp->pr_domain->dom_refs++;
 556         TAILQ_INIT(&so->so_evlist);
 557
 558         /* Attach socket filters for this protocol */
 559         sflt_initsock(so);
 560 #if TCPDEBUG
 561         if (tcpconsdebug == 2)
 562                 so->so_options |= SO_DEBUG;
 563 #endif
 564 #endif
 565         *aso = so;
 566         return (0);
 567 }
 568
 569 /*
 570  * Returns:     0                       Success
 571  *      <pru_bind>:EINVAL               Invalid argument [COMMON_START]
 572  *      <pru_bind>:EAFNOSUPPORT         Address family not supported
 573  *      <pru_bind>:EADDRNOTAVAIL        Address not available.
 574  *      <pru_bind>:EINVAL               Invalid argument
 575  *      <pru_bind>:EAFNOSUPPORT         Address family not supported [notdef]
 576  *      <pru_bind>:EACCES               Permission denied
 577  *      <pru_bind>:EADDRINUSE           Address in use
 578  *      <pru_bind>:EAGAIN               Resource unavailable, try again
 579  *      <pru_bind>:EPERM                Operation not permitted
 580  *      <pru_bind>:???
 581  *      <sf_bind>:???
 582  *
 583  * Notes:       It's not possible to fully enumerate the return codes above,
 584  *              since socket filter authors and protocol family authors may
 585  *              not choose to limit their error returns to those listed, even
 586  *              though this may result in some software operating incorrectly.
 587  *
 588  *              The error codes which are enumerated above are those known to
 589  *              be returned by the tcp_usr_bind function supplied.
 590  */
 591 int
 592 sobind(struct socket *so, struct sockaddr *nam)
 593 {
 594         struct proc *p = current_proc();
 595         int error = 0;
 596         struct socket_filter_entry *filter;
 597         int filtered = 0;
 598
 599         socket_lock(so, 1);
 600
 601         /*
 602          * If this is a bind request on a previously-accepted socket
 603          * that has been marked as inactive, reject it now before
 604          * we go any further.
 605          */
 606         if (so->so_flags & SOF_DEFUNCT) {
 607                 error = EINVAL;
 608                 goto out;
 609         }
 610
 611         /* Socket filter */
 612         error = 0;
 613         for (filter = so->so_filt; filter && (error == 0);
 614             filter = filter->sfe_next_onsocket) {
 615                 if (filter->sfe_filter->sf_filter.sf_bind) {
 616                         if (filtered == 0) {
 617                                 filtered = 1;
 618                                 sflt_use(so);
 619                                 socket_unlock(so, 0);
 620                         }
 621                         error = filter->sfe_filter->sf_filter.
 622                             sf_bind(filter->sfe_cookie, so, nam);
 623                 }
 624         }
 625         if (filtered != 0) {
 626                 socket_lock(so, 0);
 627                 sflt_unuse(so);
 628         }
 629         /* End socket filter */
 630
 631         if (error == 0)
 632                 error = (*so->so_proto->pr_usrreqs->pru_bind)(so, nam, p);
 633 out:
 634         socket_unlock(so, 1);
 635
 636         if (error == EJUSTRETURN)
 637                 error = 0;
 638
 639         return (error);
 640 }
 641
 642 void
 643 sodealloc(struct socket *so)
 644 {
 645         so->so_gencnt = ++so_gencnt;
 646
 647 #if CONFIG_MACF_SOCKET
 648         mac_socket_label_destroy(so);
 649 #endif /* MAC_SOCKET */
 650         if (so->cached_in_sock_layer == 1) {
 651                 cached_sock_free(so);
 652         } else {
 653                 if (so->cached_in_sock_layer == -1)
 654                         panic("sodealloc: double dealloc: so=%p\n", so);
 655                 so->cached_in_sock_layer = -1;
 656                 FREE_ZONE(so, sizeof (*so), so->so_zone);
 657         }
 658 }
 659
 660 /*
 661  * Returns:     0                       Success
 662  *              EINVAL
 663  *              EOPNOTSUPP
 664  *      <pru_listen>:EINVAL[AF_UNIX]
 665  *      <pru_listen>:EINVAL[TCP]
 666  *      <pru_listen>:EADDRNOTAVAIL[TCP] Address not available.
 667  *      <pru_listen>:EINVAL[TCP]        Invalid argument
 668  *      <pru_listen>:EAFNOSUPPORT[TCP]  Address family not supported [notdef]
 669  *      <pru_listen>:EACCES[TCP]        Permission denied
 670  *      <pru_listen>:EADDRINUSE[TCP]    Address in use
 671  *      <pru_listen>:EAGAIN[TCP]        Resource unavailable, try again
 672  *      <pru_listen>:EPERM[TCP]         Operation not permitted
 673  *      <sf_listen>:???
 674  *
 675  * Notes:       Other <pru_listen> returns depend on the protocol family; all
 676  *              <sf_listen> returns depend on what the filter author causes
 677  *              their filter to return.
 678  */
 679 int
 680 solisten(struct socket *so, int backlog)
 681 {
 682         struct proc *p = current_proc();
 683         int error = 0;
 684         struct socket_filter_entry *filter;
 685         int filtered = 0;
 686
 687         socket_lock(so, 1);
 688         if (so->so_proto == NULL) {
 689                 error = EINVAL;
 690                 goto out;
 691         }
 692         if ((so->so_proto->pr_flags & PR_CONNREQUIRED) == 0) {
 693                 error = EOPNOTSUPP;
 694                 goto out;
 695         }
 696
 697         /*
 698          * If the listen request is made on a socket that is not fully
 699          * disconnected, or on a previously-accepted socket that has
 700          * been marked as inactive, reject the request now.
 701          */
 702         if ((so->so_state &
 703             (SS_ISCONNECTED|SS_ISCONNECTING|SS_ISDISCONNECTING)) ||
 704             (so->so_flags & SOF_DEFUNCT)) {
 705                 error = EINVAL;
 706                 goto out;
 707         }
 708
 709         if ((so->so_restrictions & SO_RESTRICT_DENYIN) != 0) {
 710                 error = EPERM;
 711                 goto out;
 712         }
 713
 714         error = 0;
 715         for (filter = so->so_filt; filter && (error == 0);
 716             filter = filter->sfe_next_onsocket) {
 717                 if (filter->sfe_filter->sf_filter.sf_listen) {
 718                         if (filtered == 0) {
 719                                 filtered = 1;
 720                                 sflt_use(so);
 721                                 socket_unlock(so, 0);
 722                         }
 723                         error = filter->sfe_filter->sf_filter.
 724                             sf_listen(filter->sfe_cookie, so);
 725                 }
 726         }
 727         if (filtered != 0) {
 728                 socket_lock(so, 0);
 729                 sflt_unuse(so);
 730         }
 731
 732         if (error == 0) {
 733                 error = (*so->so_proto->pr_usrreqs->pru_listen)(so, p);
 734         }
 735
 736         if (error) {
 737                 if (error == EJUSTRETURN)
 738                         error = 0;
 739                 goto out;
 740         }
 741
 742         if (TAILQ_EMPTY(&so->so_comp))
 743                 so->so_options |= SO_ACCEPTCONN;
 744         /*
 745          * POSIX: The implementation may have an upper limit on the length of
 746          * the listen queue-either global or per accepting socket. If backlog
 747          * exceeds this limit, the length of the listen queue is set to the
 748          * limit.
 749          *
 750          * If listen() is called with a backlog argument value that is less
 751          * than 0, the function behaves as if it had been called with a backlog
 752          * argument value of 0.
 753          *
 754          * A backlog argument of 0 may allow the socket to accept connections,
 755          * in which case the length of the listen queue may be set to an
 756          * implementation-defined minimum value.
 757          */
 758         if (backlog <= 0 || backlog > somaxconn)
 759                 backlog = somaxconn;
 760
 761         so->so_qlimit = backlog;
 762 out:
 763         socket_unlock(so, 1);
 764         return (error);
 765 }
 766
 767 void
 768 sofreelastref(struct socket *so, int dealloc)
 769 {
 770         struct socket *head = so->so_head;
 771
 772         /* Assume socket is locked */
 773
 774         /* Remove any filters - may be called more than once */
 775         sflt_termsock(so);
 776
 777         if ((!(so->so_flags & SOF_PCBCLEARING)) ||
 778             ((so->so_state & SS_NOFDREF) == 0)) {
 779 #ifdef __APPLE__
 780                 selthreadclear(&so->so_snd.sb_sel);
 781                 selthreadclear(&so->so_rcv.sb_sel);
 782                 so->so_rcv.sb_flags &= ~SB_UPCALL;
 783                 so->so_snd.sb_flags &= ~SB_UPCALL;
 784 #endif
 785                 return;
 786         }
 787         if (head != NULL) {
 788                 socket_lock(head, 1);
 789                 if (so->so_state & SS_INCOMP) {
 790                         TAILQ_REMOVE(&head->so_incomp, so, so_list);
 791                         head->so_incqlen--;
 792                 } else if (so->so_state & SS_COMP) {
 793                         /*
 794                          * We must not decommission a socket that's
 795                          * on the accept(2) queue.  If we do, then
 796                          * accept(2) may hang after select(2) indicated
 797                          * that the listening socket was ready.
 798                          */
 799 #ifdef __APPLE__
 800                         selthreadclear(&so->so_snd.sb_sel);
 801                         selthreadclear(&so->so_rcv.sb_sel);
 802                         so->so_rcv.sb_flags &= ~SB_UPCALL;
 803                         so->so_snd.sb_flags &= ~SB_UPCALL;
 804 #endif
 805                         socket_unlock(head, 1);
 806                         return;
 807                 } else {
 808                         panic("sofree: not queued");
 809                 }
 810                 head->so_qlen--;
 811                 so->so_state &= ~SS_INCOMP;
 812                 so->so_head = NULL;
 813                 socket_unlock(head, 1);
 814         }
 815 #ifdef __APPLE__
 816         selthreadclear(&so->so_snd.sb_sel);
 817         sbrelease(&so->so_snd);
 818 #endif
 819         sorflush(so);
 820
 821         /* 3932268: disable upcall */
 822         so->so_rcv.sb_flags &= ~SB_UPCALL;
 823         so->so_snd.sb_flags &= ~SB_UPCALL;
 824
 825         if (dealloc)
 826                 sodealloc(so);
 827 }
 828
 829 void
 830 soclose_wait_locked(struct socket *so)
 831 {
 832         lck_mtx_t *mutex_held;
 833
 834         if (so->so_proto->pr_getlock != NULL)
 835                 mutex_held = (*so->so_proto->pr_getlock)(so, 0);
 836         else
 837                 mutex_held = so->so_proto->pr_domain->dom_mtx;
 838         lck_mtx_assert(mutex_held, LCK_MTX_ASSERT_OWNED);
 839
 840         /* Double check here and return if there's no outstanding upcall */
 841         if (!(so->so_flags & SOF_UPCALLINUSE))
 842                 return;
 843
 844         so->so_flags |= SOF_CLOSEWAIT;
 845         (void) msleep((caddr_t)&so->so_upcall, mutex_held, (PZERO - 1),
 846             "soclose_wait_locked", NULL);
 847         lck_mtx_assert(mutex_held, LCK_MTX_ASSERT_OWNED);
 848         so->so_flags &= ~SOF_CLOSEWAIT;
 849 }
 850
 851 /*
 852  * Close a socket on last file table reference removal.
 853  * Initiate disconnect if connected.
 854  * Free socket when disconnect complete.
 855  */
 856 int
 857 soclose_locked(struct socket *so)
 858 {
 859         int error = 0;
 860         lck_mtx_t *mutex_held;
 861         struct timespec ts;
 862
 863         if (so->so_usecount == 0) {
 864                 panic("soclose: so=%p refcount=0\n", so);
 865         }
 866
 867         sflt_notify(so, sock_evt_closing, NULL);
 868
 869         if ((so->so_options & SO_ACCEPTCONN)) {
 870                 struct socket *sp, *sonext;
 871                 int socklock = 0;
 872
 873                 /*
 874                  * We do not want new connection to be added
 875                  * to the connection queues
 876                  */
 877                 so->so_options &= ~SO_ACCEPTCONN;
 878
 879                 for (sp = TAILQ_FIRST(&so->so_incomp); sp != NULL; sp = sonext) {
 880                         sonext = TAILQ_NEXT(sp, so_list);
 881
 882                         /* Radar 5350314
 883                          * skip sockets thrown away by tcpdropdropblreq
 884                          * they will get cleanup by the garbage collection.
 885                          * otherwise, remove the incomp socket from the queue
 886                          * and let soabort trigger the appropriate cleanup.
 887                          */
 888                         if (sp->so_flags & SOF_OVERFLOW)
 889                                 continue;
 890
 891                         if (so->so_proto->pr_getlock != NULL) {
 892                                 /* lock ordering for consistency with the rest of the stack,
 893                                  * we lock the socket first and then grabb the head.
 894                                  */
 895                                 socket_unlock(so, 0);
 896                                 socket_lock(sp, 1);
 897                                 socket_lock(so, 0);
 898                                 socklock = 1;
 899                         }
 900
 901                         TAILQ_REMOVE(&so->so_incomp, sp, so_list);
 902                         so->so_incqlen--;
 903
 904                         if (sp->so_state & SS_INCOMP) {
 905                                 sp->so_state &= ~SS_INCOMP;
 906                                 sp->so_head = NULL;
 907
 908                                 (void) soabort(sp);
 909                         }
 910
 911                         if (socklock)
 912                                 socket_unlock(sp, 1);
 913                 }
 914
 915                 while ((sp = TAILQ_FIRST(&so->so_comp)) != NULL) {
 916                         /* Dequeue from so_comp since sofree() won't do it */
 917                         TAILQ_REMOVE(&so->so_comp, sp, so_list);
 918                         so->so_qlen--;
 919
 920                         if (so->so_proto->pr_getlock != NULL) {
 921                                 socket_unlock(so, 0);
 922                                 socket_lock(sp, 1);
 923                         }
 924
 925                         if (sp->so_state & SS_COMP) {
 926                                 sp->so_state &= ~SS_COMP;
 927                                 sp->so_head = NULL;
 928
 929                                 (void) soabort(sp);
 930                         }
 931
 932                         if (so->so_proto->pr_getlock != NULL) {
 933                                 socket_unlock(sp, 1);
 934                                 socket_lock(so, 0);
 935                         }
 936                 }
 937         }
 938         if (so->so_pcb == 0) {
 939                 /* 3915887: mark the socket as ready for dealloc */
 940                 so->so_flags |= SOF_PCBCLEARING;
 941                 goto discard;
 942         }
 943         if (so->so_state & SS_ISCONNECTED) {
 944                 if ((so->so_state & SS_ISDISCONNECTING) == 0) {
 945                         error = sodisconnectlocked(so);
 946                         if (error)
 947                                 goto drop;
 948                 }
 949                 if (so->so_options & SO_LINGER) {
 950                         if ((so->so_state & SS_ISDISCONNECTING) &&
 951                             (so->so_state & SS_NBIO))
 952                                 goto drop;
 953                         if (so->so_proto->pr_getlock != NULL)
 954                                 mutex_held = (*so->so_proto->pr_getlock)(so, 0);
 955                         else
 956                                 mutex_held = so->so_proto->pr_domain->dom_mtx;
 957                         while (so->so_state & SS_ISCONNECTED) {
 958                                 ts.tv_sec = (so->so_linger/100);
 959                                 ts.tv_nsec = (so->so_linger % 100) *
 960                                     NSEC_PER_USEC * 1000 * 10;
 961                                 error = msleep((caddr_t)&so->so_timeo,
 962                                     mutex_held, PSOCK | PCATCH, "soclose", &ts);
 963                                 if (error) {
 964                                         /*
 965                                          * It's OK when the time fires,
 966                                          * don't report an error
 967                                          */
 968                                         if (error == EWOULDBLOCK)
 969                                                 error = 0;
 970                                         break;
 971                                 }
 972                         }
 973                 }
 974         }
 975 drop:
 976         if (so->so_usecount == 0)
 977                 panic("soclose: usecount is zero so=%p\n", so);
 978         if (so->so_pcb && !(so->so_flags & SOF_PCBCLEARING)) {
 979                 int error2 = (*so->so_proto->pr_usrreqs->pru_detach)(so);
 980                 if (error == 0)
 981                         error = error2;
 982         }
 983         if (so->so_usecount <= 0)
 984                 panic("soclose: usecount is zero so=%p\n", so);
 985 discard:
 986         if (so->so_pcb && so->so_state & SS_NOFDREF)
 987                 panic("soclose: NOFDREF");
 988         so->so_state |= SS_NOFDREF;
 989 #ifdef __APPLE__
 990         so->so_proto->pr_domain->dom_refs--;
 991         evsofree(so);
 992 #endif
 993         so->so_usecount--;
 994         sofree(so);
 995         return (error);
 996 }
 997
 998 int
 999 soclose(struct socket *so)
1000 {
1001         int error = 0;
1002         socket_lock(so, 1);
1003
1004         if (so->so_flags & SOF_UPCALLINUSE)
1005                 soclose_wait_locked(so);
1006
1007         if (so->so_retaincnt == 0) {
1008                 error = soclose_locked(so);
1009         } else {
1010                 /*
1011                  * if the FD is going away, but socket is
1012                  * retained in kernel remove its reference
1013                  */
1014                 so->so_usecount--;
1015                 if (so->so_usecount < 2)
1016                         panic("soclose: retaincnt non null and so=%p "
1017                             "usecount=%d\n", so, so->so_usecount);
1018         }
1019         socket_unlock(so, 1);
1020         return (error);
1021 }
1022
1023 /*
1024  * Must be called at splnet...
1025  */
1026 /* Should already be locked */
1027 int
1028 soabort(struct socket *so)
1029 {
1030         int error;
1031
1032 #ifdef MORE_LOCKING_DEBUG
1033         lck_mtx_t *mutex_held;
1034
1035         if (so->so_proto->pr_getlock != NULL)
1036                 mutex_held = (*so->so_proto->pr_getlock)(so, 0);
1037         else
1038                 mutex_held = so->so_proto->pr_domain->dom_mtx;
1039         lck_mtx_assert(mutex_held, LCK_MTX_ASSERT_OWNED);
1040 #endif
1041
1042         if ((so->so_flags & SOF_ABORTED) == 0) {
1043                 so->so_flags |= SOF_ABORTED;
1044                 error = (*so->so_proto->pr_usrreqs->pru_abort)(so);
1045                 if (error) {
1046                         sofree(so);
1047                         return (error);
1048                 }
1049         }
1050         return (0);
1051 }
1052
1053 int
1054 soacceptlock(struct socket *so, struct sockaddr **nam, int dolock)
1055 {
1056         int error;
1057
1058         if (dolock)
1059                 socket_lock(so, 1);
1060
1061         if ((so->so_state & SS_NOFDREF) == 0)
1062                 panic("soaccept: !NOFDREF");
1063         so->so_state &= ~SS_NOFDREF;
1064         error = (*so->so_proto->pr_usrreqs->pru_accept)(so, nam);
1065
1066         if (dolock)
1067                 socket_unlock(so, 1);
1068         return (error);
1069 }
1070
1071 int
1072 soaccept(struct socket *so, struct sockaddr **nam)
1073 {
1074         return (soacceptlock(so, nam, 1));
1075 }
1076
1077 int
1078 soacceptfilter(struct socket *so)
1079 {
1080         struct sockaddr *local = NULL, *remote = NULL;
1081         struct socket_filter_entry *filter;
1082         int error = 0, filtered = 0;
1083         struct socket *head = so->so_head;
1084
1085         /*
1086          * There's no need to hold the lock; this socket
1087          * has not been made visible to the filter(s).
1088          */
1089         if ((sock_getaddr(so, &remote, 1) != 0) ||
1090             sock_getaddr(so, &local, 0) != 0) {
1091                 so->so_state &= ~(SS_NOFDREF | SS_COMP);
1092                 so->so_head = NULL;
1093                 soclose(so);
1094                 /* Out of resources; try it again next time */
1095                 error = ECONNABORTED;
1096                 goto done;
1097         }
1098
1099         /*
1100          * At this point, we have a reference on the listening socket
1101          * so we know it won't be going away.  Do the same for the newly
1102          * accepted socket while we invoke the accept callback routine.
1103          */
1104         socket_lock(so, 1);
1105         for (filter = so->so_filt; filter != NULL && error == 0;
1106             filter = filter->sfe_next_onsocket) {
1107                 if (filter->sfe_filter->sf_filter.sf_accept != NULL) {
1108                         if (!filtered) {
1109                                 filtered = 1;
1110                                 sflt_use(so);
1111                                 socket_unlock(so, 0);
1112                         }
1113                         error = filter->sfe_filter->sf_filter.
1114                             sf_accept(filter->sfe_cookie,
1115                             head, so, local, remote);
1116                 }
1117         }
1118
1119         if (filtered) {
1120                 socket_lock(so, 0);
1121                 sflt_unuse(so);
1122         }
1123
1124         /*
1125          * If we get EJUSTRETURN from one of the filters, mark this socket
1126          * as inactive and return it anyway.  This newly accepted socket
1127          * will be disconnected later before we hand it off to the caller.
1128          */
1129         if (error == EJUSTRETURN) {
1130                 error = 0;
1131                 so->so_flags |= SOF_DEFUNCT;
1132                 /* Prevent data from being appended to the socket buffers */
1133                 so->so_snd.sb_flags |= SB_DROP;
1134                 so->so_rcv.sb_flags |= SB_DROP;
1135         }
1136
1137         if (error != 0) {
1138                 /*
1139                  * This may seem like a duplication to the above error
1140                  * handling part when we return ECONNABORTED, except
1141                  * the following is done while holding the lock since
1142                  * the socket has been exposed to the filter(s) earlier.
1143                  */
1144                 so->so_state &= ~(SS_NOFDREF | SS_COMP);
1145                 so->so_head = NULL;
1146                 socket_unlock(so, 1);
1147                 soclose(so);
1148                 /* Propagate socket filter's error code to the caller */
1149         } else {
1150                 socket_unlock(so, 1);
1151         }
1152 done:
1153         /* Callee checks for NULL pointer */
1154         sock_freeaddr(remote);
1155         sock_freeaddr(local);
1156         return (error);
1157 }
1158
1159 /*
1160  * Returns:     0                       Success
1161  *              EOPNOTSUPP              Operation not supported on socket
1162  *              EISCONN                 Socket is connected
1163  *      <pru_connect>:EADDRNOTAVAIL     Address not available.
1164  *      <pru_connect>:EINVAL            Invalid argument
1165  *      <pru_connect>:EAFNOSUPPORT      Address family not supported [notdef]
1166  *      <pru_connect>:EACCES            Permission denied
1167  *      <pru_connect>:EADDRINUSE        Address in use
1168  *      <pru_connect>:EAGAIN            Resource unavailable, try again
1169  *      <pru_connect>:EPERM             Operation not permitted
1170  *      <sf_connect_out>:???            [anything a filter writer might set]
1171  */
1172 int
1173 soconnectlock(struct socket *so, struct sockaddr *nam, int dolock)
1174 {
1175         int error;
1176         struct proc *p = current_proc();
1177
1178         if (dolock)
1179                 socket_lock(so, 1);
1180
1181         /*
1182          * If this is a listening socket or if this is a previously-accepted
1183          * socket that has been marked as inactive, reject the connect request.
1184          */
1185         if ((so->so_options & SO_ACCEPTCONN) || (so->so_flags & SOF_DEFUNCT)) {
1186                 if (dolock)
1187                         socket_unlock(so, 1);
1188                 return (EOPNOTSUPP);
1189         }
1190
1191         if ((so->so_restrictions & SO_RESTRICT_DENYOUT) != 0) {
1192                 if (dolock)
1193                         socket_unlock(so, 1);
1194                 return (EPERM);
1195         }
1196
1197         /*
1198          * If protocol is connection-based, can only connect once.
1199          * Otherwise, if connected, try to disconnect first.
1200          * This allows user to disconnect by connecting to, e.g.,
1201          * a null address.
1202          */
1203         if (so->so_state & (SS_ISCONNECTED|SS_ISCONNECTING) &&
1204             ((so->so_proto->pr_flags & PR_CONNREQUIRED) ||
1205             (error = sodisconnectlocked(so)))) {
1206                 error = EISCONN;
1207         } else {
1208                 /*
1209                  * Run connect filter before calling protocol:
1210                  *  - non-blocking connect returns before completion;
1211                  */
1212                 struct socket_filter_entry *filter;
1213                 int filtered = 0;
1214
1215                 error = 0;
1216                 for (filter = so->so_filt; filter && (error == 0);
1217                     filter = filter->sfe_next_onsocket) {
1218                         if (filter->sfe_filter->sf_filter.sf_connect_out) {
1219                                 if (filtered == 0) {
1220                                         filtered = 1;
1221                                         sflt_use(so);
1222                                         socket_unlock(so, 0);
1223                                 }
1224                                 error = filter->sfe_filter->sf_filter.
1225                                     sf_connect_out(filter->sfe_cookie, so, nam);
1226                         }
1227                 }
1228                 if (filtered != 0) {
1229                         socket_lock(so, 0);
1230                         sflt_unuse(so);
1231                 }
1232
1233                 if (error) {
1234                         if (error == EJUSTRETURN)
1235                                 error = 0;
1236                         if (dolock)
1237                                 socket_unlock(so, 1);
1238                         return (error);
1239                 }
1240
1241                 error = (*so->so_proto->pr_usrreqs->pru_connect)(so, nam, p);
1242         }
1243         if (dolock)
1244                 socket_unlock(so, 1);
1245         return (error);
1246 }
1247
1248 int
1249 soconnect(struct socket *so, struct sockaddr *nam)
1250 {
1251         return (soconnectlock(so, nam, 1));
1252 }
1253
1254 /*
1255  * Returns:     0                       Success
1256  *      <pru_connect2>:EINVAL[AF_UNIX]
1257  *      <pru_connect2>:EPROTOTYPE[AF_UNIX]
1258  *      <pru_connect2>:???              [other protocol families]
1259  *
1260  * Notes:       <pru_connect2> is not supported by [TCP].
1261  */
1262 int
1263 soconnect2(struct socket *so1, struct socket *so2)
1264 {
1265         int error;
1266
1267         socket_lock(so1, 1);
1268         if (so2->so_proto->pr_lock)
1269                 socket_lock(so2, 1);
1270
1271         error = (*so1->so_proto->pr_usrreqs->pru_connect2)(so1, so2);
1272
1273         socket_unlock(so1, 1);
1274         if (so2->so_proto->pr_lock)
1275                 socket_unlock(so2, 1);
1276         return (error);
1277 }
1278
1279 int
1280 sodisconnectlocked(struct socket *so)
1281 {
1282         int error;
1283
1284         if ((so->so_state & SS_ISCONNECTED) == 0) {
1285                 error = ENOTCONN;
1286                 goto bad;
1287         }
1288         if (so->so_state & SS_ISDISCONNECTING) {
1289                 error = EALREADY;
1290                 goto bad;
1291         }
1292
1293         error = (*so->so_proto->pr_usrreqs->pru_disconnect)(so);
1294
1295         if (error == 0) {
1296                 sflt_notify(so, sock_evt_disconnected, NULL);
1297         }
1298 bad:
1299         return (error);
1300 }
1301
1302 /* Locking version */
1303 int
1304 sodisconnect(struct socket *so)
1305 {
1306         int error;
1307
1308         socket_lock(so, 1);
1309         error = sodisconnectlocked(so);
1310         socket_unlock(so, 1);
1311         return (error);
1312 }
1313
1314 #define SBLOCKWAIT(f)   (((f) & MSG_DONTWAIT) ? M_DONTWAIT : M_WAIT)
1315
1316 /*
1317  * sosendcheck will lock the socket buffer if it isn't locked and
1318  * verify that there is space for the data being inserted.
1319  *
1320  * Returns:     0                       Success
1321  *              EPIPE
1322  *      sblock:EWOULDBLOCK
1323  *      sblock:EINTR
1324  *      sbwait:EBADF
1325  *      sbwait:EINTR
1326  *      [so_error]:???
1327  */
1328 static int
1329 sosendcheck(struct socket *so, struct sockaddr *addr, long resid, long clen,
1330     long atomic, int flags, int *sblocked)
1331 {
1332         int error = 0;
1333         long space;
1334         int     assumelock = 0;
1335
1336 restart:
1337         if (*sblocked == 0) {
1338                 if ((so->so_snd.sb_flags & SB_LOCK) != 0 &&
1339                     so->so_send_filt_thread != 0 &&
1340                     so->so_send_filt_thread == current_thread()) {
1341                         /*
1342                          * We're being called recursively from a filter,
1343                          * allow this to continue. Radar 4150520.
1344                          * Don't set sblocked because we don't want
1345                          * to perform an unlock later.
1346                          */
1347                         assumelock = 1;
1348                 } else {
1349                         error = sblock(&so->so_snd, SBLOCKWAIT(flags));
1350                         if (error) {
1351                                 return (error);
1352                         }
1353                         *sblocked = 1;
1354                 }
1355         }
1356
1357         /*
1358          * If a send attempt is made on a previously-accepted socket
1359          * that has been marked as inactive (disconnected), reject
1360          * the request.
1361          */
1362         if (so->so_flags & SOF_DEFUNCT)
1363                 return (ENOTCONN);
1364
1365         if (so->so_state & SS_CANTSENDMORE)
1366                 return (EPIPE);
1367
1368         if (so->so_error) {
1369                 error = so->so_error;
1370                 so->so_error = 0;
1371                 return (error);
1372         }
1373
1374         if ((so->so_state & SS_ISCONNECTED) == 0) {
1375                 if ((so->so_proto->pr_flags & PR_CONNREQUIRED) != 0) {
1376                         if ((so->so_state & SS_ISCONFIRMING) == 0 &&
1377                             !(resid == 0 && clen != 0))
1378                                 return (ENOTCONN);
1379                 } else if (addr == 0 && !(flags&MSG_HOLD)) {
1380                         return ((so->so_proto->pr_flags & PR_CONNREQUIRED) ?
1381                             ENOTCONN : EDESTADDRREQ);
1382                 }
1383         }
1384         space = sbspace(&so->so_snd);
1385         if (flags & MSG_OOB)
1386                 space += 1024;
1387         if ((atomic && resid > so->so_snd.sb_hiwat) ||
1388             clen > so->so_snd.sb_hiwat)
1389                 return (EMSGSIZE);
1390         if (space < resid + clen &&
1391             (atomic || space < (long)so->so_snd.sb_lowat || space < clen)) {
1392                 if ((so->so_state & SS_NBIO) || (flags & MSG_NBIO) ||
1393                     assumelock) {
1394                         return (EWOULDBLOCK);
1395                 }
1396                 sbunlock(&so->so_snd, 1);
1397                 error = sbwait(&so->so_snd);
1398                 if (error) {
1399                         return (error);
1400                 }
1401                 goto restart;
1402         }
1403
1404         return (0);
1405 }
1406
1407 /*
1408  * Send on a socket.
1409  * If send must go all at once and message is larger than
1410  * send buffering, then hard error.
1411  * Lock against other senders.
1412  * If must go all at once and not enough room now, then
1413  * inform user that this would block and do nothing.
1414  * Otherwise, if nonblocking, send as much as possible.
1415  * The data to be sent is described by "uio" if nonzero,
1416  * otherwise by the mbuf chain "top" (which must be null
1417  * if uio is not).  Data provided in mbuf chain must be small
1418  * enough to send all at once.
1419  *
1420  * Returns nonzero on error, timeout or signal; callers
1421  * must check for short counts if EINTR/ERESTART are returned.
1422  * Data and control buffers are freed on return.
1423  * Experiment:
1424  * MSG_HOLD: go thru most of sosend(), but just enqueue the mbuf
1425  * MSG_SEND: go thru as for MSG_HOLD on current fragment, then
1426  *  point at the mbuf chain being constructed and go from there.
1427  *
1428  * Returns:     0                       Success
1429  *              EOPNOTSUPP
1430  *              EINVAL
1431  *              ENOBUFS
1432  *      uiomove:EFAULT
1433  *      sosendcheck:EPIPE
1434  *      sosendcheck:EWOULDBLOCK
1435  *      sosendcheck:EINTR
1436  *      sosendcheck:EBADF
1437  *      sosendcheck:EINTR
1438  *      sosendcheck:???                 [value from so_error]
1439  *      <pru_send>:ECONNRESET[TCP]
1440  *      <pru_send>:EINVAL[TCP]
1441  *      <pru_send>:ENOBUFS[TCP]
1442  *      <pru_send>:EADDRINUSE[TCP]
1443  *      <pru_send>:EADDRNOTAVAIL[TCP]
1444  *      <pru_send>:EAFNOSUPPORT[TCP]
1445  *      <pru_send>:EACCES[TCP]
1446  *      <pru_send>:EAGAIN[TCP]
1447  *      <pru_send>:EPERM[TCP]
1448  *      <pru_send>:EMSGSIZE[TCP]
1449  *      <pru_send>:EHOSTUNREACH[TCP]
1450  *      <pru_send>:ENETUNREACH[TCP]
1451  *      <pru_send>:ENETDOWN[TCP]
1452  *      <pru_send>:ENOMEM[TCP]
1453  *      <pru_send>:ENOBUFS[TCP]
1454  *      <pru_send>:???[TCP]             [ignorable: mostly IPSEC/firewall/DLIL]
1455  *      <pru_send>:EINVAL[AF_UNIX]
1456  *      <pru_send>:EOPNOTSUPP[AF_UNIX]
1457  *      <pru_send>:EPIPE[AF_UNIX]
1458  *      <pru_send>:ENOTCONN[AF_UNIX]
1459  *      <pru_send>:EISCONN[AF_UNIX]
1460  *      <pru_send>:???[AF_UNIX]         [whatever a filter author chooses]
1461  *      <sf_data_out>:???               [whatever a filter author chooses]
1462  *
1463  * Notes:       Other <pru_send> returns depend on the protocol family; all
1464  *              <sf_data_out> returns depend on what the filter author causes
1465  *              their filter to return.
1466  */
1467 int
1468 sosend(struct socket *so, struct sockaddr *addr, struct uio *uio,
1469     struct mbuf *top, struct mbuf *control, int flags)
1470 {
1471         struct mbuf **mp;
1472         register struct mbuf *m, *freelist = NULL;
1473         register long space, len, resid;
1474         int clen = 0, error, dontroute, mlen, sendflags;
1475         int atomic = sosendallatonce(so) || top;
1476         int sblocked = 0;
1477         struct proc *p = current_proc();
1478
1479         if (uio) {
1480                 // LP64todo - fix this!
1481                 resid = uio_resid(uio);
1482         } else {
1483                 resid = top->m_pkthdr.len;
1484         }
1485         KERNEL_DEBUG((DBG_FNC_SOSEND | DBG_FUNC_START), so, resid,
1486             so->so_snd.sb_cc, so->so_snd.sb_lowat, so->so_snd.sb_hiwat);
1487
1488         socket_lock(so, 1);
1489         if (so->so_type != SOCK_STREAM && (flags & MSG_OOB) != 0) {
1490                 error = EOPNOTSUPP;
1491                 socket_unlock(so, 1);
1492                 goto out;
1493         }
1494
1495         /*
1496          * In theory resid should be unsigned.
1497          * However, space must be signed, as it might be less than 0
1498          * if we over-committed, and we must use a signed comparison
1499          * of space and resid.  On the other hand, a negative resid
1500          * causes us to loop sending 0-length segments to the protocol.
1501          *
1502          * Also check to make sure that MSG_EOR isn't used on SOCK_STREAM
1503          * type sockets since that's an error.
1504          */
1505         if (resid < 0 || (so->so_type == SOCK_STREAM && (flags & MSG_EOR))) {
1506                 error = EINVAL;
1507                 socket_unlock(so, 1);
1508                 goto out;
1509         }
1510
1511         dontroute =
1512             (flags & MSG_DONTROUTE) && (so->so_options & SO_DONTROUTE) == 0 &&
1513             (so->so_proto->pr_flags & PR_ATOMIC);
1514         if (p)
1515                 OSIncrementAtomic(&p->p_stats->p_ru.ru_msgsnd);
1516         if (control)
1517                 clen = control->m_len;
1518
1519         do {
1520                 error = sosendcheck(so, addr, resid, clen, atomic, flags,
1521                     &sblocked);
1522                 if (error) {
1523                         goto release;
1524                 }
1525                 mp = &top;
1526                 space = sbspace(&so->so_snd) - clen + ((flags & MSG_OOB) ?
1527                     1024 : 0);
1528
1529                 do {
1530                         struct socket_filter_entry *filter;
1531                         int filtered;
1532                         boolean_t recursive;
1533
1534                         if (uio == NULL) {
1535                                 /*
1536                                  * Data is prepackaged in "top".
1537                                  */
1538                                 resid = 0;
1539                                 if (flags & MSG_EOR)
1540                                         top->m_flags |= M_EOR;
1541                         } else {
1542                                 int chainlength;
1543                                 int bytes_to_copy;
1544                                 boolean_t jumbocl;
1545
1546                                 bytes_to_copy = min(resid, space);
1547
1548                                 if (sosendminchain > 0) {
1549                                         chainlength = 0;
1550                                 } else {
1551                                         chainlength = sosendmaxchain;
1552                                 }
1553
1554                                 /*
1555                                  * Attempt to use larger than system page-size
1556                                  * clusters for large writes only if there is
1557                                  * a jumbo cluster pool and if the socket is
1558                                  * marked accordingly.
1559                                  */
1560                                 jumbocl = sosendjcl && njcl > 0 &&
1561                                     ((so->so_flags & SOF_MULTIPAGES) ||
1562                                     sosendjcl_ignore_capab);
1563
1564                                 socket_unlock(so, 0);
1565
1566                                 do {
1567                                         int num_needed;
1568                                         int hdrs_needed = (top == 0) ? 1 : 0;
1569
1570                                         /*
1571                                          * try to maintain a local cache of mbuf
1572                                          * clusters needed to complete this
1573                                          * write the list is further limited to
1574                                          * the number that are currently needed
1575                                          * to fill the socket this mechanism
1576                                          * allows a large number of mbufs/
1577                                          * clusters to be grabbed under a single
1578                                          * mbuf lock... if we can't get any
1579                                          * clusters, than fall back to trying
1580                                          * for mbufs if we fail early (or
1581                                          * miscalcluate the number needed) make
1582                                          * sure to release any clusters we
1583                                          * haven't yet consumed.
1584                                          */
1585                                         if (freelist == NULL &&
1586                                             bytes_to_copy > NBPG && jumbocl) {
1587                                                 num_needed =
1588                                                     bytes_to_copy / M16KCLBYTES;
1589
1590                                                 if ((bytes_to_copy -
1591                                                     (num_needed * M16KCLBYTES))
1592                                                     >= MINCLSIZE)
1593                                                         num_needed++;
1594
1595                                                 freelist =
1596                                                     m_getpackets_internal(
1597                                                     (unsigned int *)&num_needed,
1598                                                     hdrs_needed, M_WAIT, 0,
1599                                                     M16KCLBYTES);
1600                                                 /*
1601                                                  * Fall back to 4K cluster size
1602                                                  * if allocation failed
1603                                                  */
1604                                         }
1605
1606                                         if (freelist == NULL &&
1607                                             bytes_to_copy > MCLBYTES) {
1608                                                 num_needed =
1609                                                     bytes_to_copy / NBPG;
1610
1611                                                 if ((bytes_to_copy -
1612                                                     (num_needed * NBPG)) >=
1613                                                     MINCLSIZE)
1614                                                         num_needed++;
1615
1616                                                 freelist =
1617                                                     m_getpackets_internal(
1618                                                     (unsigned int *)&num_needed,
1619                                                     hdrs_needed, M_WAIT, 0,
1620                                                     NBPG);
1621                                                 /*
1622                                                  * Fall back to cluster size
1623                                                  * if allocation failed
1624                                                  */
1625                                         }
1626
1627                                         if (freelist == NULL &&
1628                                             bytes_to_copy > MINCLSIZE) {
1629                                                 num_needed =
1630                                                     bytes_to_copy / MCLBYTES;
1631
1632                                                 if ((bytes_to_copy -
1633                                                     (num_needed * MCLBYTES)) >=
1634                                                     MINCLSIZE)
1635                                                         num_needed++;
1636
1637                                                 freelist =
1638                                                     m_getpackets_internal(
1639                                                     (unsigned int *)&num_needed,
1640                                                     hdrs_needed, M_WAIT, 0,
1641                                                     MCLBYTES);
1642                                                 /*
1643                                                  * Fall back to a single mbuf
1644                                                  * if allocation failed
1645                                                  */
1646                                         }
1647
1648                                         if (freelist == NULL) {
1649                                                 if (top == 0)
1650                                                         MGETHDR(freelist,
1651                                                             M_WAIT, MT_DATA);
1652                                                 else
1653                                                         MGET(freelist,
1654                                                             M_WAIT, MT_DATA);
1655
1656                                                 if (freelist == NULL) {
1657                                                         error = ENOBUFS;
1658                                                         socket_lock(so, 0);
1659                                                         goto release;
1660                                                 }
1661                                                 /*
1662                                                  * For datagram protocols,
1663                                                  * leave room for protocol
1664                                                  * headers in first mbuf.
1665                                                  */
1666                                                 if (atomic && top == 0 &&
1667                                                     bytes_to_copy < MHLEN) {
1668                                                         MH_ALIGN(freelist,
1669                                                             bytes_to_copy);
1670                                                 }
1671                                         }
1672                                         m = freelist;
1673                                         freelist = m->m_next;
1674                                         m->m_next = NULL;
1675
1676                                         if ((m->m_flags & M_EXT))
1677                                                 mlen = m->m_ext.ext_size;
1678                                         else if ((m->m_flags & M_PKTHDR))
1679                                                 mlen =
1680                                                     MHLEN - m_leadingspace(m);
1681                                         else
1682                                                 mlen = MLEN;
1683                                         len = min(mlen, bytes_to_copy);
1684
1685                                         chainlength += len;
1686
1687                                         space -= len;
1688
1689                                         error = uiomove(mtod(m, caddr_t),
1690                                             (int)len, uio);
1691
1692                                         // LP64todo - fix this!
1693                                         resid = uio_resid(uio);
1694
1695                                         m->m_len = len;
1696                                         *mp = m;
1697                                         top->m_pkthdr.len += len;
1698                                         if (error)
1699                                                 break;
1700                                         mp = &m->m_next;
1701                                         if (resid <= 0) {
1702                                                 if (flags & MSG_EOR)
1703                                                         top->m_flags |= M_EOR;
1704                                                 break;
1705                                         }
1706                                         bytes_to_copy = min(resid, space);
1707
1708                                 } while (space > 0 &&
1709                                     (chainlength < sosendmaxchain || atomic ||
1710                                     resid < MINCLSIZE));
1711
1712                                 socket_lock(so, 0);
1713
1714                                 if (error)
1715                                         goto release;
1716                         }
1717
1718                         if (flags & (MSG_HOLD|MSG_SEND)) {
1719                                 /* Enqueue for later, go away if HOLD */
1720                                 register struct mbuf *mb1;
1721                                 if (so->so_temp && (flags & MSG_FLUSH)) {
1722                                         m_freem(so->so_temp);
1723                                         so->so_temp = NULL;
1724                                 }
1725                                 if (so->so_temp)
1726                                         so->so_tail->m_next = top;
1727                                 else
1728                                         so->so_temp = top;
1729                                 mb1 = top;
1730                                 while (mb1->m_next)
1731                                         mb1 = mb1->m_next;
1732                                 so->so_tail = mb1;
1733                                 if (flags & MSG_HOLD) {
1734                                         top = NULL;
1735                                         goto release;
1736                                 }
1737                                 top = so->so_temp;
1738                         }
1739                         if (dontroute)
1740                                 so->so_options |= SO_DONTROUTE;
1741
1742                         /* Compute flags here, for pru_send and NKEs */
1743                         sendflags = (flags & MSG_OOB) ? PRUS_OOB :
1744                             /*
1745                              * If the user set MSG_EOF, the protocol
1746                              * understands this flag and nothing left to
1747                              * send then use PRU_SEND_EOF instead of PRU_SEND.
1748                              */
1749                             ((flags & MSG_EOF) &&
1750                              (so->so_proto->pr_flags & PR_IMPLOPCL) &&
1751                              (resid <= 0)) ?
1752                                 PRUS_EOF :
1753                             /* If there is more to send set PRUS_MORETOCOME */
1754                             (resid > 0 && space > 0) ? PRUS_MORETOCOME : 0;
1755
1756                         /*
1757                          * Socket filter processing
1758                          */
1759                         recursive = (so->so_send_filt_thread != NULL);
1760                         filtered = 0;
1761                         error = 0;
1762                         for (filter = so->so_filt; filter && (error == 0);
1763                             filter = filter->sfe_next_onsocket) {
1764                                 if (filter->sfe_filter->sf_filter.sf_data_out) {
1765                                         int so_flags = 0;
1766                                         if (filtered == 0) {
1767                                                 filtered = 1;
1768                                                 so->so_send_filt_thread =
1769                                                     current_thread();
1770                                                 sflt_use(so);
1771                                                 socket_unlock(so, 0);
1772                                                 so_flags =
1773                                                     (sendflags & MSG_OOB) ?
1774                                                     sock_data_filt_flag_oob : 0;
1775                                         }
1776                                         error = filter->sfe_filter->sf_filter.
1777                                             sf_data_out(filter->sfe_cookie, so,
1778                                             addr, &top, &control, so_flags);
1779                                 }
1780                         }
1781
1782                         if (filtered) {
1783                                 /*
1784                                  * At this point, we've run at least one
1785                                  * filter.  The socket is unlocked as is
1786                                  * the socket buffer.  Clear the recorded
1787                                  * filter thread only when we are outside
1788                                  * of a filter's context.  This allows for
1789                                  * a filter to issue multiple inject calls
1790                                  * from its sf_data_out callback routine.
1791                                  */
1792                                 socket_lock(so, 0);
1793                                 sflt_unuse(so);
1794                                 if (!recursive)
1795                                         so->so_send_filt_thread = 0;
1796                                 if (error) {
1797                                         if (error == EJUSTRETURN) {
1798                                                 error = 0;
1799                                                 clen = 0;
1800                                                 control = 0;
1801                                                 top = 0;
1802                                         }
1803
1804                                         goto release;
1805                                 }
1806                         }
1807                         /*
1808                          * End Socket filter processing
1809                          */
1810
1811                         if (error == EJUSTRETURN) {
1812                                 /* A socket filter handled this data */
1813                                 error = 0;
1814                         } else {
1815                                 error = (*so->so_proto->pr_usrreqs->pru_send)
1816                                     (so, sendflags, top, addr, control, p);
1817                         }
1818 #ifdef __APPLE__
1819                         if (flags & MSG_SEND)
1820                                 so->so_temp = NULL;
1821 #endif
1822                         if (dontroute)
1823                                 so->so_options &= ~SO_DONTROUTE;
1824
1825                         clen = 0;
1826                         control = 0;
1827                         top = 0;
1828                         mp = &top;
1829                         if (error)
1830                                 goto release;
1831                 } while (resid && space > 0);
1832         } while (resid);
1833
1834 release:
1835         if (sblocked)
1836                 sbunlock(&so->so_snd, 0);       /* will unlock socket */
1837         else
1838                 socket_unlock(so, 1);
1839 out:
1840         if (top)
1841                 m_freem(top);
1842         if (control)
1843                 m_freem(control);
1844         if (freelist)
1845                 m_freem_list(freelist);
1846
1847         KERNEL_DEBUG(DBG_FNC_SOSEND | DBG_FUNC_END, so, resid, so->so_snd.sb_cc,
1848             space, error);
1849
1850         return (error);
1851 }
1852
1853 /*
1854  * Implement receive operations on a socket.
1855  * We depend on the way that records are added to the sockbuf
1856  * by sbappend*.  In particular, each record (mbufs linked through m_next)
1857  * must begin with an address if the protocol so specifies,
1858  * followed by an optional mbuf or mbufs containing ancillary data,
1859  * and then zero or more mbufs of data.
1860  * In order to avoid blocking network interrupts for the entire time here,
1861  * we splx() while doing the actual copy to user space.
1862  * Although the sockbuf is locked, new data may still be appended,
1863  * and thus we must maintain consistency of the sockbuf during that time.
1864  *
1865  * The caller may receive the data as a single mbuf chain by supplying
1866  * an mbuf **mp0 for use in returning the chain.  The uio is then used
1867  * only for the count in uio_resid.
1868  *
1869  * Returns:     0                       Success
1870  *              ENOBUFS
1871  *              ENOTCONN
1872  *              EWOULDBLOCK
1873  *      uiomove:EFAULT
1874  *      sblock:EWOULDBLOCK
1875  *      sblock:EINTR
1876  *      sbwait:EBADF
1877  *      sbwait:EINTR
1878  *      sodelayed_copy:EFAULT
1879  *      <pru_rcvoob>:EINVAL[TCP]
1880  *      <pru_rcvoob>:EWOULDBLOCK[TCP]
1881  *      <pru_rcvoob>:???
1882  *      <pr_domain->dom_externalize>:EMSGSIZE[AF_UNIX]
1883  *      <pr_domain->dom_externalize>:ENOBUFS[AF_UNIX]
1884  *      <pr_domain->dom_externalize>:???
1885  *
1886  * Notes:       Additional return values from calls through <pru_rcvoob> and
1887  *              <pr_domain->dom_externalize> depend on protocols other than
1888  *              TCP or AF_UNIX, which are documented above.
1889  */
1890 int
1891 soreceive(struct socket *so, struct sockaddr **psa, struct uio *uio,
1892     struct mbuf **mp0, struct mbuf **controlp, int *flagsp)
1893 {
1894         register struct mbuf *m, **mp, *ml = NULL;
1895         register int flags, len, error, offset;
1896         struct protosw *pr = so->so_proto;
1897         struct mbuf *nextrecord;
1898         int moff, type = 0;
1899                 // LP64todo - fix this!
1900         int orig_resid = uio_resid(uio);
1901         struct mbuf *free_list;
1902         int delayed_copy_len;
1903         int can_delay;
1904         int need_event;
1905         struct proc *p = current_proc();
1906
1907         // LP64todo - fix this!
1908         KERNEL_DEBUG(DBG_FNC_SORECEIVE | DBG_FUNC_START, so, uio_resid(uio),
1909             so->so_rcv.sb_cc, so->so_rcv.sb_lowat, so->so_rcv.sb_hiwat);
1910
1911         socket_lock(so, 1);
1912
1913 #ifdef MORE_LOCKING_DEBUG
1914         if (so->so_usecount == 1)
1915                 panic("soreceive: so=%x no other reference on socket\n", so);
1916 #endif
1917         mp = mp0;
1918         if (psa)
1919                 *psa = 0;
1920         if (controlp)
1921                 *controlp = 0;
1922         if (flagsp)
1923                 flags = *flagsp &~ MSG_EOR;
1924         else
1925                 flags = 0;
1926
1927         /*
1928          * If a recv attempt is made on a previously-accepted socket
1929          * that has been marked as inactive (disconnected), reject
1930          * the request.
1931          */
1932         if (so->so_flags & SOF_DEFUNCT) {
1933                 struct sockbuf *sb = &so->so_rcv;
1934
1935                 /*
1936                  * This socket should have been disconnected and flushed
1937                  * prior to being returned from accept; there should be
1938                  * no data on its receive list, so panic otherwise.
1939                  */
1940                 sb_empty_assert(sb, __func__);
1941                 socket_unlock(so, 1);
1942                 return (ENOTCONN);
1943         }
1944
1945         /*
1946          * When SO_WANTOOBFLAG is set we try to get out-of-band data
1947          * regardless of the flags argument. Here is the case were
1948          * out-of-band data is not inline.
1949          */
1950         if ((flags & MSG_OOB) ||
1951             ((so->so_options & SO_WANTOOBFLAG) != 0 &&
1952             (so->so_options & SO_OOBINLINE) == 0 &&
1953             (so->so_oobmark || (so->so_state & SS_RCVATMARK)))) {
1954                 m = m_get(M_WAIT, MT_DATA);
1955                 if (m == NULL) {
1956                         socket_unlock(so, 1);
1957                         KERNEL_DEBUG(DBG_FNC_SORECEIVE | DBG_FUNC_END,
1958                             ENOBUFS, 0, 0, 0, 0);
1959                         return (ENOBUFS);
1960                 }
1961                 error = (*pr->pr_usrreqs->pru_rcvoob)(so, m, flags & MSG_PEEK);
1962                 if (error)
1963                         goto bad;
1964                 socket_unlock(so, 0);
1965                 do {
1966                 // LP64todo - fix this!
1967                         error = uiomove(mtod(m, caddr_t),
1968                             (int)min(uio_resid(uio), m->m_len), uio);
1969                         m = m_free(m);
1970                 } while (uio_resid(uio) && error == 0 && m);
1971                 socket_lock(so, 0);
1972 bad:
1973                 if (m)
1974                         m_freem(m);
1975 #ifdef __APPLE__
1976                 if ((so->so_options & SO_WANTOOBFLAG) != 0) {
1977                         if (error == EWOULDBLOCK || error == EINVAL) {
1978                                 /*
1979                                  * Let's try to get normal data:
1980                                  * EWOULDBLOCK: out-of-band data not
1981                                  * receive yet. EINVAL: out-of-band data
1982                                  * already read.
1983                                  */
1984                                 error = 0;
1985                                 goto nooob;
1986                         } else if (error == 0 && flagsp) {
1987                                 *flagsp |= MSG_OOB;
1988                         }
1989                 }
1990                 socket_unlock(so, 1);
1991                 KERNEL_DEBUG(DBG_FNC_SORECEIVE | DBG_FUNC_END, error,
1992                     0, 0, 0, 0);
1993 #endif
1994                 return (error);
1995         }
1996 nooob:
1997         if (mp)
1998                 *mp = (struct mbuf *)0;
1999         if (so->so_state & SS_ISCONFIRMING && uio_resid(uio))
2000                 (*pr->pr_usrreqs->pru_rcvd)(so, 0);
2001
2002
2003         free_list = (struct mbuf *)0;
2004         delayed_copy_len = 0;
2005 restart:
2006 #ifdef MORE_LOCKING_DEBUG
2007         if (so->so_usecount <= 1)
2008                 printf("soreceive: sblock so=%p ref=%d on socket\n",
2009                     so, so->so_usecount);
2010 #endif
2011         /*
2012          * See if the socket has been closed (SS_NOFDREF|SS_CANTRCVMORE)
2013          * and if so just return to the caller.  This could happen when
2014          * soreceive() is called by a socket upcall function during the
2015          * time the socket is freed.  The socket buffer would have been
2016          * locked across the upcall, therefore we cannot put this thread
2017          * to sleep (else we will deadlock) or return EWOULDBLOCK (else
2018          * we may livelock), because the lock on the socket buffer will
2019          * only be released when the upcall routine returns to its caller.
2020          * Because the socket has been officially closed, there can be
2021          * no further read on it.
2022          */
2023         if ((so->so_state & (SS_NOFDREF | SS_CANTRCVMORE)) ==
2024             (SS_NOFDREF | SS_CANTRCVMORE)) {
2025                 socket_unlock(so, 1);
2026                 return (0);
2027         }
2028
2029         error = sblock(&so->so_rcv, SBLOCKWAIT(flags));
2030         if (error) {
2031                 socket_unlock(so, 1);
2032                 KERNEL_DEBUG(DBG_FNC_SORECEIVE | DBG_FUNC_END, error,
2033                     0, 0, 0, 0);
2034                 return (error);
2035         }
2036
2037         m = so->so_rcv.sb_mb;
2038         /*
2039          * If we have less data than requested, block awaiting more
2040          * (subject to any timeout) if:
2041          *   1. the current count is less than the low water mark, or
2042          *   2. MSG_WAITALL is set, and it is possible to do the entire
2043          *      receive operation at once if we block (resid <= hiwat).
2044          *   3. MSG_DONTWAIT is not set
2045          * If MSG_WAITALL is set but resid is larger than the receive buffer,
2046          * we have to do the receive in sections, and thus risk returning
2047          * a short count if a timeout or signal occurs after we start.
2048          */
2049         if (m == 0 || (((flags & MSG_DONTWAIT) == 0 &&
2050             so->so_rcv.sb_cc < uio_resid(uio)) &&
2051             (so->so_rcv.sb_cc < so->so_rcv.sb_lowat ||
2052             ((flags & MSG_WAITALL) && uio_resid(uio) <= so->so_rcv.sb_hiwat)) &&
2053             m->m_nextpkt == 0 && (pr->pr_flags & PR_ATOMIC) == 0)) {
2054                 /*
2055                  * Panic if we notice inconsistencies in the socket's
2056                  * receive list; both sb_mb and sb_cc should correctly
2057                  * reflect the contents of the list, otherwise we may
2058                  * end up with false positives during select() or poll()
2059                  * which could put the application in a bad state.
2060                  */
2061                 if (m == NULL && so->so_rcv.sb_cc != 0)
2062                         panic("soreceive corrupted so_rcv: m %p cc %lu",
2063                             m, so->so_rcv.sb_cc);
2064
2065                 if (so->so_error) {
2066                         if (m)
2067                                 goto dontblock;
2068                         error = so->so_error;
2069                         if ((flags & MSG_PEEK) == 0)
2070                                 so->so_error = 0;
2071                         goto release;
2072                 }
2073                 if (so->so_state & SS_CANTRCVMORE) {
2074                         if (m)
2075                                 goto dontblock;
2076                         else
2077                                 goto release;
2078                 }
2079                 for (; m; m = m->m_next)
2080                         if (m->m_type == MT_OOBDATA || (m->m_flags & M_EOR)) {
2081                                 m = so->so_rcv.sb_mb;
2082                                 goto dontblock;
2083                         }
2084                 if ((so->so_state & (SS_ISCONNECTED|SS_ISCONNECTING)) == 0 &&
2085                     (so->so_proto->pr_flags & PR_CONNREQUIRED)) {
2086                         error = ENOTCONN;
2087                         goto release;
2088                 }
2089                 if (uio_resid(uio) == 0)
2090                         goto release;
2091                 if ((so->so_state & SS_NBIO) ||
2092                     (flags & (MSG_DONTWAIT|MSG_NBIO))) {
2093                         error = EWOULDBLOCK;
2094                         goto release;
2095                 }
2096                 SBLASTRECORDCHK(&so->so_rcv, "soreceive sbwait 1");
2097                 SBLASTMBUFCHK(&so->so_rcv, "soreceive sbwait 1");
2098                 sbunlock(&so->so_rcv, 1);
2099 #if EVEN_MORE_LOCKING_DEBUG
2100                 if (socket_debug)
2101                         printf("Waiting for socket data\n");
2102 #endif
2103
2104                 error = sbwait(&so->so_rcv);
2105 #if EVEN_MORE_LOCKING_DEBUG
2106                 if (socket_debug)
2107                         printf("SORECEIVE - sbwait returned %d\n", error);
2108 #endif
2109                 if (so->so_usecount < 1)
2110                         panic("soreceive: after 2nd sblock so=%p ref=%d on "
2111                             "socket\n", so, so->so_usecount);
2112                 if (error) {
2113                         socket_unlock(so, 1);
2114                         KERNEL_DEBUG(DBG_FNC_SORECEIVE | DBG_FUNC_END, error,
2115                             0, 0, 0, 0);
2116                         return (error);
2117                 }
2118                 goto restart;
2119         }
2120 dontblock:
2121 #ifndef __APPLE__
2122         if (uio->uio_procp)
2123                 uio->uio_procp->p_stats->p_ru.ru_msgrcv++;
2124 #else   /* __APPLE__ */
2125         /*
2126          * 2207985
2127          * This should be uio->uio-procp; however, some callers of this
2128          * function use auto variables with stack garbage, and fail to
2129          * fill out the uio structure properly.
2130          */
2131         if (p)
2132                 OSIncrementAtomic(&p->p_stats->p_ru.ru_msgrcv);
2133 #endif  /* __APPLE__ */
2134         SBLASTRECORDCHK(&so->so_rcv, "soreceive 1");
2135         SBLASTMBUFCHK(&so->so_rcv, "soreceive 1");
2136         nextrecord = m->m_nextpkt;
2137         if ((pr->pr_flags & PR_ADDR) && m->m_type == MT_SONAME) {
2138                 KASSERT(m->m_type == MT_SONAME, ("receive 1a"));
2139 #if CONFIG_MACF_SOCKET_SUBSET
2140                 /*
2141                  * Call the MAC framework for policy checking if we're in
2142                  * the user process context and the socket isn't connected.
2143                  */
2144                 if (p != kernproc && !(so->so_state & SS_ISCONNECTED)) {
2145                         struct mbuf *m0 = m;
2146                         /*
2147                          * Dequeue this record (temporarily) from the receive
2148                          * list since we're about to drop the socket's lock
2149                          * where a new record may arrive and be appended to
2150                          * the list.  Upon MAC policy failure, the record
2151                          * will be freed.  Otherwise, we'll add it back to
2152                          * the head of the list.  We cannot rely on SB_LOCK
2153                          * because append operation uses the socket's lock.
2154                          */
2155                         do {
2156                                 m->m_nextpkt = NULL;
2157                                 sbfree(&so->so_rcv, m);
2158                                 m = m->m_next;
2159                         } while (m != NULL);
2160                         m = m0;
2161                         so->so_rcv.sb_mb = nextrecord;
2162                         SB_EMPTY_FIXUP(&so->so_rcv);
2163                         SBLASTRECORDCHK(&so->so_rcv, "soreceive 1a");
2164                         SBLASTMBUFCHK(&so->so_rcv, "soreceive 1a");
2165                         socket_unlock(so, 0);
2166                         if (mac_socket_check_received(proc_ucred(p), so,
2167                             mtod(m, struct sockaddr *)) != 0) {
2168                                 /*
2169                                  * MAC policy failure; free this record and
2170                                  * process the next record (or block until
2171                                  * one is available).  We have adjusted sb_cc
2172                                  * and sb_mbcnt above so there is no need to
2173                                  * call sbfree() again.
2174                                  */
2175                                 do {
2176                                         m = m_free(m);
2177                                 } while (m != NULL);
2178                                 /*
2179                                  * Clear SB_LOCK but don't unlock the socket.
2180                                  * Process the next record or wait for one.
2181                                  */
2182                                 socket_lock(so, 0);
2183                                 sbunlock(&so->so_rcv, 1);
2184                                 goto restart;
2185                         }
2186                         socket_lock(so, 0);
2187                         /*
2188                          * Re-adjust the socket receive list and re-enqueue
2189                          * the record in front of any packets which may have
2190                          * been appended while we dropped the lock.
2191                          */
2192                         for (m = m0; m->m_next != NULL; m = m->m_next)
2193                                 sballoc(&so->so_rcv, m);
2194                         sballoc(&so->so_rcv, m);
2195                         if (so->so_rcv.sb_mb == NULL) {
2196                                 so->so_rcv.sb_lastrecord = m0;
2197                                 so->so_rcv.sb_mbtail = m;
2198                         }
2199                         m = m0;
2200                         nextrecord = m->m_nextpkt = so->so_rcv.sb_mb;
2201                         so->so_rcv.sb_mb = m;
2202                         SBLASTRECORDCHK(&so->so_rcv, "soreceive 1b");
2203                         SBLASTMBUFCHK(&so->so_rcv, "soreceive 1b");
2204                 }
2205 #endif /* CONFIG_MACF_SOCKET_SUBSET */
2206                 orig_resid = 0;
2207                 if (psa) {
2208                         *psa = dup_sockaddr(mtod(m, struct sockaddr *),
2209                             mp0 == 0);
2210                         if ((*psa == 0) && (flags & MSG_NEEDSA)) {
2211                                 error = EWOULDBLOCK;
2212                                 goto release;
2213                         }
2214                 }
2215                 if (flags & MSG_PEEK) {
2216                         m = m->m_next;
2217                 } else {
2218                         sbfree(&so->so_rcv, m);
2219                         if (m->m_next == 0 && so->so_rcv.sb_cc != 0)
2220                                 panic("soreceive: about to create invalid "
2221                                     "socketbuf");
2222                         MFREE(m, so->so_rcv.sb_mb);
2223                         m = so->so_rcv.sb_mb;
2224                         if (m != NULL) {
2225                                 m->m_nextpkt = nextrecord;
2226                         } else {
2227                                 so->so_rcv.sb_mb = nextrecord;
2228                                 SB_EMPTY_FIXUP(&so->so_rcv);
2229                         }
2230                 }
2231         }
2232
2233         /*
2234          * Process one or more MT_CONTROL mbufs present before any data mbufs
2235          * in the first mbuf chain on the socket buffer.  If MSG_PEEK, we
2236          * just copy the data; if !MSG_PEEK, we call into the protocol to
2237          * perform externalization.
2238          */
2239         if (m != NULL && m->m_type == MT_CONTROL) {
2240                 struct mbuf *cm = NULL, *cmn;
2241                 struct mbuf **cme = &cm;
2242                 struct sockbuf *sb_rcv = &so->so_rcv;
2243
2244                 /*
2245                  * Externalizing the control messages would require us to
2246                  * drop the socket's lock below.  Once we re-acquire the
2247                  * lock, the mbuf chain might change.  In order to preserve
2248                  * consistency, we unlink all control messages from the
2249                  * first mbuf chain in one shot and link them separately
2250                  * onto a different chain.
2251                  */
2252                 do {
2253                         if (flags & MSG_PEEK) {
2254                                 if (controlp != NULL) {
2255                                         *controlp = m_copy(m, 0, m->m_len);
2256                                         controlp = &(*controlp)->m_next;
2257                                 }
2258                                 m = m->m_next;
2259                         } else {
2260                                 m->m_nextpkt = NULL;
2261                                 sbfree(sb_rcv, m);
2262                                 sb_rcv->sb_mb = m->m_next;
2263                                 m->m_next = NULL;
2264                                 *cme = m;
2265                                 cme = &(*cme)->m_next;
2266                                 m = sb_rcv->sb_mb;
2267                         }
2268                 } while (m != NULL && m->m_type == MT_CONTROL);
2269
2270                 if (!(flags & MSG_PEEK)) {
2271                         if (sb_rcv->sb_mb != NULL) {
2272                                 sb_rcv->sb_mb->m_nextpkt = nextrecord;
2273                         } else {
2274                                 sb_rcv->sb_mb = nextrecord;
2275                                 SB_EMPTY_FIXUP(sb_rcv);
2276                         }
2277                         if (nextrecord == NULL)
2278                                 sb_rcv->sb_lastrecord = m;
2279                 }
2280
2281                 SBLASTRECORDCHK(&so->so_rcv, "soreceive ctl");
2282                 SBLASTMBUFCHK(&so->so_rcv, "soreceive ctl");
2283
2284                 while (cm != NULL) {
2285                         int cmsg_type;
2286
2287                         cmn = cm->m_next;
2288                         cm->m_next = NULL;
2289                         cmsg_type = mtod(cm, struct cmsghdr *)->cmsg_type;
2290
2291                         /*
2292                          * Call the protocol to externalize SCM_RIGHTS message
2293                          * and return the modified message to the caller upon
2294                          * success.  Otherwise, all other control messages are
2295                          * returned unmodified to the caller.  Note that we
2296                          * only get into this loop if MSG_PEEK is not set.
2297                          */
2298                         if (pr->pr_domain->dom_externalize != NULL &&
2299                             cmsg_type == SCM_RIGHTS) {
2300                                 /*
2301                                  * Release socket lock: see 3903171.  This
2302                                  * would also allow more records to be appended
2303                                  * to the socket buffer.  We still have SB_LOCK
2304                                  * set on it, so we can be sure that the head
2305                                  * of the mbuf chain won't change.
2306                                  */
2307                                 socket_unlock(so, 0);
2308                                 error = (*pr->pr_domain->dom_externalize)(cm);
2309                                 socket_lock(so, 0);
2310                         } else {
2311                                 error = 0;
2312                         }
2313
2314                         if (controlp != NULL && error == 0) {
2315                                 *controlp = cm;
2316                                 controlp = &(*controlp)->m_next;
2317                                 orig_resid = 0;
2318                         } else {
2319                                 (void) m_free(cm);
2320                         }
2321                         cm = cmn;
2322                 }
2323                 orig_resid = 0;
2324                 if (sb_rcv->sb_mb != NULL)
2325                         nextrecord = sb_rcv->sb_mb->m_nextpkt;
2326                 else
2327                         nextrecord = NULL;
2328         }
2329
2330         if (m != NULL) {
2331                 if (!(flags & MSG_PEEK)) {
2332                         /*
2333                          * We get here because m points to an mbuf following
2334                          * any MT_SONAME or MT_CONTROL mbufs which have been
2335                          * processed above.  In any case, m should be pointing
2336                          * to the head of the mbuf chain, and the nextrecord
2337                          * should be either NULL or equal to m->m_nextpkt.
2338                          * See comments above about SB_LOCK.
2339                          */
2340                         if (m != so->so_rcv.sb_mb || m->m_nextpkt != nextrecord)
2341                                 panic("soreceive: post-control !sync so=%p "
2342                                     "m=%p nextrecord=%p\n", so, m, nextrecord);
2343
2344                         if (nextrecord == NULL)
2345                                 so->so_rcv.sb_lastrecord = m;
2346                 }
2347                 type = m->m_type;
2348                 if (type == MT_OOBDATA)
2349                         flags |= MSG_OOB;
2350         } else {
2351                 if (!(flags & MSG_PEEK)) {
2352                         so->so_rcv.sb_mb = nextrecord;
2353                         SB_EMPTY_FIXUP(&so->so_rcv);
2354                 }
2355         }
2356         SBLASTRECORDCHK(&so->so_rcv, "soreceive 2");
2357         SBLASTMBUFCHK(&so->so_rcv, "soreceive 2");
2358
2359         moff = 0;
2360         offset = 0;
2361
2362         if (!(flags & MSG_PEEK) && uio_resid(uio) > sorecvmincopy)
2363                 can_delay = 1;
2364         else
2365                 can_delay = 0;
2366
2367         need_event = 0;
2368
2369         while (m && (uio_resid(uio) - delayed_copy_len) > 0 && error == 0) {
2370                 if (m->m_type == MT_OOBDATA) {
2371                         if (type != MT_OOBDATA)
2372                                 break;
2373                 } else if (type == MT_OOBDATA) {
2374                         break;
2375                 }
2376                 /*
2377                  * Make sure to allways set MSG_OOB event when getting
2378                  * out of band data inline.
2379                  */
2380                 if ((so->so_options & SO_WANTOOBFLAG) != 0 &&
2381                     (so->so_options & SO_OOBINLINE) != 0 &&
2382                     (so->so_state & SS_RCVATMARK) != 0) {
2383                         flags |= MSG_OOB;
2384                 }
2385                 so->so_state &= ~SS_RCVATMARK;
2386                 // LP64todo - fix this!
2387                 len = uio_resid(uio) - delayed_copy_len;
2388                 if (so->so_oobmark && len > so->so_oobmark - offset)
2389                         len = so->so_oobmark - offset;
2390                 if (len > m->m_len - moff)
2391                         len = m->m_len - moff;
2392                 /*
2393                  * If mp is set, just pass back the mbufs.
2394                  * Otherwise copy them out via the uio, then free.
2395                  * Sockbuf must be consistent here (points to current mbuf,
2396                  * it points to next record) when we drop priority;
2397                  * we must note any additions to the sockbuf when we
2398                  * block interrupts again.
2399                  */
2400                 if (mp == 0) {
2401                         SBLASTRECORDCHK(&so->so_rcv, "soreceive uiomove");
2402                         SBLASTMBUFCHK(&so->so_rcv, "soreceive uiomove");
2403                         if (can_delay && len == m->m_len) {
2404                                 /*
2405                                  * only delay the copy if we're consuming the
2406                                  * mbuf and we're NOT in MSG_PEEK mode
2407                                  * and we have enough data to make it worthwile
2408                                  * to drop and retake the lock... can_delay
2409                                  * reflects the state of the 2 latter
2410                                  * constraints moff should always be zero
2411                                  * in these cases
2412                                  */
2413                                 delayed_copy_len += len;
2414                         } else {
2415                                 if (delayed_copy_len) {
2416                                         error = sodelayed_copy(so, uio,
2417                                             &free_list, &delayed_copy_len);
2418
2419                                         if (error) {
2420                                                 goto release;
2421                                         }
2422                                         /*
2423                                          * can only get here if MSG_PEEK is not
2424                                          * set therefore, m should point at the
2425                                          * head of the rcv queue; if it doesn't,
2426                                          * it means something drastically
2427                                          * changed while we were out from behind
2428                                          * the lock in sodelayed_copy. perhaps
2429                                          * a RST on the stream. in any event,
2430                                          * the stream has been interrupted. it's
2431                                          * probably best just to return whatever
2432                                          * data we've moved and let the caller
2433                                          * sort it out...
2434                                          */
2435                                         if (m != so->so_rcv.sb_mb) {
2436                                                 break;
2437                                         }
2438                                 }
2439                                 socket_unlock(so, 0);
2440                                 error = uiomove(mtod(m, caddr_t) + moff,
2441                                     (int)len, uio);
2442                                 socket_lock(so, 0);
2443
2444                                 if (error)
2445                                         goto release;
2446                         }
2447                 } else {
2448                         uio_setresid(uio, (uio_resid(uio) - len));
2449                 }
2450                 if (len == m->m_len - moff) {
2451                         if (m->m_flags & M_EOR)
2452                                 flags |= MSG_EOR;
2453                         if (flags & MSG_PEEK) {
2454                                 m = m->m_next;
2455                                 moff = 0;
2456                         } else {
2457                                 nextrecord = m->m_nextpkt;
2458                                 sbfree(&so->so_rcv, m);
2459                                 m->m_nextpkt = NULL;
2460
2461                                 if (mp) {
2462                                         *mp = m;
2463                                         mp = &m->m_next;
2464                                         so->so_rcv.sb_mb = m = m->m_next;
2465                                         *mp = (struct mbuf *)0;
2466                                 } else {
2467                                         if (free_list == NULL)
2468                                                 free_list = m;
2469                                         else
2470                                                 ml->m_next = m;
2471                                         ml = m;
2472                                         so->so_rcv.sb_mb = m = m->m_next;
2473                                         ml->m_next = 0;
2474                                 }
2475                                 if (m != NULL) {
2476                                         m->m_nextpkt = nextrecord;
2477                                         if (nextrecord == NULL)
2478                                                 so->so_rcv.sb_lastrecord = m;
2479                                 } else {
2480                                         so->so_rcv.sb_mb = nextrecord;
2481                                         SB_EMPTY_FIXUP(&so->so_rcv);
2482                                 }
2483                                 SBLASTRECORDCHK(&so->so_rcv, "soreceive 3");
2484                                 SBLASTMBUFCHK(&so->so_rcv, "soreceive 3");
2485                         }
2486                 } else {
2487                         if (flags & MSG_PEEK) {
2488                                 moff += len;
2489                         } else {
2490                                 if (mp)
2491                                         *mp = m_copym(m, 0, len, M_WAIT);
2492                                 m->m_data += len;
2493                                 m->m_len -= len;
2494                                 so->so_rcv.sb_cc -= len;
2495                         }
2496                 }
2497                 if (so->so_oobmark) {
2498                         if ((flags & MSG_PEEK) == 0) {
2499                                 so->so_oobmark -= len;
2500                                 if (so->so_oobmark == 0) {
2501                                         so->so_state |= SS_RCVATMARK;
2502                                         /*
2503                                          * delay posting the actual event until
2504                                          * after any delayed copy processing
2505                                          * has finished
2506                                          */
2507                                         need_event = 1;
2508                                         break;
2509                                 }
2510                         } else {
2511                                 offset += len;
2512                                 if (offset == so->so_oobmark)
2513                                         break;
2514                         }
2515                 }
2516                 if (flags & MSG_EOR)
2517                         break;
2518                 /*
2519                  * If the MSG_WAITALL or MSG_WAITSTREAM flag is set
2520                  * (for non-atomic socket), we must not quit until
2521                  * "uio->uio_resid == 0" or an error termination.
2522                  * If a signal/timeout occurs, return with a short
2523                  * count but without error.  Keep sockbuf locked
2524                  * against other readers.
2525                  */
2526                 while (flags & (MSG_WAITALL|MSG_WAITSTREAM) && m == 0 &&
2527                     (uio_resid(uio) - delayed_copy_len) > 0 &&
2528                     !sosendallatonce(so) && !nextrecord) {
2529                         if (so->so_error || so->so_state & SS_CANTRCVMORE)
2530                                 goto release;
2531
2532                         /*
2533                          * Depending on the protocol (e.g. TCP), the following
2534                          * might cause the socket lock to be dropped and later
2535                          * be reacquired, and more data could have arrived and
2536                          * have been appended to the receive socket buffer by
2537                          * the time it returns.  Therefore, we only sleep in
2538                          * sbwait() below if and only if the socket buffer is
2539                          * empty, in order to avoid a false sleep.
2540                          */
2541                         if (pr->pr_flags & PR_WANTRCVD && so->so_pcb &&
2542                             (((struct inpcb *)so->so_pcb)->inp_state !=
2543                             INPCB_STATE_DEAD))
2544                                 (*pr->pr_usrreqs->pru_rcvd)(so, flags);
2545
2546                         SBLASTRECORDCHK(&so->so_rcv, "soreceive sbwait 2");
2547                         SBLASTMBUFCHK(&so->so_rcv, "soreceive sbwait 2");
2548
2549                         if (so->so_rcv.sb_mb == NULL && sbwait(&so->so_rcv)) {
2550                                 error = 0;
2551                                 goto release;
2552                         }
2553                         /*
2554                          * have to wait until after we get back from the sbwait
2555                          * to do the copy because we will drop the lock if we
2556                          * have enough data that has been delayed... by dropping
2557                          * the lock we open up a window allowing the netisr
2558                          * thread to process the incoming packets and to change
2559                          * the state of this socket... we're issuing the sbwait
2560                          * because the socket is empty and we're expecting the
2561                          * netisr thread to wake us up when more packets arrive;
2562                          * if we allow that processing to happen and then sbwait
2563                          * we could stall forever with packets sitting in the
2564                          * socket if no further packets arrive from the remote
2565                          * side.
2566                          *
2567                          * we want to copy before we've collected all the data
2568                          * to satisfy this request to allow the copy to overlap
2569                          * the incoming packet processing on an MP system
2570                          */
2571                         if (delayed_copy_len > sorecvmincopy &&
2572                             (delayed_copy_len > (so->so_rcv.sb_hiwat / 2))) {
2573                                 error = sodelayed_copy(so, uio,
2574                                     &free_list, &delayed_copy_len);
2575
2576                                 if (error)
2577                                         goto release;
2578                         }
2579                         m = so->so_rcv.sb_mb;
2580                         if (m) {
2581                                 nextrecord = m->m_nextpkt;
2582                         }
2583                 }
2584         }
2585 #ifdef MORE_LOCKING_DEBUG
2586         if (so->so_usecount <= 1)
2587                 panic("soreceive: after big while so=%p ref=%d on socket\n",
2588                     so, so->so_usecount);
2589 #endif
2590
2591         if (m && pr->pr_flags & PR_ATOMIC) {
2592 #ifdef __APPLE__
2593                 if (so->so_options & SO_DONTTRUNC) {
2594                         flags |= MSG_RCVMORE;
2595                 } else {
2596 #endif
2597                         flags |= MSG_TRUNC;
2598                         if ((flags & MSG_PEEK) == 0)
2599                                 (void) sbdroprecord(&so->so_rcv);
2600 #ifdef __APPLE__
2601                 }
2602 #endif
2603         }
2604
2605         /*
2606          * pru_rcvd below (for TCP) may cause more data to be received
2607          * if the socket lock is dropped prior to sending the ACK; some
2608          * legacy OpenTransport applications don't handle this well
2609          * (if it receives less data than requested while MSG_HAVEMORE
2610          * is set), and so we set the flag now based on what we know
2611          * prior to calling pru_rcvd.
2612          */
2613         if ((so->so_options & SO_WANTMORE) && so->so_rcv.sb_cc > 0)
2614                 flags |= MSG_HAVEMORE;
2615
2616         if ((flags & MSG_PEEK) == 0) {
2617                 if (m == 0) {
2618                         so->so_rcv.sb_mb = nextrecord;
2619                         /*
2620                          * First part is an inline SB_EMPTY_FIXUP().  Second
2621                          * part makes sure sb_lastrecord is up-to-date if
2622                          * there is still data in the socket buffer.
2623                          */
2624                         if (so->so_rcv.sb_mb == NULL) {
2625                                 so->so_rcv.sb_mbtail = NULL;
2626                                 so->so_rcv.sb_lastrecord = NULL;
2627                         } else if (nextrecord->m_nextpkt == NULL) {
2628                                 so->so_rcv.sb_lastrecord = nextrecord;
2629                         }
2630                 }
2631                 SBLASTRECORDCHK(&so->so_rcv, "soreceive 4");
2632                 SBLASTMBUFCHK(&so->so_rcv, "soreceive 4");
2633                 if (pr->pr_flags & PR_WANTRCVD && so->so_pcb)
2634                         (*pr->pr_usrreqs->pru_rcvd)(so, flags);
2635         }
2636 #ifdef __APPLE__
2637         if (delayed_copy_len) {
2638                 error = sodelayed_copy(so, uio, &free_list, &delayed_copy_len);
2639
2640                 if (error)
2641                         goto release;
2642         }
2643         if (free_list) {
2644                 m_freem_list((struct mbuf *)free_list);
2645                 free_list = (struct mbuf *)0;
2646         }
2647         if (need_event)
2648                 postevent(so, 0, EV_OOB);
2649 #endif
2650         if (orig_resid == uio_resid(uio) && orig_resid &&
2651             (flags & MSG_EOR) == 0 && (so->so_state & SS_CANTRCVMORE) == 0) {
2652                 sbunlock(&so->so_rcv, 1);
2653                 goto restart;
2654         }
2655
2656         if (flagsp)
2657                 *flagsp |= flags;
2658 release:
2659 #ifdef MORE_LOCKING_DEBUG
2660         if (so->so_usecount <= 1)
2661                 panic("soreceive: release so=%p ref=%d on socket\n",
2662                     so, so->so_usecount);
2663 #endif
2664         if (delayed_copy_len) {
2665                 error = sodelayed_copy(so, uio, &free_list, &delayed_copy_len);
2666         }
2667         if (free_list) {
2668                 m_freem_list((struct mbuf *)free_list);
2669         }
2670         sbunlock(&so->so_rcv, 0);       /* will unlock socket */
2671
2672         // LP64todo - fix this!
2673         KERNEL_DEBUG(DBG_FNC_SORECEIVE | DBG_FUNC_END, so, uio_resid(uio),
2674             so->so_rcv.sb_cc, 0, error);
2675
2676         return (error);
2677 }
2678
2679 /*
2680  * Returns:     0                       Success
2681  *      uiomove:EFAULT
2682  */
2683 static int
2684 sodelayed_copy(struct socket *so, struct uio *uio, struct mbuf **free_list,
2685     int *resid)
2686 {
2687         int error = 0;
2688         struct mbuf *m;
2689
2690         m = *free_list;
2691
2692         socket_unlock(so, 0);
2693
2694         while (m && error == 0) {
2695
2696                 error = uiomove(mtod(m, caddr_t), (int)m->m_len, uio);
2697
2698                 m = m->m_next;
2699         }
2700         m_freem_list(*free_list);
2701
2702         *free_list = (struct mbuf *)NULL;
2703         *resid = 0;
2704
2705         socket_lock(so, 0);
2706
2707         return (error);
2708 }
2709
2710
2711 /*
2712  * Returns:     0                       Success
2713  *              EINVAL
2714  *              ENOTCONN
2715  *      <pru_shutdown>:EINVAL
2716  *      <pru_shutdown>:EADDRNOTAVAIL[TCP]
2717  *      <pru_shutdown>:ENOBUFS[TCP]
2718  *      <pru_shutdown>:EMSGSIZE[TCP]
2719  *      <pru_shutdown>:EHOSTUNREACH[TCP]
2720  *      <pru_shutdown>:ENETUNREACH[TCP]
2721  *      <pru_shutdown>:ENETDOWN[TCP]
2722  *      <pru_shutdown>:ENOMEM[TCP]
2723  *      <pru_shutdown>:EACCES[TCP]
2724  *      <pru_shutdown>:EMSGSIZE[TCP]
2725  *      <pru_shutdown>:ENOBUFS[TCP]
2726  *      <pru_shutdown>:???[TCP]         [ignorable: mostly IPSEC/firewall/DLIL]
2727  *      <pru_shutdown>:???              [other protocol families]
2728  */
2729 int
2730 soshutdown(struct socket *so, int how)
2731 {
2732         int error;
2733
2734         switch (how) {
2735         case SHUT_RD:
2736         case SHUT_WR:
2737         case SHUT_RDWR:
2738                 socket_lock(so, 1);
2739                 if ((so->so_state &
2740                     (SS_ISCONNECTED|SS_ISCONNECTING|SS_ISDISCONNECTING)) == 0) {
2741                         error = ENOTCONN;
2742                 } else {
2743                         error = soshutdownlock(so, how);
2744                 }
2745                 socket_unlock(so, 1);
2746                 break;
2747         default:
2748                 error = EINVAL;
2749                 break;
2750         }
2751
2752         return (error);
2753 }
2754
2755 int
2756 soshutdownlock(struct socket *so, int how)
2757 {
2758         struct protosw *pr = so->so_proto;
2759         int error = 0;
2760
2761         sflt_notify(so, sock_evt_shutdown, &how);
2762
2763         if (how != SHUT_WR) {
2764                 if ((so->so_state & SS_CANTRCVMORE) != 0) {
2765                         /* read already shut down */
2766                         error = ENOTCONN;
2767                         goto done;
2768                 }
2769                 sorflush(so);
2770                 postevent(so, 0, EV_RCLOSED);
2771         }
2772         if (how != SHUT_RD) {
2773                 if ((so->so_state & SS_CANTSENDMORE) != 0) {
2774                         /* write already shut down */
2775                         error = ENOTCONN;
2776                         goto done;
2777                 }
2778                 error = (*pr->pr_usrreqs->pru_shutdown)(so);
2779                 postevent(so, 0, EV_WCLOSED);
2780         }
2781 done:
2782         KERNEL_DEBUG(DBG_FNC_SOSHUTDOWN | DBG_FUNC_END, 0, 0, 0, 0, 0);
2783         return (error);
2784 }
2785
2786 void
2787 sorflush(struct socket *so)
2788 {
2789         register struct sockbuf *sb = &so->so_rcv;
2790         register struct protosw *pr = so->so_proto;
2791         struct sockbuf asb;
2792
2793 #ifdef MORE_LOCKING_DEBUG
2794         lck_mtx_t *mutex_held;
2795
2796         if (so->so_proto->pr_getlock != NULL)
2797                 mutex_held = (*so->so_proto->pr_getlock)(so, 0);
2798         else
2799                 mutex_held = so->so_proto->pr_domain->dom_mtx;
2800         lck_mtx_assert(mutex_held, LCK_MTX_ASSERT_OWNED);
2801 #endif
2802
2803         sflt_notify(so, sock_evt_flush_read, NULL);
2804
2805         sb->sb_flags |= SB_NOINTR;
2806         (void) sblock(sb, M_WAIT);
2807         socantrcvmore(so);
2808         sbunlock(sb, 1);
2809 #ifdef __APPLE__
2810         selthreadclear(&sb->sb_sel);
2811 #endif
2812         asb = *sb;
2813         bzero((caddr_t)sb, sizeof (*sb));
2814         sb->sb_so = so; /* reestablish link to socket */
2815         if (asb.sb_flags & SB_KNOTE) {
2816                 sb->sb_sel.si_note = asb.sb_sel.si_note;
2817                 sb->sb_flags = SB_KNOTE;
2818         }
2819         if (asb.sb_flags & SB_DROP)
2820                 sb->sb_flags |= SB_DROP;
2821         if (asb.sb_flags & SB_UNIX)
2822                 sb->sb_flags |= SB_UNIX;
2823         if ((pr->pr_flags & PR_RIGHTS) && pr->pr_domain->dom_dispose) {
2824                 boolean_t unp = (pr->pr_domain->dom_dispose == unp_dispose);
2825                 /*
2826                  * Currently AF_UNIX domain uses a global domain mutex;
2827                  * unp_dispose() may end up calling soclose() on another
2828                  * AF_UNIX socket and therefore the lock must not be held
2829                  * across the call.
2830                  */
2831                 if (unp)
2832                         socket_unlock(so, 0);
2833                 (*pr->pr_domain->dom_dispose)(asb.sb_mb);
2834                 if (unp)
2835                         socket_lock(so, 0);
2836         }
2837         sbrelease(&asb);
2838 }
2839
2840 /*
2841  * Perhaps this routine, and sooptcopyout(), below, ought to come in
2842  * an additional variant to handle the case where the option value needs
2843  * to be some kind of integer, but not a specific size.
2844  * In addition to their use here, these functions are also called by the
2845  * protocol-level pr_ctloutput() routines.
2846  *
2847  * Returns:     0                       Success
2848  *              EINVAL
2849  *      copyin:EFAULT
2850  */
2851 int
2852 sooptcopyin(struct sockopt *sopt, void *buf, size_t len, size_t minlen)
2853 {
2854         size_t  valsize;
2855
2856         /*
2857          * If the user gives us more than we wanted, we ignore it,
2858          * but if we don't get the minimum length the caller
2859          * wants, we return EINVAL.  On success, sopt->sopt_valsize
2860          * is set to however much we actually retrieved.
2861          */
2862         if ((valsize = sopt->sopt_valsize) < minlen)
2863                 return (EINVAL);
2864         if (valsize > len)
2865                 sopt->sopt_valsize = valsize = len;
2866
2867         if (sopt->sopt_p != 0)
2868                 return (copyin(sopt->sopt_val, buf, valsize));
2869
2870         bcopy(CAST_DOWN(caddr_t, sopt->sopt_val), buf, valsize);
2871         return (0);
2872 }
2873
2874 /*
2875  * sooptcopyin_timeval
2876  *   Copy in a timeval value into tv_p, and take into account whether the
2877  *   the calling process is 64-bit or 32-bit.  Moved the sanity checking
2878  *   code here so that we can verify the 64-bit tv_sec value before we lose
2879  *   the top 32-bits assigning tv64.tv_sec to tv_p->tv_sec.
2880  */
2881 static int
2882 sooptcopyin_timeval(struct sockopt *sopt, struct timeval * tv_p)
2883 {
2884         int                     error;
2885
2886         if (proc_is64bit(sopt->sopt_p)) {
2887                 struct timeval64        tv64;
2888
2889                 if (sopt->sopt_valsize < sizeof(tv64)) {
2890                         return (EINVAL);
2891                 }
2892                 sopt->sopt_valsize = sizeof(tv64);
2893                 error = copyin(sopt->sopt_val, &tv64, sizeof(tv64));
2894                 if (error != 0) {
2895                         return (error);
2896                 }
2897                 if (tv64.tv_sec < 0 || tv64.tv_sec > LONG_MAX
2898                     || tv64.tv_usec < 0 || tv64.tv_usec >= 1000000) {
2899                         return (EDOM);
2900                 }
2901                 tv_p->tv_sec = tv64.tv_sec;
2902                 tv_p->tv_usec = tv64.tv_usec;
2903         } else {
2904                 if (sopt->sopt_valsize < sizeof(*tv_p)) {
2905                         return (EINVAL);
2906                 }
2907                 sopt->sopt_valsize = sizeof(*tv_p);
2908                 if (sopt->sopt_p != 0) {
2909                         error = copyin(sopt->sopt_val, tv_p, sizeof(*tv_p));
2910                         if (error != 0) {
2911                                 return (error);
2912                         }
2913                 } else {
2914                         bcopy(CAST_DOWN(caddr_t, sopt->sopt_val), tv_p,
2915                               sizeof(*tv_p));
2916                 }
2917                 if (tv_p->tv_sec < 0 || tv_p->tv_sec > LONG_MAX
2918                     || tv_p->tv_usec < 0 || tv_p->tv_usec >= 1000000) {
2919                         return (EDOM);
2920                 }
2921         }
2922         return (0);
2923 }
2924
2925 /*
2926  * Returns:     0                       Success
2927  *              EINVAL
2928  *              ENOPROTOOPT
2929  *              ENOBUFS
2930  *              EDOM
2931  *      sooptcopyin:EINVAL
2932  *      sooptcopyin:EFAULT
2933  *      sooptcopyin_timeval:EINVAL
2934  *      sooptcopyin_timeval:EFAULT
2935  *      sooptcopyin_timeval:EDOM
2936  *      <pr_ctloutput>:EOPNOTSUPP[AF_UNIX]
2937  *      <pr_ctloutput>:???w
2938  *      sflt_attach_private:???         [whatever a filter author chooses]
2939  *      <sf_setoption>:???              [whatever a filter author chooses]
2940  *
2941  * Notes:       Other <pru_listen> returns depend on the protocol family; all
2942  *              <sf_listen> returns depend on what the filter author causes
2943  *              their filter to return.
2944  */
2945 int
2946 sosetopt(struct socket *so, struct sockopt *sopt)
2947 {
2948         int     error, optval;
2949         struct  linger l;
2950         struct  timeval tv;
2951         struct socket_filter_entry *filter;
2952         int filtered = 0;
2953 #if CONFIG_MACF_SOCKET
2954         struct mac extmac;
2955 #endif /* MAC_SOCKET */
2956
2957         socket_lock(so, 1);
2958         if ((so->so_state & (SS_CANTRCVMORE | SS_CANTSENDMORE))
2959             == (SS_CANTRCVMORE | SS_CANTSENDMORE)) {
2960                 /* the socket has been shutdown, no more sockopt's */
2961                 error = EINVAL;
2962                 goto bad;
2963         }
2964
2965         if (sopt->sopt_dir != SOPT_SET) {
2966                 sopt->sopt_dir = SOPT_SET;
2967         }
2968
2969         error = 0;
2970         for (filter = so->so_filt; filter && (error == 0);
2971             filter = filter->sfe_next_onsocket) {
2972                 if (filter->sfe_filter->sf_filter.sf_setoption) {
2973                         if (filtered == 0) {
2974                                 filtered = 1;
2975                                 sflt_use(so);
2976                                 socket_unlock(so, 0);
2977                         }
2978                         error = filter->sfe_filter->sf_filter.
2979                             sf_setoption(filter->sfe_cookie, so, sopt);
2980                 }
2981         }
2982
2983         if (filtered != 0) {
2984                 socket_lock(so, 0);
2985                 sflt_unuse(so);
2986
2987                 if (error) {
2988                         if (error == EJUSTRETURN)
2989                                 error = 0;
2990                         goto bad;
2991                 }
2992         }
2993
2994         error = 0;
2995         if (sopt->sopt_level != SOL_SOCKET) {
2996                 if (so->so_proto && so->so_proto->pr_ctloutput) {
2997                         error = (*so->so_proto->pr_ctloutput)(so, sopt);
2998                         socket_unlock(so, 1);
2999                         return (error);
3000                 }
3001                 error = ENOPROTOOPT;
3002         } else {
3003                 switch (sopt->sopt_name) {
3004                 case SO_LINGER:
3005                 case SO_LINGER_SEC:
3006                         error = sooptcopyin(sopt, &l, sizeof (l), sizeof (l));
3007                         if (error)
3008                                 goto bad;
3009
3010                         so->so_linger = (sopt->sopt_name == SO_LINGER) ?
3011                             l.l_linger : l.l_linger * hz;
3012                         if (l.l_onoff)
3013                                 so->so_options |= SO_LINGER;
3014                         else
3015                                 so->so_options &= ~SO_LINGER;
3016                         break;
3017
3018                 case SO_DEBUG:
3019                 case SO_KEEPALIVE:
3020                 case SO_DONTROUTE:
3021                 case SO_USELOOPBACK:
3022                 case SO_BROADCAST:
3023                 case SO_REUSEADDR:
3024                 case SO_REUSEPORT:
3025                 case SO_OOBINLINE:
3026                 case SO_TIMESTAMP:
3027 #ifdef __APPLE__
3028                 case SO_DONTTRUNC:
3029                 case SO_WANTMORE:
3030                 case SO_WANTOOBFLAG:
3031 #endif
3032                         error = sooptcopyin(sopt, &optval, sizeof (optval),
3033                             sizeof (optval));
3034                         if (error)
3035                                 goto bad;
3036                         if (optval)
3037                                 so->so_options |= sopt->sopt_name;
3038                         else
3039                                 so->so_options &= ~sopt->sopt_name;
3040                         break;
3041
3042                 case SO_SNDBUF:
3043                 case SO_RCVBUF:
3044                 case SO_SNDLOWAT:
3045                 case SO_RCVLOWAT:
3046                         error = sooptcopyin(sopt, &optval, sizeof (optval),
3047                             sizeof (optval));
3048                         if (error)
3049                                 goto bad;
3050
3051                         /*
3052                          * Values < 1 make no sense for any of these
3053                          * options, so disallow them.
3054                          */
3055                         if (optval < 1) {
3056                                 error = EINVAL;
3057                                 goto bad;
3058                         }
3059
3060                         switch (sopt->sopt_name) {
3061                         case SO_SNDBUF:
3062                         case SO_RCVBUF:
3063                                 if (sbreserve(sopt->sopt_name == SO_SNDBUF ?
3064                                     &so->so_snd : &so->so_rcv,
3065                                     (u_long) optval) == 0) {
3066                                         error = ENOBUFS;
3067                                         goto bad;
3068                                 }
3069                                 if (sopt->sopt_name == SO_SNDBUF)
3070                                         so->so_snd.sb_flags |= SB_USRSIZE;
3071                                 else
3072                                         so->so_rcv.sb_flags |= SB_USRSIZE;
3073                                 break;
3074
3075                         /*
3076                          * Make sure the low-water is never greater than
3077                          * the high-water.
3078                          */
3079                         case SO_SNDLOWAT:
3080                                 so->so_snd.sb_lowat =
3081                                     (optval > so->so_snd.sb_hiwat) ?
3082                                     so->so_snd.sb_hiwat : optval;
3083                                 break;
3084                         case SO_RCVLOWAT:
3085                                 so->so_rcv.sb_lowat =
3086                                     (optval > so->so_rcv.sb_hiwat) ?
3087                                     so->so_rcv.sb_hiwat : optval;
3088                                 break;
3089                         }
3090                         break;
3091
3092                 case SO_SNDTIMEO:
3093                 case SO_RCVTIMEO:
3094                         error = sooptcopyin_timeval(sopt, &tv);
3095                         if (error)
3096                                 goto bad;
3097
3098                         switch (sopt->sopt_name) {
3099                         case SO_SNDTIMEO:
3100                                 so->so_snd.sb_timeo = tv;
3101                                 break;
3102                         case SO_RCVTIMEO:
3103                                 so->so_rcv.sb_timeo = tv;
3104                                 break;
3105                         }
3106                         break;
3107
3108                 case SO_NKE:
3109                 {
3110                         struct so_nke nke;
3111
3112                         error = sooptcopyin(sopt, &nke, sizeof (nke),
3113                             sizeof (nke));
3114                         if (error)
3115                                 goto bad;
3116
3117                         error = sflt_attach_private(so, NULL,
3118                             nke.nke_handle, 1);
3119                         break;
3120                 }
3121
3122                 case SO_NOSIGPIPE:
3123                         error = sooptcopyin(sopt, &optval, sizeof (optval),
3124                             sizeof (optval));
3125                         if (error)
3126                                 goto bad;
3127                         if (optval)
3128                                 so->so_flags |= SOF_NOSIGPIPE;
3129                         else
3130                                 so->so_flags &= ~SOF_NOSIGPIPE;
3131
3132                         break;
3133
3134                 case SO_NOADDRERR:
3135                         error = sooptcopyin(sopt, &optval, sizeof (optval),
3136                             sizeof (optval));
3137                         if (error)
3138                                 goto bad;
3139                         if (optval)
3140                                 so->so_flags |= SOF_NOADDRAVAIL;
3141                         else
3142                                 so->so_flags &= ~SOF_NOADDRAVAIL;
3143
3144                         break;
3145
3146                 case SO_REUSESHAREUID:
3147                         error = sooptcopyin(sopt, &optval, sizeof (optval),
3148                             sizeof (optval));
3149                         if (error)
3150                                 goto bad;
3151                         if (optval)
3152                                 so->so_flags |= SOF_REUSESHAREUID;
3153                         else
3154                                 so->so_flags &= ~SOF_REUSESHAREUID;
3155                         break;
3156 #ifdef __APPLE_API_PRIVATE
3157                 case SO_NOTIFYCONFLICT:
3158                         if (kauth_cred_issuser(kauth_cred_get()) == 0) {
3159                                 error = EPERM;
3160                                 goto bad;
3161                         }
3162                         error = sooptcopyin(sopt, &optval, sizeof (optval),
3163                             sizeof (optval));
3164                         if (error)
3165                                 goto bad;
3166                         if (optval)
3167                                 so->so_flags |= SOF_NOTIFYCONFLICT;
3168                         else
3169                                 so->so_flags &= ~SOF_NOTIFYCONFLICT;
3170                         break;
3171 #endif
3172                 case SO_RESTRICTIONS:
3173                         if (kauth_cred_issuser(kauth_cred_get()) == 0) {
3174                                 error = EPERM;
3175                                 goto bad;
3176                         }
3177                         error = sooptcopyin(sopt, &optval, sizeof (optval),
3178                             sizeof (optval));
3179                         if (error)
3180                                 goto bad;
3181                         so->so_restrictions = (optval & (SO_RESTRICT_DENYIN |
3182                             SO_RESTRICT_DENYOUT | SO_RESTRICT_DENYSET));
3183                         break;
3184
3185                 case SO_LABEL:
3186 #if CONFIG_MACF_SOCKET
3187                         if ((error = sooptcopyin(sopt, &extmac, sizeof (extmac),
3188                             sizeof (extmac))) != 0)
3189                                 goto bad;
3190
3191                         error = mac_setsockopt_label(proc_ucred(sopt->sopt_p),
3192                             so, &extmac);
3193 #else
3194                         error = EOPNOTSUPP;
3195 #endif /* MAC_SOCKET */
3196                         break;
3197
3198                 default:
3199                         error = ENOPROTOOPT;
3200                         break;
3201                 }
3202                 if (error == 0 && so->so_proto && so->so_proto->pr_ctloutput) {
3203                         (void) ((*so->so_proto->pr_ctloutput)(so, sopt));
3204                 }
3205         }
3206 bad:
3207         socket_unlock(so, 1);
3208         return (error);
3209 }
3210
3211 /* Helper routines for getsockopt */
3212 int
3213 sooptcopyout(struct sockopt *sopt, void *buf, size_t len)
3214 {
3215         int     error;
3216         size_t  valsize;
3217
3218         error = 0;
3219
3220         /*
3221          * Documented get behavior is that we always return a value,
3222          * possibly truncated to fit in the user's buffer.
3223          * Traditional behavior is that we always tell the user
3224          * precisely how much we copied, rather than something useful
3225          * like the total amount we had available for her.
3226          * Note that this interface is not idempotent; the entire answer must
3227          * generated ahead of time.
3228          */
3229         valsize = min(len, sopt->sopt_valsize);
3230         sopt->sopt_valsize = valsize;
3231         if (sopt->sopt_val != USER_ADDR_NULL) {
3232                 if (sopt->sopt_p != 0)
3233                         error = copyout(buf, sopt->sopt_val, valsize);
3234                 else
3235                         bcopy(buf, CAST_DOWN(caddr_t, sopt->sopt_val), valsize);
3236         }
3237         return (error);
3238 }
3239
3240 static int
3241 sooptcopyout_timeval(struct sockopt *sopt, const struct timeval * tv_p)
3242 {
3243         int                     error;
3244         size_t                  len;
3245         struct timeval64        tv64;
3246         const void *            val;
3247         size_t                  valsize;
3248
3249         error = 0;
3250         if (proc_is64bit(sopt->sopt_p)) {
3251                 len = sizeof(struct timeval64);
3252                 tv64.tv_sec = tv_p->tv_sec;
3253                 tv64.tv_usec = tv_p->tv_usec;
3254                 val = &tv64;
3255         } else {
3256                 len = sizeof(struct timeval);
3257                 val = tv_p;
3258         }
3259         valsize = min(len, sopt->sopt_valsize);
3260         sopt->sopt_valsize = valsize;
3261         if (sopt->sopt_val != USER_ADDR_NULL) {
3262                 if (sopt->sopt_p != 0)
3263                         error = copyout(val, sopt->sopt_val, valsize);
3264                 else
3265                         bcopy(val, CAST_DOWN(caddr_t, sopt->sopt_val), valsize);
3266         }
3267         return (error);
3268 }
3269
3270 /*
3271  * Return:      0                       Success
3272  *              ENOPROTOOPT
3273  *      <pr_ctloutput>:EOPNOTSUPP[AF_UNIX]
3274  *      <pr_ctloutput>:???
3275  *      <sf_getoption>:???
3276  */
3277 int
3278 sogetopt(struct socket *so, struct sockopt *sopt)
3279 {
3280         int     error, optval;
3281         struct  linger l;
3282         struct  timeval tv;
3283         struct  socket_filter_entry *filter;
3284         int     filtered = 0;
3285 #if CONFIG_MACF_SOCKET
3286         struct mac extmac;
3287 #endif /* MAC_SOCKET */
3288
3289         if (sopt->sopt_dir != SOPT_GET) {
3290                 sopt->sopt_dir = SOPT_GET;
3291         }
3292
3293         socket_lock(so, 1);
3294
3295         error = 0;
3296         for (filter = so->so_filt; filter && (error == 0);
3297             filter = filter->sfe_next_onsocket) {
3298                 if (filter->sfe_filter->sf_filter.sf_getoption) {
3299                         if (filtered == 0) {
3300                                 filtered = 1;
3301                                 sflt_use(so);
3302                                 socket_unlock(so, 0);
3303                         }
3304                         error = filter->sfe_filter->sf_filter.
3305                             sf_getoption(filter->sfe_cookie, so, sopt);
3306                 }
3307         }
3308         if (filtered != 0) {
3309                 socket_lock(so, 0);
3310                 sflt_unuse(so);
3311
3312                 if (error) {
3313                         if (error == EJUSTRETURN)
3314                                 error = 0;
3315                         socket_unlock(so, 1);
3316                         return (error);
3317                 }
3318         }
3319
3320         error = 0;
3321         if (sopt->sopt_level != SOL_SOCKET) {
3322                 if (so->so_proto && so->so_proto->pr_ctloutput) {
3323                         error = (*so->so_proto->pr_ctloutput)(so, sopt);
3324                         socket_unlock(so, 1);
3325                         return (error);
3326                 } else {
3327                         socket_unlock(so, 1);
3328                         return (ENOPROTOOPT);
3329                 }
3330         } else {
3331                 switch (sopt->sopt_name) {
3332                 case SO_LINGER:
3333                 case SO_LINGER_SEC:
3334                         l.l_onoff = so->so_options & SO_LINGER;
3335                         l.l_linger = (sopt->sopt_name == SO_LINGER) ?
3336                             so->so_linger : so->so_linger / hz;
3337                         error = sooptcopyout(sopt, &l, sizeof (l));
3338                         break;
3339
3340                 case SO_USELOOPBACK:
3341                 case SO_DONTROUTE:
3342                 case SO_DEBUG:
3343                 case SO_KEEPALIVE:
3344                 case SO_REUSEADDR:
3345                 case SO_REUSEPORT:
3346                 case SO_BROADCAST:
3347                 case SO_OOBINLINE:
3348                 case SO_TIMESTAMP:
3349 #ifdef __APPLE__
3350                 case SO_DONTTRUNC:
3351                 case SO_WANTMORE:
3352                 case SO_WANTOOBFLAG:
3353 #endif
3354                         optval = so->so_options & sopt->sopt_name;
3355 integer:
3356                         error = sooptcopyout(sopt, &optval, sizeof (optval));
3357                         break;
3358
3359                 case SO_TYPE:
3360                         optval = so->so_type;
3361                         goto integer;
3362
3363 #ifdef __APPLE__
3364                 case SO_NREAD:
3365                         if (so->so_proto->pr_flags & PR_ATOMIC) {
3366                                 int pkt_total;
3367                                 struct mbuf *m1;
3368
3369                                 pkt_total = 0;
3370                                 m1 = so->so_rcv.sb_mb;
3371                                 while (m1) {
3372                                         if (m1->m_type == MT_DATA || m1->m_type == MT_HEADER ||
3373                                                 m1->m_type == MT_OOBDATA)
3374                                                 pkt_total += m1->m_len;
3375                                         m1 = m1->m_next;
3376                                 }
3377                                 optval = pkt_total;
3378                         } else {
3379                                 optval = so->so_rcv.sb_cc - so->so_rcv.sb_ctl;
3380                         }
3381                         goto integer;
3382
3383                 case SO_NWRITE:
3384                         optval = so->so_snd.sb_cc;
3385                         goto integer;
3386 #endif
3387                 case SO_ERROR:
3388                         optval = so->so_error;
3389                         so->so_error = 0;
3390                         goto integer;
3391
3392                 case SO_SNDBUF:
3393                         optval = so->so_snd.sb_hiwat;
3394                         goto integer;
3395
3396                 case SO_RCVBUF:
3397                         optval = so->so_rcv.sb_hiwat;
3398                         goto integer;
3399
3400                 case SO_SNDLOWAT:
3401                         optval = so->so_snd.sb_lowat;
3402                         goto integer;
3403
3404                 case SO_RCVLOWAT:
3405                         optval = so->so_rcv.sb_lowat;
3406                         goto integer;
3407
3408                 case SO_SNDTIMEO:
3409                 case SO_RCVTIMEO:
3410                         tv = (sopt->sopt_name == SO_SNDTIMEO ?
3411                             so->so_snd.sb_timeo : so->so_rcv.sb_timeo);
3412
3413                         error = sooptcopyout_timeval(sopt, &tv);
3414                         break;
3415
3416                 case SO_NOSIGPIPE:
3417                         optval = (so->so_flags & SOF_NOSIGPIPE);
3418                         goto integer;
3419
3420                 case SO_NOADDRERR:
3421                         optval = (so->so_flags & SOF_NOADDRAVAIL);
3422                         goto integer;
3423
3424                 case SO_REUSESHAREUID:
3425                         optval = (so->so_flags & SOF_REUSESHAREUID);
3426                         goto integer;
3427
3428 #ifdef __APPLE_API_PRIVATE
3429                 case SO_NOTIFYCONFLICT:
3430                         optval = (so->so_flags & SOF_NOTIFYCONFLICT);
3431                         goto integer;
3432 #endif
3433                 case SO_RESTRICTIONS:
3434                         optval = so->so_restrictions & (SO_RESTRICT_DENYIN |
3435                             SO_RESTRICT_DENYOUT | SO_RESTRICT_DENYSET);
3436                         goto integer;
3437
3438                 case SO_LABEL:
3439 #if CONFIG_MACF_SOCKET
3440                         if ((error = sooptcopyin(sopt, &extmac, sizeof (extmac),
3441                             sizeof (extmac))) != 0 ||
3442                             (error = mac_socket_label_get(proc_ucred(
3443                             sopt->sopt_p), so, &extmac)) != 0)
3444                                 break;
3445
3446                         error = sooptcopyout(sopt, &extmac, sizeof (extmac));
3447 #else
3448                         error = EOPNOTSUPP;
3449 #endif /* MAC_SOCKET */
3450                         break;
3451
3452                 case SO_PEERLABEL:
3453 #if CONFIG_MACF_SOCKET
3454                         if ((error = sooptcopyin(sopt, &extmac, sizeof (extmac),
3455                             sizeof (extmac))) != 0 ||
3456                             (error = mac_socketpeer_label_get(proc_ucred(
3457                             sopt->sopt_p), so, &extmac)) != 0)
3458                                 break;
3459
3460                         error = sooptcopyout(sopt, &extmac, sizeof (extmac));
3461 #else
3462                         error = EOPNOTSUPP;
3463 #endif /* MAC_SOCKET */
3464                         break;
3465
3466                 default:
3467                         error = ENOPROTOOPT;
3468                         break;
3469                 }
3470                 socket_unlock(so, 1);
3471                 return (error);
3472         }
3473 }
3474
3475 /* XXX; prepare mbuf for (__FreeBSD__ < 3) routines. */
3476 int
3477 soopt_getm(struct sockopt *sopt, struct mbuf **mp)
3478 {
3479         struct mbuf *m, *m_prev;
3480         int sopt_size = sopt->sopt_valsize;
3481
3482         if (sopt_size > MAX_SOOPTGETM_SIZE)
3483                 return (EMSGSIZE);
3484
3485         MGET(m, sopt->sopt_p ? M_WAIT : M_DONTWAIT, MT_DATA);
3486         if (m == 0)
3487                 return (ENOBUFS);
3488         if (sopt_size > MLEN) {
3489                 MCLGET(m, sopt->sopt_p ? M_WAIT : M_DONTWAIT);
3490                 if ((m->m_flags & M_EXT) == 0) {
3491                         m_free(m);
3492                         return (ENOBUFS);
3493                 }
3494                 m->m_len = min(MCLBYTES, sopt_size);
3495         } else {
3496                 m->m_len = min(MLEN, sopt_size);
3497         }
3498         sopt_size -= m->m_len;
3499         *mp = m;
3500         m_prev = m;
3501
3502         while (sopt_size) {
3503                 MGET(m, sopt->sopt_p ? M_WAIT : M_DONTWAIT, MT_DATA);
3504                 if (m == 0) {
3505                         m_freem(*mp);
3506                         return (ENOBUFS);
3507                 }
3508                 if (sopt_size > MLEN) {
3509                         MCLGET(m, sopt->sopt_p ? M_WAIT : M_DONTWAIT);
3510                         if ((m->m_flags & M_EXT) == 0) {
3511                                 m_freem(*mp);
3512                                 return (ENOBUFS);
3513                         }
3514                         m->m_len = min(MCLBYTES, sopt_size);
3515                 } else {
3516                         m->m_len = min(MLEN, sopt_size);
3517                 }
3518                 sopt_size -= m->m_len;
3519                 m_prev->m_next = m;
3520                 m_prev = m;
3521         }
3522         return (0);
3523 }
3524
3525 /* XXX; copyin sopt data into mbuf chain for (__FreeBSD__ < 3) routines. */
3526 int
3527 soopt_mcopyin(struct sockopt *sopt, struct mbuf *m)
3528 {
3529         struct mbuf *m0 = m;
3530
3531         if (sopt->sopt_val == USER_ADDR_NULL)
3532                 return (0);
3533         while (m != NULL && sopt->sopt_valsize >= m->m_len) {
3534                 if (sopt->sopt_p != NULL) {
3535                         int error;
3536
3537                         error = copyin(sopt->sopt_val, mtod(m, char *),
3538                             m->m_len);
3539                         if (error != 0) {
3540                                 m_freem(m0);
3541                                 return (error);
3542                         }
3543                 } else {
3544                         bcopy(CAST_DOWN(caddr_t, sopt->sopt_val),
3545                             mtod(m, char *), m->m_len);
3546                 }
3547                 sopt->sopt_valsize -= m->m_len;
3548                 sopt->sopt_val += m->m_len;
3549                 m = m->m_next;
3550         }
3551         if (m != NULL) /* should be allocated enoughly at ip6_sooptmcopyin() */
3552                 panic("soopt_mcopyin");
3553         return (0);
3554 }
3555
3556 /* XXX; copyout mbuf chain data into soopt for (__FreeBSD__ < 3) routines. */
3557 int
3558 soopt_mcopyout(struct sockopt *sopt, struct mbuf *m)
3559 {
3560         struct mbuf *m0 = m;
3561         size_t valsize = 0;
3562
3563         if (sopt->sopt_val == USER_ADDR_NULL)
3564                 return (0);
3565         while (m != NULL && sopt->sopt_valsize >= m->m_len) {
3566                 if (sopt->sopt_p != NULL) {
3567                         int error;
3568
3569                         error = copyout(mtod(m, char *), sopt->sopt_val,
3570                             m->m_len);
3571                         if (error != 0) {
3572                                 m_freem(m0);
3573                                 return (error);
3574                         }
3575                 } else {
3576                         bcopy(mtod(m, char *),
3577                             CAST_DOWN(caddr_t, sopt->sopt_val), m->m_len);
3578                 }
3579                 sopt->sopt_valsize -= m->m_len;
3580                 sopt->sopt_val += m->m_len;
3581                 valsize += m->m_len;
3582                 m = m->m_next;
3583         }
3584         if (m != NULL) {
3585                 /* enough soopt buffer should be given from user-land */
3586                 m_freem(m0);
3587                 return (EINVAL);
3588         }
3589         sopt->sopt_valsize = valsize;
3590         return (0);
3591 }
3592
3593 void
3594 sohasoutofband(struct socket *so)
3595 {
3596
3597         if (so->so_pgid < 0)
3598                 gsignal(-so->so_pgid, SIGURG);
3599         else if (so->so_pgid > 0)
3600                 proc_signal(so->so_pgid, SIGURG);
3601         selwakeup(&so->so_rcv.sb_sel);
3602 }
3603
3604 int
3605 sopoll(struct socket *so, int events, __unused kauth_cred_t cred, void * wql)
3606 {
3607         struct proc *p = current_proc();
3608         int revents = 0;
3609
3610         socket_lock(so, 1);
3611
3612         if (events & (POLLIN | POLLRDNORM))
3613                 if (soreadable(so))
3614                         revents |= events & (POLLIN | POLLRDNORM);
3615
3616         if (events & (POLLOUT | POLLWRNORM))
3617                 if (sowriteable(so))
3618                         revents |= events & (POLLOUT | POLLWRNORM);
3619
3620         if (events & (POLLPRI | POLLRDBAND))
3621                 if (so->so_oobmark || (so->so_state & SS_RCVATMARK))
3622                         revents |= events & (POLLPRI | POLLRDBAND);
3623
3624         if (revents == 0) {
3625                 if (events & (POLLIN | POLLPRI | POLLRDNORM | POLLRDBAND)) {
3626                         /*
3627                          * Darwin sets the flag first,
3628                          * BSD calls selrecord first
3629                          */
3630                         so->so_rcv.sb_flags |= SB_SEL;
3631                         selrecord(p, &so->so_rcv.sb_sel, wql);
3632                 }
3633
3634                 if (events & (POLLOUT | POLLWRNORM)) {
3635                         /*
3636                          * Darwin sets the flag first,
3637                          * BSD calls selrecord first
3638                          */
3639                         so->so_snd.sb_flags |= SB_SEL;
3640                         selrecord(p, &so->so_snd.sb_sel, wql);
3641                 }
3642         }
3643
3644         socket_unlock(so, 1);
3645         return (revents);
3646 }
3647
3648 int
3649 soo_kqfilter(__unused struct fileproc *fp, struct knote *kn,
3650     __unused struct proc *p)
3651 {
3652         struct socket *so = (struct socket *)kn->kn_fp->f_fglob->fg_data;
3653         struct sockbuf *sb;
3654
3655         socket_lock(so, 1);
3656
3657 #if CONFIG_MACF_SOCKET
3658         if (mac_socket_check_kqfilter(proc_ucred(p), kn, so) != 0) {
3659                 socket_unlock(so, 1);
3660                 return (1);
3661         }
3662 #endif /* MAC_SOCKET */
3663
3664         switch (kn->kn_filter) {
3665         case EVFILT_READ:
3666                 if (so->so_options & SO_ACCEPTCONN)
3667                         kn->kn_fop = &solisten_filtops;
3668                 else
3669                         kn->kn_fop = &soread_filtops;
3670                 sb = &so->so_rcv;
3671                 break;
3672         case EVFILT_WRITE:
3673                 kn->kn_fop = &sowrite_filtops;
3674                 sb = &so->so_snd;
3675                 break;
3676         default:
3677                 socket_unlock(so, 1);
3678                 return (1);
3679         }
3680
3681         if (KNOTE_ATTACH(&sb->sb_sel.si_note, kn))
3682                 sb->sb_flags |= SB_KNOTE;
3683         socket_unlock(so, 1);
3684         return (0);
3685 }
3686
3687 static void
3688 filt_sordetach(struct knote *kn)
3689 {
3690         struct socket *so = (struct socket *)kn->kn_fp->f_fglob->fg_data;
3691
3692         socket_lock(so, 1);
3693         if (so->so_rcv.sb_flags & SB_KNOTE)
3694                 if (KNOTE_DETACH(&so->so_rcv.sb_sel.si_note, kn))
3695                         so->so_rcv.sb_flags &= ~SB_KNOTE;
3696         socket_unlock(so, 1);
3697 }
3698
3699 /*ARGSUSED*/
3700 static int
3701 filt_soread(struct knote *kn, long hint)
3702 {
3703         struct socket *so = (struct socket *)kn->kn_fp->f_fglob->fg_data;
3704
3705         if ((hint & SO_FILT_HINT_LOCKED) == 0)
3706                 socket_lock(so, 1);
3707
3708         kn->kn_data = so->so_rcv.sb_cc - so->so_rcv.sb_ctl;
3709
3710         if (so->so_oobmark) {
3711                 if (kn->kn_flags & EV_OOBAND) {
3712                         kn->kn_data -= so->so_oobmark;
3713                         if ((hint & SO_FILT_HINT_LOCKED) == 0)
3714                                 socket_unlock(so, 1);
3715                         return (1);
3716                 }
3717                 kn->kn_data = so->so_oobmark;
3718                 kn->kn_flags |= EV_OOBAND;
3719         } else {
3720                 if (so->so_state & SS_CANTRCVMORE) {
3721                         kn->kn_flags |= EV_EOF;
3722                         kn->kn_fflags = so->so_error;
3723                         if ((hint & SO_FILT_HINT_LOCKED) == 0)
3724                                 socket_unlock(so, 1);
3725                         return (1);
3726                 }
3727         }
3728
3729         if (so->so_state & SS_RCVATMARK) {
3730                 if (kn->kn_flags & EV_OOBAND) {
3731                         if ((hint & SO_FILT_HINT_LOCKED) == 0)
3732                                 socket_unlock(so, 1);
3733                         return (1);
3734                 }
3735                 kn->kn_flags |= EV_OOBAND;
3736         } else if (kn->kn_flags & EV_OOBAND) {
3737                 kn->kn_data = 0;
3738                 if ((hint & SO_FILT_HINT_LOCKED) == 0)
3739                         socket_unlock(so, 1);
3740                 return (0);
3741         }
3742
3743         if (so->so_error) {     /* temporary udp error */
3744                 if ((hint & SO_FILT_HINT_LOCKED) == 0)
3745                         socket_unlock(so, 1);
3746                 return (1);
3747         }
3748
3749         if ((hint & SO_FILT_HINT_LOCKED) == 0)
3750                 socket_unlock(so, 1);
3751
3752         return ((kn->kn_flags & EV_OOBAND) ||
3753             kn->kn_data >= ((kn->kn_sfflags & NOTE_LOWAT) ?
3754             kn->kn_sdata : so->so_rcv.sb_lowat));
3755 }
3756
3757 static void
3758 filt_sowdetach(struct knote *kn)
3759 {
3760         struct socket *so = (struct socket *)kn->kn_fp->f_fglob->fg_data;
3761         socket_lock(so, 1);
3762
3763         if (so->so_snd.sb_flags & SB_KNOTE)
3764                 if (KNOTE_DETACH(&so->so_snd.sb_sel.si_note, kn))
3765                         so->so_snd.sb_flags &= ~SB_KNOTE;
3766         socket_unlock(so, 1);
3767 }
3768
3769 /*ARGSUSED*/
3770 static int
3771 filt_sowrite(struct knote *kn, long hint)
3772 {
3773         struct socket *so = (struct socket *)kn->kn_fp->f_fglob->fg_data;
3774
3775         if ((hint & SO_FILT_HINT_LOCKED) == 0)
3776                 socket_lock(so, 1);
3777
3778         kn->kn_data = sbspace(&so->so_snd);
3779         if (so->so_state & SS_CANTSENDMORE) {
3780                 kn->kn_flags |= EV_EOF;
3781                 kn->kn_fflags = so->so_error;
3782                 if ((hint & SO_FILT_HINT_LOCKED) == 0)
3783                         socket_unlock(so, 1);
3784                 return (1);
3785         }
3786         if (so->so_error) {     /* temporary udp error */
3787                 if ((hint & SO_FILT_HINT_LOCKED) == 0)
3788                         socket_unlock(so, 1);
3789                 return (1);
3790         }
3791         if (((so->so_state & SS_ISCONNECTED) == 0) &&
3792             (so->so_proto->pr_flags & PR_CONNREQUIRED)) {
3793                 if ((hint & SO_FILT_HINT_LOCKED) == 0)
3794                         socket_unlock(so, 1);
3795                 return (0);
3796         }
3797         if ((hint & SO_FILT_HINT_LOCKED) == 0)
3798                 socket_unlock(so, 1);
3799         if (kn->kn_sfflags & NOTE_LOWAT)
3800                 return (kn->kn_data >= kn->kn_sdata);
3801         return (kn->kn_data >= so->so_snd.sb_lowat);
3802 }
3803
3804 /*ARGSUSED*/
3805 static int
3806 filt_solisten(struct knote *kn, long hint)
3807 {
3808         struct socket *so = (struct socket *)kn->kn_fp->f_fglob->fg_data;
3809         int isempty;
3810
3811         if ((hint & SO_FILT_HINT_LOCKED) == 0)
3812                 socket_lock(so, 1);
3813         kn->kn_data = so->so_qlen;
3814         isempty = ! TAILQ_EMPTY(&so->so_comp);
3815         if ((hint & SO_FILT_HINT_LOCKED) == 0)
3816                 socket_unlock(so, 1);
3817         return (isempty);
3818 }
3819
3820
3821 int
3822 socket_lock(struct socket *so, int refcount)
3823 {
3824         int error = 0, lr_saved;
3825
3826         lr_saved = (unsigned int) __builtin_return_address(0);
3827
3828         if (so->so_proto->pr_lock) {
3829                 error = (*so->so_proto->pr_lock)(so, refcount, lr_saved);
3830         } else {
3831 #ifdef MORE_LOCKING_DEBUG
3832                 lck_mtx_assert(so->so_proto->pr_domain->dom_mtx,
3833                     LCK_MTX_ASSERT_NOTOWNED);
3834 #endif
3835                 lck_mtx_lock(so->so_proto->pr_domain->dom_mtx);
3836                 if (refcount)
3837                         so->so_usecount++;
3838                 so->lock_lr[so->next_lock_lr] = (u_int32_t)lr_saved;
3839                 so->next_lock_lr = (so->next_lock_lr+1) % SO_LCKDBG_MAX;
3840         }
3841
3842         return (error);
3843 }
3844
3845 int
3846 socket_unlock(struct socket *so, int refcount)
3847 {
3848         int error = 0, lr_saved;
3849         lck_mtx_t *mutex_held;
3850
3851         lr_saved = (unsigned int) __builtin_return_address(0);
3852
3853         if (so->so_proto == NULL)
3854                 panic("socket_unlock null so_proto so=%p\n", so);
3855
3856         if (so && so->so_proto->pr_unlock) {
3857                 error = (*so->so_proto->pr_unlock)(so, refcount, lr_saved);
3858         } else {
3859                 mutex_held = so->so_proto->pr_domain->dom_mtx;
3860 #ifdef MORE_LOCKING_DEBUG
3861                 lck_mtx_assert(mutex_held, LCK_MTX_ASSERT_OWNED);
3862 #endif
3863                 so->unlock_lr[so->next_unlock_lr] = (u_int32_t)lr_saved;
3864                 so->next_unlock_lr = (so->next_unlock_lr+1) % SO_LCKDBG_MAX;
3865
3866                 if (refcount) {
3867                         if (so->so_usecount <= 0)
3868                                 panic("socket_unlock: bad refcount so=%p "
3869                                     "value=%d\n", so, so->so_usecount);
3870                         so->so_usecount--;
3871                         if (so->so_usecount == 0) {
3872                                 sofreelastref(so, 1);
3873                         }
3874                 }
3875                 lck_mtx_unlock(mutex_held);
3876         }
3877
3878         return (error);
3879 }
3880
3881 /* Called with socket locked, will unlock socket */
3882 void
3883 sofree(struct socket *so)
3884 {
3885
3886         lck_mtx_t *mutex_held;
3887         if (so->so_proto->pr_getlock != NULL)
3888                 mutex_held = (*so->so_proto->pr_getlock)(so, 0);
3889         else
3890                 mutex_held = so->so_proto->pr_domain->dom_mtx;
3891         lck_mtx_assert(mutex_held, LCK_MTX_ASSERT_OWNED);
3892
3893         sofreelastref(so, 0);
3894 }
3895
3896 void
3897 soreference(struct socket *so)
3898 {
3899         socket_lock(so, 1);     /* locks & take one reference on socket */
3900         socket_unlock(so, 0);   /* unlock only */
3901 }
3902
3903 void
3904 sodereference(struct socket *so)
3905 {
3906         socket_lock(so, 0);
3907         socket_unlock(so, 1);
3908 }
3909
3910 /*
3911  * Set or clear SOF_MULTIPAGES on the socket to enable or disable the
3912  * possibility of using jumbo clusters.  Caller must ensure to hold
3913  * the socket lock.
3914  */
3915 void
3916 somultipages(struct socket *so, boolean_t set)
3917 {
3918         if (set)
3919                 so->so_flags |= SOF_MULTIPAGES;
3920         else
3921                 so->so_flags &= ~SOF_MULTIPAGES;
3922 }