bsd/kern/uipc_socket.c

   1 /*
   2  * Copyright (c) 1998-2008 Apple Inc. All rights reserved.
   3  *
   4  * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
   5  *
   6  * This file contains Original Code and/or Modifications of Original Code
   7  * as defined in and that are subject to the Apple Public Source License
   8  * Version 2.0 (the 'License'). You may not use this file except in
   9  * compliance with the License. The rights granted to you under the License
  10  * may not be used to create, or enable the creation or redistribution of,
  11  * unlawful or unlicensed copies of an Apple operating system, or to
  12  * circumvent, violate, or enable the circumvention or violation of, any
  13  * terms of an Apple operating system software license agreement.
  14  *
  15  * Please obtain a copy of the License at
  16  * http://www.opensource.apple.com/apsl/ and read it before using this file.
  17  *
  18  * The Original Code and all software distributed under the License are
  19  * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
  20  * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
  21  * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
  22  * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
  23  * Please see the License for the specific language governing rights and
  24  * limitations under the License.
  25  *
  26  * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
  27  */
  28 /* Copyright (c) 1995 NeXT Computer, Inc. All Rights Reserved */
  29 /*
  30  * Copyright (c) 1982, 1986, 1988, 1990, 1993
  31  *      The Regents of the University of California.  All rights reserved.
  32  *
  33  * Redistribution and use in source and binary forms, with or without
  34  * modification, are permitted provided that the following conditions
  35  * are met:
  36  * 1. Redistributions of source code must retain the above copyright
  37  *    notice, this list of conditions and the following disclaimer.
  38  * 2. Redistributions in binary form must reproduce the above copyright
  39  *    notice, this list of conditions and the following disclaimer in the
  40  *    documentation and/or other materials provided with the distribution.
  41  * 3. All advertising materials mentioning features or use of this software
  42  *    must display the following acknowledgement:
  43  *      This product includes software developed by the University of
  44  *      California, Berkeley and its contributors.
  45  * 4. Neither the name of the University nor the names of its contributors
  46  *    may be used to endorse or promote products derived from this software
  47  *    without specific prior written permission.
  48  *
  49  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
  50  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  51  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  52  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  53  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  54  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  55  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  56  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  57  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  58  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  59  * SUCH DAMAGE.
  60  *
  61  *      @(#)uipc_socket.c       8.3 (Berkeley) 4/15/94
  62  * $FreeBSD: src/sys/kern/uipc_socket.c,v 1.68.2.16 2001/06/14 20:46:06 ume Exp $
  63  */
  64 /*
  65  * NOTICE: This file was modified by SPARTA, Inc. in 2005 to introduce
  66  * support for mandatory and extensible security protections.  This notice
  67  * is included in support of clause 2.2 (b) of the Apple Public License,
  68  * Version 2.0.
  69  */
  70
  71 #include <sys/param.h>
  72 #include <sys/systm.h>
  73 #include <sys/filedesc.h>
  74 #include <sys/proc.h>
  75 #include <sys/proc_internal.h>
  76 #include <sys/kauth.h>
  77 #include <sys/file_internal.h>
  78 #include <sys/fcntl.h>
  79 #include <sys/malloc.h>
  80 #include <sys/mbuf.h>
  81 #include <sys/domain.h>
  82 #include <sys/kernel.h>
  83 #include <sys/event.h>
  84 #include <sys/poll.h>
  85 #include <sys/protosw.h>
  86 #include <sys/socket.h>
  87 #include <sys/socketvar.h>
  88 #include <sys/resourcevar.h>
  89 #include <sys/signalvar.h>
  90 #include <sys/sysctl.h>
  91 #include <sys/uio.h>
  92 #include <sys/ev.h>
  93 #include <sys/kdebug.h>
  94 #include <sys/un.h>
  95 #include <sys/user.h>
  96 #include <net/route.h>
  97 #include <netinet/in.h>
  98 #include <netinet/in_pcb.h>
  99 #include <kern/zalloc.h>
 100 #include <kern/locks.h>
 101 #include <machine/limits.h>
 102 #include <libkern/OSAtomic.h>
 103 #include <pexpert/pexpert.h>
 104 #include <kern/assert.h>
 105
 106 #if CONFIG_MACF
 107 #include <security/mac.h>
 108 #include <security/mac_framework.h>
 109 #endif /* MAC */
 110
 111 int                     so_cache_hw = 0;
 112 int                     so_cache_timeouts = 0;
 113 int                     so_cache_max_freed = 0;
 114 int                     cached_sock_count = 0;
 115 __private_extern__ int  max_cached_sock_count = MAX_CACHED_SOCKETS;
 116 struct socket           *socket_cache_head = 0;
 117 struct socket           *socket_cache_tail = 0;
 118 u_int32_t                       so_cache_time = 0;
 119 int                     so_cache_init_done = 0;
 120 struct zone             *so_cache_zone;
 121
 122 static lck_grp_t                *so_cache_mtx_grp;
 123 static lck_attr_t               *so_cache_mtx_attr;
 124 static lck_grp_attr_t   *so_cache_mtx_grp_attr;
 125 lck_mtx_t                               *so_cache_mtx;
 126
 127 #include <machine/limits.h>
 128
 129 static void     filt_sordetach(struct knote *kn);
 130 static int      filt_soread(struct knote *kn, long hint);
 131 static void     filt_sowdetach(struct knote *kn);
 132 static int      filt_sowrite(struct knote *kn, long hint);
 133
 134 static int
 135 sooptcopyin_timeval(struct sockopt *sopt, struct timeval * tv_p);
 136
 137 static int
 138 sooptcopyout_timeval(struct sockopt *sopt, const struct timeval * tv_p);
 139
 140 static struct filterops soread_filtops = {
 141         .f_isfd = 1,
 142         .f_detach = filt_sordetach,
 143         .f_event = filt_soread,
 144 };
 145 static struct filterops sowrite_filtops = {
 146         .f_isfd = 1,
 147         .f_detach = filt_sowdetach,
 148         .f_event = filt_sowrite,
 149 };
 150
 151 #define EVEN_MORE_LOCKING_DEBUG 0
 152 int socket_debug = 0;
 153 int socket_zone = M_SOCKET;
 154 so_gen_t        so_gencnt;      /* generation count for sockets */
 155
 156 MALLOC_DEFINE(M_SONAME, "soname", "socket name");
 157 MALLOC_DEFINE(M_PCB, "pcb", "protocol control block");
 158
 159 #define DBG_LAYER_IN_BEG        NETDBG_CODE(DBG_NETSOCK, 0)
 160 #define DBG_LAYER_IN_END        NETDBG_CODE(DBG_NETSOCK, 2)
 161 #define DBG_LAYER_OUT_BEG       NETDBG_CODE(DBG_NETSOCK, 1)
 162 #define DBG_LAYER_OUT_END       NETDBG_CODE(DBG_NETSOCK, 3)
 163 #define DBG_FNC_SOSEND          NETDBG_CODE(DBG_NETSOCK, (4 << 8) | 1)
 164 #define DBG_FNC_SORECEIVE       NETDBG_CODE(DBG_NETSOCK, (8 << 8))
 165 #define DBG_FNC_SOSHUTDOWN      NETDBG_CODE(DBG_NETSOCK, (9 << 8))
 166
 167 #define MAX_SOOPTGETM_SIZE      (128 * MCLBYTES)
 168
 169
 170 SYSCTL_DECL(_kern_ipc);
 171
 172 int somaxconn = SOMAXCONN;
 173 SYSCTL_INT(_kern_ipc, KIPC_SOMAXCONN, somaxconn, CTLFLAG_RW, &somaxconn, 0, "");
 174
 175 /* Should we get a maximum also ??? */
 176 static int sosendmaxchain = 65536;
 177 static int sosendminchain = 16384;
 178 static int sorecvmincopy  = 16384;
 179 SYSCTL_INT(_kern_ipc, OID_AUTO, sosendminchain, CTLFLAG_RW, &sosendminchain,
 180     0, "");
 181 SYSCTL_INT(_kern_ipc, OID_AUTO, sorecvmincopy, CTLFLAG_RW, &sorecvmincopy,
 182     0, "");
 183
 184 /*
 185  * Set to enable jumbo clusters (if available) for large writes when
 186  * the socket is marked with SOF_MULTIPAGES; see below.
 187  */
 188 int sosendjcl = 1;
 189 SYSCTL_INT(_kern_ipc, OID_AUTO, sosendjcl, CTLFLAG_RW, &sosendjcl, 0, "");
 190
 191 /*
 192  * Set this to ignore SOF_MULTIPAGES and use jumbo clusters for large
 193  * writes on the socket for all protocols on any network interfaces,
 194  * depending upon sosendjcl above.  Be extra careful when setting this
 195  * to 1, because sending down packets that cross physical pages down to
 196  * broken drivers (those that falsely assume that the physical pages
 197  * are contiguous) might lead to system panics or silent data corruption.
 198  * When set to 0, the system will respect SOF_MULTIPAGES, which is set
 199  * only for TCP sockets whose outgoing interface is IFNET_MULTIPAGES
 200  * capable.  Set this to 1 only for testing/debugging purposes.
 201  */
 202 int sosendjcl_ignore_capab = 0;
 203 SYSCTL_INT(_kern_ipc, OID_AUTO, sosendjcl_ignore_capab, CTLFLAG_RW,
 204     &sosendjcl_ignore_capab, 0, "");
 205
 206 /*
 207  * Socket operation routines.
 208  * These routines are called by the routines in
 209  * sys_socket.c or from a system process, and
 210  * implement the semantics of socket operations by
 211  * switching out to the protocol specific routines.
 212  */
 213
 214 /* sys_generic.c */
 215 extern void postevent(struct socket *, struct sockbuf *, int);
 216 extern void evsofree(struct socket *);
 217
 218 /* TODO: these should be in header file */
 219 extern int get_inpcb_str_size(void);
 220 extern int get_tcp_str_size(void);
 221 extern struct domain *pffinddomain(int);
 222 extern struct protosw *pffindprotonotype(int, int);
 223 extern int soclose_locked(struct socket *);
 224 extern int soo_kqfilter(struct fileproc *, struct knote *, struct proc *);
 225
 226 extern int uthread_get_background_state(uthread_t);
 227
 228 #ifdef __APPLE__
 229
 230 vm_size_t       so_cache_zone_element_size;
 231
 232 static int sodelayed_copy(struct socket *, struct uio *, struct mbuf **, int *);
 233 static void cached_sock_alloc(struct socket **, int);
 234 static void cached_sock_free(struct socket *);
 235 static void so_cache_timer(void *);
 236
 237 void soclose_wait_locked(struct socket *so);
 238 int so_isdstlocal(struct socket *so);
 239
 240
 241 void
 242 socketinit(void)
 243 {
 244         vm_size_t str_size;
 245
 246         if (so_cache_init_done) {
 247                 printf("socketinit: already called...\n");
 248                 return;
 249         }
 250
 251         PE_parse_boot_argn("socket_debug", &socket_debug, sizeof (socket_debug));
 252
 253         /*
 254          * allocate lock group attribute and group for socket cache mutex
 255          */
 256         so_cache_mtx_grp_attr = lck_grp_attr_alloc_init();
 257
 258         so_cache_mtx_grp = lck_grp_alloc_init("so_cache",
 259             so_cache_mtx_grp_attr);
 260
 261         /*
 262          * allocate the lock attribute for socket cache mutex
 263          */
 264         so_cache_mtx_attr = lck_attr_alloc_init();
 265
 266         so_cache_init_done = 1;
 267
 268         /* cached sockets mutex */
 269         so_cache_mtx = lck_mtx_alloc_init(so_cache_mtx_grp, so_cache_mtx_attr);
 270
 271         if (so_cache_mtx == NULL)
 272                 return; /* we're hosed... */
 273
 274         str_size = (vm_size_t)(sizeof (struct socket) + 4 +
 275             get_inpcb_str_size() + 4 + get_tcp_str_size());
 276
 277         so_cache_zone = zinit(str_size, 120000*str_size, 8192, "socache zone");
 278 #if TEMPDEBUG
 279         printf("cached_sock_alloc -- so_cache_zone size is %x\n", str_size);
 280 #endif
 281         timeout(so_cache_timer, NULL, (SO_CACHE_FLUSH_INTERVAL * hz));
 282
 283         so_cache_zone_element_size = str_size;
 284
 285         sflt_init();
 286 }
 287
 288 static void
 289 cached_sock_alloc(struct socket **so, int waitok)
 290 {
 291         caddr_t temp;
 292         register uintptr_t offset;
 293
 294         lck_mtx_lock(so_cache_mtx);
 295
 296         if (cached_sock_count) {
 297                 cached_sock_count--;
 298                 *so = socket_cache_head;
 299                 if (*so == 0)
 300                         panic("cached_sock_alloc: cached sock is null");
 301
 302                 socket_cache_head = socket_cache_head->cache_next;
 303                 if (socket_cache_head)
 304                         socket_cache_head->cache_prev = 0;
 305                 else
 306                         socket_cache_tail = 0;
 307
 308                 lck_mtx_unlock(so_cache_mtx);
 309
 310                 temp = (*so)->so_saved_pcb;
 311                 bzero((caddr_t)*so, sizeof (struct socket));
 312 #if TEMPDEBUG
 313                 kprintf("cached_sock_alloc - retreiving cached sock %p - "
 314                     "count == %d\n", *so, cached_sock_count);
 315 #endif
 316                 (*so)->so_saved_pcb = temp;
 317                 (*so)->cached_in_sock_layer = 1;
 318         } else {
 319 #if TEMPDEBUG
 320                 kprintf("Allocating cached sock %p from memory\n", *so);
 321 #endif
 322
 323                 lck_mtx_unlock(so_cache_mtx);
 324
 325                 if (waitok)
 326                         *so = (struct socket *)zalloc(so_cache_zone);
 327                 else
 328                         *so = (struct socket *)zalloc_noblock(so_cache_zone);
 329
 330                 if (*so == 0)
 331                         return;
 332
 333                 bzero((caddr_t)*so, sizeof (struct socket));
 334
 335                 /*
 336                  * Define offsets for extra structures into our single block of
 337                  * memory. Align extra structures on longword boundaries.
 338                  */
 339
 340                 offset = (uintptr_t) *so;
 341                 offset += sizeof (struct socket);
 342
 343                 offset = ALIGN(offset);
 344
 345                 (*so)->so_saved_pcb = (caddr_t)offset;
 346                 offset += get_inpcb_str_size();
 347
 348                 offset = ALIGN(offset);
 349
 350                 ((struct inpcb *)(*so)->so_saved_pcb)->inp_saved_ppcb =
 351                     (caddr_t)offset;
 352 #if TEMPDEBUG
 353                 kprintf("Allocating cached socket - %p, pcb=%p tcpcb=%p\n",
 354                     *so, (*so)->so_saved_pcb,
 355                     ((struct inpcb *)(*so)->so_saved_pcb)->inp_saved_ppcb);
 356 #endif
 357         }
 358
 359         (*so)->cached_in_sock_layer = 1;
 360 }
 361
 362 static void
 363 cached_sock_free(struct socket *so)
 364 {
 365
 366         lck_mtx_lock(so_cache_mtx);
 367
 368         if (++cached_sock_count > max_cached_sock_count) {
 369                 --cached_sock_count;
 370                 lck_mtx_unlock(so_cache_mtx);
 371 #if TEMPDEBUG
 372                 kprintf("Freeing overflowed cached socket %p\n", so);
 373 #endif
 374                 zfree(so_cache_zone, so);
 375         } else {
 376 #if TEMPDEBUG
 377                 kprintf("Freeing socket %p into cache\n", so);
 378 #endif
 379                 if (so_cache_hw < cached_sock_count)
 380                         so_cache_hw = cached_sock_count;
 381
 382                 so->cache_next = socket_cache_head;
 383                 so->cache_prev = 0;
 384                 if (socket_cache_head)
 385                         socket_cache_head->cache_prev = so;
 386                 else
 387                         socket_cache_tail = so;
 388
 389                 so->cache_timestamp = so_cache_time;
 390                 socket_cache_head = so;
 391                 lck_mtx_unlock(so_cache_mtx);
 392         }
 393
 394 #if TEMPDEBUG
 395         kprintf("Freed cached sock %p into cache - count is %d\n",
 396             so, cached_sock_count);
 397 #endif
 398 }
 399
 400 static void
 401 so_cache_timer(__unused void *dummy)
 402 {
 403         register struct socket  *p;
 404         register int            n_freed = 0;
 405
 406         lck_mtx_lock(so_cache_mtx);
 407
 408         ++so_cache_time;
 409
 410         while ((p = socket_cache_tail)) {
 411                 if ((so_cache_time - p->cache_timestamp) < SO_CACHE_TIME_LIMIT)
 412                         break;
 413
 414                 so_cache_timeouts++;
 415
 416                 if ((socket_cache_tail = p->cache_prev))
 417                         p->cache_prev->cache_next = 0;
 418                 if (--cached_sock_count == 0)
 419                         socket_cache_head = 0;
 420
 421                 zfree(so_cache_zone, p);
 422
 423                 if (++n_freed >= SO_CACHE_MAX_FREE_BATCH) {
 424                         so_cache_max_freed++;
 425                         break;
 426                 }
 427         }
 428         lck_mtx_unlock(so_cache_mtx);
 429
 430         timeout(so_cache_timer, NULL, (SO_CACHE_FLUSH_INTERVAL * hz));
 431 }
 432 #endif /* __APPLE__ */
 433
 434 /*
 435  * Get a socket structure from our zone, and initialize it.
 436  * We don't implement `waitok' yet (see comments in uipc_domain.c).
 437  * Note that it would probably be better to allocate socket
 438  * and PCB at the same time, but I'm not convinced that all
 439  * the protocols can be easily modified to do this.
 440  */
 441 struct socket *
 442 soalloc(int waitok, int dom, int type)
 443 {
 444         struct socket *so;
 445
 446         if ((dom == PF_INET) && (type == SOCK_STREAM)) {
 447                 cached_sock_alloc(&so, waitok);
 448         } else {
 449                 MALLOC_ZONE(so, struct socket *, sizeof (*so), socket_zone,
 450                     M_WAITOK);
 451                 if (so != NULL)
 452                         bzero(so, sizeof (*so));
 453         }
 454         /* XXX race condition for reentrant kernel */
 455 //###LD Atomic add for so_gencnt
 456         if (so != NULL) {
 457                 so->so_gencnt = ++so_gencnt;
 458                 so->so_zone = socket_zone;
 459 #if CONFIG_MACF_SOCKET
 460              /* Convert waitok to  M_WAITOK/M_NOWAIT for MAC Framework. */
 461              if (mac_socket_label_init(so, !waitok) != 0) {
 462                         sodealloc(so);
 463                         return (NULL);
 464                 }
 465 #endif /* MAC_SOCKET */
 466         }
 467
 468         return (so);
 469 }
 470
 471 /*
 472  * Returns:     0                       Success
 473  *              EAFNOSUPPORT
 474  *              EPROTOTYPE
 475  *              EPROTONOSUPPORT
 476  *              ENOBUFS
 477  *      <pru_attach>:ENOBUFS[AF_UNIX]
 478  *      <pru_attach>:ENOBUFS[TCP]
 479  *      <pru_attach>:ENOMEM[TCP]
 480  *      <pru_attach>:EISCONN[TCP]
 481  *      <pru_attach>:???                [other protocol families, IPSEC]
 482  */
 483 int
 484 socreate(int dom, struct socket **aso, int type, int proto)
 485 {
 486         struct proc *p = current_proc();
 487         register struct protosw *prp;
 488         register struct socket *so;
 489         register int error = 0;
 490         thread_t thread;
 491         struct uthread *ut;
 492
 493 #if TCPDEBUG
 494         extern int tcpconsdebug;
 495 #endif
 496         if (proto)
 497                 prp = pffindproto(dom, proto, type);
 498         else
 499                 prp = pffindtype(dom, type);
 500
 501         if (prp == 0 || prp->pr_usrreqs->pru_attach == 0) {
 502                 if (pffinddomain(dom) == NULL) {
 503                         return (EAFNOSUPPORT);
 504                 }
 505                 if (proto != 0) {
 506                         if (pffindprotonotype(dom, proto) != NULL) {
 507                                 return (EPROTOTYPE);
 508                         }
 509                 }
 510                 return (EPROTONOSUPPORT);
 511         }
 512         if (prp->pr_type != type)
 513                 return (EPROTOTYPE);
 514         so = soalloc(1, dom, type);
 515         if (so == 0)
 516                 return (ENOBUFS);
 517
 518         TAILQ_INIT(&so->so_incomp);
 519         TAILQ_INIT(&so->so_comp);
 520         so->so_type = type;
 521
 522         so->so_uid = kauth_cred_getuid(kauth_cred_get());
 523         if (!suser(kauth_cred_get(), NULL))
 524                 so->so_state = SS_PRIV;
 525
 526         so->so_proto = prp;
 527 #ifdef __APPLE__
 528         so->so_rcv.sb_flags |= SB_RECV; /* XXX */
 529         so->so_rcv.sb_so = so->so_snd.sb_so = so;
 530 #endif
 531         so->next_lock_lr = 0;
 532         so->next_unlock_lr = 0;
 533
 534 #if CONFIG_MACF_SOCKET
 535         mac_socket_label_associate(kauth_cred_get(), so);
 536 #endif /* MAC_SOCKET */
 537
 538 //### Attachement will create the per pcb lock if necessary and increase refcount
 539         /*
 540          * for creation, make sure it's done before
 541          * socket is inserted in lists
 542          */
 543         so->so_usecount++;
 544
 545         error = (*prp->pr_usrreqs->pru_attach)(so, proto, p);
 546         if (error) {
 547                 /*
 548                  * Warning:
 549                  * If so_pcb is not zero, the socket will be leaked,
 550                  * so protocol attachment handler must be coded carefuly
 551                  */
 552                 so->so_state |= SS_NOFDREF;
 553                 so->so_usecount--;
 554                 sofreelastref(so, 1);   /* will deallocate the socket */
 555                 return (error);
 556         }
 557 #ifdef __APPLE__
 558         prp->pr_domain->dom_refs++;
 559         TAILQ_INIT(&so->so_evlist);
 560
 561         /* Attach socket filters for this protocol */
 562         sflt_initsock(so);
 563 #if TCPDEBUG
 564         if (tcpconsdebug == 2)
 565                 so->so_options |= SO_DEBUG;
 566 #endif
 567 #endif
 568         /*
 569          * If this is a background thread/task, mark the socket as such.
 570          */
 571         thread = current_thread();
 572         ut = get_bsdthread_info(thread);
 573         if (uthread_get_background_state(ut)) {
 574                 socket_set_traffic_mgt_flags(so, TRAFFIC_MGT_SO_BACKGROUND);
 575                 so->so_background_thread = thread;
 576                 /*
 577                  * In case setpriority(PRIO_DARWIN_THREAD) was called
 578                  * on this thread, regulate network (TCP) traffics.
 579                  */
 580                 if (ut->uu_flag & UT_BACKGROUND_TRAFFIC_MGT) {
 581                         socket_set_traffic_mgt_flags(so,
 582                             TRAFFIC_MGT_SO_BG_REGULATE);
 583                 }
 584         }
 585
 586         *aso = so;
 587         return (0);
 588 }
 589
 590 /*
 591  * Returns:     0                       Success
 592  *      <pru_bind>:EINVAL               Invalid argument [COMMON_START]
 593  *      <pru_bind>:EAFNOSUPPORT         Address family not supported
 594  *      <pru_bind>:EADDRNOTAVAIL        Address not available.
 595  *      <pru_bind>:EINVAL               Invalid argument
 596  *      <pru_bind>:EAFNOSUPPORT         Address family not supported [notdef]
 597  *      <pru_bind>:EACCES               Permission denied
 598  *      <pru_bind>:EADDRINUSE           Address in use
 599  *      <pru_bind>:EAGAIN               Resource unavailable, try again
 600  *      <pru_bind>:EPERM                Operation not permitted
 601  *      <pru_bind>:???
 602  *      <sf_bind>:???
 603  *
 604  * Notes:       It's not possible to fully enumerate the return codes above,
 605  *              since socket filter authors and protocol family authors may
 606  *              not choose to limit their error returns to those listed, even
 607  *              though this may result in some software operating incorrectly.
 608  *
 609  *              The error codes which are enumerated above are those known to
 610  *              be returned by the tcp_usr_bind function supplied.
 611  */
 612 int
 613 sobind(struct socket *so, struct sockaddr *nam)
 614 {
 615         struct proc *p = current_proc();
 616         int error = 0;
 617         struct socket_filter_entry *filter;
 618         int filtered = 0;
 619
 620         socket_lock(so, 1);
 621
 622         /*
 623          * If this is a bind request on a previously-accepted socket
 624          * that has been marked as inactive, reject it now before
 625          * we go any further.
 626          */
 627         if (so->so_flags & SOF_DEFUNCT) {
 628                 error = EINVAL;
 629                 goto out;
 630         }
 631
 632         /* Socket filter */
 633         error = 0;
 634         for (filter = so->so_filt; filter && (error == 0);
 635             filter = filter->sfe_next_onsocket) {
 636                 if (filter->sfe_filter->sf_filter.sf_bind) {
 637                         if (filtered == 0) {
 638                                 filtered = 1;
 639                                 sflt_use(so);
 640                                 socket_unlock(so, 0);
 641                         }
 642                         error = filter->sfe_filter->sf_filter.
 643                             sf_bind(filter->sfe_cookie, so, nam);
 644                 }
 645         }
 646         if (filtered != 0) {
 647                 socket_lock(so, 0);
 648                 sflt_unuse(so);
 649         }
 650         /* End socket filter */
 651
 652         if (error == 0)
 653                 error = (*so->so_proto->pr_usrreqs->pru_bind)(so, nam, p);
 654 out:
 655         socket_unlock(so, 1);
 656
 657         if (error == EJUSTRETURN)
 658                 error = 0;
 659
 660         return (error);
 661 }
 662
 663 void
 664 sodealloc(struct socket *so)
 665 {
 666         so->so_gencnt = ++so_gencnt;
 667
 668 #if CONFIG_MACF_SOCKET
 669         mac_socket_label_destroy(so);
 670 #endif /* MAC_SOCKET */
 671         if (so->cached_in_sock_layer == 1) {
 672                 cached_sock_free(so);
 673         } else {
 674                 if (so->cached_in_sock_layer == -1)
 675                         panic("sodealloc: double dealloc: so=%p\n", so);
 676                 so->cached_in_sock_layer = -1;
 677                 FREE_ZONE(so, sizeof (*so), so->so_zone);
 678         }
 679 }
 680
 681 /*
 682  * Returns:     0                       Success
 683  *              EINVAL
 684  *              EOPNOTSUPP
 685  *      <pru_listen>:EINVAL[AF_UNIX]
 686  *      <pru_listen>:EINVAL[TCP]
 687  *      <pru_listen>:EADDRNOTAVAIL[TCP] Address not available.
 688  *      <pru_listen>:EINVAL[TCP]        Invalid argument
 689  *      <pru_listen>:EAFNOSUPPORT[TCP]  Address family not supported [notdef]
 690  *      <pru_listen>:EACCES[TCP]        Permission denied
 691  *      <pru_listen>:EADDRINUSE[TCP]    Address in use
 692  *      <pru_listen>:EAGAIN[TCP]        Resource unavailable, try again
 693  *      <pru_listen>:EPERM[TCP]         Operation not permitted
 694  *      <sf_listen>:???
 695  *
 696  * Notes:       Other <pru_listen> returns depend on the protocol family; all
 697  *              <sf_listen> returns depend on what the filter author causes
 698  *              their filter to return.
 699  */
 700 int
 701 solisten(struct socket *so, int backlog)
 702 {
 703         struct proc *p = current_proc();
 704         int error = 0;
 705         struct socket_filter_entry *filter;
 706         int filtered = 0;
 707
 708         socket_lock(so, 1);
 709         if (so->so_proto == NULL) {
 710                 error = EINVAL;
 711                 goto out;
 712         }
 713         if ((so->so_proto->pr_flags & PR_CONNREQUIRED) == 0) {
 714                 error = EOPNOTSUPP;
 715                 goto out;
 716         }
 717
 718         /*
 719          * If the listen request is made on a socket that is not fully
 720          * disconnected, or on a previously-accepted socket that has
 721          * been marked as inactive, reject the request now.
 722          */
 723         if ((so->so_state &
 724             (SS_ISCONNECTED|SS_ISCONNECTING|SS_ISDISCONNECTING)) ||
 725             (so->so_flags & SOF_DEFUNCT)) {
 726                 error = EINVAL;
 727                 goto out;
 728         }
 729
 730         if ((so->so_restrictions & SO_RESTRICT_DENYIN) != 0) {
 731                 error = EPERM;
 732                 goto out;
 733         }
 734
 735         error = 0;
 736         for (filter = so->so_filt; filter && (error == 0);
 737             filter = filter->sfe_next_onsocket) {
 738                 if (filter->sfe_filter->sf_filter.sf_listen) {
 739                         if (filtered == 0) {
 740                                 filtered = 1;
 741                                 sflt_use(so);
 742                                 socket_unlock(so, 0);
 743                         }
 744                         error = filter->sfe_filter->sf_filter.
 745                             sf_listen(filter->sfe_cookie, so);
 746                 }
 747         }
 748         if (filtered != 0) {
 749                 socket_lock(so, 0);
 750                 sflt_unuse(so);
 751         }
 752
 753         if (error == 0) {
 754                 error = (*so->so_proto->pr_usrreqs->pru_listen)(so, p);
 755         }
 756
 757         if (error) {
 758                 if (error == EJUSTRETURN)
 759                         error = 0;
 760                 goto out;
 761         }
 762
 763         if (TAILQ_EMPTY(&so->so_comp))
 764                 so->so_options |= SO_ACCEPTCONN;
 765         /*
 766          * POSIX: The implementation may have an upper limit on the length of
 767          * the listen queue-either global or per accepting socket. If backlog
 768          * exceeds this limit, the length of the listen queue is set to the
 769          * limit.
 770          *
 771          * If listen() is called with a backlog argument value that is less
 772          * than 0, the function behaves as if it had been called with a backlog
 773          * argument value of 0.
 774          *
 775          * A backlog argument of 0 may allow the socket to accept connections,
 776          * in which case the length of the listen queue may be set to an
 777          * implementation-defined minimum value.
 778          */
 779         if (backlog <= 0 || backlog > somaxconn)
 780                 backlog = somaxconn;
 781
 782         so->so_qlimit = backlog;
 783 out:
 784         socket_unlock(so, 1);
 785         return (error);
 786 }
 787
 788 void
 789 sofreelastref(struct socket *so, int dealloc)
 790 {
 791         struct socket *head = so->so_head;
 792
 793         /* Assume socket is locked */
 794
 795         /* Remove any filters - may be called more than once */
 796         sflt_termsock(so);
 797
 798         if ((!(so->so_flags & SOF_PCBCLEARING)) ||
 799             ((so->so_state & SS_NOFDREF) == 0)) {
 800 #ifdef __APPLE__
 801                 selthreadclear(&so->so_snd.sb_sel);
 802                 selthreadclear(&so->so_rcv.sb_sel);
 803                 so->so_rcv.sb_flags &= ~SB_UPCALL;
 804                 so->so_snd.sb_flags &= ~SB_UPCALL;
 805 #endif
 806                 return;
 807         }
 808         if (head != NULL) {
 809                 socket_lock(head, 1);
 810                 if (so->so_state & SS_INCOMP) {
 811                         TAILQ_REMOVE(&head->so_incomp, so, so_list);
 812                         head->so_incqlen--;
 813                 } else if (so->so_state & SS_COMP) {
 814                         /*
 815                          * We must not decommission a socket that's
 816                          * on the accept(2) queue.  If we do, then
 817                          * accept(2) may hang after select(2) indicated
 818                          * that the listening socket was ready.
 819                          */
 820 #ifdef __APPLE__
 821                         selthreadclear(&so->so_snd.sb_sel);
 822                         selthreadclear(&so->so_rcv.sb_sel);
 823                         so->so_rcv.sb_flags &= ~SB_UPCALL;
 824                         so->so_snd.sb_flags &= ~SB_UPCALL;
 825 #endif
 826                         socket_unlock(head, 1);
 827                         return;
 828                 } else {
 829                         panic("sofree: not queued");
 830                 }
 831                 head->so_qlen--;
 832                 so->so_state &= ~SS_INCOMP;
 833                 so->so_head = NULL;
 834                 socket_unlock(head, 1);
 835         }
 836 #ifdef __APPLE__
 837         selthreadclear(&so->so_snd.sb_sel);
 838         sbrelease(&so->so_snd);
 839 #endif
 840         sorflush(so);
 841
 842         /* 3932268: disable upcall */
 843         so->so_rcv.sb_flags &= ~SB_UPCALL;
 844         so->so_snd.sb_flags &= ~SB_UPCALL;
 845
 846         if (dealloc)
 847                 sodealloc(so);
 848 }
 849
 850 void
 851 soclose_wait_locked(struct socket *so)
 852 {
 853         lck_mtx_t *mutex_held;
 854
 855         if (so->so_proto->pr_getlock != NULL)
 856                 mutex_held = (*so->so_proto->pr_getlock)(so, 0);
 857         else
 858                 mutex_held = so->so_proto->pr_domain->dom_mtx;
 859         lck_mtx_assert(mutex_held, LCK_MTX_ASSERT_OWNED);
 860
 861         /*
 862          * Double check here and return if there's no outstanding upcall;
 863          * otherwise proceed further only if SOF_UPCALLCLOSEWAIT is set.
 864          */
 865         if (!(so->so_flags & SOF_UPCALLINUSE) ||
 866             !(so->so_flags & SOF_UPCALLCLOSEWAIT))
 867                 return;
 868
 869         so->so_flags |= SOF_CLOSEWAIT;
 870         (void) msleep((caddr_t)&so->so_upcall, mutex_held, (PZERO - 1),
 871             "soclose_wait_locked", NULL);
 872         lck_mtx_assert(mutex_held, LCK_MTX_ASSERT_OWNED);
 873         so->so_flags &= ~SOF_CLOSEWAIT;
 874 }
 875
 876 /*
 877  * Close a socket on last file table reference removal.
 878  * Initiate disconnect if connected.
 879  * Free socket when disconnect complete.
 880  */
 881 int
 882 soclose_locked(struct socket *so)
 883 {
 884         int error = 0;
 885         lck_mtx_t *mutex_held;
 886         struct timespec ts;
 887
 888         if (so->so_usecount == 0) {
 889                 panic("soclose: so=%p refcount=0\n", so);
 890         }
 891
 892         sflt_notify(so, sock_evt_closing, NULL);
 893
 894         if ((so->so_options & SO_ACCEPTCONN)) {
 895                 struct socket *sp, *sonext;
 896                 int socklock = 0;
 897
 898                 /*
 899                  * We do not want new connection to be added
 900                  * to the connection queues
 901                  */
 902                 so->so_options &= ~SO_ACCEPTCONN;
 903
 904                 for (sp = TAILQ_FIRST(&so->so_incomp); sp != NULL; sp = sonext) {
 905                         sonext = TAILQ_NEXT(sp, so_list);
 906
 907                         /* Radar 5350314
 908                          * skip sockets thrown away by tcpdropdropblreq
 909                          * they will get cleanup by the garbage collection.
 910                          * otherwise, remove the incomp socket from the queue
 911                          * and let soabort trigger the appropriate cleanup.
 912                          */
 913                         if (sp->so_flags & SOF_OVERFLOW)
 914                                 continue;
 915
 916                         if (so->so_proto->pr_getlock != NULL) {
 917                                 /* lock ordering for consistency with the rest of the stack,
 918                                  * we lock the socket first and then grabb the head.
 919                                  */
 920                                 socket_unlock(so, 0);
 921                                 socket_lock(sp, 1);
 922                                 socket_lock(so, 0);
 923                                 socklock = 1;
 924                         }
 925
 926                         TAILQ_REMOVE(&so->so_incomp, sp, so_list);
 927                         so->so_incqlen--;
 928
 929                         if (sp->so_state & SS_INCOMP) {
 930                                 sp->so_state &= ~SS_INCOMP;
 931                                 sp->so_head = NULL;
 932
 933                                 (void) soabort(sp);
 934                         }
 935
 936                         if (socklock)
 937                                 socket_unlock(sp, 1);
 938                 }
 939
 940                 while ((sp = TAILQ_FIRST(&so->so_comp)) != NULL) {
 941                         /* Dequeue from so_comp since sofree() won't do it */
 942                         TAILQ_REMOVE(&so->so_comp, sp, so_list);
 943                         so->so_qlen--;
 944
 945                         if (so->so_proto->pr_getlock != NULL) {
 946                                 socket_unlock(so, 0);
 947                                 socket_lock(sp, 1);
 948                         }
 949
 950                         if (sp->so_state & SS_COMP) {
 951                                 sp->so_state &= ~SS_COMP;
 952                                 sp->so_head = NULL;
 953
 954                                 (void) soabort(sp);
 955                         }
 956
 957                         if (so->so_proto->pr_getlock != NULL) {
 958                                 socket_unlock(sp, 1);
 959                                 socket_lock(so, 0);
 960                         }
 961                 }
 962         }
 963         if (so->so_pcb == 0) {
 964                 /* 3915887: mark the socket as ready for dealloc */
 965                 so->so_flags |= SOF_PCBCLEARING;
 966                 goto discard;
 967         }
 968         if (so->so_state & SS_ISCONNECTED) {
 969                 if ((so->so_state & SS_ISDISCONNECTING) == 0) {
 970                         error = sodisconnectlocked(so);
 971                         if (error)
 972                                 goto drop;
 973                 }
 974                 if (so->so_options & SO_LINGER) {
 975                         if ((so->so_state & SS_ISDISCONNECTING) &&
 976                             (so->so_state & SS_NBIO))
 977                                 goto drop;
 978                         if (so->so_proto->pr_getlock != NULL)
 979                                 mutex_held = (*so->so_proto->pr_getlock)(so, 0);
 980                         else
 981                                 mutex_held = so->so_proto->pr_domain->dom_mtx;
 982                         while (so->so_state & SS_ISCONNECTED) {
 983                                 ts.tv_sec = (so->so_linger/100);
 984                                 ts.tv_nsec = (so->so_linger % 100) *
 985                                     NSEC_PER_USEC * 1000 * 10;
 986                                 error = msleep((caddr_t)&so->so_timeo,
 987                                     mutex_held, PSOCK | PCATCH, "soclose", &ts);
 988                                 if (error) {
 989                                         /*
 990                                          * It's OK when the time fires,
 991                                          * don't report an error
 992                                          */
 993                                         if (error == EWOULDBLOCK)
 994                                                 error = 0;
 995                                         break;
 996                                 }
 997                         }
 998                 }
 999         }
1000 drop:
1001         if (so->so_usecount == 0)
1002                 panic("soclose: usecount is zero so=%p\n", so);
1003         if (so->so_pcb && !(so->so_flags & SOF_PCBCLEARING)) {
1004                 int error2 = (*so->so_proto->pr_usrreqs->pru_detach)(so);
1005                 if (error == 0)
1006                         error = error2;
1007         }
1008         if (so->so_usecount <= 0)
1009                 panic("soclose: usecount is zero so=%p\n", so);
1010 discard:
1011         if (so->so_pcb && so->so_state & SS_NOFDREF)
1012                 panic("soclose: NOFDREF");
1013         so->so_state |= SS_NOFDREF;
1014 #ifdef __APPLE__
1015         so->so_proto->pr_domain->dom_refs--;
1016         evsofree(so);
1017 #endif
1018         so->so_usecount--;
1019         sofree(so);
1020         return (error);
1021 }
1022
1023 int
1024 soclose(struct socket *so)
1025 {
1026         int error = 0;
1027         socket_lock(so, 1);
1028
1029         if (so->so_flags & SOF_UPCALLINUSE)
1030                 soclose_wait_locked(so);
1031
1032         if (so->so_retaincnt == 0) {
1033                 error = soclose_locked(so);
1034         } else {
1035                 /*
1036                  * if the FD is going away, but socket is
1037                  * retained in kernel remove its reference
1038                  */
1039                 so->so_usecount--;
1040                 if (so->so_usecount < 2)
1041                         panic("soclose: retaincnt non null and so=%p "
1042                             "usecount=%d\n", so, so->so_usecount);
1043         }
1044         socket_unlock(so, 1);
1045         return (error);
1046 }
1047
1048 /*
1049  * Must be called at splnet...
1050  */
1051 /* Should already be locked */
1052 int
1053 soabort(struct socket *so)
1054 {
1055         int error;
1056
1057 #ifdef MORE_LOCKING_DEBUG
1058         lck_mtx_t *mutex_held;
1059
1060         if (so->so_proto->pr_getlock != NULL)
1061                 mutex_held = (*so->so_proto->pr_getlock)(so, 0);
1062         else
1063                 mutex_held = so->so_proto->pr_domain->dom_mtx;
1064         lck_mtx_assert(mutex_held, LCK_MTX_ASSERT_OWNED);
1065 #endif
1066
1067         if ((so->so_flags & SOF_ABORTED) == 0) {
1068                 so->so_flags |= SOF_ABORTED;
1069                 error = (*so->so_proto->pr_usrreqs->pru_abort)(so);
1070                 if (error) {
1071                         sofree(so);
1072                         return (error);
1073                 }
1074         }
1075         return (0);
1076 }
1077
1078 int
1079 soacceptlock(struct socket *so, struct sockaddr **nam, int dolock)
1080 {
1081         int error;
1082
1083         if (dolock)
1084                 socket_lock(so, 1);
1085
1086         if ((so->so_state & SS_NOFDREF) == 0)
1087                 panic("soaccept: !NOFDREF");
1088         so->so_state &= ~SS_NOFDREF;
1089         error = (*so->so_proto->pr_usrreqs->pru_accept)(so, nam);
1090
1091         if (dolock)
1092                 socket_unlock(so, 1);
1093         return (error);
1094 }
1095
1096 int
1097 soaccept(struct socket *so, struct sockaddr **nam)
1098 {
1099         return (soacceptlock(so, nam, 1));
1100 }
1101
1102 int
1103 soacceptfilter(struct socket *so)
1104 {
1105         struct sockaddr *local = NULL, *remote = NULL;
1106         struct socket_filter_entry *filter;
1107         int error = 0, filtered = 0;
1108         struct socket *head = so->so_head;
1109
1110         /*
1111          * Hold the lock even if this socket
1112          * has not been made visible to the filter(s).
1113          * For sockets with global locks, this protect against the
1114          * head or peer going away
1115          */
1116         socket_lock(so, 1);
1117         if (sogetaddr_locked(so, &remote, 1) != 0 ||
1118             sogetaddr_locked(so, &local, 0) != 0) {
1119                 so->so_state &= ~(SS_NOFDREF | SS_COMP);
1120                 so->so_head = NULL;
1121                 socket_unlock(so, 1);
1122                 soclose(so);
1123                 /* Out of resources; try it again next time */
1124                 error = ECONNABORTED;
1125                 goto done;
1126         }
1127
1128         /*
1129          * At this point, we have a reference on the listening socket
1130          * so we know it won't be going away.  Do the same for the newly
1131          * accepted socket while we invoke the accept callback routine.
1132          */
1133         for (filter = so->so_filt; filter != NULL && error == 0;
1134             filter = filter->sfe_next_onsocket) {
1135                 if (filter->sfe_filter->sf_filter.sf_accept != NULL) {
1136                         if (!filtered) {
1137                                 filtered = 1;
1138                                 sflt_use(so);
1139                                 socket_unlock(so, 0);
1140                         }
1141                         error = filter->sfe_filter->sf_filter.
1142                             sf_accept(filter->sfe_cookie,
1143                             head, so, local, remote);
1144                 }
1145         }
1146
1147         if (filtered) {
1148                 socket_lock(so, 0);
1149                 sflt_unuse(so);
1150         }
1151
1152         /*
1153          * If we get EJUSTRETURN from one of the filters, mark this socket
1154          * as inactive and return it anyway.  This newly accepted socket
1155          * will be disconnected later before we hand it off to the caller.
1156          */
1157         if (error == EJUSTRETURN) {
1158                 error = 0;
1159                 so->so_flags |= SOF_DEFUNCT;
1160                 /* Prevent data from being appended to the socket buffers */
1161                 so->so_snd.sb_flags |= SB_DROP;
1162                 so->so_rcv.sb_flags |= SB_DROP;
1163         }
1164
1165         if (error != 0) {
1166                 /*
1167                  * This may seem like a duplication to the above error
1168                  * handling part when we return ECONNABORTED, except
1169                  * the following is done while holding the lock since
1170                  * the socket has been exposed to the filter(s) earlier.
1171                  */
1172                 so->so_state &= ~(SS_NOFDREF | SS_COMP);
1173                 so->so_head = NULL;
1174                 socket_unlock(so, 1);
1175                 soclose(so);
1176                 /* Propagate socket filter's error code to the caller */
1177         } else {
1178                 socket_unlock(so, 1);
1179         }
1180 done:
1181         /* Callee checks for NULL pointer */
1182         sock_freeaddr(remote);
1183         sock_freeaddr(local);
1184         return (error);
1185 }
1186
1187 /*
1188  * Returns:     0                       Success
1189  *              EOPNOTSUPP              Operation not supported on socket
1190  *              EISCONN                 Socket is connected
1191  *      <pru_connect>:EADDRNOTAVAIL     Address not available.
1192  *      <pru_connect>:EINVAL            Invalid argument
1193  *      <pru_connect>:EAFNOSUPPORT      Address family not supported [notdef]
1194  *      <pru_connect>:EACCES            Permission denied
1195  *      <pru_connect>:EADDRINUSE        Address in use
1196  *      <pru_connect>:EAGAIN            Resource unavailable, try again
1197  *      <pru_connect>:EPERM             Operation not permitted
1198  *      <sf_connect_out>:???            [anything a filter writer might set]
1199  */
1200 int
1201 soconnectlock(struct socket *so, struct sockaddr *nam, int dolock)
1202 {
1203         int error;
1204         struct proc *p = current_proc();
1205
1206         if (dolock)
1207                 socket_lock(so, 1);
1208
1209         /*
1210          * If this is a listening socket or if this is a previously-accepted
1211          * socket that has been marked as inactive, reject the connect request.
1212          */
1213         if ((so->so_options & SO_ACCEPTCONN) || (so->so_flags & SOF_DEFUNCT)) {
1214                 if (dolock)
1215                         socket_unlock(so, 1);
1216                 return (EOPNOTSUPP);
1217         }
1218
1219         if ((so->so_restrictions & SO_RESTRICT_DENYOUT) != 0) {
1220                 if (dolock)
1221                         socket_unlock(so, 1);
1222                 return (EPERM);
1223         }
1224
1225         /*
1226          * If protocol is connection-based, can only connect once.
1227          * Otherwise, if connected, try to disconnect first.
1228          * This allows user to disconnect by connecting to, e.g.,
1229          * a null address.
1230          */
1231         if (so->so_state & (SS_ISCONNECTED|SS_ISCONNECTING) &&
1232             ((so->so_proto->pr_flags & PR_CONNREQUIRED) ||
1233             (error = sodisconnectlocked(so)))) {
1234                 error = EISCONN;
1235         } else {
1236                 /*
1237                  * Run connect filter before calling protocol:
1238                  *  - non-blocking connect returns before completion;
1239                  */
1240                 struct socket_filter_entry *filter;
1241                 int filtered = 0;
1242
1243                 error = 0;
1244                 for (filter = so->so_filt; filter && (error == 0);
1245                     filter = filter->sfe_next_onsocket) {
1246                         if (filter->sfe_filter->sf_filter.sf_connect_out) {
1247                                 if (filtered == 0) {
1248                                         filtered = 1;
1249                                         sflt_use(so);
1250                                         socket_unlock(so, 0);
1251                                 }
1252                                 error = filter->sfe_filter->sf_filter.
1253                                     sf_connect_out(filter->sfe_cookie, so, nam);
1254                         }
1255                 }
1256                 if (filtered != 0) {
1257                         socket_lock(so, 0);
1258                         sflt_unuse(so);
1259                 }
1260
1261                 if (error) {
1262                         if (error == EJUSTRETURN)
1263                                 error = 0;
1264                         if (dolock)
1265                                 socket_unlock(so, 1);
1266                         return (error);
1267                 }
1268
1269                 error = (*so->so_proto->pr_usrreqs->pru_connect)(so, nam, p);
1270         }
1271         if (dolock)
1272                 socket_unlock(so, 1);
1273         return (error);
1274 }
1275
1276 int
1277 soconnect(struct socket *so, struct sockaddr *nam)
1278 {
1279         return (soconnectlock(so, nam, 1));
1280 }
1281
1282 /*
1283  * Returns:     0                       Success
1284  *      <pru_connect2>:EINVAL[AF_UNIX]
1285  *      <pru_connect2>:EPROTOTYPE[AF_UNIX]
1286  *      <pru_connect2>:???              [other protocol families]
1287  *
1288  * Notes:       <pru_connect2> is not supported by [TCP].
1289  */
1290 int
1291 soconnect2(struct socket *so1, struct socket *so2)
1292 {
1293         int error;
1294
1295         socket_lock(so1, 1);
1296         if (so2->so_proto->pr_lock)
1297                 socket_lock(so2, 1);
1298
1299         error = (*so1->so_proto->pr_usrreqs->pru_connect2)(so1, so2);
1300
1301         socket_unlock(so1, 1);
1302         if (so2->so_proto->pr_lock)
1303                 socket_unlock(so2, 1);
1304         return (error);
1305 }
1306
1307 int
1308 sodisconnectlocked(struct socket *so)
1309 {
1310         int error;
1311
1312         if ((so->so_state & SS_ISCONNECTED) == 0) {
1313                 error = ENOTCONN;
1314                 goto bad;
1315         }
1316         if (so->so_state & SS_ISDISCONNECTING) {
1317                 error = EALREADY;
1318                 goto bad;
1319         }
1320
1321         error = (*so->so_proto->pr_usrreqs->pru_disconnect)(so);
1322
1323         if (error == 0) {
1324                 sflt_notify(so, sock_evt_disconnected, NULL);
1325         }
1326 bad:
1327         return (error);
1328 }
1329
1330 /* Locking version */
1331 int
1332 sodisconnect(struct socket *so)
1333 {
1334         int error;
1335
1336         socket_lock(so, 1);
1337         error = sodisconnectlocked(so);
1338         socket_unlock(so, 1);
1339         return (error);
1340 }
1341
1342 #define SBLOCKWAIT(f)   (((f) & MSG_DONTWAIT) ? M_DONTWAIT : M_WAIT)
1343
1344 /*
1345  * sosendcheck will lock the socket buffer if it isn't locked and
1346  * verify that there is space for the data being inserted.
1347  *
1348  * Returns:     0                       Success
1349  *              EPIPE
1350  *      sblock:EWOULDBLOCK
1351  *      sblock:EINTR
1352  *      sbwait:EBADF
1353  *      sbwait:EINTR
1354  *      [so_error]:???
1355  */
1356 static int
1357 sosendcheck(struct socket *so, struct sockaddr *addr, int32_t resid, int32_t clen,
1358     int32_t atomic, int flags, int *sblocked)
1359 {
1360         int     error = 0;
1361         int32_t space;
1362         int     assumelock = 0;
1363
1364 restart:
1365         if (*sblocked == 0) {
1366                 if ((so->so_snd.sb_flags & SB_LOCK) != 0 &&
1367                     so->so_send_filt_thread != 0 &&
1368                     so->so_send_filt_thread == current_thread()) {
1369                         /*
1370                          * We're being called recursively from a filter,
1371                          * allow this to continue. Radar 4150520.
1372                          * Don't set sblocked because we don't want
1373                          * to perform an unlock later.
1374                          */
1375                         assumelock = 1;
1376                 } else {
1377                         error = sblock(&so->so_snd, SBLOCKWAIT(flags));
1378                         if (error) {
1379                                 return (error);
1380                         }
1381                         *sblocked = 1;
1382                 }
1383         }
1384
1385         /*
1386          * If a send attempt is made on a previously-accepted socket
1387          * that has been marked as inactive (disconnected), reject
1388          * the request.
1389          */
1390         if (so->so_flags & SOF_DEFUNCT)
1391                 return (ENOTCONN);
1392
1393         if (so->so_state & SS_CANTSENDMORE)
1394                 return (EPIPE);
1395
1396         if (so->so_error) {
1397                 error = so->so_error;
1398                 so->so_error = 0;
1399                 return (error);
1400         }
1401
1402         if ((so->so_state & SS_ISCONNECTED) == 0) {
1403                 if ((so->so_proto->pr_flags & PR_CONNREQUIRED) != 0) {
1404                         if ((so->so_state & SS_ISCONFIRMING) == 0 &&
1405                             !(resid == 0 && clen != 0))
1406                                 return (ENOTCONN);
1407                 } else if (addr == 0 && !(flags&MSG_HOLD)) {
1408                         return ((so->so_proto->pr_flags & PR_CONNREQUIRED) ?
1409                             ENOTCONN : EDESTADDRREQ);
1410                 }
1411         }
1412         space = sbspace(&so->so_snd);
1413         if (flags & MSG_OOB)
1414                 space += 1024;
1415         if ((atomic && resid > so->so_snd.sb_hiwat) ||
1416             clen > so->so_snd.sb_hiwat)
1417                 return (EMSGSIZE);
1418         if (space < resid + clen &&
1419             (atomic || space < (int32_t)so->so_snd.sb_lowat || space < clen)) {
1420                 if ((so->so_state & SS_NBIO) || (flags & MSG_NBIO) ||
1421                     assumelock) {
1422                         return (EWOULDBLOCK);
1423                 }
1424                 sbunlock(&so->so_snd, 1);
1425                 error = sbwait(&so->so_snd);
1426                 if (error) {
1427                         return (error);
1428                 }
1429                 goto restart;
1430         }
1431
1432         return (0);
1433 }
1434
1435 /*
1436  * Send on a socket.
1437  * If send must go all at once and message is larger than
1438  * send buffering, then hard error.
1439  * Lock against other senders.
1440  * If must go all at once and not enough room now, then
1441  * inform user that this would block and do nothing.
1442  * Otherwise, if nonblocking, send as much as possible.
1443  * The data to be sent is described by "uio" if nonzero,
1444  * otherwise by the mbuf chain "top" (which must be null
1445  * if uio is not).  Data provided in mbuf chain must be small
1446  * enough to send all at once.
1447  *
1448  * Returns nonzero on error, timeout or signal; callers
1449  * must check for short counts if EINTR/ERESTART are returned.
1450  * Data and control buffers are freed on return.
1451  * Experiment:
1452  * MSG_HOLD: go thru most of sosend(), but just enqueue the mbuf
1453  * MSG_SEND: go thru as for MSG_HOLD on current fragment, then
1454  *  point at the mbuf chain being constructed and go from there.
1455  *
1456  * Returns:     0                       Success
1457  *              EOPNOTSUPP
1458  *              EINVAL
1459  *              ENOBUFS
1460  *      uiomove:EFAULT
1461  *      sosendcheck:EPIPE
1462  *      sosendcheck:EWOULDBLOCK
1463  *      sosendcheck:EINTR
1464  *      sosendcheck:EBADF
1465  *      sosendcheck:EINTR
1466  *      sosendcheck:???                 [value from so_error]
1467  *      <pru_send>:ECONNRESET[TCP]
1468  *      <pru_send>:EINVAL[TCP]
1469  *      <pru_send>:ENOBUFS[TCP]
1470  *      <pru_send>:EADDRINUSE[TCP]
1471  *      <pru_send>:EADDRNOTAVAIL[TCP]
1472  *      <pru_send>:EAFNOSUPPORT[TCP]
1473  *      <pru_send>:EACCES[TCP]
1474  *      <pru_send>:EAGAIN[TCP]
1475  *      <pru_send>:EPERM[TCP]
1476  *      <pru_send>:EMSGSIZE[TCP]
1477  *      <pru_send>:EHOSTUNREACH[TCP]
1478  *      <pru_send>:ENETUNREACH[TCP]
1479  *      <pru_send>:ENETDOWN[TCP]
1480  *      <pru_send>:ENOMEM[TCP]
1481  *      <pru_send>:ENOBUFS[TCP]
1482  *      <pru_send>:???[TCP]             [ignorable: mostly IPSEC/firewall/DLIL]
1483  *      <pru_send>:EINVAL[AF_UNIX]
1484  *      <pru_send>:EOPNOTSUPP[AF_UNIX]
1485  *      <pru_send>:EPIPE[AF_UNIX]
1486  *      <pru_send>:ENOTCONN[AF_UNIX]
1487  *      <pru_send>:EISCONN[AF_UNIX]
1488  *      <pru_send>:???[AF_UNIX]         [whatever a filter author chooses]
1489  *      <sf_data_out>:???               [whatever a filter author chooses]
1490  *
1491  * Notes:       Other <pru_send> returns depend on the protocol family; all
1492  *              <sf_data_out> returns depend on what the filter author causes
1493  *              their filter to return.
1494  */
1495 int
1496 sosend(struct socket *so, struct sockaddr *addr, struct uio *uio,
1497     struct mbuf *top, struct mbuf *control, int flags)
1498 {
1499         struct mbuf **mp;
1500         register struct mbuf *m, *freelist = NULL;
1501         register int32_t space, len, resid;
1502         int clen = 0, error, dontroute, mlen, sendflags;
1503         int atomic = sosendallatonce(so) || top;
1504         int sblocked = 0;
1505         struct proc *p = current_proc();
1506
1507         if (uio) {
1508                 // LP64todo - fix this!
1509                 resid = uio_resid(uio);
1510         } else {
1511                 resid = top->m_pkthdr.len;
1512         }
1513         KERNEL_DEBUG((DBG_FNC_SOSEND | DBG_FUNC_START), so, resid,
1514             so->so_snd.sb_cc, so->so_snd.sb_lowat, so->so_snd.sb_hiwat);
1515
1516         socket_lock(so, 1);
1517         if (so->so_type != SOCK_STREAM && (flags & MSG_OOB) != 0) {
1518                 error = EOPNOTSUPP;
1519                 socket_unlock(so, 1);
1520                 goto out;
1521         }
1522
1523         /*
1524          * In theory resid should be unsigned.
1525          * However, space must be signed, as it might be less than 0
1526          * if we over-committed, and we must use a signed comparison
1527          * of space and resid.  On the other hand, a negative resid
1528          * causes us to loop sending 0-length segments to the protocol.
1529          *
1530          * Also check to make sure that MSG_EOR isn't used on SOCK_STREAM
1531          * type sockets since that's an error.
1532          */
1533         if (resid < 0 || (so->so_type == SOCK_STREAM && (flags & MSG_EOR))) {
1534                 error = EINVAL;
1535                 socket_unlock(so, 1);
1536                 goto out;
1537         }
1538
1539         dontroute =
1540             (flags & MSG_DONTROUTE) && (so->so_options & SO_DONTROUTE) == 0 &&
1541             (so->so_proto->pr_flags & PR_ATOMIC);
1542         OSIncrementAtomicLong(&p->p_stats->p_ru.ru_msgsnd);
1543         if (control)
1544                 clen = control->m_len;
1545
1546         do {
1547                 error = sosendcheck(so, addr, resid, clen, atomic, flags,
1548                     &sblocked);
1549                 if (error) {
1550                         goto release;
1551                 }
1552                 mp = &top;
1553                 space = sbspace(&so->so_snd) - clen + ((flags & MSG_OOB) ?
1554                     1024 : 0);
1555
1556                 do {
1557                         struct socket_filter_entry *filter;
1558                         int filtered;
1559                         boolean_t recursive;
1560
1561                         if (uio == NULL) {
1562                                 /*
1563                                  * Data is prepackaged in "top".
1564                                  */
1565                                 resid = 0;
1566                                 if (flags & MSG_EOR)
1567                                         top->m_flags |= M_EOR;
1568                         } else {
1569                                 int chainlength;
1570                                 int bytes_to_copy;
1571                                 boolean_t jumbocl;
1572
1573                                 bytes_to_copy = imin(resid, space);
1574
1575                                 if (sosendminchain > 0) {
1576                                         chainlength = 0;
1577                                 } else {
1578                                         chainlength = sosendmaxchain;
1579                                 }
1580
1581                                 /*
1582                                  * Attempt to use larger than system page-size
1583                                  * clusters for large writes only if there is
1584                                  * a jumbo cluster pool and if the socket is
1585                                  * marked accordingly.
1586                                  */
1587                                 jumbocl = sosendjcl && njcl > 0 &&
1588                                     ((so->so_flags & SOF_MULTIPAGES) ||
1589                                     sosendjcl_ignore_capab);
1590
1591                                 socket_unlock(so, 0);
1592
1593                                 do {
1594                                         int num_needed;
1595                                         int hdrs_needed = (top == 0) ? 1 : 0;
1596
1597                                         /*
1598                                          * try to maintain a local cache of mbuf
1599                                          * clusters needed to complete this
1600                                          * write the list is further limited to
1601                                          * the number that are currently needed
1602                                          * to fill the socket this mechanism
1603                                          * allows a large number of mbufs/
1604                                          * clusters to be grabbed under a single
1605                                          * mbuf lock... if we can't get any
1606                                          * clusters, than fall back to trying
1607                                          * for mbufs if we fail early (or
1608                                          * miscalcluate the number needed) make
1609                                          * sure to release any clusters we
1610                                          * haven't yet consumed.
1611                                          */
1612                                         if (freelist == NULL &&
1613                                             bytes_to_copy > NBPG && jumbocl) {
1614                                                 num_needed =
1615                                                     bytes_to_copy / M16KCLBYTES;
1616
1617                                                 if ((bytes_to_copy -
1618                                                     (num_needed * M16KCLBYTES))
1619                                                     >= MINCLSIZE)
1620                                                         num_needed++;
1621
1622                                                 freelist =
1623                                                     m_getpackets_internal(
1624                                                     (unsigned int *)&num_needed,
1625                                                     hdrs_needed, M_WAIT, 0,
1626                                                     M16KCLBYTES);
1627                                                 /*
1628                                                  * Fall back to 4K cluster size
1629                                                  * if allocation failed
1630                                                  */
1631                                         }
1632
1633                                         if (freelist == NULL &&
1634                                             bytes_to_copy > MCLBYTES) {
1635                                                 num_needed =
1636                                                     bytes_to_copy / NBPG;
1637
1638                                                 if ((bytes_to_copy -
1639                                                     (num_needed * NBPG)) >=
1640                                                     MINCLSIZE)
1641                                                         num_needed++;
1642
1643                                                 freelist =
1644                                                     m_getpackets_internal(
1645                                                     (unsigned int *)&num_needed,
1646                                                     hdrs_needed, M_WAIT, 0,
1647                                                     NBPG);
1648                                                 /*
1649                                                  * Fall back to cluster size
1650                                                  * if allocation failed
1651                                                  */
1652                                         }
1653
1654                                         if (freelist == NULL &&
1655                                             bytes_to_copy > MINCLSIZE) {
1656                                                 num_needed =
1657                                                     bytes_to_copy / MCLBYTES;
1658
1659                                                 if ((bytes_to_copy -
1660                                                     (num_needed * MCLBYTES)) >=
1661                                                     MINCLSIZE)
1662                                                         num_needed++;
1663
1664                                                 freelist =
1665                                                     m_getpackets_internal(
1666                                                     (unsigned int *)&num_needed,
1667                                                     hdrs_needed, M_WAIT, 0,
1668                                                     MCLBYTES);
1669                                                 /*
1670                                                  * Fall back to a single mbuf
1671                                                  * if allocation failed
1672                                                  */
1673                                         }
1674
1675                                         if (freelist == NULL) {
1676                                                 if (top == 0)
1677                                                         MGETHDR(freelist,
1678                                                             M_WAIT, MT_DATA);
1679                                                 else
1680                                                         MGET(freelist,
1681                                                             M_WAIT, MT_DATA);
1682
1683                                                 if (freelist == NULL) {
1684                                                         error = ENOBUFS;
1685                                                         socket_lock(so, 0);
1686                                                         goto release;
1687                                                 }
1688                                                 /*
1689                                                  * For datagram protocols,
1690                                                  * leave room for protocol
1691                                                  * headers in first mbuf.
1692                                                  */
1693                                                 if (atomic && top == 0 &&
1694                                                     bytes_to_copy < MHLEN) {
1695                                                         MH_ALIGN(freelist,
1696                                                             bytes_to_copy);
1697                                                 }
1698                                         }
1699                                         m = freelist;
1700                                         freelist = m->m_next;
1701                                         m->m_next = NULL;
1702
1703                                         if ((m->m_flags & M_EXT))
1704                                                 mlen = m->m_ext.ext_size;
1705                                         else if ((m->m_flags & M_PKTHDR))
1706                                                 mlen =
1707                                                     MHLEN - m_leadingspace(m);
1708                                         else
1709                                                 mlen = MLEN;
1710                                         len = imin(mlen, bytes_to_copy);
1711
1712                                         chainlength += len;
1713
1714                                         space -= len;
1715
1716                                         error = uiomove(mtod(m, caddr_t),
1717                                             len, uio);
1718
1719                                         resid = uio_resid(uio);
1720
1721                                         m->m_len = len;
1722                                         *mp = m;
1723                                         top->m_pkthdr.len += len;
1724                                         if (error)
1725                                                 break;
1726                                         mp = &m->m_next;
1727                                         if (resid <= 0) {
1728                                                 if (flags & MSG_EOR)
1729                                                         top->m_flags |= M_EOR;
1730                                                 break;
1731                                         }
1732                                         bytes_to_copy = min(resid, space);
1733
1734                                 } while (space > 0 &&
1735                                     (chainlength < sosendmaxchain || atomic ||
1736                                     resid < MINCLSIZE));
1737
1738                                 socket_lock(so, 0);
1739
1740                                 if (error)
1741                                         goto release;
1742                         }
1743
1744                         if (flags & (MSG_HOLD|MSG_SEND)) {
1745                                 /* Enqueue for later, go away if HOLD */
1746                                 register struct mbuf *mb1;
1747                                 if (so->so_temp && (flags & MSG_FLUSH)) {
1748                                         m_freem(so->so_temp);
1749                                         so->so_temp = NULL;
1750                                 }
1751                                 if (so->so_temp)
1752                                         so->so_tail->m_next = top;
1753                                 else
1754                                         so->so_temp = top;
1755                                 mb1 = top;
1756                                 while (mb1->m_next)
1757                                         mb1 = mb1->m_next;
1758                                 so->so_tail = mb1;
1759                                 if (flags & MSG_HOLD) {
1760                                         top = NULL;
1761                                         goto release;
1762                                 }
1763                                 top = so->so_temp;
1764                         }
1765                         if (dontroute)
1766                                 so->so_options |= SO_DONTROUTE;
1767
1768                         /* Compute flags here, for pru_send and NKEs */
1769                         sendflags = (flags & MSG_OOB) ? PRUS_OOB :
1770                             /*
1771                              * If the user set MSG_EOF, the protocol
1772                              * understands this flag and nothing left to
1773                              * send then use PRU_SEND_EOF instead of PRU_SEND.
1774                              */
1775                             ((flags & MSG_EOF) &&
1776                              (so->so_proto->pr_flags & PR_IMPLOPCL) &&
1777                              (resid <= 0)) ?
1778                                 PRUS_EOF :
1779                             /* If there is more to send set PRUS_MORETOCOME */
1780                             (resid > 0 && space > 0) ? PRUS_MORETOCOME : 0;
1781
1782                         /*
1783                          * Socket filter processing
1784                          */
1785                         recursive = (so->so_send_filt_thread != NULL);
1786                         filtered = 0;
1787                         error = 0;
1788                         for (filter = so->so_filt; filter && (error == 0);
1789                             filter = filter->sfe_next_onsocket) {
1790                                 if (filter->sfe_filter->sf_filter.sf_data_out) {
1791                                         int so_flags = 0;
1792                                         if (filtered == 0) {
1793                                                 filtered = 1;
1794                                                 so->so_send_filt_thread =
1795                                                     current_thread();
1796                                                 sflt_use(so);
1797                                                 socket_unlock(so, 0);
1798                                                 so_flags =
1799                                                     (sendflags & MSG_OOB) ?
1800                                                     sock_data_filt_flag_oob : 0;
1801                                         }
1802                                         error = filter->sfe_filter->sf_filter.
1803                                             sf_data_out(filter->sfe_cookie, so,
1804                                             addr, &top, &control, so_flags);
1805                                 }
1806                         }
1807
1808                         if (filtered) {
1809                                 /*
1810                                  * At this point, we've run at least one
1811                                  * filter.  The socket is unlocked as is
1812                                  * the socket buffer.  Clear the recorded
1813                                  * filter thread only when we are outside
1814                                  * of a filter's context.  This allows for
1815                                  * a filter to issue multiple inject calls
1816                                  * from its sf_data_out callback routine.
1817                                  */
1818                                 socket_lock(so, 0);
1819                                 sflt_unuse(so);
1820                                 if (!recursive)
1821                                         so->so_send_filt_thread = 0;
1822                                 if (error) {
1823                                         if (error == EJUSTRETURN) {
1824                                                 error = 0;
1825                                                 clen = 0;
1826                                                 control = 0;
1827                                                 top = 0;
1828                                         }
1829
1830                                         goto release;
1831                                 }
1832                         }
1833                         /*
1834                          * End Socket filter processing
1835                          */
1836
1837                         if (error == EJUSTRETURN) {
1838                                 /* A socket filter handled this data */
1839                                 error = 0;
1840                         } else {
1841                                 error = (*so->so_proto->pr_usrreqs->pru_send)
1842                                     (so, sendflags, top, addr, control, p);
1843                         }
1844 #ifdef __APPLE__
1845                         if (flags & MSG_SEND)
1846                                 so->so_temp = NULL;
1847 #endif
1848                         if (dontroute)
1849                                 so->so_options &= ~SO_DONTROUTE;
1850
1851                         clen = 0;
1852                         control = 0;
1853                         top = 0;
1854                         mp = &top;
1855                         if (error)
1856                                 goto release;
1857                 } while (resid && space > 0);
1858         } while (resid);
1859
1860 release:
1861         if (sblocked)
1862                 sbunlock(&so->so_snd, 0);       /* will unlock socket */
1863         else
1864                 socket_unlock(so, 1);
1865 out:
1866         if (top)
1867                 m_freem(top);
1868         if (control)
1869                 m_freem(control);
1870         if (freelist)
1871                 m_freem_list(freelist);
1872
1873         KERNEL_DEBUG(DBG_FNC_SOSEND | DBG_FUNC_END, so, resid, so->so_snd.sb_cc,
1874             space, error);
1875
1876         return (error);
1877 }
1878
1879 /*
1880  * Implement receive operations on a socket.
1881  * We depend on the way that records are added to the sockbuf
1882  * by sbappend*.  In particular, each record (mbufs linked through m_next)
1883  * must begin with an address if the protocol so specifies,
1884  * followed by an optional mbuf or mbufs containing ancillary data,
1885  * and then zero or more mbufs of data.
1886  * In order to avoid blocking network interrupts for the entire time here,
1887  * we splx() while doing the actual copy to user space.
1888  * Although the sockbuf is locked, new data may still be appended,
1889  * and thus we must maintain consistency of the sockbuf during that time.
1890  *
1891  * The caller may receive the data as a single mbuf chain by supplying
1892  * an mbuf **mp0 for use in returning the chain.  The uio is then used
1893  * only for the count in uio_resid.
1894  *
1895  * Returns:     0                       Success
1896  *              ENOBUFS
1897  *              ENOTCONN
1898  *              EWOULDBLOCK
1899  *      uiomove:EFAULT
1900  *      sblock:EWOULDBLOCK
1901  *      sblock:EINTR
1902  *      sbwait:EBADF
1903  *      sbwait:EINTR
1904  *      sodelayed_copy:EFAULT
1905  *      <pru_rcvoob>:EINVAL[TCP]
1906  *      <pru_rcvoob>:EWOULDBLOCK[TCP]
1907  *      <pru_rcvoob>:???
1908  *      <pr_domain->dom_externalize>:EMSGSIZE[AF_UNIX]
1909  *      <pr_domain->dom_externalize>:ENOBUFS[AF_UNIX]
1910  *      <pr_domain->dom_externalize>:???
1911  *
1912  * Notes:       Additional return values from calls through <pru_rcvoob> and
1913  *              <pr_domain->dom_externalize> depend on protocols other than
1914  *              TCP or AF_UNIX, which are documented above.
1915  */
1916 int
1917 soreceive(struct socket *so, struct sockaddr **psa, struct uio *uio,
1918     struct mbuf **mp0, struct mbuf **controlp, int *flagsp)
1919 {
1920         register struct mbuf *m, **mp, *ml = NULL;
1921         register int flags, len, error, offset;
1922         struct protosw *pr = so->so_proto;
1923         struct mbuf *nextrecord;
1924         int moff, type = 0;
1925         int orig_resid = uio_resid(uio);
1926         struct mbuf *free_list;
1927         int delayed_copy_len;
1928         int can_delay;
1929         int need_event;
1930         struct proc *p = current_proc();
1931
1932         // LP64todo - fix this!
1933         KERNEL_DEBUG(DBG_FNC_SORECEIVE | DBG_FUNC_START, so, uio_resid(uio),
1934             so->so_rcv.sb_cc, so->so_rcv.sb_lowat, so->so_rcv.sb_hiwat);
1935
1936         socket_lock(so, 1);
1937
1938 #ifdef MORE_LOCKING_DEBUG
1939         if (so->so_usecount == 1)
1940                 panic("soreceive: so=%x no other reference on socket\n", so);
1941 #endif
1942         mp = mp0;
1943         if (psa)
1944                 *psa = 0;
1945         if (controlp)
1946                 *controlp = 0;
1947         if (flagsp)
1948                 flags = *flagsp &~ MSG_EOR;
1949         else
1950                 flags = 0;
1951
1952         /*
1953          * If a recv attempt is made on a previously-accepted socket
1954          * that has been marked as inactive (disconnected), reject
1955          * the request.
1956          */
1957         if (so->so_flags & SOF_DEFUNCT) {
1958                 struct sockbuf *sb = &so->so_rcv;
1959
1960                 /*
1961                  * This socket should have been disconnected and flushed
1962                  * prior to being returned from accept; there should be
1963                  * no data on its receive list, so panic otherwise.
1964                  */
1965                 sb_empty_assert(sb, __func__);
1966                 socket_unlock(so, 1);
1967                 return (ENOTCONN);
1968         }
1969
1970         /*
1971          * When SO_WANTOOBFLAG is set we try to get out-of-band data
1972          * regardless of the flags argument. Here is the case were
1973          * out-of-band data is not inline.
1974          */
1975         if ((flags & MSG_OOB) ||
1976             ((so->so_options & SO_WANTOOBFLAG) != 0 &&
1977             (so->so_options & SO_OOBINLINE) == 0 &&
1978             (so->so_oobmark || (so->so_state & SS_RCVATMARK)))) {
1979                 m = m_get(M_WAIT, MT_DATA);
1980                 if (m == NULL) {
1981                         socket_unlock(so, 1);
1982                         KERNEL_DEBUG(DBG_FNC_SORECEIVE | DBG_FUNC_END,
1983                             ENOBUFS, 0, 0, 0, 0);
1984                         return (ENOBUFS);
1985                 }
1986                 error = (*pr->pr_usrreqs->pru_rcvoob)(so, m, flags & MSG_PEEK);
1987                 if (error)
1988                         goto bad;
1989                 socket_unlock(so, 0);
1990                 do {
1991                         error = uiomove(mtod(m, caddr_t),
1992                             imin(uio_resid(uio), m->m_len), uio);
1993                         m = m_free(m);
1994                 } while (uio_resid(uio) && error == 0 && m);
1995                 socket_lock(so, 0);
1996 bad:
1997                 if (m)
1998                         m_freem(m);
1999 #ifdef __APPLE__
2000                 if ((so->so_options & SO_WANTOOBFLAG) != 0) {
2001                         if (error == EWOULDBLOCK || error == EINVAL) {
2002                                 /*
2003                                  * Let's try to get normal data:
2004                                  * EWOULDBLOCK: out-of-band data not
2005                                  * receive yet. EINVAL: out-of-band data
2006                                  * already read.
2007                                  */
2008                                 error = 0;
2009                                 goto nooob;
2010                         } else if (error == 0 && flagsp) {
2011                                 *flagsp |= MSG_OOB;
2012                         }
2013                 }
2014                 socket_unlock(so, 1);
2015                 KERNEL_DEBUG(DBG_FNC_SORECEIVE | DBG_FUNC_END, error,
2016                     0, 0, 0, 0);
2017 #endif
2018                 return (error);
2019         }
2020 nooob:
2021         if (mp)
2022                 *mp = (struct mbuf *)0;
2023         if (so->so_state & SS_ISCONFIRMING && uio_resid(uio))
2024                 (*pr->pr_usrreqs->pru_rcvd)(so, 0);
2025
2026
2027         free_list = (struct mbuf *)0;
2028         delayed_copy_len = 0;
2029 restart:
2030 #ifdef MORE_LOCKING_DEBUG
2031         if (so->so_usecount <= 1)
2032                 printf("soreceive: sblock so=%p ref=%d on socket\n",
2033                     so, so->so_usecount);
2034 #endif
2035         /*
2036          * See if the socket has been closed (SS_NOFDREF|SS_CANTRCVMORE)
2037          * and if so just return to the caller.  This could happen when
2038          * soreceive() is called by a socket upcall function during the
2039          * time the socket is freed.  The socket buffer would have been
2040          * locked across the upcall, therefore we cannot put this thread
2041          * to sleep (else we will deadlock) or return EWOULDBLOCK (else
2042          * we may livelock), because the lock on the socket buffer will
2043          * only be released when the upcall routine returns to its caller.
2044          * Because the socket has been officially closed, there can be
2045          * no further read on it.
2046          */
2047         if ((so->so_state & (SS_NOFDREF | SS_CANTRCVMORE)) ==
2048             (SS_NOFDREF | SS_CANTRCVMORE)) {
2049                 socket_unlock(so, 1);
2050                 return (0);
2051         }
2052
2053         error = sblock(&so->so_rcv, SBLOCKWAIT(flags));
2054         if (error) {
2055                 socket_unlock(so, 1);
2056                 KERNEL_DEBUG(DBG_FNC_SORECEIVE | DBG_FUNC_END, error,
2057                     0, 0, 0, 0);
2058                 return (error);
2059         }
2060
2061         m = so->so_rcv.sb_mb;
2062         /*
2063          * If we have less data than requested, block awaiting more
2064          * (subject to any timeout) if:
2065          *   1. the current count is less than the low water mark, or
2066          *   2. MSG_WAITALL is set, and it is possible to do the entire
2067          *      receive operation at once if we block (resid <= hiwat).
2068          *   3. MSG_DONTWAIT is not set
2069          * If MSG_WAITALL is set but resid is larger than the receive buffer,
2070          * we have to do the receive in sections, and thus risk returning
2071          * a short count if a timeout or signal occurs after we start.
2072          */
2073         if (m == 0 || (((flags & MSG_DONTWAIT) == 0 &&
2074             so->so_rcv.sb_cc < uio_resid(uio)) &&
2075             (so->so_rcv.sb_cc < so->so_rcv.sb_lowat ||
2076             ((flags & MSG_WAITALL) && uio_resid(uio) <= so->so_rcv.sb_hiwat)) &&
2077             m->m_nextpkt == 0 && (pr->pr_flags & PR_ATOMIC) == 0)) {
2078                 /*
2079                  * Panic if we notice inconsistencies in the socket's
2080                  * receive list; both sb_mb and sb_cc should correctly
2081                  * reflect the contents of the list, otherwise we may
2082                  * end up with false positives during select() or poll()
2083                  * which could put the application in a bad state.
2084                  */
2085                 if (m == NULL && so->so_rcv.sb_cc != 0)
2086                         panic("soreceive corrupted so_rcv: m %p cc %u",
2087                             m, so->so_rcv.sb_cc);
2088
2089                 if (so->so_error) {
2090                         if (m)
2091                                 goto dontblock;
2092                         error = so->so_error;
2093                         if ((flags & MSG_PEEK) == 0)
2094                                 so->so_error = 0;
2095                         goto release;
2096                 }
2097                 if (so->so_state & SS_CANTRCVMORE) {
2098                         if (m)
2099                                 goto dontblock;
2100                         else
2101                                 goto release;
2102                 }
2103                 for (; m; m = m->m_next)
2104                         if (m->m_type == MT_OOBDATA || (m->m_flags & M_EOR)) {
2105                                 m = so->so_rcv.sb_mb;
2106                                 goto dontblock;
2107                         }
2108                 if ((so->so_state & (SS_ISCONNECTED|SS_ISCONNECTING)) == 0 &&
2109                     (so->so_proto->pr_flags & PR_CONNREQUIRED)) {
2110                         error = ENOTCONN;
2111                         goto release;
2112                 }
2113                 if (uio_resid(uio) == 0)
2114                         goto release;
2115                 if ((so->so_state & SS_NBIO) ||
2116                     (flags & (MSG_DONTWAIT|MSG_NBIO))) {
2117                         error = EWOULDBLOCK;
2118                         goto release;
2119                 }
2120                 SBLASTRECORDCHK(&so->so_rcv, "soreceive sbwait 1");
2121                 SBLASTMBUFCHK(&so->so_rcv, "soreceive sbwait 1");
2122                 sbunlock(&so->so_rcv, 1);
2123 #if EVEN_MORE_LOCKING_DEBUG
2124                 if (socket_debug)
2125                         printf("Waiting for socket data\n");
2126 #endif
2127
2128                 error = sbwait(&so->so_rcv);
2129 #if EVEN_MORE_LOCKING_DEBUG
2130                 if (socket_debug)
2131                         printf("SORECEIVE - sbwait returned %d\n", error);
2132 #endif
2133                 if (so->so_usecount < 1)
2134                         panic("soreceive: after 2nd sblock so=%p ref=%d on "
2135                             "socket\n", so, so->so_usecount);
2136                 if (error) {
2137                         socket_unlock(so, 1);
2138                         KERNEL_DEBUG(DBG_FNC_SORECEIVE | DBG_FUNC_END, error,
2139                             0, 0, 0, 0);
2140                         return (error);
2141                 }
2142                 goto restart;
2143         }
2144 dontblock:
2145         OSIncrementAtomicLong(&p->p_stats->p_ru.ru_msgrcv);
2146         SBLASTRECORDCHK(&so->so_rcv, "soreceive 1");
2147         SBLASTMBUFCHK(&so->so_rcv, "soreceive 1");
2148         nextrecord = m->m_nextpkt;
2149         if ((pr->pr_flags & PR_ADDR) && m->m_type == MT_SONAME) {
2150                 KASSERT(m->m_type == MT_SONAME, ("receive 1a"));
2151 #if CONFIG_MACF_SOCKET_SUBSET
2152                 /*
2153                  * Call the MAC framework for policy checking if we're in
2154                  * the user process context and the socket isn't connected.
2155                  */
2156                 if (p != kernproc && !(so->so_state & SS_ISCONNECTED)) {
2157                         struct mbuf *m0 = m;
2158                         /*
2159                          * Dequeue this record (temporarily) from the receive
2160                          * list since we're about to drop the socket's lock
2161                          * where a new record may arrive and be appended to
2162                          * the list.  Upon MAC policy failure, the record
2163                          * will be freed.  Otherwise, we'll add it back to
2164                          * the head of the list.  We cannot rely on SB_LOCK
2165                          * because append operation uses the socket's lock.
2166                          */
2167                         do {
2168                                 m->m_nextpkt = NULL;
2169                                 sbfree(&so->so_rcv, m);
2170                                 m = m->m_next;
2171                         } while (m != NULL);
2172                         m = m0;
2173                         so->so_rcv.sb_mb = nextrecord;
2174                         SB_EMPTY_FIXUP(&so->so_rcv);
2175                         SBLASTRECORDCHK(&so->so_rcv, "soreceive 1a");
2176                         SBLASTMBUFCHK(&so->so_rcv, "soreceive 1a");
2177                         socket_unlock(so, 0);
2178                         if (mac_socket_check_received(proc_ucred(p), so,
2179                             mtod(m, struct sockaddr *)) != 0) {
2180                                 /*
2181                                  * MAC policy failure; free this record and
2182                                  * process the next record (or block until
2183                                  * one is available).  We have adjusted sb_cc
2184                                  * and sb_mbcnt above so there is no need to
2185                                  * call sbfree() again.
2186                                  */
2187                                 do {
2188                                         m = m_free(m);
2189                                 } while (m != NULL);
2190                                 /*
2191                                  * Clear SB_LOCK but don't unlock the socket.
2192                                  * Process the next record or wait for one.
2193                                  */
2194                                 socket_lock(so, 0);
2195                                 sbunlock(&so->so_rcv, 1);
2196                                 goto restart;
2197                         }
2198                         socket_lock(so, 0);
2199                         /*
2200                          * Re-adjust the socket receive list and re-enqueue
2201                          * the record in front of any packets which may have
2202                          * been appended while we dropped the lock.
2203                          */
2204                         for (m = m0; m->m_next != NULL; m = m->m_next)
2205                                 sballoc(&so->so_rcv, m);
2206                         sballoc(&so->so_rcv, m);
2207                         if (so->so_rcv.sb_mb == NULL) {
2208                                 so->so_rcv.sb_lastrecord = m0;
2209                                 so->so_rcv.sb_mbtail = m;
2210                         }
2211                         m = m0;
2212                         nextrecord = m->m_nextpkt = so->so_rcv.sb_mb;
2213                         so->so_rcv.sb_mb = m;
2214                         SBLASTRECORDCHK(&so->so_rcv, "soreceive 1b");
2215                         SBLASTMBUFCHK(&so->so_rcv, "soreceive 1b");
2216                 }
2217 #endif /* CONFIG_MACF_SOCKET_SUBSET */
2218                 orig_resid = 0;
2219                 if (psa) {
2220                         *psa = dup_sockaddr(mtod(m, struct sockaddr *),
2221                             mp0 == 0);
2222                         if ((*psa == 0) && (flags & MSG_NEEDSA)) {
2223                                 error = EWOULDBLOCK;
2224                                 goto release;
2225                         }
2226                 }
2227                 if (flags & MSG_PEEK) {
2228                         m = m->m_next;
2229                 } else {
2230                         sbfree(&so->so_rcv, m);
2231                         if (m->m_next == 0 && so->so_rcv.sb_cc != 0)
2232                                 panic("soreceive: about to create invalid "
2233                                     "socketbuf");
2234                         MFREE(m, so->so_rcv.sb_mb);
2235                         m = so->so_rcv.sb_mb;
2236                         if (m != NULL) {
2237                                 m->m_nextpkt = nextrecord;
2238                         } else {
2239                                 so->so_rcv.sb_mb = nextrecord;
2240                                 SB_EMPTY_FIXUP(&so->so_rcv);
2241                         }
2242                 }
2243         }
2244
2245         /*
2246          * Process one or more MT_CONTROL mbufs present before any data mbufs
2247          * in the first mbuf chain on the socket buffer.  If MSG_PEEK, we
2248          * just copy the data; if !MSG_PEEK, we call into the protocol to
2249          * perform externalization.
2250          */
2251         if (m != NULL && m->m_type == MT_CONTROL) {
2252                 struct mbuf *cm = NULL, *cmn;
2253                 struct mbuf **cme = &cm;
2254                 struct sockbuf *sb_rcv = &so->so_rcv;
2255
2256                 /*
2257                  * Externalizing the control messages would require us to
2258                  * drop the socket's lock below.  Once we re-acquire the
2259                  * lock, the mbuf chain might change.  In order to preserve
2260                  * consistency, we unlink all control messages from the
2261                  * first mbuf chain in one shot and link them separately
2262                  * onto a different chain.
2263                  */
2264                 do {
2265                         if (flags & MSG_PEEK) {
2266                                 if (controlp != NULL) {
2267                                         *controlp = m_copy(m, 0, m->m_len);
2268                                         controlp = &(*controlp)->m_next;
2269                                 }
2270                                 m = m->m_next;
2271                         } else {
2272                                 m->m_nextpkt = NULL;
2273                                 sbfree(sb_rcv, m);
2274                                 sb_rcv->sb_mb = m->m_next;
2275                                 m->m_next = NULL;
2276                                 *cme = m;
2277                                 cme = &(*cme)->m_next;
2278                                 m = sb_rcv->sb_mb;
2279                         }
2280                 } while (m != NULL && m->m_type == MT_CONTROL);
2281
2282                 if (!(flags & MSG_PEEK)) {
2283                         if (sb_rcv->sb_mb != NULL) {
2284                                 sb_rcv->sb_mb->m_nextpkt = nextrecord;
2285                         } else {
2286                                 sb_rcv->sb_mb = nextrecord;
2287                                 SB_EMPTY_FIXUP(sb_rcv);
2288                         }
2289                         if (nextrecord == NULL)
2290                                 sb_rcv->sb_lastrecord = m;
2291                 }
2292
2293                 SBLASTRECORDCHK(&so->so_rcv, "soreceive ctl");
2294                 SBLASTMBUFCHK(&so->so_rcv, "soreceive ctl");
2295
2296                 while (cm != NULL) {
2297                         int cmsg_type;
2298
2299                         cmn = cm->m_next;
2300                         cm->m_next = NULL;
2301                         cmsg_type = mtod(cm, struct cmsghdr *)->cmsg_type;
2302
2303                         /*
2304                          * Call the protocol to externalize SCM_RIGHTS message
2305                          * and return the modified message to the caller upon
2306                          * success.  Otherwise, all other control messages are
2307                          * returned unmodified to the caller.  Note that we
2308                          * only get into this loop if MSG_PEEK is not set.
2309                          */
2310                         if (pr->pr_domain->dom_externalize != NULL &&
2311                             cmsg_type == SCM_RIGHTS) {
2312                                 /*
2313                                  * Release socket lock: see 3903171.  This
2314                                  * would also allow more records to be appended
2315                                  * to the socket buffer.  We still have SB_LOCK
2316                                  * set on it, so we can be sure that the head
2317                                  * of the mbuf chain won't change.
2318                                  */
2319                                 socket_unlock(so, 0);
2320                                 error = (*pr->pr_domain->dom_externalize)(cm);
2321                                 socket_lock(so, 0);
2322                         } else {
2323                                 error = 0;
2324                         }
2325
2326                         if (controlp != NULL && error == 0) {
2327                                 *controlp = cm;
2328                                 controlp = &(*controlp)->m_next;
2329                                 orig_resid = 0;
2330                         } else {
2331                                 (void) m_free(cm);
2332                         }
2333                         cm = cmn;
2334                 }
2335                 orig_resid = 0;
2336                 if (sb_rcv->sb_mb != NULL)
2337                         nextrecord = sb_rcv->sb_mb->m_nextpkt;
2338                 else
2339                         nextrecord = NULL;
2340         }
2341
2342         if (m != NULL) {
2343                 if (!(flags & MSG_PEEK)) {
2344                         /*
2345                          * We get here because m points to an mbuf following
2346                          * any MT_SONAME or MT_CONTROL mbufs which have been
2347                          * processed above.  In any case, m should be pointing
2348                          * to the head of the mbuf chain, and the nextrecord
2349                          * should be either NULL or equal to m->m_nextpkt.
2350                          * See comments above about SB_LOCK.
2351                          */
2352                         if (m != so->so_rcv.sb_mb || m->m_nextpkt != nextrecord)
2353                                 panic("soreceive: post-control !sync so=%p "
2354                                     "m=%p nextrecord=%p\n", so, m, nextrecord);
2355
2356                         if (nextrecord == NULL)
2357                                 so->so_rcv.sb_lastrecord = m;
2358                 }
2359                 type = m->m_type;
2360                 if (type == MT_OOBDATA)
2361                         flags |= MSG_OOB;
2362         } else {
2363                 if (!(flags & MSG_PEEK)) {
2364                         so->so_rcv.sb_mb = nextrecord;
2365                         SB_EMPTY_FIXUP(&so->so_rcv);
2366                 }
2367         }
2368         SBLASTRECORDCHK(&so->so_rcv, "soreceive 2");
2369         SBLASTMBUFCHK(&so->so_rcv, "soreceive 2");
2370
2371         moff = 0;
2372         offset = 0;
2373
2374         if (!(flags & MSG_PEEK) && uio_resid(uio) > sorecvmincopy)
2375                 can_delay = 1;
2376         else
2377                 can_delay = 0;
2378
2379         need_event = 0;
2380
2381         while (m && (uio_resid(uio) - delayed_copy_len) > 0 && error == 0) {
2382                 if (m->m_type == MT_OOBDATA) {
2383                         if (type != MT_OOBDATA)
2384                                 break;
2385                 } else if (type == MT_OOBDATA) {
2386                         break;
2387                 }
2388                 /*
2389                  * Make sure to allways set MSG_OOB event when getting
2390                  * out of band data inline.
2391                  */
2392                 if ((so->so_options & SO_WANTOOBFLAG) != 0 &&
2393                     (so->so_options & SO_OOBINLINE) != 0 &&
2394                     (so->so_state & SS_RCVATMARK) != 0) {
2395                         flags |= MSG_OOB;
2396                 }
2397                 so->so_state &= ~SS_RCVATMARK;
2398                 len = uio_resid(uio) - delayed_copy_len;
2399                 if (so->so_oobmark && len > so->so_oobmark - offset)
2400                         len = so->so_oobmark - offset;
2401                 if (len > m->m_len - moff)
2402                         len = m->m_len - moff;
2403                 /*
2404                  * If mp is set, just pass back the mbufs.
2405                  * Otherwise copy them out via the uio, then free.
2406                  * Sockbuf must be consistent here (points to current mbuf,
2407                  * it points to next record) when we drop priority;
2408                  * we must note any additions to the sockbuf when we
2409                  * block interrupts again.
2410                  */
2411                 if (mp == 0) {
2412                         SBLASTRECORDCHK(&so->so_rcv, "soreceive uiomove");
2413                         SBLASTMBUFCHK(&so->so_rcv, "soreceive uiomove");
2414                         if (can_delay && len == m->m_len) {
2415                                 /*
2416                                  * only delay the copy if we're consuming the
2417                                  * mbuf and we're NOT in MSG_PEEK mode
2418                                  * and we have enough data to make it worthwile
2419                                  * to drop and retake the lock... can_delay
2420                                  * reflects the state of the 2 latter
2421                                  * constraints moff should always be zero
2422                                  * in these cases
2423                                  */
2424                                 delayed_copy_len += len;
2425                         } else {
2426                                 if (delayed_copy_len) {
2427                                         error = sodelayed_copy(so, uio,
2428                                             &free_list, &delayed_copy_len);
2429
2430                                         if (error) {
2431                                                 goto release;
2432                                         }
2433                                         /*
2434                                          * can only get here if MSG_PEEK is not
2435                                          * set therefore, m should point at the
2436                                          * head of the rcv queue; if it doesn't,
2437                                          * it means something drastically
2438                                          * changed while we were out from behind
2439                                          * the lock in sodelayed_copy. perhaps
2440                                          * a RST on the stream. in any event,
2441                                          * the stream has been interrupted. it's
2442                                          * probably best just to return whatever
2443                                          * data we've moved and let the caller
2444                                          * sort it out...
2445                                          */
2446                                         if (m != so->so_rcv.sb_mb) {
2447                                                 break;
2448                                         }
2449                                 }
2450                                 socket_unlock(so, 0);
2451                                 error = uiomove(mtod(m, caddr_t) + moff,
2452                                     (int)len, uio);
2453                                 socket_lock(so, 0);
2454
2455                                 if (error)
2456                                         goto release;
2457                         }
2458                 } else {
2459                         uio_setresid(uio, (uio_resid(uio) - len));
2460                 }
2461                 if (len == m->m_len - moff) {
2462                         if (m->m_flags & M_EOR)
2463                                 flags |= MSG_EOR;
2464                         if (flags & MSG_PEEK) {
2465                                 m = m->m_next;
2466                                 moff = 0;
2467                         } else {
2468                                 nextrecord = m->m_nextpkt;
2469                                 sbfree(&so->so_rcv, m);
2470                                 m->m_nextpkt = NULL;
2471
2472                                 if (mp) {
2473                                         *mp = m;
2474                                         mp = &m->m_next;
2475                                         so->so_rcv.sb_mb = m = m->m_next;
2476                                         *mp = (struct mbuf *)0;
2477                                 } else {
2478                                         if (free_list == NULL)
2479                                                 free_list = m;
2480                                         else
2481                                                 ml->m_next = m;
2482                                         ml = m;
2483                                         so->so_rcv.sb_mb = m = m->m_next;
2484                                         ml->m_next = 0;
2485                                 }
2486                                 if (m != NULL) {
2487                                         m->m_nextpkt = nextrecord;
2488                                         if (nextrecord == NULL)
2489                                                 so->so_rcv.sb_lastrecord = m;
2490                                 } else {
2491                                         so->so_rcv.sb_mb = nextrecord;
2492                                         SB_EMPTY_FIXUP(&so->so_rcv);
2493                                 }
2494                                 SBLASTRECORDCHK(&so->so_rcv, "soreceive 3");
2495                                 SBLASTMBUFCHK(&so->so_rcv, "soreceive 3");
2496                         }
2497                 } else {
2498                         if (flags & MSG_PEEK) {
2499                                 moff += len;
2500                         } else {
2501                                 if (mp)
2502                                         *mp = m_copym(m, 0, len, M_WAIT);
2503                                 m->m_data += len;
2504                                 m->m_len -= len;
2505                                 so->so_rcv.sb_cc -= len;
2506                         }
2507                 }
2508                 if (so->so_oobmark) {
2509                         if ((flags & MSG_PEEK) == 0) {
2510                                 so->so_oobmark -= len;
2511                                 if (so->so_oobmark == 0) {
2512                                         so->so_state |= SS_RCVATMARK;
2513                                         /*
2514                                          * delay posting the actual event until
2515                                          * after any delayed copy processing
2516                                          * has finished
2517                                          */
2518                                         need_event = 1;
2519                                         break;
2520                                 }
2521                         } else {
2522                                 offset += len;
2523                                 if (offset == so->so_oobmark)
2524                                         break;
2525                         }
2526                 }
2527                 if (flags & MSG_EOR)
2528                         break;
2529                 /*
2530                  * If the MSG_WAITALL or MSG_WAITSTREAM flag is set
2531                  * (for non-atomic socket), we must not quit until
2532                  * "uio->uio_resid == 0" or an error termination.
2533                  * If a signal/timeout occurs, return with a short
2534                  * count but without error.  Keep sockbuf locked
2535                  * against other readers.
2536                  */
2537                 while (flags & (MSG_WAITALL|MSG_WAITSTREAM) && m == 0 &&
2538                     (uio_resid(uio) - delayed_copy_len) > 0 &&
2539                     !sosendallatonce(so) && !nextrecord) {
2540                         if (so->so_error || so->so_state & SS_CANTRCVMORE)
2541                                 goto release;
2542
2543                         /*
2544                          * Depending on the protocol (e.g. TCP), the following
2545                          * might cause the socket lock to be dropped and later
2546                          * be reacquired, and more data could have arrived and
2547                          * have been appended to the receive socket buffer by
2548                          * the time it returns.  Therefore, we only sleep in
2549                          * sbwait() below if and only if the socket buffer is
2550                          * empty, in order to avoid a false sleep.
2551                          */
2552                         if (pr->pr_flags & PR_WANTRCVD && so->so_pcb &&
2553                             (((struct inpcb *)so->so_pcb)->inp_state !=
2554                             INPCB_STATE_DEAD))
2555                                 (*pr->pr_usrreqs->pru_rcvd)(so, flags);
2556
2557                         SBLASTRECORDCHK(&so->so_rcv, "soreceive sbwait 2");
2558                         SBLASTMBUFCHK(&so->so_rcv, "soreceive sbwait 2");
2559
2560                         if (so->so_rcv.sb_mb == NULL && sbwait(&so->so_rcv)) {
2561                                 error = 0;
2562                                 goto release;
2563                         }
2564                         /*
2565                          * have to wait until after we get back from the sbwait
2566                          * to do the copy because we will drop the lock if we
2567                          * have enough data that has been delayed... by dropping
2568                          * the lock we open up a window allowing the netisr
2569                          * thread to process the incoming packets and to change
2570                          * the state of this socket... we're issuing the sbwait
2571                          * because the socket is empty and we're expecting the
2572                          * netisr thread to wake us up when more packets arrive;
2573                          * if we allow that processing to happen and then sbwait
2574                          * we could stall forever with packets sitting in the
2575                          * socket if no further packets arrive from the remote
2576                          * side.
2577                          *
2578                          * we want to copy before we've collected all the data
2579                          * to satisfy this request to allow the copy to overlap
2580                          * the incoming packet processing on an MP system
2581                          */
2582                         if (delayed_copy_len > sorecvmincopy &&
2583                             (delayed_copy_len > (so->so_rcv.sb_hiwat / 2))) {
2584                                 error = sodelayed_copy(so, uio,
2585                                     &free_list, &delayed_copy_len);
2586
2587                                 if (error)
2588                                         goto release;
2589                         }
2590                         m = so->so_rcv.sb_mb;
2591                         if (m) {
2592                                 nextrecord = m->m_nextpkt;
2593                         }
2594                 }
2595         }
2596 #ifdef MORE_LOCKING_DEBUG
2597         if (so->so_usecount <= 1)
2598                 panic("soreceive: after big while so=%p ref=%d on socket\n",
2599                     so, so->so_usecount);
2600 #endif
2601
2602         if (m && pr->pr_flags & PR_ATOMIC) {
2603 #ifdef __APPLE__
2604                 if (so->so_options & SO_DONTTRUNC) {
2605                         flags |= MSG_RCVMORE;
2606                 } else {
2607 #endif
2608                         flags |= MSG_TRUNC;
2609                         if ((flags & MSG_PEEK) == 0)
2610                                 (void) sbdroprecord(&so->so_rcv);
2611 #ifdef __APPLE__
2612                 }
2613 #endif
2614         }
2615
2616         /*
2617          * pru_rcvd below (for TCP) may cause more data to be received
2618          * if the socket lock is dropped prior to sending the ACK; some
2619          * legacy OpenTransport applications don't handle this well
2620          * (if it receives less data than requested while MSG_HAVEMORE
2621          * is set), and so we set the flag now based on what we know
2622          * prior to calling pru_rcvd.
2623          */
2624         if ((so->so_options & SO_WANTMORE) && so->so_rcv.sb_cc > 0)
2625                 flags |= MSG_HAVEMORE;
2626
2627         if ((flags & MSG_PEEK) == 0) {
2628                 if (m == 0) {
2629                         so->so_rcv.sb_mb = nextrecord;
2630                         /*
2631                          * First part is an inline SB_EMPTY_FIXUP().  Second
2632                          * part makes sure sb_lastrecord is up-to-date if
2633                          * there is still data in the socket buffer.
2634                          */
2635                         if (so->so_rcv.sb_mb == NULL) {
2636                                 so->so_rcv.sb_mbtail = NULL;
2637                                 so->so_rcv.sb_lastrecord = NULL;
2638                         } else if (nextrecord->m_nextpkt == NULL) {
2639                                 so->so_rcv.sb_lastrecord = nextrecord;
2640                         }
2641                 }
2642                 SBLASTRECORDCHK(&so->so_rcv, "soreceive 4");
2643                 SBLASTMBUFCHK(&so->so_rcv, "soreceive 4");
2644                 if (pr->pr_flags & PR_WANTRCVD && so->so_pcb)
2645                         (*pr->pr_usrreqs->pru_rcvd)(so, flags);
2646         }
2647 #ifdef __APPLE__
2648         if (delayed_copy_len) {
2649                 error = sodelayed_copy(so, uio, &free_list, &delayed_copy_len);
2650
2651                 if (error)
2652                         goto release;
2653         }
2654         if (free_list) {
2655                 m_freem_list((struct mbuf *)free_list);
2656                 free_list = (struct mbuf *)0;
2657         }
2658         if (need_event)
2659                 postevent(so, 0, EV_OOB);
2660 #endif
2661         if (orig_resid == uio_resid(uio) && orig_resid &&
2662             (flags & MSG_EOR) == 0 && (so->so_state & SS_CANTRCVMORE) == 0) {
2663                 sbunlock(&so->so_rcv, 1);
2664                 goto restart;
2665         }
2666
2667         if (flagsp)
2668                 *flagsp |= flags;
2669 release:
2670 #ifdef MORE_LOCKING_DEBUG
2671         if (so->so_usecount <= 1)
2672                 panic("soreceive: release so=%p ref=%d on socket\n",
2673                     so, so->so_usecount);
2674 #endif
2675         if (delayed_copy_len) {
2676                 error = sodelayed_copy(so, uio, &free_list, &delayed_copy_len);
2677         }
2678         if (free_list) {
2679                 m_freem_list((struct mbuf *)free_list);
2680         }
2681         sbunlock(&so->so_rcv, 0);       /* will unlock socket */
2682
2683         // LP64todo - fix this!
2684         KERNEL_DEBUG(DBG_FNC_SORECEIVE | DBG_FUNC_END, so, uio_resid(uio),
2685             so->so_rcv.sb_cc, 0, error);
2686
2687         return (error);
2688 }
2689
2690 /*
2691  * Returns:     0                       Success
2692  *      uiomove:EFAULT
2693  */
2694 static int
2695 sodelayed_copy(struct socket *so, struct uio *uio, struct mbuf **free_list,
2696     int *resid)
2697 {
2698         int error = 0;
2699         struct mbuf *m;
2700
2701         m = *free_list;
2702
2703         socket_unlock(so, 0);
2704
2705         while (m && error == 0) {
2706
2707                 error = uiomove(mtod(m, caddr_t), (int)m->m_len, uio);
2708
2709                 m = m->m_next;
2710         }
2711         m_freem_list(*free_list);
2712
2713         *free_list = (struct mbuf *)NULL;
2714         *resid = 0;
2715
2716         socket_lock(so, 0);
2717
2718         return (error);
2719 }
2720
2721
2722 /*
2723  * Returns:     0                       Success
2724  *              EINVAL
2725  *              ENOTCONN
2726  *      <pru_shutdown>:EINVAL
2727  *      <pru_shutdown>:EADDRNOTAVAIL[TCP]
2728  *      <pru_shutdown>:ENOBUFS[TCP]
2729  *      <pru_shutdown>:EMSGSIZE[TCP]
2730  *      <pru_shutdown>:EHOSTUNREACH[TCP]
2731  *      <pru_shutdown>:ENETUNREACH[TCP]
2732  *      <pru_shutdown>:ENETDOWN[TCP]
2733  *      <pru_shutdown>:ENOMEM[TCP]
2734  *      <pru_shutdown>:EACCES[TCP]
2735  *      <pru_shutdown>:EMSGSIZE[TCP]
2736  *      <pru_shutdown>:ENOBUFS[TCP]
2737  *      <pru_shutdown>:???[TCP]         [ignorable: mostly IPSEC/firewall/DLIL]
2738  *      <pru_shutdown>:???              [other protocol families]
2739  */
2740 int
2741 soshutdown(struct socket *so, int how)
2742 {
2743         int error;
2744
2745         switch (how) {
2746         case SHUT_RD:
2747         case SHUT_WR:
2748         case SHUT_RDWR:
2749                 socket_lock(so, 1);
2750                 if ((so->so_state &
2751                     (SS_ISCONNECTED|SS_ISCONNECTING|SS_ISDISCONNECTING)) == 0) {
2752                         error = ENOTCONN;
2753                 } else {
2754                         error = soshutdownlock(so, how);
2755                 }
2756                 socket_unlock(so, 1);
2757                 break;
2758         default:
2759                 error = EINVAL;
2760                 break;
2761         }
2762
2763         return (error);
2764 }
2765
2766 int
2767 soshutdownlock(struct socket *so, int how)
2768 {
2769         struct protosw *pr = so->so_proto;
2770         int error = 0;
2771
2772         sflt_notify(so, sock_evt_shutdown, &how);
2773
2774         if (how != SHUT_WR) {
2775                 if ((so->so_state & SS_CANTRCVMORE) != 0) {
2776                         /* read already shut down */
2777                         error = ENOTCONN;
2778                         goto done;
2779                 }
2780                 sorflush(so);
2781                 postevent(so, 0, EV_RCLOSED);
2782         }
2783         if (how != SHUT_RD) {
2784                 if ((so->so_state & SS_CANTSENDMORE) != 0) {
2785                         /* write already shut down */
2786                         error = ENOTCONN;
2787                         goto done;
2788                 }
2789                 error = (*pr->pr_usrreqs->pru_shutdown)(so);
2790                 postevent(so, 0, EV_WCLOSED);
2791         }
2792 done:
2793         KERNEL_DEBUG(DBG_FNC_SOSHUTDOWN | DBG_FUNC_END, 0, 0, 0, 0, 0);
2794         return (error);
2795 }
2796
2797 void
2798 sorflush(struct socket *so)
2799 {
2800         register struct sockbuf *sb = &so->so_rcv;
2801         register struct protosw *pr = so->so_proto;
2802         struct sockbuf asb;
2803
2804 #ifdef MORE_LOCKING_DEBUG
2805         lck_mtx_t *mutex_held;
2806
2807         if (so->so_proto->pr_getlock != NULL)
2808                 mutex_held = (*so->so_proto->pr_getlock)(so, 0);
2809         else
2810                 mutex_held = so->so_proto->pr_domain->dom_mtx;
2811         lck_mtx_assert(mutex_held, LCK_MTX_ASSERT_OWNED);
2812 #endif
2813
2814         sflt_notify(so, sock_evt_flush_read, NULL);
2815
2816         sb->sb_flags |= SB_NOINTR;
2817         (void) sblock(sb, M_WAIT);
2818         socantrcvmore(so);
2819         sbunlock(sb, 1);
2820 #ifdef __APPLE__
2821         selthreadclear(&sb->sb_sel);
2822 #endif
2823         asb = *sb;
2824         bzero((caddr_t)sb, sizeof (*sb));
2825         sb->sb_so = so; /* reestablish link to socket */
2826         if (asb.sb_flags & SB_KNOTE) {
2827                 sb->sb_sel.si_note = asb.sb_sel.si_note;
2828                 sb->sb_flags = SB_KNOTE;
2829         }
2830         if (asb.sb_flags & SB_DROP)
2831                 sb->sb_flags |= SB_DROP;
2832         if (asb.sb_flags & SB_UNIX)
2833                 sb->sb_flags |= SB_UNIX;
2834         if ((pr->pr_flags & PR_RIGHTS) && pr->pr_domain->dom_dispose) {
2835                 (*pr->pr_domain->dom_dispose)(asb.sb_mb);
2836         }
2837         sbrelease(&asb);
2838 }
2839
2840 /*
2841  * Perhaps this routine, and sooptcopyout(), below, ought to come in
2842  * an additional variant to handle the case where the option value needs
2843  * to be some kind of integer, but not a specific size.
2844  * In addition to their use here, these functions are also called by the
2845  * protocol-level pr_ctloutput() routines.
2846  *
2847  * Returns:     0                       Success
2848  *              EINVAL
2849  *      copyin:EFAULT
2850  */
2851 int
2852 sooptcopyin(struct sockopt *sopt, void *buf, size_t len, size_t minlen)
2853 {
2854         size_t  valsize;
2855
2856         /*
2857          * If the user gives us more than we wanted, we ignore it,
2858          * but if we don't get the minimum length the caller
2859          * wants, we return EINVAL.  On success, sopt->sopt_valsize
2860          * is set to however much we actually retrieved.
2861          */
2862         if ((valsize = sopt->sopt_valsize) < minlen)
2863                 return (EINVAL);
2864         if (valsize > len)
2865                 sopt->sopt_valsize = valsize = len;
2866
2867         if (sopt->sopt_p != kernproc)
2868                 return (copyin(sopt->sopt_val, buf, valsize));
2869
2870         bcopy(CAST_DOWN(caddr_t, sopt->sopt_val), buf, valsize);
2871         return (0);
2872 }
2873
2874 /*
2875  * sooptcopyin_timeval
2876  *   Copy in a timeval value into tv_p, and take into account whether the
2877  *   the calling process is 64-bit or 32-bit.  Moved the sanity checking
2878  *   code here so that we can verify the 64-bit tv_sec value before we lose
2879  *   the top 32-bits assigning tv64.tv_sec to tv_p->tv_sec.
2880  */
2881 static int
2882 sooptcopyin_timeval(struct sockopt *sopt, struct timeval * tv_p)
2883 {
2884         int                     error;
2885
2886         if (proc_is64bit(sopt->sopt_p)) {
2887                 struct user64_timeval   tv64;
2888
2889                 if (sopt->sopt_valsize < sizeof(tv64)) {
2890                         return (EINVAL);
2891                 }
2892                 sopt->sopt_valsize = sizeof(tv64);
2893                 if (sopt->sopt_p != kernproc) {
2894                         error = copyin(sopt->sopt_val, &tv64, sizeof(tv64));
2895                         if (error != 0)
2896                                 return (error);
2897                 } else {
2898                         bcopy(CAST_DOWN(caddr_t, sopt->sopt_val), &tv64,
2899                                 sizeof(tv64));
2900                 }
2901                 if (tv64.tv_sec < 0 || tv64.tv_sec > LONG_MAX
2902                     || tv64.tv_usec < 0 || tv64.tv_usec >= 1000000) {
2903                         return (EDOM);
2904                 }
2905                 tv_p->tv_sec = tv64.tv_sec;
2906                 tv_p->tv_usec = tv64.tv_usec;
2907         } else {
2908                 struct user32_timeval   tv32;
2909
2910                 if (sopt->sopt_valsize < sizeof(tv32)) {
2911                         return (EINVAL);
2912                 }
2913                 sopt->sopt_valsize = sizeof(tv32);
2914                 if (sopt->sopt_p != kernproc) {
2915                         error = copyin(sopt->sopt_val, &tv32, sizeof(tv32));
2916                         if (error != 0) {
2917                                 return (error);
2918                         }
2919                 } else {
2920                         bcopy(CAST_DOWN(caddr_t, sopt->sopt_val), &tv32,
2921                               sizeof(tv32));
2922                 }
2923 #ifndef __LP64__ // K64todo "comparison is always false due to limited range of data type"
2924                 if (tv32.tv_sec < 0 || tv32.tv_sec > LONG_MAX
2925                     || tv32.tv_usec < 0 || tv32.tv_usec >= 1000000) {
2926                         return (EDOM);
2927                 }
2928 #endif
2929                 tv_p->tv_sec = tv32.tv_sec;
2930                 tv_p->tv_usec = tv32.tv_usec;
2931         }
2932         return (0);
2933 }
2934
2935 /*
2936  * Returns:     0                       Success
2937  *              EINVAL
2938  *              ENOPROTOOPT
2939  *              ENOBUFS
2940  *              EDOM
2941  *      sooptcopyin:EINVAL
2942  *      sooptcopyin:EFAULT
2943  *      sooptcopyin_timeval:EINVAL
2944  *      sooptcopyin_timeval:EFAULT
2945  *      sooptcopyin_timeval:EDOM
2946  *      <pr_ctloutput>:EOPNOTSUPP[AF_UNIX]
2947  *      <pr_ctloutput>:???w
2948  *      sflt_attach_private:???         [whatever a filter author chooses]
2949  *      <sf_setoption>:???              [whatever a filter author chooses]
2950  *
2951  * Notes:       Other <pru_listen> returns depend on the protocol family; all
2952  *              <sf_listen> returns depend on what the filter author causes
2953  *              their filter to return.
2954  */
2955 int
2956 sosetopt(struct socket *so, struct sockopt *sopt)
2957 {
2958         int     error, optval;
2959         struct  linger l;
2960         struct  timeval tv;
2961         struct socket_filter_entry *filter;
2962         int filtered = 0;
2963 #if CONFIG_MACF_SOCKET
2964         struct mac extmac;
2965 #endif /* MAC_SOCKET */
2966
2967         socket_lock(so, 1);
2968         if ((so->so_state & (SS_CANTRCVMORE | SS_CANTSENDMORE))
2969             == (SS_CANTRCVMORE | SS_CANTSENDMORE) &&
2970             (so->so_flags & SOF_NPX_SETOPTSHUT) == 0) {
2971                 /* the socket has been shutdown, no more sockopt's */
2972                 error = EINVAL;
2973                 goto bad;
2974         }
2975
2976         if (sopt->sopt_dir != SOPT_SET) {
2977                 sopt->sopt_dir = SOPT_SET;
2978         }
2979
2980         error = 0;
2981         for (filter = so->so_filt; filter && (error == 0);
2982             filter = filter->sfe_next_onsocket) {
2983                 if (filter->sfe_filter->sf_filter.sf_setoption) {
2984                         if (filtered == 0) {
2985                                 filtered = 1;
2986                                 sflt_use(so);
2987                                 socket_unlock(so, 0);
2988                         }
2989                         error = filter->sfe_filter->sf_filter.
2990                             sf_setoption(filter->sfe_cookie, so, sopt);
2991                 }
2992         }
2993
2994         if (filtered != 0) {
2995                 socket_lock(so, 0);
2996                 sflt_unuse(so);
2997
2998                 if (error) {
2999                         if (error == EJUSTRETURN)
3000                                 error = 0;
3001                         goto bad;
3002                 }
3003         }
3004
3005         error = 0;
3006         if (sopt->sopt_level != SOL_SOCKET) {
3007                 if (so->so_proto && so->so_proto->pr_ctloutput) {
3008                         error = (*so->so_proto->pr_ctloutput)(so, sopt);
3009                         socket_unlock(so, 1);
3010                         return (error);
3011                 }
3012                 error = ENOPROTOOPT;
3013         } else {
3014                 switch (sopt->sopt_name) {
3015                 case SO_LINGER:
3016                 case SO_LINGER_SEC:
3017                         error = sooptcopyin(sopt, &l, sizeof (l), sizeof (l));
3018                         if (error)
3019                                 goto bad;
3020
3021                         so->so_linger = (sopt->sopt_name == SO_LINGER) ?
3022                             l.l_linger : l.l_linger * hz;
3023                         if (l.l_onoff)
3024                                 so->so_options |= SO_LINGER;
3025                         else
3026                                 so->so_options &= ~SO_LINGER;
3027                         break;
3028
3029                 case SO_DEBUG:
3030                 case SO_KEEPALIVE:
3031                 case SO_DONTROUTE:
3032                 case SO_USELOOPBACK:
3033                 case SO_BROADCAST:
3034                 case SO_REUSEADDR:
3035                 case SO_REUSEPORT:
3036                 case SO_OOBINLINE:
3037                 case SO_TIMESTAMP:
3038 #ifdef __APPLE__
3039                 case SO_DONTTRUNC:
3040                 case SO_WANTMORE:
3041                 case SO_WANTOOBFLAG:
3042 #endif
3043                         error = sooptcopyin(sopt, &optval, sizeof (optval),
3044                             sizeof (optval));
3045                         if (error)
3046                                 goto bad;
3047                         if (optval)
3048                                 so->so_options |= sopt->sopt_name;
3049                         else
3050                                 so->so_options &= ~sopt->sopt_name;
3051                         break;
3052
3053                 case SO_SNDBUF:
3054                 case SO_RCVBUF:
3055                 case SO_SNDLOWAT:
3056                 case SO_RCVLOWAT:
3057                         error = sooptcopyin(sopt, &optval, sizeof (optval),
3058                             sizeof (optval));
3059                         if (error)
3060                                 goto bad;
3061
3062                         /*
3063                          * Values < 1 make no sense for any of these
3064                          * options, so disallow them.
3065                          */
3066                         if (optval < 1) {
3067                                 error = EINVAL;
3068                                 goto bad;
3069                         }
3070
3071                         switch (sopt->sopt_name) {
3072                         case SO_SNDBUF:
3073                         case SO_RCVBUF:
3074                                 if (sbreserve(sopt->sopt_name == SO_SNDBUF ?
3075                                     &so->so_snd : &so->so_rcv,
3076                                     (u_int32_t) optval) == 0) {
3077                                         error = ENOBUFS;
3078                                         goto bad;
3079                                 }
3080                                 if (sopt->sopt_name == SO_SNDBUF)
3081                                         so->so_snd.sb_flags |= SB_USRSIZE;
3082                                 else
3083                                         so->so_rcv.sb_flags |= SB_USRSIZE;
3084                                 break;
3085
3086                         /*
3087                          * Make sure the low-water is never greater than
3088                          * the high-water.
3089                          */
3090                         case SO_SNDLOWAT:
3091                                 so->so_snd.sb_lowat =
3092                                     (optval > so->so_snd.sb_hiwat) ?
3093                                     so->so_snd.sb_hiwat : optval;
3094                                 break;
3095                         case SO_RCVLOWAT:
3096                                 so->so_rcv.sb_lowat =
3097                                     (optval > so->so_rcv.sb_hiwat) ?
3098                                     so->so_rcv.sb_hiwat : optval;
3099                                 break;
3100                         }
3101                         break;
3102
3103                 case SO_SNDTIMEO:
3104                 case SO_RCVTIMEO:
3105                         error = sooptcopyin_timeval(sopt, &tv);
3106                         if (error)
3107                                 goto bad;
3108
3109                         switch (sopt->sopt_name) {
3110                         case SO_SNDTIMEO:
3111                                 so->so_snd.sb_timeo = tv;
3112                                 break;
3113                         case SO_RCVTIMEO:
3114                                 so->so_rcv.sb_timeo = tv;
3115                                 break;
3116                         }
3117                         break;
3118
3119                 case SO_NKE:
3120                 {
3121                         struct so_nke nke;
3122
3123                         error = sooptcopyin(sopt, &nke, sizeof (nke),
3124                             sizeof (nke));
3125                         if (error)
3126                                 goto bad;
3127
3128                         error = sflt_attach_private(so, NULL,
3129                             nke.nke_handle, 1);
3130                         break;
3131                 }
3132
3133                 case SO_NOSIGPIPE:
3134                         error = sooptcopyin(sopt, &optval, sizeof (optval),
3135                             sizeof (optval));
3136                         if (error)
3137                                 goto bad;
3138                         if (optval)
3139                                 so->so_flags |= SOF_NOSIGPIPE;
3140                         else
3141                                 so->so_flags &= ~SOF_NOSIGPIPE;
3142
3143                         break;
3144
3145                 case SO_NOADDRERR:
3146                         error = sooptcopyin(sopt, &optval, sizeof (optval),
3147                             sizeof (optval));
3148                         if (error)
3149                                 goto bad;
3150                         if (optval)
3151                                 so->so_flags |= SOF_NOADDRAVAIL;
3152                         else
3153                                 so->so_flags &= ~SOF_NOADDRAVAIL;
3154
3155                         break;
3156
3157                 case SO_REUSESHAREUID:
3158                         error = sooptcopyin(sopt, &optval, sizeof (optval),
3159                             sizeof (optval));
3160                         if (error)
3161                                 goto bad;
3162                         if (optval)
3163                                 so->so_flags |= SOF_REUSESHAREUID;
3164                         else
3165                                 so->so_flags &= ~SOF_REUSESHAREUID;
3166                         break;
3167 #ifdef __APPLE_API_PRIVATE
3168                 case SO_NOTIFYCONFLICT:
3169                         if (kauth_cred_issuser(kauth_cred_get()) == 0) {
3170                                 error = EPERM;
3171                                 goto bad;
3172                         }
3173                         error = sooptcopyin(sopt, &optval, sizeof (optval),
3174                             sizeof (optval));
3175                         if (error)
3176                                 goto bad;
3177                         if (optval)
3178                                 so->so_flags |= SOF_NOTIFYCONFLICT;
3179                         else
3180                                 so->so_flags &= ~SOF_NOTIFYCONFLICT;
3181                         break;
3182 #endif
3183                 case SO_RESTRICTIONS:
3184                         if (kauth_cred_issuser(kauth_cred_get()) == 0) {
3185                                 error = EPERM;
3186                                 goto bad;
3187                         }
3188                         error = sooptcopyin(sopt, &optval, sizeof (optval),
3189                             sizeof (optval));
3190                         if (error)
3191                                 goto bad;
3192                         so->so_restrictions = (optval & (SO_RESTRICT_DENYIN |
3193                             SO_RESTRICT_DENYOUT | SO_RESTRICT_DENYSET));
3194                         break;
3195
3196                 case SO_LABEL:
3197 #if CONFIG_MACF_SOCKET
3198                         if ((error = sooptcopyin(sopt, &extmac, sizeof (extmac),
3199                             sizeof (extmac))) != 0)
3200                                 goto bad;
3201
3202                         error = mac_setsockopt_label(proc_ucred(sopt->sopt_p),
3203                             so, &extmac);
3204 #else
3205                         error = EOPNOTSUPP;
3206 #endif /* MAC_SOCKET */
3207                         break;
3208
3209 #ifdef __APPLE_API_PRIVATE
3210                 case SO_UPCALLCLOSEWAIT:
3211                         error = sooptcopyin(sopt, &optval, sizeof (optval),
3212                             sizeof (optval));
3213                         if (error)
3214                                 goto bad;
3215                         if (optval)
3216                                 so->so_flags |= SOF_UPCALLCLOSEWAIT;
3217                         else
3218                                 so->so_flags &= ~SOF_UPCALLCLOSEWAIT;
3219                         break;
3220 #endif
3221
3222                 case SO_RANDOMPORT:
3223                         error = sooptcopyin(sopt, &optval, sizeof (optval),
3224                             sizeof (optval));
3225                         if (error)
3226                                 goto bad;
3227                         if (optval)
3228                                 so->so_flags |= SOF_BINDRANDOMPORT;
3229                         else
3230                                 so->so_flags &= ~SOF_BINDRANDOMPORT;
3231                         break;
3232
3233                 case SO_NP_EXTENSIONS: {
3234                         struct so_np_extensions sonpx;
3235
3236                         error = sooptcopyin(sopt, &sonpx, sizeof(sonpx), sizeof(sonpx));
3237                         if (error)
3238                                 goto bad;
3239                         if (sonpx.npx_mask & ~SONPX_MASK_VALID) {
3240                                 error = EINVAL;
3241                                 goto bad;
3242                         }
3243                         /*
3244                          * Only one bit defined for now
3245                          */
3246                         if ((sonpx.npx_mask & SONPX_SETOPTSHUT)) {
3247                                 if ((sonpx.npx_flags & SONPX_SETOPTSHUT))
3248                                         so->so_flags |= SOF_NPX_SETOPTSHUT;
3249                                 else
3250                                         so->so_flags &= ~SOF_NPX_SETOPTSHUT;
3251                         }
3252                         break;
3253                 }
3254
3255                 default:
3256                         error = ENOPROTOOPT;
3257                         break;
3258                 }
3259                 if (error == 0 && so->so_proto && so->so_proto->pr_ctloutput) {
3260                         (void) ((*so->so_proto->pr_ctloutput)(so, sopt));
3261                 }
3262         }
3263 bad:
3264         socket_unlock(so, 1);
3265         return (error);
3266 }
3267
3268 /* Helper routines for getsockopt */
3269 int
3270 sooptcopyout(struct sockopt *sopt, void *buf, size_t len)
3271 {
3272         int     error;
3273         size_t  valsize;
3274
3275         error = 0;
3276
3277         /*
3278          * Documented get behavior is that we always return a value,
3279          * possibly truncated to fit in the user's buffer.
3280          * Traditional behavior is that we always tell the user
3281          * precisely how much we copied, rather than something useful
3282          * like the total amount we had available for her.
3283          * Note that this interface is not idempotent; the entire answer must
3284          * generated ahead of time.
3285          */
3286         valsize = min(len, sopt->sopt_valsize);
3287         sopt->sopt_valsize = valsize;
3288         if (sopt->sopt_val != USER_ADDR_NULL) {
3289                 if (sopt->sopt_p != kernproc)
3290                         error = copyout(buf, sopt->sopt_val, valsize);
3291                 else
3292                         bcopy(buf, CAST_DOWN(caddr_t, sopt->sopt_val), valsize);
3293         }
3294         return (error);
3295 }
3296
3297 static int
3298 sooptcopyout_timeval(struct sockopt *sopt, const struct timeval * tv_p)
3299 {
3300         int                     error;
3301         size_t                  len;
3302         struct user64_timeval   tv64;
3303         struct user32_timeval   tv32;
3304         const void *            val;
3305         size_t                  valsize;
3306
3307         error = 0;
3308         if (proc_is64bit(sopt->sopt_p)) {
3309                 len = sizeof(tv64);
3310                 tv64.tv_sec = tv_p->tv_sec;
3311                 tv64.tv_usec = tv_p->tv_usec;
3312                 val = &tv64;
3313         } else {
3314                 len = sizeof(tv32);
3315                 tv32.tv_sec = tv_p->tv_sec;
3316                 tv32.tv_usec = tv_p->tv_usec;
3317                 val = &tv32;
3318         }
3319         valsize = min(len, sopt->sopt_valsize);
3320         sopt->sopt_valsize = valsize;
3321         if (sopt->sopt_val != USER_ADDR_NULL) {
3322                 if (sopt->sopt_p != kernproc)
3323                         error = copyout(val, sopt->sopt_val, valsize);
3324                 else
3325                         bcopy(val, CAST_DOWN(caddr_t, sopt->sopt_val), valsize);
3326         }
3327         return (error);
3328 }
3329
3330 /*
3331  * Return:      0                       Success
3332  *              ENOPROTOOPT
3333  *      <pr_ctloutput>:EOPNOTSUPP[AF_UNIX]
3334  *      <pr_ctloutput>:???
3335  *      <sf_getoption>:???
3336  */
3337 int
3338 sogetopt(struct socket *so, struct sockopt *sopt)
3339 {
3340         int     error, optval;
3341         struct  linger l;
3342         struct  timeval tv;
3343         struct  socket_filter_entry *filter;
3344         int     filtered = 0;
3345 #if CONFIG_MACF_SOCKET
3346         struct mac extmac;
3347 #endif /* MAC_SOCKET */
3348
3349         if (sopt->sopt_dir != SOPT_GET) {
3350                 sopt->sopt_dir = SOPT_GET;
3351         }
3352
3353         socket_lock(so, 1);
3354
3355         error = 0;
3356         for (filter = so->so_filt; filter && (error == 0);
3357             filter = filter->sfe_next_onsocket) {
3358                 if (filter->sfe_filter->sf_filter.sf_getoption) {
3359                         if (filtered == 0) {
3360                                 filtered = 1;
3361                                 sflt_use(so);
3362                                 socket_unlock(so, 0);
3363                         }
3364                         error = filter->sfe_filter->sf_filter.
3365                             sf_getoption(filter->sfe_cookie, so, sopt);
3366                 }
3367         }
3368         if (filtered != 0) {
3369                 socket_lock(so, 0);
3370                 sflt_unuse(so);
3371
3372                 if (error) {
3373                         if (error == EJUSTRETURN)
3374                                 error = 0;
3375                         socket_unlock(so, 1);
3376                         return (error);
3377                 }
3378         }
3379
3380         error = 0;
3381         if (sopt->sopt_level != SOL_SOCKET) {
3382                 if (so->so_proto && so->so_proto->pr_ctloutput) {
3383                         error = (*so->so_proto->pr_ctloutput)(so, sopt);
3384                         socket_unlock(so, 1);
3385                         return (error);
3386                 } else {
3387                         socket_unlock(so, 1);
3388                         return (ENOPROTOOPT);
3389                 }
3390         } else {
3391                 switch (sopt->sopt_name) {
3392                 case SO_LINGER:
3393                 case SO_LINGER_SEC:
3394                         l.l_onoff = so->so_options & SO_LINGER;
3395                         l.l_linger = (sopt->sopt_name == SO_LINGER) ?
3396                             so->so_linger : so->so_linger / hz;
3397                         error = sooptcopyout(sopt, &l, sizeof (l));
3398                         break;
3399
3400                 case SO_USELOOPBACK:
3401                 case SO_DONTROUTE:
3402                 case SO_DEBUG:
3403                 case SO_KEEPALIVE:
3404                 case SO_REUSEADDR:
3405                 case SO_REUSEPORT:
3406                 case SO_BROADCAST:
3407                 case SO_OOBINLINE:
3408                 case SO_TIMESTAMP:
3409 #ifdef __APPLE__
3410                 case SO_DONTTRUNC:
3411                 case SO_WANTMORE:
3412                 case SO_WANTOOBFLAG:
3413 #endif
3414                         optval = so->so_options & sopt->sopt_name;
3415 integer:
3416                         error = sooptcopyout(sopt, &optval, sizeof (optval));
3417                         break;
3418
3419                 case SO_TYPE:
3420                         optval = so->so_type;
3421                         goto integer;
3422
3423 #ifdef __APPLE__
3424                 case SO_NREAD:
3425                         if (so->so_proto->pr_flags & PR_ATOMIC) {
3426                                 int pkt_total;
3427                                 struct mbuf *m1;
3428
3429                                 pkt_total = 0;
3430                                 m1 = so->so_rcv.sb_mb;
3431                                 while (m1) {
3432                                         if (m1->m_type == MT_DATA || m1->m_type == MT_HEADER ||
3433                                                 m1->m_type == MT_OOBDATA)
3434                                                 pkt_total += m1->m_len;
3435                                         m1 = m1->m_next;
3436                                 }
3437                                 optval = pkt_total;
3438                         } else {
3439                                 optval = so->so_rcv.sb_cc - so->so_rcv.sb_ctl;
3440                         }
3441                         goto integer;
3442
3443                 case SO_NWRITE:
3444                         optval = so->so_snd.sb_cc;
3445                         goto integer;
3446 #endif
3447                 case SO_ERROR:
3448                         optval = so->so_error;
3449                         so->so_error = 0;
3450                         goto integer;
3451
3452                 case SO_SNDBUF:
3453                         optval = so->so_snd.sb_hiwat;
3454                         goto integer;
3455
3456                 case SO_RCVBUF:
3457                         optval = so->so_rcv.sb_hiwat;
3458                         goto integer;
3459
3460                 case SO_SNDLOWAT:
3461                         optval = so->so_snd.sb_lowat;
3462                         goto integer;
3463
3464                 case SO_RCVLOWAT:
3465                         optval = so->so_rcv.sb_lowat;
3466                         goto integer;
3467
3468                 case SO_SNDTIMEO:
3469                 case SO_RCVTIMEO:
3470                         tv = (sopt->sopt_name == SO_SNDTIMEO ?
3471                             so->so_snd.sb_timeo : so->so_rcv.sb_timeo);
3472
3473                         error = sooptcopyout_timeval(sopt, &tv);
3474                         break;
3475
3476                 case SO_NOSIGPIPE:
3477                         optval = (so->so_flags & SOF_NOSIGPIPE);
3478                         goto integer;
3479
3480                 case SO_NOADDRERR:
3481                         optval = (so->so_flags & SOF_NOADDRAVAIL);
3482                         goto integer;
3483
3484                 case SO_REUSESHAREUID:
3485                         optval = (so->so_flags & SOF_REUSESHAREUID);
3486                         goto integer;
3487
3488 #ifdef __APPLE_API_PRIVATE
3489                 case SO_NOTIFYCONFLICT:
3490                         optval = (so->so_flags & SOF_NOTIFYCONFLICT);
3491                         goto integer;
3492 #endif
3493                 case SO_RESTRICTIONS:
3494                         optval = so->so_restrictions & (SO_RESTRICT_DENYIN |
3495                             SO_RESTRICT_DENYOUT | SO_RESTRICT_DENYSET);
3496                         goto integer;
3497
3498                 case SO_LABEL:
3499 #if CONFIG_MACF_SOCKET
3500                         if ((error = sooptcopyin(sopt, &extmac, sizeof (extmac),
3501                             sizeof (extmac))) != 0 ||
3502                             (error = mac_socket_label_get(proc_ucred(
3503                             sopt->sopt_p), so, &extmac)) != 0)
3504                                 break;
3505
3506                         error = sooptcopyout(sopt, &extmac, sizeof (extmac));
3507 #else
3508                         error = EOPNOTSUPP;
3509 #endif /* MAC_SOCKET */
3510                         break;
3511
3512                 case SO_PEERLABEL:
3513 #if CONFIG_MACF_SOCKET
3514                         if ((error = sooptcopyin(sopt, &extmac, sizeof (extmac),
3515                             sizeof (extmac))) != 0 ||
3516                             (error = mac_socketpeer_label_get(proc_ucred(
3517                             sopt->sopt_p), so, &extmac)) != 0)
3518                                 break;
3519
3520                         error = sooptcopyout(sopt, &extmac, sizeof (extmac));
3521 #else
3522                         error = EOPNOTSUPP;
3523 #endif /* MAC_SOCKET */
3524                         break;
3525
3526 #ifdef __APPLE_API_PRIVATE
3527                 case SO_UPCALLCLOSEWAIT:
3528                         optval = (so->so_flags & SOF_UPCALLCLOSEWAIT);
3529                         goto integer;
3530 #endif
3531                 case SO_RANDOMPORT:
3532                         optval = (so->so_flags & SOF_BINDRANDOMPORT);
3533                         goto integer;
3534
3535                 case SO_NP_EXTENSIONS: {
3536                         struct so_np_extensions sonpx;
3537
3538                         sonpx.npx_flags = (so->so_flags & SOF_NPX_SETOPTSHUT) ? SONPX_SETOPTSHUT : 0;
3539                         sonpx.npx_mask = SONPX_MASK_VALID;
3540
3541                         error = sooptcopyout(sopt, &sonpx, sizeof(struct so_np_extensions));
3542                         break;
3543                 }
3544                 default:
3545                         error = ENOPROTOOPT;
3546                         break;
3547                 }
3548                 socket_unlock(so, 1);
3549                 return (error);
3550         }
3551 }
3552
3553 /* XXX; prepare mbuf for (__FreeBSD__ < 3) routines. */
3554 int
3555 soopt_getm(struct sockopt *sopt, struct mbuf **mp)
3556 {
3557         struct mbuf *m, *m_prev;
3558         int sopt_size = sopt->sopt_valsize;
3559         int how;
3560
3561         if (sopt_size > MAX_SOOPTGETM_SIZE)
3562                 return (EMSGSIZE);
3563
3564         how = sopt->sopt_p != kernproc ? M_WAIT : M_DONTWAIT;
3565         MGET(m, how, MT_DATA);
3566         if (m == 0)
3567                 return (ENOBUFS);
3568         if (sopt_size > MLEN) {
3569                 MCLGET(m, how);
3570                 if ((m->m_flags & M_EXT) == 0) {
3571                         m_free(m);
3572                         return (ENOBUFS);
3573                 }
3574                 m->m_len = min(MCLBYTES, sopt_size);
3575         } else {
3576                 m->m_len = min(MLEN, sopt_size);
3577         }
3578         sopt_size -= m->m_len;
3579         *mp = m;
3580         m_prev = m;
3581
3582         while (sopt_size) {
3583                 MGET(m, how, MT_DATA);
3584                 if (m == 0) {
3585                         m_freem(*mp);
3586                         return (ENOBUFS);
3587                 }
3588                 if (sopt_size > MLEN) {
3589                         MCLGET(m, how);
3590                         if ((m->m_flags & M_EXT) == 0) {
3591                                 m_freem(*mp);
3592                                 return (ENOBUFS);
3593                         }
3594                         m->m_len = min(MCLBYTES, sopt_size);
3595                 } else {
3596                         m->m_len = min(MLEN, sopt_size);
3597                 }
3598                 sopt_size -= m->m_len;
3599                 m_prev->m_next = m;
3600                 m_prev = m;
3601         }
3602         return (0);
3603 }
3604
3605 /* XXX; copyin sopt data into mbuf chain for (__FreeBSD__ < 3) routines. */
3606 int
3607 soopt_mcopyin(struct sockopt *sopt, struct mbuf *m)
3608 {
3609         struct mbuf *m0 = m;
3610
3611         if (sopt->sopt_val == USER_ADDR_NULL)
3612                 return (0);
3613         while (m != NULL && sopt->sopt_valsize >= m->m_len) {
3614                 if (sopt->sopt_p != kernproc) {
3615                         int error;
3616
3617                         error = copyin(sopt->sopt_val, mtod(m, char *),
3618                             m->m_len);
3619                         if (error != 0) {
3620                                 m_freem(m0);
3621                                 return (error);
3622                         }
3623                 } else {
3624                         bcopy(CAST_DOWN(caddr_t, sopt->sopt_val),
3625                             mtod(m, char *), m->m_len);
3626                 }
3627                 sopt->sopt_valsize -= m->m_len;
3628                 sopt->sopt_val += m->m_len;
3629                 m = m->m_next;
3630         }
3631         if (m != NULL) /* should be allocated enoughly at ip6_sooptmcopyin() */
3632                 panic("soopt_mcopyin");
3633         return (0);
3634 }
3635
3636 /* XXX; copyout mbuf chain data into soopt for (__FreeBSD__ < 3) routines. */
3637 int
3638 soopt_mcopyout(struct sockopt *sopt, struct mbuf *m)
3639 {
3640         struct mbuf *m0 = m;
3641         size_t valsize = 0;
3642
3643         if (sopt->sopt_val == USER_ADDR_NULL)
3644                 return (0);
3645         while (m != NULL && sopt->sopt_valsize >= m->m_len) {
3646                 if (sopt->sopt_p != kernproc) {
3647                         int error;
3648
3649                         error = copyout(mtod(m, char *), sopt->sopt_val,
3650                             m->m_len);
3651                         if (error != 0) {
3652                                 m_freem(m0);
3653                                 return (error);
3654                         }
3655                 } else {
3656                         bcopy(mtod(m, char *),
3657                             CAST_DOWN(caddr_t, sopt->sopt_val), m->m_len);
3658                 }
3659                 sopt->sopt_valsize -= m->m_len;
3660                 sopt->sopt_val += m->m_len;
3661                 valsize += m->m_len;
3662                 m = m->m_next;
3663         }
3664         if (m != NULL) {
3665                 /* enough soopt buffer should be given from user-land */
3666                 m_freem(m0);
3667                 return (EINVAL);
3668         }
3669         sopt->sopt_valsize = valsize;
3670         return (0);
3671 }
3672
3673 void
3674 sohasoutofband(struct socket *so)
3675 {
3676
3677         if (so->so_pgid < 0)
3678                 gsignal(-so->so_pgid, SIGURG);
3679         else if (so->so_pgid > 0)
3680                 proc_signal(so->so_pgid, SIGURG);
3681         selwakeup(&so->so_rcv.sb_sel);
3682 }
3683
3684 int
3685 sopoll(struct socket *so, int events, __unused kauth_cred_t cred, void * wql)
3686 {
3687         struct proc *p = current_proc();
3688         int revents = 0;
3689
3690         socket_lock(so, 1);
3691
3692         if (events & (POLLIN | POLLRDNORM))
3693                 if (soreadable(so))
3694                         revents |= events & (POLLIN | POLLRDNORM);
3695
3696         if (events & (POLLOUT | POLLWRNORM))
3697                 if (sowriteable(so))
3698                         revents |= events & (POLLOUT | POLLWRNORM);
3699
3700         if (events & (POLLPRI | POLLRDBAND))
3701                 if (so->so_oobmark || (so->so_state & SS_RCVATMARK))
3702                         revents |= events & (POLLPRI | POLLRDBAND);
3703
3704         if (revents == 0) {
3705                 if (events & (POLLIN | POLLPRI | POLLRDNORM | POLLRDBAND)) {
3706                         /*
3707                          * Darwin sets the flag first,
3708                          * BSD calls selrecord first
3709                          */
3710                         so->so_rcv.sb_flags |= SB_SEL;
3711                         selrecord(p, &so->so_rcv.sb_sel, wql);
3712                 }
3713
3714                 if (events & (POLLOUT | POLLWRNORM)) {
3715                         /*
3716                          * Darwin sets the flag first,
3717                          * BSD calls selrecord first
3718                          */
3719                         so->so_snd.sb_flags |= SB_SEL;
3720                         selrecord(p, &so->so_snd.sb_sel, wql);
3721                 }
3722         }
3723
3724         socket_unlock(so, 1);
3725         return (revents);
3726 }
3727
3728 int
3729 soo_kqfilter(__unused struct fileproc *fp, struct knote *kn,
3730     __unused struct proc *p)
3731 {
3732         struct socket *so = (struct socket *)kn->kn_fp->f_fglob->fg_data;
3733         struct sockbuf *sb;
3734
3735         socket_lock(so, 1);
3736
3737 #if CONFIG_MACF_SOCKET
3738         if (mac_socket_check_kqfilter(proc_ucred(p), kn, so) != 0) {
3739                 socket_unlock(so, 1);
3740                 return (1);
3741         }
3742 #endif /* MAC_SOCKET */
3743
3744         switch (kn->kn_filter) {
3745         case EVFILT_READ:
3746                 kn->kn_fop = &soread_filtops;
3747                 sb = &so->so_rcv;
3748                 break;
3749         case EVFILT_WRITE:
3750                 kn->kn_fop = &sowrite_filtops;
3751                 sb = &so->so_snd;
3752                 break;
3753         default:
3754                 socket_unlock(so, 1);
3755                 return (1);
3756         }
3757
3758         if (KNOTE_ATTACH(&sb->sb_sel.si_note, kn))
3759                 sb->sb_flags |= SB_KNOTE;
3760         socket_unlock(so, 1);
3761         return (0);
3762 }
3763
3764 static void
3765 filt_sordetach(struct knote *kn)
3766 {
3767         struct socket *so = (struct socket *)kn->kn_fp->f_fglob->fg_data;
3768
3769         socket_lock(so, 1);
3770         if (so->so_rcv.sb_flags & SB_KNOTE)
3771                 if (KNOTE_DETACH(&so->so_rcv.sb_sel.si_note, kn))
3772                         so->so_rcv.sb_flags &= ~SB_KNOTE;
3773         socket_unlock(so, 1);
3774 }
3775
3776 /*ARGSUSED*/
3777 static int
3778 filt_soread(struct knote *kn, long hint)
3779 {
3780         struct socket *so = (struct socket *)kn->kn_fp->f_fglob->fg_data;
3781
3782         if ((hint & SO_FILT_HINT_LOCKED) == 0)
3783                 socket_lock(so, 1);
3784
3785         if (so->so_options & SO_ACCEPTCONN) {
3786                 int isempty;
3787
3788                 /* Radar 6615193 handle the listen case dynamically
3789                  * for kqueue read filter. This allows to call listen() after registering
3790                  * the kqueue EVFILT_READ.
3791                  */
3792
3793                 kn->kn_data = so->so_qlen;
3794                 isempty = ! TAILQ_EMPTY(&so->so_comp);
3795
3796                 if ((hint & SO_FILT_HINT_LOCKED) == 0)
3797                         socket_unlock(so, 1);
3798
3799                 return (isempty);
3800         }
3801
3802         /* socket isn't a listener */
3803
3804         kn->kn_data = so->so_rcv.sb_cc - so->so_rcv.sb_ctl;
3805
3806         if (so->so_oobmark) {
3807                 if (kn->kn_flags & EV_OOBAND) {
3808                         kn->kn_data -= so->so_oobmark;
3809                         if ((hint & SO_FILT_HINT_LOCKED) == 0)
3810                                 socket_unlock(so, 1);
3811                         return (1);
3812                 }
3813                 kn->kn_data = so->so_oobmark;
3814                 kn->kn_flags |= EV_OOBAND;
3815         } else {
3816                 if (so->so_state & SS_CANTRCVMORE) {
3817                         kn->kn_flags |= EV_EOF;
3818                         kn->kn_fflags = so->so_error;
3819                         if ((hint & SO_FILT_HINT_LOCKED) == 0)
3820                                 socket_unlock(so, 1);
3821                         return (1);
3822                 }
3823         }
3824
3825         if (so->so_state & SS_RCVATMARK) {
3826                 if (kn->kn_flags & EV_OOBAND) {
3827                         if ((hint & SO_FILT_HINT_LOCKED) == 0)
3828                                 socket_unlock(so, 1);
3829                         return (1);
3830                 }
3831                 kn->kn_flags |= EV_OOBAND;
3832         } else if (kn->kn_flags & EV_OOBAND) {
3833                 kn->kn_data = 0;
3834                 if ((hint & SO_FILT_HINT_LOCKED) == 0)
3835                         socket_unlock(so, 1);
3836                 return (0);
3837         }
3838
3839         if (so->so_error) {     /* temporary udp error */
3840                 if ((hint & SO_FILT_HINT_LOCKED) == 0)
3841                         socket_unlock(so, 1);
3842                 return (1);
3843         }
3844
3845         if ((hint & SO_FILT_HINT_LOCKED) == 0)
3846                 socket_unlock(so, 1);
3847
3848         return ((kn->kn_flags & EV_OOBAND) ||
3849             kn->kn_data >= ((kn->kn_sfflags & NOTE_LOWAT) ?
3850             kn->kn_sdata : so->so_rcv.sb_lowat));
3851 }
3852
3853 static void
3854 filt_sowdetach(struct knote *kn)
3855 {
3856         struct socket *so = (struct socket *)kn->kn_fp->f_fglob->fg_data;
3857         socket_lock(so, 1);
3858
3859         if (so->so_snd.sb_flags & SB_KNOTE)
3860                 if (KNOTE_DETACH(&so->so_snd.sb_sel.si_note, kn))
3861                         so->so_snd.sb_flags &= ~SB_KNOTE;
3862         socket_unlock(so, 1);
3863 }
3864
3865 /*ARGSUSED*/
3866 static int
3867 filt_sowrite(struct knote *kn, long hint)
3868 {
3869         struct socket *so = (struct socket *)kn->kn_fp->f_fglob->fg_data;
3870
3871         if ((hint & SO_FILT_HINT_LOCKED) == 0)
3872                 socket_lock(so, 1);
3873
3874         kn->kn_data = sbspace(&so->so_snd);
3875         if (so->so_state & SS_CANTSENDMORE) {
3876                 kn->kn_flags |= EV_EOF;
3877                 kn->kn_fflags = so->so_error;
3878                 if ((hint & SO_FILT_HINT_LOCKED) == 0)
3879                         socket_unlock(so, 1);
3880                 return (1);
3881         }
3882         if (so->so_error) {     /* temporary udp error */
3883                 if ((hint & SO_FILT_HINT_LOCKED) == 0)
3884                         socket_unlock(so, 1);
3885                 return (1);
3886         }
3887         if (((so->so_state & SS_ISCONNECTED) == 0) &&
3888             (so->so_proto->pr_flags & PR_CONNREQUIRED)) {
3889                 if ((hint & SO_FILT_HINT_LOCKED) == 0)
3890                         socket_unlock(so, 1);
3891                 return (0);
3892         }
3893         if ((hint & SO_FILT_HINT_LOCKED) == 0)
3894                 socket_unlock(so, 1);
3895         if (kn->kn_sfflags & NOTE_LOWAT)
3896                 return (kn->kn_data >= kn->kn_sdata);
3897         return (kn->kn_data >= so->so_snd.sb_lowat);
3898 }
3899
3900 #define SO_LOCK_HISTORY_STR_LEN (2 * SO_LCKDBG_MAX * (2 + sizeof(void *) + 1) + 1)
3901
3902 __private_extern__ const char * solockhistory_nr(struct socket *so)
3903 {
3904         size_t n = 0;
3905         int i;
3906         static char lock_history_str[SO_LOCK_HISTORY_STR_LEN];
3907
3908         for (i = SO_LCKDBG_MAX - 1; i >= 0; i--) {
3909                 n += snprintf(lock_history_str + n, SO_LOCK_HISTORY_STR_LEN - n, "%lx:%lx ",
3910                         (uintptr_t) so->lock_lr[(so->next_lock_lr + i) % SO_LCKDBG_MAX],
3911                         (uintptr_t) so->unlock_lr[(so->next_unlock_lr + i) % SO_LCKDBG_MAX]);
3912         }
3913         return lock_history_str;
3914 }
3915
3916 int
3917 socket_lock(struct socket *so, int refcount)
3918 {
3919         int error = 0;
3920         void *lr_saved;
3921
3922         lr_saved = __builtin_return_address(0);
3923
3924         if (so->so_proto->pr_lock) {
3925                 error = (*so->so_proto->pr_lock)(so, refcount, lr_saved);
3926         } else {
3927 #ifdef MORE_LOCKING_DEBUG
3928                 lck_mtx_assert(so->so_proto->pr_domain->dom_mtx,
3929                     LCK_MTX_ASSERT_NOTOWNED);
3930 #endif
3931                 lck_mtx_lock(so->so_proto->pr_domain->dom_mtx);
3932                 if (refcount)
3933                         so->so_usecount++;
3934                 so->lock_lr[so->next_lock_lr] = lr_saved;
3935                 so->next_lock_lr = (so->next_lock_lr+1) % SO_LCKDBG_MAX;
3936         }
3937
3938         return (error);
3939 }
3940
3941 int
3942 socket_unlock(struct socket *so, int refcount)
3943 {
3944         int error = 0;
3945         void *lr_saved;
3946         lck_mtx_t *mutex_held;
3947
3948         lr_saved = __builtin_return_address(0);
3949
3950         if (so->so_proto == NULL)
3951                 panic("socket_unlock null so_proto so=%p\n", so);
3952
3953         if (so && so->so_proto->pr_unlock) {
3954                 error = (*so->so_proto->pr_unlock)(so, refcount, lr_saved);
3955         } else {
3956                 mutex_held = so->so_proto->pr_domain->dom_mtx;
3957 #ifdef MORE_LOCKING_DEBUG
3958                 lck_mtx_assert(mutex_held, LCK_MTX_ASSERT_OWNED);
3959 #endif
3960                 so->unlock_lr[so->next_unlock_lr] = lr_saved;
3961                 so->next_unlock_lr = (so->next_unlock_lr+1) % SO_LCKDBG_MAX;
3962
3963                 if (refcount) {
3964                         if (so->so_usecount <= 0)
3965                                 panic("socket_unlock: bad refcount=%d so=%p (%d, %d, %d) lrh=%s",
3966                                     so->so_usecount, so, so->so_proto->pr_domain->dom_family,
3967                                     so->so_type, so->so_proto->pr_protocol,
3968                                     solockhistory_nr(so));
3969
3970                         so->so_usecount--;
3971                         if (so->so_usecount == 0) {
3972                                 sofreelastref(so, 1);
3973                         }
3974                 }
3975                 lck_mtx_unlock(mutex_held);
3976         }
3977
3978         return (error);
3979 }
3980
3981 /* Called with socket locked, will unlock socket */
3982 void
3983 sofree(struct socket *so)
3984 {
3985
3986         lck_mtx_t *mutex_held;
3987         if (so->so_proto->pr_getlock != NULL)
3988                 mutex_held = (*so->so_proto->pr_getlock)(so, 0);
3989         else
3990                 mutex_held = so->so_proto->pr_domain->dom_mtx;
3991         lck_mtx_assert(mutex_held, LCK_MTX_ASSERT_OWNED);
3992
3993         sofreelastref(so, 0);
3994 }
3995
3996 void
3997 soreference(struct socket *so)
3998 {
3999         socket_lock(so, 1);     /* locks & take one reference on socket */
4000         socket_unlock(so, 0);   /* unlock only */
4001 }
4002
4003 void
4004 sodereference(struct socket *so)
4005 {
4006         socket_lock(so, 0);
4007         socket_unlock(so, 1);
4008 }
4009
4010 /*
4011  * Set or clear SOF_MULTIPAGES on the socket to enable or disable the
4012  * possibility of using jumbo clusters.  Caller must ensure to hold
4013  * the socket lock.
4014  */
4015 void
4016 somultipages(struct socket *so, boolean_t set)
4017 {
4018         if (set)
4019                 so->so_flags |= SOF_MULTIPAGES;
4020         else
4021                 so->so_flags &= ~SOF_MULTIPAGES;
4022 }
4023
4024 int
4025 so_isdstlocal(struct socket *so) {
4026
4027         struct inpcb *inp = (struct inpcb *)so->so_pcb;
4028
4029         if (so->so_proto->pr_domain->dom_family == AF_INET) {
4030                 return inaddr_local(inp->inp_faddr);
4031         } else if (so->so_proto->pr_domain->dom_family == AF_INET6) {
4032                 return in6addr_local(&inp->in6p_faddr);
4033         }
4034         return 0;
4035 }