bsd/kern/uipc_socket.c

   1 /*
   2  * Copyright (c) 1998-2011 Apple Inc. All rights reserved.
   3  *
   4  * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
   5  *
   6  * This file contains Original Code and/or Modifications of Original Code
   7  * as defined in and that are subject to the Apple Public Source License
   8  * Version 2.0 (the 'License'). You may not use this file except in
   9  * compliance with the License. The rights granted to you under the License
  10  * may not be used to create, or enable the creation or redistribution of,
  11  * unlawful or unlicensed copies of an Apple operating system, or to
  12  * circumvent, violate, or enable the circumvention or violation of, any
  13  * terms of an Apple operating system software license agreement.
  14  *
  15  * Please obtain a copy of the License at
  16  * http://www.opensource.apple.com/apsl/ and read it before using this file.
  17  *
  18  * The Original Code and all software distributed under the License are
  19  * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
  20  * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
  21  * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
  22  * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
  23  * Please see the License for the specific language governing rights and
  24  * limitations under the License.
  25  *
  26  * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
  27  */
  28 /* Copyright (c) 1995 NeXT Computer, Inc. All Rights Reserved */
  29 /*
  30  * Copyright (c) 1982, 1986, 1988, 1990, 1993
  31  *      The Regents of the University of California.  All rights reserved.
  32  *
  33  * Redistribution and use in source and binary forms, with or without
  34  * modification, are permitted provided that the following conditions
  35  * are met:
  36  * 1. Redistributions of source code must retain the above copyright
  37  *    notice, this list of conditions and the following disclaimer.
  38  * 2. Redistributions in binary form must reproduce the above copyright
  39  *    notice, this list of conditions and the following disclaimer in the
  40  *    documentation and/or other materials provided with the distribution.
  41  * 3. All advertising materials mentioning features or use of this software
  42  *    must display the following acknowledgement:
  43  *      This product includes software developed by the University of
  44  *      California, Berkeley and its contributors.
  45  * 4. Neither the name of the University nor the names of its contributors
  46  *    may be used to endorse or promote products derived from this software
  47  *    without specific prior written permission.
  48  *
  49  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
  50  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  51  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  52  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  53  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  54  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  55  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  56  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  57  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  58  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  59  * SUCH DAMAGE.
  60  *
  61  *      @(#)uipc_socket.c       8.3 (Berkeley) 4/15/94
  62  * $FreeBSD: src/sys/kern/uipc_socket.c,v 1.68.2.16 2001/06/14 20:46:06 ume Exp $
  63  */
  64 /*
  65  * NOTICE: This file was modified by SPARTA, Inc. in 2005 to introduce
  66  * support for mandatory and extensible security protections.  This notice
  67  * is included in support of clause 2.2 (b) of the Apple Public License,
  68  * Version 2.0.
  69  */
  70
  71 #include <sys/param.h>
  72 #include <sys/systm.h>
  73 #include <sys/filedesc.h>
  74 #include <sys/proc.h>
  75 #include <sys/proc_internal.h>
  76 #include <sys/kauth.h>
  77 #include <sys/file_internal.h>
  78 #include <sys/fcntl.h>
  79 #include <sys/malloc.h>
  80 #include <sys/mbuf.h>
  81 #include <sys/domain.h>
  82 #include <sys/kernel.h>
  83 #include <sys/event.h>
  84 #include <sys/poll.h>
  85 #include <sys/protosw.h>
  86 #include <sys/socket.h>
  87 #include <sys/socketvar.h>
  88 #include <sys/resourcevar.h>
  89 #include <sys/signalvar.h>
  90 #include <sys/sysctl.h>
  91 #include <sys/uio.h>
  92 #include <sys/ev.h>
  93 #include <sys/kdebug.h>
  94 #include <sys/un.h>
  95 #include <sys/user.h>
  96 #include <net/route.h>
  97 #include <netinet/in.h>
  98 #include <netinet/in_pcb.h>
  99 #include <netinet/ip6.h>
 100 #include <netinet6/ip6_var.h>
 101 #include <kern/zalloc.h>
 102 #include <kern/locks.h>
 103 #include <machine/limits.h>
 104 #include <libkern/OSAtomic.h>
 105 #include <pexpert/pexpert.h>
 106 #include <kern/assert.h>
 107 #include <kern/task.h>
 108
 109 #include <sys/mcache.h>
 110
 111 #if CONFIG_MACF
 112 #include <security/mac.h>
 113 #include <security/mac_framework.h>
 114 #endif /* MAC */
 115
 116 extern int in6_init_done;
 117
 118 int                     so_cache_hw = 0;
 119 int                     so_cache_timeouts = 0;
 120 int                     so_cache_max_freed = 0;
 121 int                     cached_sock_count = 0;
 122 __private_extern__ int  max_cached_sock_count = MAX_CACHED_SOCKETS;
 123 struct socket           *socket_cache_head = 0;
 124 struct socket           *socket_cache_tail = 0;
 125 u_int32_t                       so_cache_time = 0;
 126 int                     so_cache_init_done = 0;
 127 struct zone             *so_cache_zone;
 128
 129 static lck_grp_t                *so_cache_mtx_grp;
 130 static lck_attr_t               *so_cache_mtx_attr;
 131 static lck_grp_attr_t   *so_cache_mtx_grp_attr;
 132 lck_mtx_t                               *so_cache_mtx;
 133
 134 #include <machine/limits.h>
 135
 136 static void     filt_sordetach(struct knote *kn);
 137 static int      filt_soread(struct knote *kn, long hint);
 138 static void     filt_sowdetach(struct knote *kn);
 139 static int      filt_sowrite(struct knote *kn, long hint);
 140
 141 static int
 142 sooptcopyin_timeval(struct sockopt *sopt, struct timeval * tv_p);
 143
 144 static int
 145 sooptcopyout_timeval(struct sockopt *sopt, const struct timeval * tv_p);
 146
 147 static struct filterops soread_filtops = {
 148         .f_isfd = 1,
 149         .f_detach = filt_sordetach,
 150         .f_event = filt_soread,
 151 };
 152 static struct filterops sowrite_filtops = {
 153         .f_isfd = 1,
 154         .f_detach = filt_sowdetach,
 155         .f_event = filt_sowrite,
 156 };
 157
 158 #define EVEN_MORE_LOCKING_DEBUG 0
 159 int socket_debug = 0;
 160 int socket_zone = M_SOCKET;
 161 so_gen_t        so_gencnt;      /* generation count for sockets */
 162
 163 MALLOC_DEFINE(M_SONAME, "soname", "socket name");
 164 MALLOC_DEFINE(M_PCB, "pcb", "protocol control block");
 165
 166 #define DBG_LAYER_IN_BEG        NETDBG_CODE(DBG_NETSOCK, 0)
 167 #define DBG_LAYER_IN_END        NETDBG_CODE(DBG_NETSOCK, 2)
 168 #define DBG_LAYER_OUT_BEG       NETDBG_CODE(DBG_NETSOCK, 1)
 169 #define DBG_LAYER_OUT_END       NETDBG_CODE(DBG_NETSOCK, 3)
 170 #define DBG_FNC_SOSEND          NETDBG_CODE(DBG_NETSOCK, (4 << 8) | 1)
 171 #define DBG_FNC_SORECEIVE       NETDBG_CODE(DBG_NETSOCK, (8 << 8))
 172 #define DBG_FNC_SOSHUTDOWN      NETDBG_CODE(DBG_NETSOCK, (9 << 8))
 173
 174 #define MAX_SOOPTGETM_SIZE      (128 * MCLBYTES)
 175
 176
 177 SYSCTL_DECL(_kern_ipc);
 178
 179 int somaxconn = SOMAXCONN;
 180 SYSCTL_INT(_kern_ipc, KIPC_SOMAXCONN, somaxconn, CTLFLAG_RW | CTLFLAG_LOCKED, &somaxconn, 0, "");
 181
 182 /* Should we get a maximum also ??? */
 183 static int sosendmaxchain = 65536;
 184 static int sosendminchain = 16384;
 185 static int sorecvmincopy  = 16384;
 186 SYSCTL_INT(_kern_ipc, OID_AUTO, sosendminchain, CTLFLAG_RW | CTLFLAG_LOCKED, &sosendminchain,
 187     0, "");
 188 SYSCTL_INT(_kern_ipc, OID_AUTO, sorecvmincopy, CTLFLAG_RW | CTLFLAG_LOCKED, &sorecvmincopy,
 189     0, "");
 190
 191 /*
 192  * Set to enable jumbo clusters (if available) for large writes when
 193  * the socket is marked with SOF_MULTIPAGES; see below.
 194  */
 195 int sosendjcl = 1;
 196 SYSCTL_INT(_kern_ipc, OID_AUTO, sosendjcl, CTLFLAG_RW | CTLFLAG_LOCKED, &sosendjcl, 0, "");
 197
 198 /*
 199  * Set this to ignore SOF_MULTIPAGES and use jumbo clusters for large
 200  * writes on the socket for all protocols on any network interfaces,
 201  * depending upon sosendjcl above.  Be extra careful when setting this
 202  * to 1, because sending down packets that cross physical pages down to
 203  * broken drivers (those that falsely assume that the physical pages
 204  * are contiguous) might lead to system panics or silent data corruption.
 205  * When set to 0, the system will respect SOF_MULTIPAGES, which is set
 206  * only for TCP sockets whose outgoing interface is IFNET_MULTIPAGES
 207  * capable.  Set this to 1 only for testing/debugging purposes.
 208  */
 209 int sosendjcl_ignore_capab = 0;
 210 SYSCTL_INT(_kern_ipc, OID_AUTO, sosendjcl_ignore_capab, CTLFLAG_RW | CTLFLAG_LOCKED,
 211     &sosendjcl_ignore_capab, 0, "");
 212
 213 int sodefunctlog = 0;
 214 SYSCTL_INT(_kern_ipc, OID_AUTO, sodefunctlog, CTLFLAG_RW | CTLFLAG_LOCKED,
 215     &sodefunctlog, 0, "");
 216
 217 /*
 218  * Socket operation routines.
 219  * These routines are called by the routines in
 220  * sys_socket.c or from a system process, and
 221  * implement the semantics of socket operations by
 222  * switching out to the protocol specific routines.
 223  */
 224
 225 /* sys_generic.c */
 226 extern void postevent(struct socket *, struct sockbuf *, int);
 227 extern void evsofree(struct socket *);
 228
 229 /* TODO: these should be in header file */
 230 extern int get_inpcb_str_size(void);
 231 extern int get_tcp_str_size(void);
 232 extern struct domain *pffinddomain(int);
 233 extern struct protosw *pffindprotonotype(int, int);
 234 extern int soclose_locked(struct socket *);
 235 extern int soo_kqfilter(struct fileproc *, struct knote *, struct proc *);
 236
 237 #if CONFIG_EMBEDDED
 238 extern int uthread_get_background_state(uthread_t);
 239 #endif /*CONFIG_EMBEDDED */
 240
 241 #ifdef __APPLE__
 242
 243 vm_size_t       so_cache_zone_element_size;
 244
 245 static int sodelayed_copy(struct socket *, struct uio *, struct mbuf **, int *);
 246 static void cached_sock_alloc(struct socket **, int);
 247 static void cached_sock_free(struct socket *);
 248 static void so_cache_timer(void *);
 249
 250 void soclose_wait_locked(struct socket *so);
 251 int so_isdstlocal(struct socket *so);
 252
 253 __private_extern__ u_int32_t sotcdb = 0;
 254 SYSCTL_INT(_kern_ipc, OID_AUTO, sotcdb, CTLFLAG_RW | CTLFLAG_LOCKED,
 255     &sotcdb, 0, "");
 256
 257 void
 258 socketinit(void)
 259 {
 260         vm_size_t str_size;
 261
 262         if (so_cache_init_done) {
 263                 printf("socketinit: already called...\n");
 264                 return;
 265         }
 266
 267         PE_parse_boot_argn("socket_debug", &socket_debug, sizeof (socket_debug));
 268
 269         /*
 270          * allocate lock group attribute and group for socket cache mutex
 271          */
 272         so_cache_mtx_grp_attr = lck_grp_attr_alloc_init();
 273
 274         so_cache_mtx_grp = lck_grp_alloc_init("so_cache",
 275             so_cache_mtx_grp_attr);
 276
 277         /*
 278          * allocate the lock attribute for socket cache mutex
 279          */
 280         so_cache_mtx_attr = lck_attr_alloc_init();
 281
 282         so_cache_init_done = 1;
 283
 284         /* cached sockets mutex */
 285         so_cache_mtx = lck_mtx_alloc_init(so_cache_mtx_grp, so_cache_mtx_attr);
 286
 287         if (so_cache_mtx == NULL)
 288                 return; /* we're hosed... */
 289
 290         str_size = (vm_size_t)(sizeof (struct socket) + 4 +
 291             get_inpcb_str_size() + 4 + get_tcp_str_size());
 292
 293         so_cache_zone = zinit(str_size, 120000*str_size, 8192, "socache zone");
 294         zone_change(so_cache_zone, Z_CALLERACCT, FALSE);
 295         zone_change(so_cache_zone, Z_NOENCRYPT, TRUE);
 296 #if TEMPDEBUG
 297         printf("cached_sock_alloc -- so_cache_zone size is %x\n", str_size);
 298 #endif
 299         timeout(so_cache_timer, NULL, (SO_CACHE_FLUSH_INTERVAL * hz));
 300
 301         so_cache_zone_element_size = str_size;
 302
 303         sflt_init();
 304
 305         VERIFY(SO_TC_MAX == SO_TC_STATS_MAX);
 306
 307         socket_tclass_init();
 308 }
 309
 310 static void
 311 cached_sock_alloc(struct socket **so, int waitok)
 312 {
 313         caddr_t temp;
 314         register uintptr_t offset;
 315
 316         lck_mtx_lock(so_cache_mtx);
 317
 318         if (cached_sock_count) {
 319                 cached_sock_count--;
 320                 *so = socket_cache_head;
 321                 if (*so == 0)
 322                         panic("cached_sock_alloc: cached sock is null");
 323
 324                 socket_cache_head = socket_cache_head->cache_next;
 325                 if (socket_cache_head)
 326                         socket_cache_head->cache_prev = 0;
 327                 else
 328                         socket_cache_tail = 0;
 329
 330                 lck_mtx_unlock(so_cache_mtx);
 331
 332                 temp = (*so)->so_saved_pcb;
 333                 bzero((caddr_t)*so, sizeof (struct socket));
 334 #if TEMPDEBUG
 335                 kprintf("cached_sock_alloc - retreiving cached sock %p - "
 336                     "count == %d\n", *so, cached_sock_count);
 337 #endif
 338                 (*so)->so_saved_pcb = temp;
 339                 (*so)->cached_in_sock_layer = 1;
 340         } else {
 341 #if TEMPDEBUG
 342                 kprintf("Allocating cached sock %p from memory\n", *so);
 343 #endif
 344
 345                 lck_mtx_unlock(so_cache_mtx);
 346
 347                 if (waitok)
 348                         *so = (struct socket *)zalloc(so_cache_zone);
 349                 else
 350                         *so = (struct socket *)zalloc_noblock(so_cache_zone);
 351
 352                 if (*so == 0)
 353                         return;
 354
 355                 bzero((caddr_t)*so, sizeof (struct socket));
 356
 357                 /*
 358                  * Define offsets for extra structures into our single block of
 359                  * memory. Align extra structures on longword boundaries.
 360                  */
 361
 362                 offset = (uintptr_t) *so;
 363                 offset += sizeof (struct socket);
 364
 365                 offset = ALIGN(offset);
 366
 367                 (*so)->so_saved_pcb = (caddr_t)offset;
 368                 offset += get_inpcb_str_size();
 369
 370                 offset = ALIGN(offset);
 371
 372                 ((struct inpcb *)(*so)->so_saved_pcb)->inp_saved_ppcb =
 373                     (caddr_t)offset;
 374 #if TEMPDEBUG
 375                 kprintf("Allocating cached socket - %p, pcb=%p tcpcb=%p\n",
 376                     *so, (*so)->so_saved_pcb,
 377                     ((struct inpcb *)(*so)->so_saved_pcb)->inp_saved_ppcb);
 378 #endif
 379         }
 380
 381         (*so)->cached_in_sock_layer = 1;
 382 }
 383
 384 static void
 385 cached_sock_free(struct socket *so)
 386 {
 387
 388         lck_mtx_lock(so_cache_mtx);
 389
 390         if (++cached_sock_count > max_cached_sock_count) {
 391                 --cached_sock_count;
 392                 lck_mtx_unlock(so_cache_mtx);
 393 #if TEMPDEBUG
 394                 kprintf("Freeing overflowed cached socket %p\n", so);
 395 #endif
 396                 zfree(so_cache_zone, so);
 397         } else {
 398 #if TEMPDEBUG
 399                 kprintf("Freeing socket %p into cache\n", so);
 400 #endif
 401                 if (so_cache_hw < cached_sock_count)
 402                         so_cache_hw = cached_sock_count;
 403
 404                 so->cache_next = socket_cache_head;
 405                 so->cache_prev = 0;
 406                 if (socket_cache_head)
 407                         socket_cache_head->cache_prev = so;
 408                 else
 409                         socket_cache_tail = so;
 410
 411                 so->cache_timestamp = so_cache_time;
 412                 socket_cache_head = so;
 413                 lck_mtx_unlock(so_cache_mtx);
 414         }
 415
 416 #if TEMPDEBUG
 417         kprintf("Freed cached sock %p into cache - count is %d\n",
 418             so, cached_sock_count);
 419 #endif
 420 }
 421
 422 static void
 423 so_update_last_owner_locked(
 424         struct socket   *so,
 425         proc_t                  self)
 426 {
 427         if (self == NULL)
 428                 self = current_proc();
 429
 430         if (self)
 431         {
 432                 so->last_upid = proc_uniqueid(self);
 433                 so->last_pid = proc_pid(self);
 434         }
 435 }
 436
 437 static void
 438 so_cache_timer(__unused void *dummy)
 439 {
 440         register struct socket  *p;
 441         register int            n_freed = 0;
 442
 443         lck_mtx_lock(so_cache_mtx);
 444
 445         ++so_cache_time;
 446
 447         while ((p = socket_cache_tail)) {
 448                 if ((so_cache_time - p->cache_timestamp) < SO_CACHE_TIME_LIMIT)
 449                         break;
 450
 451                 so_cache_timeouts++;
 452
 453                 if ((socket_cache_tail = p->cache_prev))
 454                         p->cache_prev->cache_next = 0;
 455                 if (--cached_sock_count == 0)
 456                         socket_cache_head = 0;
 457
 458                 zfree(so_cache_zone, p);
 459
 460                 if (++n_freed >= SO_CACHE_MAX_FREE_BATCH) {
 461                         so_cache_max_freed++;
 462                         break;
 463                 }
 464         }
 465         lck_mtx_unlock(so_cache_mtx);
 466
 467         timeout(so_cache_timer, NULL, (SO_CACHE_FLUSH_INTERVAL * hz));
 468 }
 469 #endif /* __APPLE__ */
 470
 471 /*
 472  * Get a socket structure from our zone, and initialize it.
 473  * We don't implement `waitok' yet (see comments in uipc_domain.c).
 474  * Note that it would probably be better to allocate socket
 475  * and PCB at the same time, but I'm not convinced that all
 476  * the protocols can be easily modified to do this.
 477  */
 478 struct socket *
 479 soalloc(int waitok, int dom, int type)
 480 {
 481         struct socket *so;
 482
 483         if ((dom == PF_INET) && (type == SOCK_STREAM)) {
 484                 cached_sock_alloc(&so, waitok);
 485         } else {
 486                 MALLOC_ZONE(so, struct socket *, sizeof (*so), socket_zone,
 487                     M_WAITOK);
 488                 if (so != NULL)
 489                         bzero(so, sizeof (*so));
 490         }
 491         /* XXX race condition for reentrant kernel */
 492 //###LD Atomic add for so_gencnt
 493         if (so != NULL) {
 494                 so->so_gencnt = ++so_gencnt;
 495                 so->so_zone = socket_zone;
 496 #if CONFIG_MACF_SOCKET
 497              /* Convert waitok to  M_WAITOK/M_NOWAIT for MAC Framework. */
 498              if (mac_socket_label_init(so, !waitok) != 0) {
 499                         sodealloc(so);
 500                         return (NULL);
 501                 }
 502 #endif /* MAC_SOCKET */
 503                 so_update_last_owner_locked(so, NULL);
 504         }
 505
 506         return (so);
 507 }
 508
 509 /*
 510  * Returns:     0                       Success
 511  *              EAFNOSUPPORT
 512  *              EPROTOTYPE
 513  *              EPROTONOSUPPORT
 514  *              ENOBUFS
 515  *      <pru_attach>:ENOBUFS[AF_UNIX]
 516  *      <pru_attach>:ENOBUFS[TCP]
 517  *      <pru_attach>:ENOMEM[TCP]
 518  *      <pru_attach>:EISCONN[TCP]
 519  *      <pru_attach>:???                [other protocol families, IPSEC]
 520  */
 521 int
 522 socreate(int dom, struct socket **aso, int type, int proto)
 523 {
 524         struct proc *p = current_proc();
 525         register struct protosw *prp;
 526         register struct socket *so;
 527         register int error = 0;
 528 #if CONFIG_EMBEDDED
 529         thread_t thread;
 530         struct uthread *ut;
 531 #endif /* CONFIG_EMBEDDED */
 532
 533 #if TCPDEBUG
 534         extern int tcpconsdebug;
 535 #endif
 536         if (proto)
 537                 prp = pffindproto(dom, proto, type);
 538         else
 539                 prp = pffindtype(dom, type);
 540
 541         if (prp == 0 || prp->pr_usrreqs->pru_attach == 0) {
 542                 if (pffinddomain(dom) == NULL) {
 543                         return (EAFNOSUPPORT);
 544                 }
 545                 if (proto != 0) {
 546                         if (pffindprotonotype(dom, proto) != NULL) {
 547                                 return (EPROTOTYPE);
 548                         }
 549                 }
 550                 return (EPROTONOSUPPORT);
 551         }
 552         if (prp->pr_type != type)
 553                 return (EPROTOTYPE);
 554         so = soalloc(1, dom, type);
 555         if (so == 0)
 556                 return (ENOBUFS);
 557
 558         TAILQ_INIT(&so->so_incomp);
 559         TAILQ_INIT(&so->so_comp);
 560         so->so_type = type;
 561
 562         so->so_uid = kauth_cred_getuid(kauth_cred_get());
 563         so->so_gid = kauth_cred_getgid(kauth_cred_get());
 564         if (!suser(kauth_cred_get(), NULL))
 565                 so->so_state = SS_PRIV;
 566
 567         so->so_proto = prp;
 568 #ifdef __APPLE__
 569         so->so_rcv.sb_flags |= SB_RECV; /* XXX */
 570         so->so_rcv.sb_so = so->so_snd.sb_so = so;
 571 #endif
 572         so->next_lock_lr = 0;
 573         so->next_unlock_lr = 0;
 574
 575 #if CONFIG_MACF_SOCKET
 576         mac_socket_label_associate(kauth_cred_get(), so);
 577 #endif /* MAC_SOCKET */
 578
 579 //### Attachement will create the per pcb lock if necessary and increase refcount
 580         /*
 581          * for creation, make sure it's done before
 582          * socket is inserted in lists
 583          */
 584         so->so_usecount++;
 585
 586         error = (*prp->pr_usrreqs->pru_attach)(so, proto, p);
 587         if (error) {
 588                 /*
 589                  * Warning:
 590                  * If so_pcb is not zero, the socket will be leaked,
 591                  * so protocol attachment handler must be coded carefuly
 592                  */
 593                 so->so_state |= SS_NOFDREF;
 594                 so->so_usecount--;
 595                 sofreelastref(so, 1);   /* will deallocate the socket */
 596                 return (error);
 597         }
 598 #ifdef __APPLE__
 599         prp->pr_domain->dom_refs++;
 600         TAILQ_INIT(&so->so_evlist);
 601
 602         /* Attach socket filters for this protocol */
 603         sflt_initsock(so);
 604 #if TCPDEBUG
 605         if (tcpconsdebug == 2)
 606                 so->so_options |= SO_DEBUG;
 607 #endif
 608 #endif
 609         so_set_default_traffic_class(so);
 610         /*
 611          * If this is a background thread/task, mark the socket as such.
 612          */
 613 #if !CONFIG_EMBEDDED
 614         if (proc_get_self_isbackground() != 0)
 615 #else /* !CONFIG_EMBEDDED */
 616         thread = current_thread();
 617         ut = get_bsdthread_info(thread);
 618         if (uthread_get_background_state(ut))
 619 #endif /* !CONFIG_EMBEDDED */
 620         {
 621                 socket_set_traffic_mgt_flags(so, TRAFFIC_MGT_SO_BACKGROUND);
 622                 so->so_background_thread = current_thread();
 623         }
 624
 625         switch (dom) {
 626     /*
 627      * Don't mark Unix domain sockets as eligible for defunct by default.
 628      */
 629         case PF_LOCAL:
 630                 so->so_flags |= SOF_NODEFUNCT;
 631                 break;
 632     /*
 633      * Radar 9119053
 634      * Since v6 initialization is asynchronous and we can't hold
 635      * up the main boot path, we need to at least hold off any
 636      * sockets attempting to be created until the v6 stack is
 637      * up and ready.
 638      */
 639         case PF_INET6:
 640                 if (in6_init_done == 0)
 641                         ip6_fin();
 642         break;
 643     default:
 644         break;
 645         }
 646
 647         *aso = so;
 648         return (0);
 649 }
 650
 651 /*
 652  * Returns:     0                       Success
 653  *      <pru_bind>:EINVAL               Invalid argument [COMMON_START]
 654  *      <pru_bind>:EAFNOSUPPORT         Address family not supported
 655  *      <pru_bind>:EADDRNOTAVAIL        Address not available.
 656  *      <pru_bind>:EINVAL               Invalid argument
 657  *      <pru_bind>:EAFNOSUPPORT         Address family not supported [notdef]
 658  *      <pru_bind>:EACCES               Permission denied
 659  *      <pru_bind>:EADDRINUSE           Address in use
 660  *      <pru_bind>:EAGAIN               Resource unavailable, try again
 661  *      <pru_bind>:EPERM                Operation not permitted
 662  *      <pru_bind>:???
 663  *      <sf_bind>:???
 664  *
 665  * Notes:       It's not possible to fully enumerate the return codes above,
 666  *              since socket filter authors and protocol family authors may
 667  *              not choose to limit their error returns to those listed, even
 668  *              though this may result in some software operating incorrectly.
 669  *
 670  *              The error codes which are enumerated above are those known to
 671  *              be returned by the tcp_usr_bind function supplied.
 672  */
 673 int
 674 sobind(struct socket *so, struct sockaddr *nam)
 675 {
 676         struct proc *p = current_proc();
 677         int error = 0;
 678
 679         socket_lock(so, 1);
 680
 681         so_update_last_owner_locked(so, p);
 682
 683         /*
 684          * If this is a bind request on a socket that has been marked
 685          * as inactive, reject it now before we go any further.
 686          */
 687         if (so->so_flags & SOF_DEFUNCT) {
 688                 error = EINVAL;
 689                 SODEFUNCTLOG(("%s[%d]: defunct so %p [%d,%d] (%d)\n",
 690                     __func__, proc_pid(p), so, INP_SOCKAF(so), INP_SOCKTYPE(so),
 691                     error));
 692                 goto out;
 693         }
 694
 695         /* Socket filter */
 696         error = sflt_bind(so, nam);
 697
 698         if (error == 0)
 699                 error = (*so->so_proto->pr_usrreqs->pru_bind)(so, nam, p);
 700 out:
 701         socket_unlock(so, 1);
 702
 703         if (error == EJUSTRETURN)
 704                 error = 0;
 705
 706         return (error);
 707 }
 708
 709 void
 710 sodealloc(struct socket *so)
 711 {
 712         /* Remove any filters */
 713         sflt_termsock(so);
 714
 715         so->so_gencnt = ++so_gencnt;
 716
 717 #if CONFIG_MACF_SOCKET
 718         mac_socket_label_destroy(so);
 719 #endif /* MAC_SOCKET */
 720         if (so->cached_in_sock_layer == 1) {
 721                 cached_sock_free(so);
 722         } else {
 723                 if (so->cached_in_sock_layer == -1)
 724                         panic("sodealloc: double dealloc: so=%p\n", so);
 725                 so->cached_in_sock_layer = -1;
 726                 FREE_ZONE(so, sizeof (*so), so->so_zone);
 727         }
 728 }
 729
 730 /*
 731  * Returns:     0                       Success
 732  *              EINVAL
 733  *              EOPNOTSUPP
 734  *      <pru_listen>:EINVAL[AF_UNIX]
 735  *      <pru_listen>:EINVAL[TCP]
 736  *      <pru_listen>:EADDRNOTAVAIL[TCP] Address not available.
 737  *      <pru_listen>:EINVAL[TCP]        Invalid argument
 738  *      <pru_listen>:EAFNOSUPPORT[TCP]  Address family not supported [notdef]
 739  *      <pru_listen>:EACCES[TCP]        Permission denied
 740  *      <pru_listen>:EADDRINUSE[TCP]    Address in use
 741  *      <pru_listen>:EAGAIN[TCP]        Resource unavailable, try again
 742  *      <pru_listen>:EPERM[TCP]         Operation not permitted
 743  *      <sf_listen>:???
 744  *
 745  * Notes:       Other <pru_listen> returns depend on the protocol family; all
 746  *              <sf_listen> returns depend on what the filter author causes
 747  *              their filter to return.
 748  */
 749 int
 750 solisten(struct socket *so, int backlog)
 751 {
 752         struct proc *p = current_proc();
 753         int error = 0;
 754
 755         socket_lock(so, 1);
 756
 757         so_update_last_owner_locked(so, p);
 758
 759         if (so->so_proto == NULL) {
 760                 error = EINVAL;
 761                 goto out;
 762         }
 763         if ((so->so_proto->pr_flags & PR_CONNREQUIRED) == 0) {
 764                 error = EOPNOTSUPP;
 765                 goto out;
 766         }
 767
 768         /*
 769          * If the listen request is made on a socket that is not fully
 770          * disconnected, or on a socket that has been marked as inactive,
 771          * reject the request now.
 772          */
 773         if ((so->so_state &
 774             (SS_ISCONNECTED|SS_ISCONNECTING|SS_ISDISCONNECTING)) ||
 775             (so->so_flags & SOF_DEFUNCT)) {
 776                 error = EINVAL;
 777                 if (so->so_flags & SOF_DEFUNCT) {
 778                         SODEFUNCTLOG(("%s[%d]: defunct so %p [%d,%d] (%d)\n",
 779                             __func__, proc_pid(p), so, INP_SOCKAF(so),
 780                             INP_SOCKTYPE(so), error));
 781                 }
 782                 goto out;
 783         }
 784
 785         if ((so->so_restrictions & SO_RESTRICT_DENYIN) != 0) {
 786                 error = EPERM;
 787                 goto out;
 788         }
 789
 790         error = sflt_listen(so);
 791
 792         if (error == 0) {
 793                 error = (*so->so_proto->pr_usrreqs->pru_listen)(so, p);
 794         }
 795
 796         if (error) {
 797                 if (error == EJUSTRETURN)
 798                         error = 0;
 799                 goto out;
 800         }
 801
 802         if (TAILQ_EMPTY(&so->so_comp))
 803                 so->so_options |= SO_ACCEPTCONN;
 804         /*
 805          * POSIX: The implementation may have an upper limit on the length of
 806          * the listen queue-either global or per accepting socket. If backlog
 807          * exceeds this limit, the length of the listen queue is set to the
 808          * limit.
 809          *
 810          * If listen() is called with a backlog argument value that is less
 811          * than 0, the function behaves as if it had been called with a backlog
 812          * argument value of 0.
 813          *
 814          * A backlog argument of 0 may allow the socket to accept connections,
 815          * in which case the length of the listen queue may be set to an
 816          * implementation-defined minimum value.
 817          */
 818         if (backlog <= 0 || backlog > somaxconn)
 819                 backlog = somaxconn;
 820
 821         so->so_qlimit = backlog;
 822 out:
 823         socket_unlock(so, 1);
 824         return (error);
 825 }
 826
 827 void
 828 sofreelastref(struct socket *so, int dealloc)
 829 {
 830         struct socket *head = so->so_head;
 831
 832         /* Assume socket is locked */
 833
 834         if ((!(so->so_flags & SOF_PCBCLEARING)) ||
 835             ((so->so_state & SS_NOFDREF) == 0)) {
 836 #ifdef __APPLE__
 837                 selthreadclear(&so->so_snd.sb_sel);
 838                 selthreadclear(&so->so_rcv.sb_sel);
 839                 so->so_rcv.sb_flags &= ~SB_UPCALL;
 840                 so->so_snd.sb_flags &= ~SB_UPCALL;
 841 #endif
 842                 return;
 843         }
 844         if (head != NULL) {
 845                 socket_lock(head, 1);
 846                 if (so->so_state & SS_INCOMP) {
 847                         TAILQ_REMOVE(&head->so_incomp, so, so_list);
 848                         head->so_incqlen--;
 849                 } else if (so->so_state & SS_COMP) {
 850                         /*
 851                          * We must not decommission a socket that's
 852                          * on the accept(2) queue.  If we do, then
 853                          * accept(2) may hang after select(2) indicated
 854                          * that the listening socket was ready.
 855                          */
 856 #ifdef __APPLE__
 857                         selthreadclear(&so->so_snd.sb_sel);
 858                         selthreadclear(&so->so_rcv.sb_sel);
 859                         so->so_rcv.sb_flags &= ~SB_UPCALL;
 860                         so->so_snd.sb_flags &= ~SB_UPCALL;
 861 #endif
 862                         socket_unlock(head, 1);
 863                         return;
 864                 } else {
 865                         panic("sofree: not queued");
 866                 }
 867                 head->so_qlen--;
 868                 so->so_state &= ~SS_INCOMP;
 869                 so->so_head = NULL;
 870                 socket_unlock(head, 1);
 871         }
 872 #ifdef __APPLE__
 873         selthreadclear(&so->so_snd.sb_sel);
 874         sbrelease(&so->so_snd);
 875 #endif
 876         sorflush(so);
 877
 878         /* 3932268: disable upcall */
 879         so->so_rcv.sb_flags &= ~SB_UPCALL;
 880         so->so_snd.sb_flags &= ~SB_UPCALL;
 881
 882         if (dealloc)
 883                 sodealloc(so);
 884 }
 885
 886 void
 887 soclose_wait_locked(struct socket *so)
 888 {
 889         lck_mtx_t *mutex_held;
 890
 891         if (so->so_proto->pr_getlock != NULL)
 892                 mutex_held = (*so->so_proto->pr_getlock)(so, 0);
 893         else
 894                 mutex_held = so->so_proto->pr_domain->dom_mtx;
 895         lck_mtx_assert(mutex_held, LCK_MTX_ASSERT_OWNED);
 896
 897         /*
 898          * Double check here and return if there's no outstanding upcall;
 899          * otherwise proceed further only if SOF_UPCALLCLOSEWAIT is set.
 900          */
 901         if (!(so->so_flags & SOF_UPCALLINUSE) ||
 902             !(so->so_flags & SOF_UPCALLCLOSEWAIT))
 903                 return;
 904
 905         so->so_flags |= SOF_CLOSEWAIT;
 906         (void) msleep((caddr_t)&so->so_upcall, mutex_held, (PZERO - 1),
 907             "soclose_wait_locked", NULL);
 908         lck_mtx_assert(mutex_held, LCK_MTX_ASSERT_OWNED);
 909         so->so_flags &= ~SOF_CLOSEWAIT;
 910 }
 911
 912 /*
 913  * Close a socket on last file table reference removal.
 914  * Initiate disconnect if connected.
 915  * Free socket when disconnect complete.
 916  */
 917 int
 918 soclose_locked(struct socket *so)
 919 {
 920         int error = 0;
 921         lck_mtx_t *mutex_held;
 922         struct timespec ts;
 923
 924         if (so->so_usecount == 0) {
 925                 panic("soclose: so=%p refcount=0\n", so);
 926         }
 927
 928         sflt_notify(so, sock_evt_closing, NULL);
 929
 930         if ((so->so_options & SO_ACCEPTCONN)) {
 931                 struct socket *sp, *sonext;
 932                 int socklock = 0;
 933
 934                 /*
 935                  * We do not want new connection to be added
 936                  * to the connection queues
 937                  */
 938                 so->so_options &= ~SO_ACCEPTCONN;
 939
 940                 for (sp = TAILQ_FIRST(&so->so_incomp); sp != NULL; sp = sonext) {
 941                         sonext = TAILQ_NEXT(sp, so_list);
 942
 943                         /* Radar 5350314
 944                          * skip sockets thrown away by tcpdropdropblreq
 945                          * they will get cleanup by the garbage collection.
 946                          * otherwise, remove the incomp socket from the queue
 947                          * and let soabort trigger the appropriate cleanup.
 948                          */
 949                         if (sp->so_flags & SOF_OVERFLOW)
 950                                 continue;
 951
 952                         if (so->so_proto->pr_getlock != NULL) {
 953                                 /* lock ordering for consistency with the rest of the stack,
 954                                  * we lock the socket first and then grabb the head.
 955                                  */
 956                                 socket_unlock(so, 0);
 957                                 socket_lock(sp, 1);
 958                                 socket_lock(so, 0);
 959                                 socklock = 1;
 960                         }
 961
 962                         TAILQ_REMOVE(&so->so_incomp, sp, so_list);
 963                         so->so_incqlen--;
 964
 965                         if (sp->so_state & SS_INCOMP) {
 966                                 sp->so_state &= ~SS_INCOMP;
 967                                 sp->so_head = NULL;
 968
 969                                 (void) soabort(sp);
 970                         }
 971
 972                         if (socklock)
 973                                 socket_unlock(sp, 1);
 974                 }
 975
 976                 while ((sp = TAILQ_FIRST(&so->so_comp)) != NULL) {
 977                         /* Dequeue from so_comp since sofree() won't do it */
 978                         TAILQ_REMOVE(&so->so_comp, sp, so_list);
 979                         so->so_qlen--;
 980
 981                         if (so->so_proto->pr_getlock != NULL) {
 982                                 socket_unlock(so, 0);
 983                                 socket_lock(sp, 1);
 984                         }
 985
 986                         if (sp->so_state & SS_COMP) {
 987                                 sp->so_state &= ~SS_COMP;
 988                                 sp->so_head = NULL;
 989
 990                                 (void) soabort(sp);
 991                         }
 992
 993                         if (so->so_proto->pr_getlock != NULL) {
 994                                 socket_unlock(sp, 1);
 995                                 socket_lock(so, 0);
 996                         }
 997                 }
 998         }
 999         if (so->so_pcb == 0) {
1000                 /* 3915887: mark the socket as ready for dealloc */
1001                 so->so_flags |= SOF_PCBCLEARING;
1002                 goto discard;
1003         }
1004         if (so->so_state & SS_ISCONNECTED) {
1005                 if ((so->so_state & SS_ISDISCONNECTING) == 0) {
1006                         error = sodisconnectlocked(so);
1007                         if (error)
1008                                 goto drop;
1009                 }
1010                 if (so->so_options & SO_LINGER) {
1011                         if ((so->so_state & SS_ISDISCONNECTING) &&
1012                             (so->so_state & SS_NBIO))
1013                                 goto drop;
1014                         if (so->so_proto->pr_getlock != NULL)
1015                                 mutex_held = (*so->so_proto->pr_getlock)(so, 0);
1016                         else
1017                                 mutex_held = so->so_proto->pr_domain->dom_mtx;
1018                         while (so->so_state & SS_ISCONNECTED) {
1019                                 ts.tv_sec = (so->so_linger/100);
1020                                 ts.tv_nsec = (so->so_linger % 100) *
1021                                     NSEC_PER_USEC * 1000 * 10;
1022                                 error = msleep((caddr_t)&so->so_timeo,
1023                                     mutex_held, PSOCK | PCATCH, "soclose", &ts);
1024                                 if (error) {
1025                                         /*
1026                                          * It's OK when the time fires,
1027                                          * don't report an error
1028                                          */
1029                                         if (error == EWOULDBLOCK)
1030                                                 error = 0;
1031                                         break;
1032                                 }
1033                         }
1034                 }
1035         }
1036 drop:
1037         if (so->so_usecount == 0)
1038                 panic("soclose: usecount is zero so=%p\n", so);
1039         if (so->so_pcb && !(so->so_flags & SOF_PCBCLEARING)) {
1040                 int error2 = (*so->so_proto->pr_usrreqs->pru_detach)(so);
1041                 if (error == 0)
1042                         error = error2;
1043         }
1044         if (so->so_usecount <= 0)
1045                 panic("soclose: usecount is zero so=%p\n", so);
1046 discard:
1047         if (so->so_pcb && so->so_state & SS_NOFDREF)
1048                 panic("soclose: NOFDREF");
1049         so->so_state |= SS_NOFDREF;
1050 #ifdef __APPLE__
1051         so->so_proto->pr_domain->dom_refs--;
1052         evsofree(so);
1053 #endif
1054         so->so_usecount--;
1055         sofree(so);
1056         return (error);
1057 }
1058
1059 int
1060 soclose(struct socket *so)
1061 {
1062         int error = 0;
1063         socket_lock(so, 1);
1064
1065         if (so->so_flags & SOF_UPCALLINUSE)
1066                 soclose_wait_locked(so);
1067
1068         if (so->so_retaincnt == 0) {
1069                 error = soclose_locked(so);
1070         } else {
1071                 /*
1072                  * if the FD is going away, but socket is
1073                  * retained in kernel remove its reference
1074                  */
1075                 so->so_usecount--;
1076                 if (so->so_usecount < 2)
1077                         panic("soclose: retaincnt non null and so=%p "
1078                             "usecount=%d\n", so, so->so_usecount);
1079         }
1080         socket_unlock(so, 1);
1081         return (error);
1082 }
1083
1084 /*
1085  * Must be called at splnet...
1086  */
1087 /* Should already be locked */
1088 int
1089 soabort(struct socket *so)
1090 {
1091         int error;
1092
1093 #ifdef MORE_LOCKING_DEBUG
1094         lck_mtx_t *mutex_held;
1095
1096         if (so->so_proto->pr_getlock != NULL)
1097                 mutex_held = (*so->so_proto->pr_getlock)(so, 0);
1098         else
1099                 mutex_held = so->so_proto->pr_domain->dom_mtx;
1100         lck_mtx_assert(mutex_held, LCK_MTX_ASSERT_OWNED);
1101 #endif
1102
1103         if ((so->so_flags & SOF_ABORTED) == 0) {
1104                 so->so_flags |= SOF_ABORTED;
1105                 error = (*so->so_proto->pr_usrreqs->pru_abort)(so);
1106                 if (error) {
1107                         sofree(so);
1108                         return (error);
1109                 }
1110         }
1111         return (0);
1112 }
1113
1114 int
1115 soacceptlock(struct socket *so, struct sockaddr **nam, int dolock)
1116 {
1117         int error;
1118
1119         if (dolock)
1120                 socket_lock(so, 1);
1121
1122         if ((so->so_state & SS_NOFDREF) == 0)
1123                 panic("soaccept: !NOFDREF");
1124         so->so_state &= ~SS_NOFDREF;
1125         error = (*so->so_proto->pr_usrreqs->pru_accept)(so, nam);
1126
1127         if (dolock)
1128                 socket_unlock(so, 1);
1129         return (error);
1130 }
1131
1132 int
1133 soaccept(struct socket *so, struct sockaddr **nam)
1134 {
1135         return (soacceptlock(so, nam, 1));
1136 }
1137
1138 int
1139 soacceptfilter(struct socket *so)
1140 {
1141         struct sockaddr *local = NULL, *remote = NULL;
1142         int error = 0;
1143         struct socket *head = so->so_head;
1144
1145         /*
1146          * Hold the lock even if this socket
1147          * has not been made visible to the filter(s).
1148          * For sockets with global locks, this protect against the
1149          * head or peer going away
1150          */
1151         socket_lock(so, 1);
1152         if (sogetaddr_locked(so, &remote, 1) != 0 ||
1153             sogetaddr_locked(so, &local, 0) != 0) {
1154                 so->so_state &= ~(SS_NOFDREF | SS_COMP);
1155                 so->so_head = NULL;
1156                 socket_unlock(so, 1);
1157                 soclose(so);
1158                 /* Out of resources; try it again next time */
1159                 error = ECONNABORTED;
1160                 goto done;
1161         }
1162
1163         error = sflt_accept(head, so, local, remote);
1164
1165         /*
1166          * If we get EJUSTRETURN from one of the filters, mark this socket
1167          * as inactive and return it anyway.  This newly accepted socket
1168          * will be disconnected later before we hand it off to the caller.
1169          */
1170         if (error == EJUSTRETURN) {
1171                 error = 0;
1172                 (void) sosetdefunct(current_proc(), so,
1173                     SHUTDOWN_SOCKET_LEVEL_DISCONNECT_INTERNAL, FALSE);
1174         }
1175
1176         if (error != 0) {
1177                 /*
1178                  * This may seem like a duplication to the above error
1179                  * handling part when we return ECONNABORTED, except
1180                  * the following is done while holding the lock since
1181                  * the socket has been exposed to the filter(s) earlier.
1182                  */
1183                 so->so_state &= ~(SS_NOFDREF | SS_COMP);
1184                 so->so_head = NULL;
1185                 socket_unlock(so, 1);
1186                 soclose(so);
1187                 /* Propagate socket filter's error code to the caller */
1188         } else {
1189                 socket_unlock(so, 1);
1190         }
1191 done:
1192         /* Callee checks for NULL pointer */
1193         sock_freeaddr(remote);
1194         sock_freeaddr(local);
1195         return (error);
1196 }
1197
1198 /*
1199  * Returns:     0                       Success
1200  *              EOPNOTSUPP              Operation not supported on socket
1201  *              EISCONN                 Socket is connected
1202  *      <pru_connect>:EADDRNOTAVAIL     Address not available.
1203  *      <pru_connect>:EINVAL            Invalid argument
1204  *      <pru_connect>:EAFNOSUPPORT      Address family not supported [notdef]
1205  *      <pru_connect>:EACCES            Permission denied
1206  *      <pru_connect>:EADDRINUSE        Address in use
1207  *      <pru_connect>:EAGAIN            Resource unavailable, try again
1208  *      <pru_connect>:EPERM             Operation not permitted
1209  *      <sf_connect_out>:???            [anything a filter writer might set]
1210  */
1211 int
1212 soconnectlock(struct socket *so, struct sockaddr *nam, int dolock)
1213 {
1214         int error;
1215         struct proc *p = current_proc();
1216
1217         if (dolock)
1218                 socket_lock(so, 1);
1219
1220         so_update_last_owner_locked(so, p);
1221
1222         /*
1223          * If this is a listening socket or if this is a previously-accepted
1224          * socket that has been marked as inactive, reject the connect request.
1225          */
1226         if ((so->so_options & SO_ACCEPTCONN) || (so->so_flags & SOF_DEFUNCT)) {
1227                 error = EOPNOTSUPP;
1228                 if (so->so_flags & SOF_DEFUNCT) {
1229                         SODEFUNCTLOG(("%s[%d]: defunct so %p [%d,%d] (%d)\n",
1230                             __func__, proc_pid(p), so, INP_SOCKAF(so),
1231                             INP_SOCKTYPE(so), error));
1232                 }
1233                 if (dolock)
1234                         socket_unlock(so, 1);
1235                 return (error);
1236         }
1237
1238         if ((so->so_restrictions & SO_RESTRICT_DENYOUT) != 0) {
1239                 if (dolock)
1240                         socket_unlock(so, 1);
1241                 return (EPERM);
1242         }
1243
1244         /*
1245          * If protocol is connection-based, can only connect once.
1246          * Otherwise, if connected, try to disconnect first.
1247          * This allows user to disconnect by connecting to, e.g.,
1248          * a null address.
1249          */
1250         if (so->so_state & (SS_ISCONNECTED|SS_ISCONNECTING) &&
1251             ((so->so_proto->pr_flags & PR_CONNREQUIRED) ||
1252             (error = sodisconnectlocked(so)))) {
1253                 error = EISCONN;
1254         } else {
1255                 /*
1256                  * Run connect filter before calling protocol:
1257                  *  - non-blocking connect returns before completion;
1258                  */
1259                 error = sflt_connectout(so, nam);
1260
1261                 if (error) {
1262                         if (error == EJUSTRETURN)
1263                                 error = 0;
1264                 } else {
1265                         error = (*so->so_proto->pr_usrreqs->pru_connect)(so, nam, p);
1266                 }
1267         }
1268         if (dolock)
1269                 socket_unlock(so, 1);
1270         return (error);
1271 }
1272
1273 int
1274 soconnect(struct socket *so, struct sockaddr *nam)
1275 {
1276         return (soconnectlock(so, nam, 1));
1277 }
1278
1279 /*
1280  * Returns:     0                       Success
1281  *      <pru_connect2>:EINVAL[AF_UNIX]
1282  *      <pru_connect2>:EPROTOTYPE[AF_UNIX]
1283  *      <pru_connect2>:???              [other protocol families]
1284  *
1285  * Notes:       <pru_connect2> is not supported by [TCP].
1286  */
1287 int
1288 soconnect2(struct socket *so1, struct socket *so2)
1289 {
1290         int error;
1291
1292         socket_lock(so1, 1);
1293         if (so2->so_proto->pr_lock)
1294                 socket_lock(so2, 1);
1295
1296         error = (*so1->so_proto->pr_usrreqs->pru_connect2)(so1, so2);
1297
1298         socket_unlock(so1, 1);
1299         if (so2->so_proto->pr_lock)
1300                 socket_unlock(so2, 1);
1301         return (error);
1302 }
1303
1304 int
1305 sodisconnectlocked(struct socket *so)
1306 {
1307         int error;
1308
1309         if ((so->so_state & SS_ISCONNECTED) == 0) {
1310                 error = ENOTCONN;
1311                 goto bad;
1312         }
1313         if (so->so_state & SS_ISDISCONNECTING) {
1314                 error = EALREADY;
1315                 goto bad;
1316         }
1317
1318         error = (*so->so_proto->pr_usrreqs->pru_disconnect)(so);
1319
1320         if (error == 0) {
1321                 sflt_notify(so, sock_evt_disconnected, NULL);
1322         }
1323 bad:
1324         return (error);
1325 }
1326
1327 /* Locking version */
1328 int
1329 sodisconnect(struct socket *so)
1330 {
1331         int error;
1332
1333         socket_lock(so, 1);
1334         error = sodisconnectlocked(so);
1335         socket_unlock(so, 1);
1336         return (error);
1337 }
1338
1339 #define SBLOCKWAIT(f)   (((f) & MSG_DONTWAIT) ? M_DONTWAIT : M_WAIT)
1340
1341 /*
1342  * sosendcheck will lock the socket buffer if it isn't locked and
1343  * verify that there is space for the data being inserted.
1344  *
1345  * Returns:     0                       Success
1346  *              EPIPE
1347  *      sblock:EWOULDBLOCK
1348  *      sblock:EINTR
1349  *      sbwait:EBADF
1350  *      sbwait:EINTR
1351  *      [so_error]:???
1352  */
1353 static int
1354 sosendcheck(struct socket *so, struct sockaddr *addr, int32_t resid, int32_t clen,
1355     int32_t atomic, int flags, int *sblocked)
1356 {
1357         int     error = 0;
1358         int32_t space;
1359         int     assumelock = 0;
1360
1361 restart:
1362         if (*sblocked == 0) {
1363                 if ((so->so_snd.sb_flags & SB_LOCK) != 0 &&
1364                     so->so_send_filt_thread != 0 &&
1365                     so->so_send_filt_thread == current_thread()) {
1366                         /*
1367                          * We're being called recursively from a filter,
1368                          * allow this to continue. Radar 4150520.
1369                          * Don't set sblocked because we don't want
1370                          * to perform an unlock later.
1371                          */
1372                         assumelock = 1;
1373                 } else {
1374                         error = sblock(&so->so_snd, SBLOCKWAIT(flags));
1375                         if (error) {
1376                                 if (so->so_flags & SOF_DEFUNCT)
1377                                         goto defunct;
1378                                 return (error);
1379                         }
1380                         *sblocked = 1;
1381                 }
1382         }
1383
1384         /*
1385          * If a send attempt is made on a socket that has been marked
1386          * as inactive (disconnected), reject the request.
1387          */
1388         if (so->so_flags & SOF_DEFUNCT) {
1389 defunct:
1390                 error = EPIPE;
1391                 SODEFUNCTLOG(("%s[%d]: defunct so %p [%d,%d] (%d)\n", __func__,
1392                     proc_selfpid(), so, INP_SOCKAF(so), INP_SOCKTYPE(so),
1393                     error));
1394                 return (error);
1395         }
1396
1397         if (so->so_state & SS_CANTSENDMORE)
1398                 return (EPIPE);
1399
1400         if (so->so_error) {
1401                 error = so->so_error;
1402                 so->so_error = 0;
1403                 return (error);
1404         }
1405
1406         if ((so->so_state & SS_ISCONNECTED) == 0) {
1407                 if ((so->so_proto->pr_flags & PR_CONNREQUIRED) != 0) {
1408                         if ((so->so_state & SS_ISCONFIRMING) == 0 &&
1409                             !(resid == 0 && clen != 0))
1410                                 return (ENOTCONN);
1411                 } else if (addr == 0 && !(flags&MSG_HOLD)) {
1412                         return ((so->so_proto->pr_flags & PR_CONNREQUIRED) ?
1413                             ENOTCONN : EDESTADDRREQ);
1414                 }
1415         }
1416         space = sbspace(&so->so_snd);
1417         if (flags & MSG_OOB)
1418                 space += 1024;
1419         if ((atomic && resid > so->so_snd.sb_hiwat) ||
1420             clen > so->so_snd.sb_hiwat)
1421                 return (EMSGSIZE);
1422         if (space < resid + clen &&
1423             (atomic || space < (int32_t)so->so_snd.sb_lowat || space < clen)) {
1424                 if ((so->so_state & SS_NBIO) || (flags & MSG_NBIO) ||
1425                     assumelock) {
1426                         return (EWOULDBLOCK);
1427                 }
1428                 sbunlock(&so->so_snd, 1);
1429                 *sblocked = 0;
1430                 error = sbwait(&so->so_snd);
1431                 if (error) {
1432                         if (so->so_flags & SOF_DEFUNCT)
1433                                 goto defunct;
1434                         return (error);
1435                 }
1436                 goto restart;
1437         }
1438
1439         return (0);
1440 }
1441
1442 /*
1443  * Send on a socket.
1444  * If send must go all at once and message is larger than
1445  * send buffering, then hard error.
1446  * Lock against other senders.
1447  * If must go all at once and not enough room now, then
1448  * inform user that this would block and do nothing.
1449  * Otherwise, if nonblocking, send as much as possible.
1450  * The data to be sent is described by "uio" if nonzero,
1451  * otherwise by the mbuf chain "top" (which must be null
1452  * if uio is not).  Data provided in mbuf chain must be small
1453  * enough to send all at once.
1454  *
1455  * Returns nonzero on error, timeout or signal; callers
1456  * must check for short counts if EINTR/ERESTART are returned.
1457  * Data and control buffers are freed on return.
1458  * Experiment:
1459  * MSG_HOLD: go thru most of sosend(), but just enqueue the mbuf
1460  * MSG_SEND: go thru as for MSG_HOLD on current fragment, then
1461  *  point at the mbuf chain being constructed and go from there.
1462  *
1463  * Returns:     0                       Success
1464  *              EOPNOTSUPP
1465  *              EINVAL
1466  *              ENOBUFS
1467  *      uiomove:EFAULT
1468  *      sosendcheck:EPIPE
1469  *      sosendcheck:EWOULDBLOCK
1470  *      sosendcheck:EINTR
1471  *      sosendcheck:EBADF
1472  *      sosendcheck:EINTR
1473  *      sosendcheck:???                 [value from so_error]
1474  *      <pru_send>:ECONNRESET[TCP]
1475  *      <pru_send>:EINVAL[TCP]
1476  *      <pru_send>:ENOBUFS[TCP]
1477  *      <pru_send>:EADDRINUSE[TCP]
1478  *      <pru_send>:EADDRNOTAVAIL[TCP]
1479  *      <pru_send>:EAFNOSUPPORT[TCP]
1480  *      <pru_send>:EACCES[TCP]
1481  *      <pru_send>:EAGAIN[TCP]
1482  *      <pru_send>:EPERM[TCP]
1483  *      <pru_send>:EMSGSIZE[TCP]
1484  *      <pru_send>:EHOSTUNREACH[TCP]
1485  *      <pru_send>:ENETUNREACH[TCP]
1486  *      <pru_send>:ENETDOWN[TCP]
1487  *      <pru_send>:ENOMEM[TCP]
1488  *      <pru_send>:ENOBUFS[TCP]
1489  *      <pru_send>:???[TCP]             [ignorable: mostly IPSEC/firewall/DLIL]
1490  *      <pru_send>:EINVAL[AF_UNIX]
1491  *      <pru_send>:EOPNOTSUPP[AF_UNIX]
1492  *      <pru_send>:EPIPE[AF_UNIX]
1493  *      <pru_send>:ENOTCONN[AF_UNIX]
1494  *      <pru_send>:EISCONN[AF_UNIX]
1495  *      <pru_send>:???[AF_UNIX]         [whatever a filter author chooses]
1496  *      <sf_data_out>:???               [whatever a filter author chooses]
1497  *
1498  * Notes:       Other <pru_send> returns depend on the protocol family; all
1499  *              <sf_data_out> returns depend on what the filter author causes
1500  *              their filter to return.
1501  */
1502 int
1503 sosend(struct socket *so, struct sockaddr *addr, struct uio *uio,
1504     struct mbuf *top, struct mbuf *control, int flags)
1505 {
1506         struct mbuf **mp;
1507         register struct mbuf *m, *freelist = NULL;
1508         register int32_t space, len, resid;
1509         int clen = 0, error, dontroute, mlen, sendflags;
1510         int atomic = sosendallatonce(so) || top;
1511         int sblocked = 0;
1512         struct proc *p = current_proc();
1513
1514         if (uio) {
1515                 // LP64todo - fix this!
1516                 resid = uio_resid(uio);
1517         } else {
1518                 resid = top->m_pkthdr.len;
1519         }
1520         KERNEL_DEBUG((DBG_FNC_SOSEND | DBG_FUNC_START), so, resid,
1521             so->so_snd.sb_cc, so->so_snd.sb_lowat, so->so_snd.sb_hiwat);
1522
1523         socket_lock(so, 1);
1524         so_update_last_owner_locked(so, p);
1525
1526         if (so->so_type != SOCK_STREAM && (flags & MSG_OOB) != 0) {
1527                 error = EOPNOTSUPP;
1528                 socket_unlock(so, 1);
1529                 goto out;
1530         }
1531
1532         /*
1533          * In theory resid should be unsigned.
1534          * However, space must be signed, as it might be less than 0
1535          * if we over-committed, and we must use a signed comparison
1536          * of space and resid.  On the other hand, a negative resid
1537          * causes us to loop sending 0-length segments to the protocol.
1538          *
1539          * Also check to make sure that MSG_EOR isn't used on SOCK_STREAM
1540          * type sockets since that's an error.
1541          */
1542         if (resid < 0 || (so->so_type == SOCK_STREAM && (flags & MSG_EOR))) {
1543                 error = EINVAL;
1544                 socket_unlock(so, 1);
1545                 goto out;
1546         }
1547
1548         dontroute =
1549             (flags & MSG_DONTROUTE) && (so->so_options & SO_DONTROUTE) == 0 &&
1550             (so->so_proto->pr_flags & PR_ATOMIC);
1551         OSIncrementAtomicLong(&p->p_stats->p_ru.ru_msgsnd);
1552         if (control)
1553                 clen = control->m_len;
1554
1555         do {
1556                 error = sosendcheck(so, addr, resid, clen, atomic, flags,
1557                     &sblocked);
1558                 if (error) {
1559                         goto release;
1560                 }
1561                 mp = &top;
1562                 space = sbspace(&so->so_snd) - clen + ((flags & MSG_OOB) ?
1563                     1024 : 0);
1564
1565                 do {
1566                         if (uio == NULL) {
1567                                 /*
1568                                  * Data is prepackaged in "top".
1569                                  */
1570                                 resid = 0;
1571                                 if (flags & MSG_EOR)
1572                                         top->m_flags |= M_EOR;
1573                         } else {
1574                                 int chainlength;
1575                                 int bytes_to_copy;
1576                                 boolean_t jumbocl;
1577
1578                                 bytes_to_copy = imin(resid, space);
1579
1580                                 if (sosendminchain > 0) {
1581                                         chainlength = 0;
1582                                 } else {
1583                                         chainlength = sosendmaxchain;
1584                                 }
1585
1586                                 /*
1587                                  * Attempt to use larger than system page-size
1588                                  * clusters for large writes only if there is
1589                                  * a jumbo cluster pool and if the socket is
1590                                  * marked accordingly.
1591                                  */
1592                                 jumbocl = sosendjcl && njcl > 0 &&
1593                                     ((so->so_flags & SOF_MULTIPAGES) ||
1594                                     sosendjcl_ignore_capab);
1595
1596                                 socket_unlock(so, 0);
1597
1598                                 do {
1599                                         int num_needed;
1600                                         int hdrs_needed = (top == 0) ? 1 : 0;
1601
1602                                         /*
1603                                          * try to maintain a local cache of mbuf
1604                                          * clusters needed to complete this
1605                                          * write the list is further limited to
1606                                          * the number that are currently needed
1607                                          * to fill the socket this mechanism
1608                                          * allows a large number of mbufs/
1609                                          * clusters to be grabbed under a single
1610                                          * mbuf lock... if we can't get any
1611                                          * clusters, than fall back to trying
1612                                          * for mbufs if we fail early (or
1613                                          * miscalcluate the number needed) make
1614                                          * sure to release any clusters we
1615                                          * haven't yet consumed.
1616                                          */
1617                                         if (freelist == NULL &&
1618                                             bytes_to_copy > MBIGCLBYTES &&
1619                                             jumbocl) {
1620                                                 num_needed =
1621                                                     bytes_to_copy / M16KCLBYTES;
1622
1623                                                 if ((bytes_to_copy -
1624                                                     (num_needed * M16KCLBYTES))
1625                                                     >= MINCLSIZE)
1626                                                         num_needed++;
1627
1628                                                 freelist =
1629                                                     m_getpackets_internal(
1630                                                     (unsigned int *)&num_needed,
1631                                                     hdrs_needed, M_WAIT, 0,
1632                                                     M16KCLBYTES);
1633                                                 /*
1634                                                  * Fall back to 4K cluster size
1635                                                  * if allocation failed
1636                                                  */
1637                                         }
1638
1639                                         if (freelist == NULL &&
1640                                             bytes_to_copy > MCLBYTES) {
1641                                                 num_needed =
1642                                                     bytes_to_copy / MBIGCLBYTES;
1643
1644                                                 if ((bytes_to_copy -
1645                                                     (num_needed * MBIGCLBYTES)) >=
1646                                                     MINCLSIZE)
1647                                                         num_needed++;
1648
1649                                                 freelist =
1650                                                     m_getpackets_internal(
1651                                                     (unsigned int *)&num_needed,
1652                                                     hdrs_needed, M_WAIT, 0,
1653                                                     MBIGCLBYTES);
1654                                                 /*
1655                                                  * Fall back to cluster size
1656                                                  * if allocation failed
1657                                                  */
1658                                         }
1659
1660                                         if (freelist == NULL &&
1661                                             bytes_to_copy > MINCLSIZE) {
1662                                                 num_needed =
1663                                                     bytes_to_copy / MCLBYTES;
1664
1665                                                 if ((bytes_to_copy -
1666                                                     (num_needed * MCLBYTES)) >=
1667                                                     MINCLSIZE)
1668                                                         num_needed++;
1669
1670                                                 freelist =
1671                                                     m_getpackets_internal(
1672                                                     (unsigned int *)&num_needed,
1673                                                     hdrs_needed, M_WAIT, 0,
1674                                                     MCLBYTES);
1675                                                 /*
1676                                                  * Fall back to a single mbuf
1677                                                  * if allocation failed
1678                                                  */
1679                                         }
1680
1681                                         if (freelist == NULL) {
1682                                                 if (top == 0)
1683                                                         MGETHDR(freelist,
1684                                                             M_WAIT, MT_DATA);
1685                                                 else
1686                                                         MGET(freelist,
1687                                                             M_WAIT, MT_DATA);
1688
1689                                                 if (freelist == NULL) {
1690                                                         error = ENOBUFS;
1691                                                         socket_lock(so, 0);
1692                                                         goto release;
1693                                                 }
1694                                                 /*
1695                                                  * For datagram protocols,
1696                                                  * leave room for protocol
1697                                                  * headers in first mbuf.
1698                                                  */
1699                                                 if (atomic && top == 0 &&
1700                                                     bytes_to_copy < MHLEN) {
1701                                                         MH_ALIGN(freelist,
1702                                                             bytes_to_copy);
1703                                                 }
1704                                         }
1705                                         m = freelist;
1706                                         freelist = m->m_next;
1707                                         m->m_next = NULL;
1708
1709                                         if ((m->m_flags & M_EXT))
1710                                                 mlen = m->m_ext.ext_size;
1711                                         else if ((m->m_flags & M_PKTHDR))
1712                                                 mlen =
1713                                                     MHLEN - m_leadingspace(m);
1714                                         else
1715                                                 mlen = MLEN;
1716                                         len = imin(mlen, bytes_to_copy);
1717
1718                                         chainlength += len;
1719
1720                                         space -= len;
1721
1722                                         error = uiomove(mtod(m, caddr_t),
1723                                             len, uio);
1724
1725                                         resid = uio_resid(uio);
1726
1727                                         m->m_len = len;
1728                                         *mp = m;
1729                                         top->m_pkthdr.len += len;
1730                                         if (error)
1731                                                 break;
1732                                         mp = &m->m_next;
1733                                         if (resid <= 0) {
1734                                                 if (flags & MSG_EOR)
1735                                                         top->m_flags |= M_EOR;
1736                                                 break;
1737                                         }
1738                                         bytes_to_copy = min(resid, space);
1739
1740                                 } while (space > 0 &&
1741                                     (chainlength < sosendmaxchain || atomic ||
1742                                     resid < MINCLSIZE));
1743
1744                                 socket_lock(so, 0);
1745
1746                                 if (error)
1747                                         goto release;
1748                         }
1749
1750                         if (flags & (MSG_HOLD|MSG_SEND)) {
1751                                 /* Enqueue for later, go away if HOLD */
1752                                 register struct mbuf *mb1;
1753                                 if (so->so_temp && (flags & MSG_FLUSH)) {
1754                                         m_freem(so->so_temp);
1755                                         so->so_temp = NULL;
1756                                 }
1757                                 if (so->so_temp)
1758                                         so->so_tail->m_next = top;
1759                                 else
1760                                         so->so_temp = top;
1761                                 mb1 = top;
1762                                 while (mb1->m_next)
1763                                         mb1 = mb1->m_next;
1764                                 so->so_tail = mb1;
1765                                 if (flags & MSG_HOLD) {
1766                                         top = NULL;
1767                                         goto release;
1768                                 }
1769                                 top = so->so_temp;
1770                         }
1771                         if (dontroute)
1772                                 so->so_options |= SO_DONTROUTE;
1773
1774                         /* Compute flags here, for pru_send and NKEs */
1775                         sendflags = (flags & MSG_OOB) ? PRUS_OOB :
1776                             /*
1777                              * If the user set MSG_EOF, the protocol
1778                              * understands this flag and nothing left to
1779                              * send then use PRU_SEND_EOF instead of PRU_SEND.
1780                              */
1781                             ((flags & MSG_EOF) &&
1782                              (so->so_proto->pr_flags & PR_IMPLOPCL) &&
1783                              (resid <= 0)) ?
1784                                 PRUS_EOF :
1785                             /* If there is more to send set PRUS_MORETOCOME */
1786                             (resid > 0 && space > 0) ? PRUS_MORETOCOME : 0;
1787
1788                         /*
1789                          * Socket filter processing
1790                          */
1791                         error = sflt_data_out(so, addr, &top, &control,
1792                                                 (sendflags & MSG_OOB) ? sock_data_filt_flag_oob : 0);
1793                         if (error) {
1794                                 if (error == EJUSTRETURN) {
1795                                         error = 0;
1796                                         clen = 0;
1797                                         control = 0;
1798                                         top = 0;
1799                                 }
1800
1801                                 goto release;
1802                         }
1803                         /*
1804                          * End Socket filter processing
1805                          */
1806
1807                         error = (*so->so_proto->pr_usrreqs->pru_send)
1808                                 (so, sendflags, top, addr, control, p);
1809 #ifdef __APPLE__
1810                         if (flags & MSG_SEND)
1811                                 so->so_temp = NULL;
1812 #endif
1813                         if (dontroute)
1814                                 so->so_options &= ~SO_DONTROUTE;
1815
1816                         clen = 0;
1817                         control = 0;
1818                         top = 0;
1819                         mp = &top;
1820                         if (error)
1821                                 goto release;
1822                 } while (resid && space > 0);
1823         } while (resid);
1824
1825 release:
1826         if (sblocked)
1827                 sbunlock(&so->so_snd, 0);       /* will unlock socket */
1828         else
1829                 socket_unlock(so, 1);
1830 out:
1831         if (top)
1832                 m_freem(top);
1833         if (control)
1834                 m_freem(control);
1835         if (freelist)
1836                 m_freem_list(freelist);
1837
1838         KERNEL_DEBUG(DBG_FNC_SOSEND | DBG_FUNC_END, so, resid, so->so_snd.sb_cc,
1839             space, error);
1840
1841         return (error);
1842 }
1843
1844 /*
1845  * Implement receive operations on a socket.
1846  * We depend on the way that records are added to the sockbuf
1847  * by sbappend*.  In particular, each record (mbufs linked through m_next)
1848  * must begin with an address if the protocol so specifies,
1849  * followed by an optional mbuf or mbufs containing ancillary data,
1850  * and then zero or more mbufs of data.
1851  * In order to avoid blocking network interrupts for the entire time here,
1852  * we splx() while doing the actual copy to user space.
1853  * Although the sockbuf is locked, new data may still be appended,
1854  * and thus we must maintain consistency of the sockbuf during that time.
1855  *
1856  * The caller may receive the data as a single mbuf chain by supplying
1857  * an mbuf **mp0 for use in returning the chain.  The uio is then used
1858  * only for the count in uio_resid.
1859  *
1860  * Returns:     0                       Success
1861  *              ENOBUFS
1862  *              ENOTCONN
1863  *              EWOULDBLOCK
1864  *      uiomove:EFAULT
1865  *      sblock:EWOULDBLOCK
1866  *      sblock:EINTR
1867  *      sbwait:EBADF
1868  *      sbwait:EINTR
1869  *      sodelayed_copy:EFAULT
1870  *      <pru_rcvoob>:EINVAL[TCP]
1871  *      <pru_rcvoob>:EWOULDBLOCK[TCP]
1872  *      <pru_rcvoob>:???
1873  *      <pr_domain->dom_externalize>:EMSGSIZE[AF_UNIX]
1874  *      <pr_domain->dom_externalize>:ENOBUFS[AF_UNIX]
1875  *      <pr_domain->dom_externalize>:???
1876  *
1877  * Notes:       Additional return values from calls through <pru_rcvoob> and
1878  *              <pr_domain->dom_externalize> depend on protocols other than
1879  *              TCP or AF_UNIX, which are documented above.
1880  */
1881 int
1882 soreceive(struct socket *so, struct sockaddr **psa, struct uio *uio,
1883     struct mbuf **mp0, struct mbuf **controlp, int *flagsp)
1884 {
1885         register struct mbuf *m, **mp, *ml = NULL;
1886         register int flags, len, error, offset;
1887         struct protosw *pr = so->so_proto;
1888         struct mbuf *nextrecord;
1889         int moff, type = 0;
1890         int orig_resid = uio_resid(uio);
1891         struct mbuf *free_list;
1892         int delayed_copy_len;
1893         int can_delay;
1894         int need_event;
1895         struct proc *p = current_proc();
1896
1897         // LP64todo - fix this!
1898         KERNEL_DEBUG(DBG_FNC_SORECEIVE | DBG_FUNC_START, so, uio_resid(uio),
1899             so->so_rcv.sb_cc, so->so_rcv.sb_lowat, so->so_rcv.sb_hiwat);
1900
1901         socket_lock(so, 1);
1902         so_update_last_owner_locked(so, p);
1903
1904 #ifdef MORE_LOCKING_DEBUG
1905         if (so->so_usecount == 1)
1906                 panic("soreceive: so=%x no other reference on socket\n", so);
1907 #endif
1908         mp = mp0;
1909         if (psa)
1910                 *psa = 0;
1911         if (controlp)
1912                 *controlp = 0;
1913         if (flagsp)
1914                 flags = *flagsp &~ MSG_EOR;
1915         else
1916                 flags = 0;
1917
1918         /*
1919          * If a recv attempt is made on a previously-accepted socket
1920          * that has been marked as inactive (disconnected), reject
1921          * the request.
1922          */
1923         if (so->so_flags & SOF_DEFUNCT) {
1924                 struct sockbuf *sb = &so->so_rcv;
1925
1926                 error = ENOTCONN;
1927                 SODEFUNCTLOG(("%s[%d]: defunct so %p [%d,%d] (%d)\n", __func__,
1928                     proc_pid(p), so, INP_SOCKAF(so), INP_SOCKTYPE(so), error));
1929                 /*
1930                  * This socket should have been disconnected and flushed
1931                  * prior to being returned from sodefunct(); there should
1932                  * be no data on its receive list, so panic otherwise.
1933                  */
1934                 if (so->so_state & SS_DEFUNCT)
1935                         sb_empty_assert(sb, __func__);
1936                 socket_unlock(so, 1);
1937                 return (error);
1938         }
1939
1940         /*
1941          * When SO_WANTOOBFLAG is set we try to get out-of-band data
1942          * regardless of the flags argument. Here is the case were
1943          * out-of-band data is not inline.
1944          */
1945         if ((flags & MSG_OOB) ||
1946             ((so->so_options & SO_WANTOOBFLAG) != 0 &&
1947             (so->so_options & SO_OOBINLINE) == 0 &&
1948             (so->so_oobmark || (so->so_state & SS_RCVATMARK)))) {
1949                 m = m_get(M_WAIT, MT_DATA);
1950                 if (m == NULL) {
1951                         socket_unlock(so, 1);
1952                         KERNEL_DEBUG(DBG_FNC_SORECEIVE | DBG_FUNC_END,
1953                             ENOBUFS, 0, 0, 0, 0);
1954                         return (ENOBUFS);
1955                 }
1956                 error = (*pr->pr_usrreqs->pru_rcvoob)(so, m, flags & MSG_PEEK);
1957                 if (error)
1958                         goto bad;
1959                 socket_unlock(so, 0);
1960                 do {
1961                         error = uiomove(mtod(m, caddr_t),
1962                             imin(uio_resid(uio), m->m_len), uio);
1963                         m = m_free(m);
1964                 } while (uio_resid(uio) && error == 0 && m);
1965                 socket_lock(so, 0);
1966 bad:
1967                 if (m)
1968                         m_freem(m);
1969 #ifdef __APPLE__
1970                 if ((so->so_options & SO_WANTOOBFLAG) != 0) {
1971                         if (error == EWOULDBLOCK || error == EINVAL) {
1972                                 /*
1973                                  * Let's try to get normal data:
1974                                  * EWOULDBLOCK: out-of-band data not
1975                                  * receive yet. EINVAL: out-of-band data
1976                                  * already read.
1977                                  */
1978                                 error = 0;
1979                                 goto nooob;
1980                         } else if (error == 0 && flagsp) {
1981                                 *flagsp |= MSG_OOB;
1982                         }
1983                 }
1984                 socket_unlock(so, 1);
1985                 KERNEL_DEBUG(DBG_FNC_SORECEIVE | DBG_FUNC_END, error,
1986                     0, 0, 0, 0);
1987 #endif
1988                 return (error);
1989         }
1990 nooob:
1991         if (mp)
1992                 *mp = (struct mbuf *)0;
1993         if (so->so_state & SS_ISCONFIRMING && uio_resid(uio))
1994                 (*pr->pr_usrreqs->pru_rcvd)(so, 0);
1995
1996
1997         free_list = (struct mbuf *)0;
1998         delayed_copy_len = 0;
1999 restart:
2000 #ifdef MORE_LOCKING_DEBUG
2001         if (so->so_usecount <= 1)
2002                 printf("soreceive: sblock so=%p ref=%d on socket\n",
2003                     so, so->so_usecount);
2004 #endif
2005         /*
2006          * See if the socket has been closed (SS_NOFDREF|SS_CANTRCVMORE)
2007          * and if so just return to the caller.  This could happen when
2008          * soreceive() is called by a socket upcall function during the
2009          * time the socket is freed.  The socket buffer would have been
2010          * locked across the upcall, therefore we cannot put this thread
2011          * to sleep (else we will deadlock) or return EWOULDBLOCK (else
2012          * we may livelock), because the lock on the socket buffer will
2013          * only be released when the upcall routine returns to its caller.
2014          * Because the socket has been officially closed, there can be
2015          * no further read on it.
2016          */
2017         if ((so->so_state & (SS_NOFDREF | SS_CANTRCVMORE)) ==
2018             (SS_NOFDREF | SS_CANTRCVMORE)) {
2019                 socket_unlock(so, 1);
2020                 return (0);
2021         }
2022
2023         error = sblock(&so->so_rcv, SBLOCKWAIT(flags));
2024         if (error) {
2025                 socket_unlock(so, 1);
2026                 KERNEL_DEBUG(DBG_FNC_SORECEIVE | DBG_FUNC_END, error,
2027                     0, 0, 0, 0);
2028                 return (error);
2029         }
2030
2031         m = so->so_rcv.sb_mb;
2032         /*
2033          * If we have less data than requested, block awaiting more
2034          * (subject to any timeout) if:
2035          *   1. the current count is less than the low water mark, or
2036          *   2. MSG_WAITALL is set, and it is possible to do the entire
2037          *      receive operation at once if we block (resid <= hiwat).
2038          *   3. MSG_DONTWAIT is not set
2039          * If MSG_WAITALL is set but resid is larger than the receive buffer,
2040          * we have to do the receive in sections, and thus risk returning
2041          * a short count if a timeout or signal occurs after we start.
2042          */
2043         if (m == 0 || (((flags & MSG_DONTWAIT) == 0 &&
2044             so->so_rcv.sb_cc < uio_resid(uio)) &&
2045             (so->so_rcv.sb_cc < so->so_rcv.sb_lowat ||
2046             ((flags & MSG_WAITALL) && uio_resid(uio) <= so->so_rcv.sb_hiwat)) &&
2047             m->m_nextpkt == 0 && (pr->pr_flags & PR_ATOMIC) == 0)) {
2048                 /*
2049                  * Panic if we notice inconsistencies in the socket's
2050                  * receive list; both sb_mb and sb_cc should correctly
2051                  * reflect the contents of the list, otherwise we may
2052                  * end up with false positives during select() or poll()
2053                  * which could put the application in a bad state.
2054                  */
2055                 if (m == NULL && so->so_rcv.sb_cc != 0)
2056                         panic("soreceive corrupted so_rcv: m %p cc %u",
2057                             m, so->so_rcv.sb_cc);
2058
2059                 if (so->so_error) {
2060                         if (m)
2061                                 goto dontblock;
2062                         error = so->so_error;
2063                         if ((flags & MSG_PEEK) == 0)
2064                                 so->so_error = 0;
2065                         goto release;
2066                 }
2067                 if (so->so_state & SS_CANTRCVMORE) {
2068                         if (m)
2069                                 goto dontblock;
2070                         else
2071                                 goto release;
2072                 }
2073                 for (; m; m = m->m_next)
2074                         if (m->m_type == MT_OOBDATA || (m->m_flags & M_EOR)) {
2075                                 m = so->so_rcv.sb_mb;
2076                                 goto dontblock;
2077                         }
2078                 if ((so->so_state & (SS_ISCONNECTED|SS_ISCONNECTING)) == 0 &&
2079                     (so->so_proto->pr_flags & PR_CONNREQUIRED)) {
2080                         error = ENOTCONN;
2081                         goto release;
2082                 }
2083                 if (uio_resid(uio) == 0)
2084                         goto release;
2085                 if ((so->so_state & SS_NBIO) ||
2086                     (flags & (MSG_DONTWAIT|MSG_NBIO))) {
2087                         error = EWOULDBLOCK;
2088                         goto release;
2089                 }
2090                 SBLASTRECORDCHK(&so->so_rcv, "soreceive sbwait 1");
2091                 SBLASTMBUFCHK(&so->so_rcv, "soreceive sbwait 1");
2092                 sbunlock(&so->so_rcv, 1);
2093 #if EVEN_MORE_LOCKING_DEBUG
2094                 if (socket_debug)
2095                         printf("Waiting for socket data\n");
2096 #endif
2097
2098                 error = sbwait(&so->so_rcv);
2099 #if EVEN_MORE_LOCKING_DEBUG
2100                 if (socket_debug)
2101                         printf("SORECEIVE - sbwait returned %d\n", error);
2102 #endif
2103                 if (so->so_usecount < 1)
2104                         panic("soreceive: after 2nd sblock so=%p ref=%d on "
2105                             "socket\n", so, so->so_usecount);
2106                 if (error) {
2107                         socket_unlock(so, 1);
2108                         KERNEL_DEBUG(DBG_FNC_SORECEIVE | DBG_FUNC_END, error,
2109                             0, 0, 0, 0);
2110                         return (error);
2111                 }
2112                 goto restart;
2113         }
2114 dontblock:
2115         OSIncrementAtomicLong(&p->p_stats->p_ru.ru_msgrcv);
2116         SBLASTRECORDCHK(&so->so_rcv, "soreceive 1");
2117         SBLASTMBUFCHK(&so->so_rcv, "soreceive 1");
2118         nextrecord = m->m_nextpkt;
2119         if ((pr->pr_flags & PR_ADDR) && m->m_type == MT_SONAME) {
2120                 KASSERT(m->m_type == MT_SONAME, ("receive 1a"));
2121 #if CONFIG_MACF_SOCKET_SUBSET
2122                 /*
2123                  * Call the MAC framework for policy checking if we're in
2124                  * the user process context and the socket isn't connected.
2125                  */
2126                 if (p != kernproc && !(so->so_state & SS_ISCONNECTED)) {
2127                         struct mbuf *m0 = m;
2128                         /*
2129                          * Dequeue this record (temporarily) from the receive
2130                          * list since we're about to drop the socket's lock
2131                          * where a new record may arrive and be appended to
2132                          * the list.  Upon MAC policy failure, the record
2133                          * will be freed.  Otherwise, we'll add it back to
2134                          * the head of the list.  We cannot rely on SB_LOCK
2135                          * because append operation uses the socket's lock.
2136                          */
2137                         do {
2138                                 m->m_nextpkt = NULL;
2139                                 sbfree(&so->so_rcv, m);
2140                                 m = m->m_next;
2141                         } while (m != NULL);
2142                         m = m0;
2143                         so->so_rcv.sb_mb = nextrecord;
2144                         SB_EMPTY_FIXUP(&so->so_rcv);
2145                         SBLASTRECORDCHK(&so->so_rcv, "soreceive 1a");
2146                         SBLASTMBUFCHK(&so->so_rcv, "soreceive 1a");
2147                         socket_unlock(so, 0);
2148                         if (mac_socket_check_received(proc_ucred(p), so,
2149                             mtod(m, struct sockaddr *)) != 0) {
2150                                 /*
2151                                  * MAC policy failure; free this record and
2152                                  * process the next record (or block until
2153                                  * one is available).  We have adjusted sb_cc
2154                                  * and sb_mbcnt above so there is no need to
2155                                  * call sbfree() again.
2156                                  */
2157                                 do {
2158                                         m = m_free(m);
2159                                 } while (m != NULL);
2160                                 /*
2161                                  * Clear SB_LOCK but don't unlock the socket.
2162                                  * Process the next record or wait for one.
2163                                  */
2164                                 socket_lock(so, 0);
2165                                 sbunlock(&so->so_rcv, 1);
2166                                 goto restart;
2167                         }
2168                         socket_lock(so, 0);
2169                         /*
2170                          * If the socket has been defunct'd, drop it.
2171                          */
2172                         if (so->so_flags & SOF_DEFUNCT) {
2173                                 m_freem(m);
2174                                 error = ENOTCONN;
2175                                 goto release;
2176                         }
2177                         /*
2178                          * Re-adjust the socket receive list and re-enqueue
2179                          * the record in front of any packets which may have
2180                          * been appended while we dropped the lock.
2181                          */
2182                         for (m = m0; m->m_next != NULL; m = m->m_next)
2183                                 sballoc(&so->so_rcv, m);
2184                         sballoc(&so->so_rcv, m);
2185                         if (so->so_rcv.sb_mb == NULL) {
2186                                 so->so_rcv.sb_lastrecord = m0;
2187                                 so->so_rcv.sb_mbtail = m;
2188                         }
2189                         m = m0;
2190                         nextrecord = m->m_nextpkt = so->so_rcv.sb_mb;
2191                         so->so_rcv.sb_mb = m;
2192                         SBLASTRECORDCHK(&so->so_rcv, "soreceive 1b");
2193                         SBLASTMBUFCHK(&so->so_rcv, "soreceive 1b");
2194                 }
2195 #endif /* CONFIG_MACF_SOCKET_SUBSET */
2196                 orig_resid = 0;
2197                 if (psa) {
2198                         *psa = dup_sockaddr(mtod(m, struct sockaddr *),
2199                             mp0 == 0);
2200                         if ((*psa == 0) && (flags & MSG_NEEDSA)) {
2201                                 error = EWOULDBLOCK;
2202                                 goto release;
2203                         }
2204                 }
2205                 if (flags & MSG_PEEK) {
2206                         m = m->m_next;
2207                 } else {
2208                         sbfree(&so->so_rcv, m);
2209                         if (m->m_next == 0 && so->so_rcv.sb_cc != 0)
2210                                 panic("soreceive: about to create invalid "
2211                                     "socketbuf");
2212                         MFREE(m, so->so_rcv.sb_mb);
2213                         m = so->so_rcv.sb_mb;
2214                         if (m != NULL) {
2215                                 m->m_nextpkt = nextrecord;
2216                         } else {
2217                                 so->so_rcv.sb_mb = nextrecord;
2218                                 SB_EMPTY_FIXUP(&so->so_rcv);
2219                         }
2220                 }
2221         }
2222
2223         /*
2224          * Process one or more MT_CONTROL mbufs present before any data mbufs
2225          * in the first mbuf chain on the socket buffer.  If MSG_PEEK, we
2226          * just copy the data; if !MSG_PEEK, we call into the protocol to
2227          * perform externalization.
2228          */
2229         if (m != NULL && m->m_type == MT_CONTROL) {
2230                 struct mbuf *cm = NULL, *cmn;
2231                 struct mbuf **cme = &cm;
2232                 struct sockbuf *sb_rcv = &so->so_rcv;
2233                 struct mbuf **msgpcm = NULL;
2234
2235                 /*
2236                  * Externalizing the control messages would require us to
2237                  * drop the socket's lock below.  Once we re-acquire the
2238                  * lock, the mbuf chain might change.  In order to preserve
2239                  * consistency, we unlink all control messages from the
2240                  * first mbuf chain in one shot and link them separately
2241                  * onto a different chain.
2242                  */
2243                 do {
2244                         if (flags & MSG_PEEK) {
2245                                 if (controlp != NULL) {
2246                                         if (*controlp == NULL) {
2247                                                 msgpcm = controlp;
2248                                         }
2249                                         *controlp = m_copy(m, 0, m->m_len);
2250
2251                                         /* If we failed to allocate an mbuf,
2252                                          * release any previously allocated
2253                                          * mbufs for control data. Return
2254                                          * an error. Keep the mbufs in the
2255                                          * socket as this is using
2256                                          * MSG_PEEK flag.
2257                                          */
2258                                         if (*controlp == NULL) {
2259                                                 m_freem(*msgpcm);
2260                                                 error = ENOBUFS;
2261                                                 goto release;
2262                                         }
2263                                         controlp = &(*controlp)->m_next;
2264                                 }
2265                                 m = m->m_next;
2266                         } else {
2267                                 m->m_nextpkt = NULL;
2268                                 sbfree(sb_rcv, m);
2269                                 sb_rcv->sb_mb = m->m_next;
2270                                 m->m_next = NULL;
2271                                 *cme = m;
2272                                 cme = &(*cme)->m_next;
2273                                 m = sb_rcv->sb_mb;
2274                         }
2275                 } while (m != NULL && m->m_type == MT_CONTROL);
2276
2277                 if (!(flags & MSG_PEEK)) {
2278                         if (sb_rcv->sb_mb != NULL) {
2279                                 sb_rcv->sb_mb->m_nextpkt = nextrecord;
2280                         } else {
2281                                 sb_rcv->sb_mb = nextrecord;
2282                                 SB_EMPTY_FIXUP(sb_rcv);
2283                         }
2284                         if (nextrecord == NULL)
2285                                 sb_rcv->sb_lastrecord = m;
2286                 }
2287
2288                 SBLASTRECORDCHK(&so->so_rcv, "soreceive ctl");
2289                 SBLASTMBUFCHK(&so->so_rcv, "soreceive ctl");
2290
2291                 while (cm != NULL) {
2292                         int cmsg_type;
2293
2294                         cmn = cm->m_next;
2295                         cm->m_next = NULL;
2296                         cmsg_type = mtod(cm, struct cmsghdr *)->cmsg_type;
2297
2298                         /*
2299                          * Call the protocol to externalize SCM_RIGHTS message
2300                          * and return the modified message to the caller upon
2301                          * success.  Otherwise, all other control messages are
2302                          * returned unmodified to the caller.  Note that we
2303                          * only get into this loop if MSG_PEEK is not set.
2304                          */
2305                         if (pr->pr_domain->dom_externalize != NULL &&
2306                             cmsg_type == SCM_RIGHTS) {
2307                                 /*
2308                                  * Release socket lock: see 3903171.  This
2309                                  * would also allow more records to be appended
2310                                  * to the socket buffer.  We still have SB_LOCK
2311                                  * set on it, so we can be sure that the head
2312                                  * of the mbuf chain won't change.
2313                                  */
2314                                 socket_unlock(so, 0);
2315                                 error = (*pr->pr_domain->dom_externalize)(cm);
2316                                 socket_lock(so, 0);
2317                         } else {
2318                                 error = 0;
2319                         }
2320
2321                         if (controlp != NULL && error == 0) {
2322                                 *controlp = cm;
2323                                 controlp = &(*controlp)->m_next;
2324                                 orig_resid = 0;
2325                         } else {
2326                                 (void) m_free(cm);
2327                         }
2328                         cm = cmn;
2329                 }
2330                 orig_resid = 0;
2331                 if (sb_rcv->sb_mb != NULL)
2332                         nextrecord = sb_rcv->sb_mb->m_nextpkt;
2333                 else
2334                         nextrecord = NULL;
2335         }
2336
2337         if (m != NULL) {
2338                 if (!(flags & MSG_PEEK)) {
2339                         /*
2340                          * We get here because m points to an mbuf following
2341                          * any MT_SONAME or MT_CONTROL mbufs which have been
2342                          * processed above.  In any case, m should be pointing
2343                          * to the head of the mbuf chain, and the nextrecord
2344                          * should be either NULL or equal to m->m_nextpkt.
2345                          * See comments above about SB_LOCK.
2346                          */
2347                         if (m != so->so_rcv.sb_mb || m->m_nextpkt != nextrecord)
2348                                 panic("soreceive: post-control !sync so=%p "
2349                                     "m=%p nextrecord=%p\n", so, m, nextrecord);
2350
2351                         if (nextrecord == NULL)
2352                                 so->so_rcv.sb_lastrecord = m;
2353                 }
2354                 type = m->m_type;
2355                 if (type == MT_OOBDATA)
2356                         flags |= MSG_OOB;
2357         } else {
2358                 if (!(flags & MSG_PEEK)) {
2359                         so->so_rcv.sb_mb = nextrecord;
2360                         SB_EMPTY_FIXUP(&so->so_rcv);
2361                 }
2362         }
2363         SBLASTRECORDCHK(&so->so_rcv, "soreceive 2");
2364         SBLASTMBUFCHK(&so->so_rcv, "soreceive 2");
2365
2366         moff = 0;
2367         offset = 0;
2368
2369         if (!(flags & MSG_PEEK) && uio_resid(uio) > sorecvmincopy)
2370                 can_delay = 1;
2371         else
2372                 can_delay = 0;
2373
2374         need_event = 0;
2375
2376         while (m && (uio_resid(uio) - delayed_copy_len) > 0 && error == 0) {
2377                 if (m->m_type == MT_OOBDATA) {
2378                         if (type != MT_OOBDATA)
2379                                 break;
2380                 } else if (type == MT_OOBDATA) {
2381                         break;
2382                 }
2383                 /*
2384                  * Make sure to allways set MSG_OOB event when getting
2385                  * out of band data inline.
2386                  */
2387                 if ((so->so_options & SO_WANTOOBFLAG) != 0 &&
2388                     (so->so_options & SO_OOBINLINE) != 0 &&
2389                     (so->so_state & SS_RCVATMARK) != 0) {
2390                         flags |= MSG_OOB;
2391                 }
2392                 so->so_state &= ~SS_RCVATMARK;
2393                 len = uio_resid(uio) - delayed_copy_len;
2394                 if (so->so_oobmark && len > so->so_oobmark - offset)
2395                         len = so->so_oobmark - offset;
2396                 if (len > m->m_len - moff)
2397                         len = m->m_len - moff;
2398                 /*
2399                  * If mp is set, just pass back the mbufs.
2400                  * Otherwise copy them out via the uio, then free.
2401                  * Sockbuf must be consistent here (points to current mbuf,
2402                  * it points to next record) when we drop priority;
2403                  * we must note any additions to the sockbuf when we
2404                  * block interrupts again.
2405                  */
2406                 if (mp == 0) {
2407                         SBLASTRECORDCHK(&so->so_rcv, "soreceive uiomove");
2408                         SBLASTMBUFCHK(&so->so_rcv, "soreceive uiomove");
2409                         if (can_delay && len == m->m_len) {
2410                                 /*
2411                                  * only delay the copy if we're consuming the
2412                                  * mbuf and we're NOT in MSG_PEEK mode
2413                                  * and we have enough data to make it worthwile
2414                                  * to drop and retake the lock... can_delay
2415                                  * reflects the state of the 2 latter
2416                                  * constraints moff should always be zero
2417                                  * in these cases
2418                                  */
2419                                 delayed_copy_len += len;
2420                         } else {
2421                                 if (delayed_copy_len) {
2422                                         error = sodelayed_copy(so, uio,
2423                                             &free_list, &delayed_copy_len);
2424
2425                                         if (error) {
2426                                                 goto release;
2427                                         }
2428                                         /*
2429                                          * can only get here if MSG_PEEK is not
2430                                          * set therefore, m should point at the
2431                                          * head of the rcv queue; if it doesn't,
2432                                          * it means something drastically
2433                                          * changed while we were out from behind
2434                                          * the lock in sodelayed_copy. perhaps
2435                                          * a RST on the stream. in any event,
2436                                          * the stream has been interrupted. it's
2437                                          * probably best just to return whatever
2438                                          * data we've moved and let the caller
2439                                          * sort it out...
2440                                          */
2441                                         if (m != so->so_rcv.sb_mb) {
2442                                                 break;
2443                                         }
2444                                 }
2445                                 socket_unlock(so, 0);
2446                                 error = uiomove(mtod(m, caddr_t) + moff,
2447                                     (int)len, uio);
2448                                 socket_lock(so, 0);
2449
2450                                 if (error)
2451                                         goto release;
2452                         }
2453                 } else {
2454                         uio_setresid(uio, (uio_resid(uio) - len));
2455                 }
2456                 if (len == m->m_len - moff) {
2457                         if (m->m_flags & M_EOR)
2458                                 flags |= MSG_EOR;
2459                         if (flags & MSG_PEEK) {
2460                                 m = m->m_next;
2461                                 moff = 0;
2462                         } else {
2463                                 nextrecord = m->m_nextpkt;
2464                                 sbfree(&so->so_rcv, m);
2465                                 m->m_nextpkt = NULL;
2466
2467                                 if (mp) {
2468                                         *mp = m;
2469                                         mp = &m->m_next;
2470                                         so->so_rcv.sb_mb = m = m->m_next;
2471                                         *mp = (struct mbuf *)0;
2472                                 } else {
2473                                         if (free_list == NULL)
2474                                                 free_list = m;
2475                                         else
2476                                                 ml->m_next = m;
2477                                         ml = m;
2478                                         so->so_rcv.sb_mb = m = m->m_next;
2479                                         ml->m_next = 0;
2480                                 }
2481                                 if (m != NULL) {
2482                                         m->m_nextpkt = nextrecord;
2483                                         if (nextrecord == NULL)
2484                                                 so->so_rcv.sb_lastrecord = m;
2485                                 } else {
2486                                         so->so_rcv.sb_mb = nextrecord;
2487                                         SB_EMPTY_FIXUP(&so->so_rcv);
2488                                 }
2489                                 SBLASTRECORDCHK(&so->so_rcv, "soreceive 3");
2490                                 SBLASTMBUFCHK(&so->so_rcv, "soreceive 3");
2491                         }
2492                 } else {
2493                         if (flags & MSG_PEEK) {
2494                                 moff += len;
2495                         } else {
2496                                 if (mp != NULL) {
2497                                         int copy_flag;
2498
2499                                         if (flags & MSG_DONTWAIT)
2500                                                 copy_flag = M_DONTWAIT;
2501                                         else
2502                                                 copy_flag = M_WAIT;
2503                                         *mp = m_copym(m, 0, len, copy_flag);
2504                                         if (*mp == NULL) {
2505                                                 /*
2506                                                  * Failed to allocate an mbuf.
2507                                                  * Adjust uio_resid back, it was
2508                                                  * adjusted down by len bytes which
2509                                                  * we didn't copy over
2510                                                  */
2511                                                 uio_setresid(uio, (uio_resid(uio) + len));
2512                                                 break;
2513                                         }
2514                                 }
2515                                 m->m_data += len;
2516                                 m->m_len -= len;
2517                                 so->so_rcv.sb_cc -= len;
2518                         }
2519                 }
2520                 if (so->so_oobmark) {
2521                         if ((flags & MSG_PEEK) == 0) {
2522                                 so->so_oobmark -= len;
2523                                 if (so->so_oobmark == 0) {
2524                                         so->so_state |= SS_RCVATMARK;
2525                                         /*
2526                                          * delay posting the actual event until
2527                                          * after any delayed copy processing
2528                                          * has finished
2529                                          */
2530                                         need_event = 1;
2531                                         break;
2532                                 }
2533                         } else {
2534                                 offset += len;
2535                                 if (offset == so->so_oobmark)
2536                                         break;
2537                         }
2538                 }
2539                 if (flags & MSG_EOR)
2540                         break;
2541                 /*
2542                  * If the MSG_WAITALL or MSG_WAITSTREAM flag is set
2543                  * (for non-atomic socket), we must not quit until
2544                  * "uio->uio_resid == 0" or an error termination.
2545                  * If a signal/timeout occurs, return with a short
2546                  * count but without error.  Keep sockbuf locked
2547                  * against other readers.
2548                  */
2549                 while (flags & (MSG_WAITALL|MSG_WAITSTREAM) && m == 0 &&
2550                     (uio_resid(uio) - delayed_copy_len) > 0 &&
2551                     !sosendallatonce(so) && !nextrecord) {
2552                         if (so->so_error || so->so_state & SS_CANTRCVMORE)
2553                                 goto release;
2554
2555                         /*
2556                          * Depending on the protocol (e.g. TCP), the following
2557                          * might cause the socket lock to be dropped and later
2558                          * be reacquired, and more data could have arrived and
2559                          * have been appended to the receive socket buffer by
2560                          * the time it returns.  Therefore, we only sleep in
2561                          * sbwait() below if and only if the socket buffer is
2562                          * empty, in order to avoid a false sleep.
2563                          */
2564                         if (pr->pr_flags & PR_WANTRCVD && so->so_pcb &&
2565                             (((struct inpcb *)so->so_pcb)->inp_state !=
2566                             INPCB_STATE_DEAD))
2567                                 (*pr->pr_usrreqs->pru_rcvd)(so, flags);
2568
2569                         SBLASTRECORDCHK(&so->so_rcv, "soreceive sbwait 2");
2570                         SBLASTMBUFCHK(&so->so_rcv, "soreceive sbwait 2");
2571
2572                         if (so->so_rcv.sb_mb == NULL && sbwait(&so->so_rcv)) {
2573                                 error = 0;
2574                                 goto release;
2575                         }
2576                         /*
2577                          * have to wait until after we get back from the sbwait
2578                          * to do the copy because we will drop the lock if we
2579                          * have enough data that has been delayed... by dropping
2580                          * the lock we open up a window allowing the netisr
2581                          * thread to process the incoming packets and to change
2582                          * the state of this socket... we're issuing the sbwait
2583                          * because the socket is empty and we're expecting the
2584                          * netisr thread to wake us up when more packets arrive;
2585                          * if we allow that processing to happen and then sbwait
2586                          * we could stall forever with packets sitting in the
2587                          * socket if no further packets arrive from the remote
2588                          * side.
2589                          *
2590                          * we want to copy before we've collected all the data
2591                          * to satisfy this request to allow the copy to overlap
2592                          * the incoming packet processing on an MP system
2593                          */
2594                         if (delayed_copy_len > sorecvmincopy &&
2595                             (delayed_copy_len > (so->so_rcv.sb_hiwat / 2))) {
2596                                 error = sodelayed_copy(so, uio,
2597                                     &free_list, &delayed_copy_len);
2598
2599                                 if (error)
2600                                         goto release;
2601                         }
2602                         m = so->so_rcv.sb_mb;
2603                         if (m) {
2604                                 nextrecord = m->m_nextpkt;
2605                         }
2606                 }
2607         }
2608 #ifdef MORE_LOCKING_DEBUG
2609         if (so->so_usecount <= 1)
2610                 panic("soreceive: after big while so=%p ref=%d on socket\n",
2611                     so, so->so_usecount);
2612 #endif
2613
2614         if (m && pr->pr_flags & PR_ATOMIC) {
2615 #ifdef __APPLE__
2616                 if (so->so_options & SO_DONTTRUNC) {
2617                         flags |= MSG_RCVMORE;
2618                 } else {
2619 #endif
2620                         flags |= MSG_TRUNC;
2621                         if ((flags & MSG_PEEK) == 0)
2622                                 (void) sbdroprecord(&so->so_rcv);
2623 #ifdef __APPLE__
2624                 }
2625 #endif
2626         }
2627
2628         /*
2629          * pru_rcvd below (for TCP) may cause more data to be received
2630          * if the socket lock is dropped prior to sending the ACK; some
2631          * legacy OpenTransport applications don't handle this well
2632          * (if it receives less data than requested while MSG_HAVEMORE
2633          * is set), and so we set the flag now based on what we know
2634          * prior to calling pru_rcvd.
2635          */
2636         if ((so->so_options & SO_WANTMORE) && so->so_rcv.sb_cc > 0)
2637                 flags |= MSG_HAVEMORE;
2638
2639         if ((flags & MSG_PEEK) == 0) {
2640                 if (m == 0) {
2641                         so->so_rcv.sb_mb = nextrecord;
2642                         /*
2643                          * First part is an inline SB_EMPTY_FIXUP().  Second
2644                          * part makes sure sb_lastrecord is up-to-date if
2645                          * there is still data in the socket buffer.
2646                          */
2647                         if (so->so_rcv.sb_mb == NULL) {
2648                                 so->so_rcv.sb_mbtail = NULL;
2649                                 so->so_rcv.sb_lastrecord = NULL;
2650                         } else if (nextrecord->m_nextpkt == NULL) {
2651                                 so->so_rcv.sb_lastrecord = nextrecord;
2652                         }
2653                 }
2654                 SBLASTRECORDCHK(&so->so_rcv, "soreceive 4");
2655                 SBLASTMBUFCHK(&so->so_rcv, "soreceive 4");
2656                 if (pr->pr_flags & PR_WANTRCVD && so->so_pcb)
2657                         (*pr->pr_usrreqs->pru_rcvd)(so, flags);
2658         }
2659 #ifdef __APPLE__
2660         if (delayed_copy_len) {
2661                 error = sodelayed_copy(so, uio, &free_list, &delayed_copy_len);
2662
2663                 if (error)
2664                         goto release;
2665         }
2666         if (free_list) {
2667                 m_freem_list((struct mbuf *)free_list);
2668                 free_list = (struct mbuf *)0;
2669         }
2670         if (need_event)
2671                 postevent(so, 0, EV_OOB);
2672 #endif
2673         if (orig_resid == uio_resid(uio) && orig_resid &&
2674             (flags & MSG_EOR) == 0 && (so->so_state & SS_CANTRCVMORE) == 0) {
2675                 sbunlock(&so->so_rcv, 1);
2676                 goto restart;
2677         }
2678
2679         if (flagsp)
2680                 *flagsp |= flags;
2681 release:
2682 #ifdef MORE_LOCKING_DEBUG
2683         if (so->so_usecount <= 1)
2684                 panic("soreceive: release so=%p ref=%d on socket\n",
2685                     so, so->so_usecount);
2686 #endif
2687         if (delayed_copy_len) {
2688                 error = sodelayed_copy(so, uio, &free_list, &delayed_copy_len);
2689         }
2690         if (free_list) {
2691                 m_freem_list((struct mbuf *)free_list);
2692         }
2693         sbunlock(&so->so_rcv, 0);       /* will unlock socket */
2694
2695         // LP64todo - fix this!
2696         KERNEL_DEBUG(DBG_FNC_SORECEIVE | DBG_FUNC_END, so, uio_resid(uio),
2697             so->so_rcv.sb_cc, 0, error);
2698
2699         return (error);
2700 }
2701
2702 /*
2703  * Returns:     0                       Success
2704  *      uiomove:EFAULT
2705  */
2706 static int
2707 sodelayed_copy(struct socket *so, struct uio *uio, struct mbuf **free_list,
2708     int *resid)
2709 {
2710         int error = 0;
2711         struct mbuf *m;
2712
2713         m = *free_list;
2714
2715         socket_unlock(so, 0);
2716
2717         while (m && error == 0) {
2718
2719                 error = uiomove(mtod(m, caddr_t), (int)m->m_len, uio);
2720
2721                 m = m->m_next;
2722         }
2723         m_freem_list(*free_list);
2724
2725         *free_list = (struct mbuf *)NULL;
2726         *resid = 0;
2727
2728         socket_lock(so, 0);
2729
2730         return (error);
2731 }
2732
2733
2734 /*
2735  * Returns:     0                       Success
2736  *              EINVAL
2737  *              ENOTCONN
2738  *      <pru_shutdown>:EINVAL
2739  *      <pru_shutdown>:EADDRNOTAVAIL[TCP]
2740  *      <pru_shutdown>:ENOBUFS[TCP]
2741  *      <pru_shutdown>:EMSGSIZE[TCP]
2742  *      <pru_shutdown>:EHOSTUNREACH[TCP]
2743  *      <pru_shutdown>:ENETUNREACH[TCP]
2744  *      <pru_shutdown>:ENETDOWN[TCP]
2745  *      <pru_shutdown>:ENOMEM[TCP]
2746  *      <pru_shutdown>:EACCES[TCP]
2747  *      <pru_shutdown>:EMSGSIZE[TCP]
2748  *      <pru_shutdown>:ENOBUFS[TCP]
2749  *      <pru_shutdown>:???[TCP]         [ignorable: mostly IPSEC/firewall/DLIL]
2750  *      <pru_shutdown>:???              [other protocol families]
2751  */
2752 int
2753 soshutdown(struct socket *so, int how)
2754 {
2755         int error;
2756
2757         switch (how) {
2758         case SHUT_RD:
2759         case SHUT_WR:
2760         case SHUT_RDWR:
2761                 socket_lock(so, 1);
2762                 if ((so->so_state &
2763                     (SS_ISCONNECTED|SS_ISCONNECTING|SS_ISDISCONNECTING)) == 0) {
2764                         error = ENOTCONN;
2765                 } else {
2766                         error = soshutdownlock(so, how);
2767                 }
2768                 socket_unlock(so, 1);
2769                 break;
2770         default:
2771                 error = EINVAL;
2772                 break;
2773         }
2774
2775         return (error);
2776 }
2777
2778 int
2779 soshutdownlock(struct socket *so, int how)
2780 {
2781         struct protosw *pr = so->so_proto;
2782         int error = 0;
2783
2784         sflt_notify(so, sock_evt_shutdown, &how);
2785
2786         if (how != SHUT_WR) {
2787                 if ((so->so_state & SS_CANTRCVMORE) != 0) {
2788                         /* read already shut down */
2789                         error = ENOTCONN;
2790                         goto done;
2791                 }
2792                 sorflush(so);
2793                 postevent(so, 0, EV_RCLOSED);
2794         }
2795         if (how != SHUT_RD) {
2796                 if ((so->so_state & SS_CANTSENDMORE) != 0) {
2797                         /* write already shut down */
2798                         error = ENOTCONN;
2799                         goto done;
2800                 }
2801                 error = (*pr->pr_usrreqs->pru_shutdown)(so);
2802                 postevent(so, 0, EV_WCLOSED);
2803         }
2804 done:
2805         KERNEL_DEBUG(DBG_FNC_SOSHUTDOWN | DBG_FUNC_END, 0, 0, 0, 0, 0);
2806         return (error);
2807 }
2808
2809 void
2810 sorflush(struct socket *so)
2811 {
2812         register struct sockbuf *sb = &so->so_rcv;
2813         register struct protosw *pr = so->so_proto;
2814         struct sockbuf asb;
2815
2816 #ifdef MORE_LOCKING_DEBUG
2817         lck_mtx_t *mutex_held;
2818
2819         if (so->so_proto->pr_getlock != NULL)
2820                 mutex_held = (*so->so_proto->pr_getlock)(so, 0);
2821         else
2822                 mutex_held = so->so_proto->pr_domain->dom_mtx;
2823         lck_mtx_assert(mutex_held, LCK_MTX_ASSERT_OWNED);
2824 #endif
2825
2826         sflt_notify(so, sock_evt_flush_read, NULL);
2827
2828         sb->sb_flags |= SB_NOINTR;
2829         (void) sblock(sb, M_WAIT);
2830         socantrcvmore(so);
2831         sbunlock(sb, 1);
2832 #ifdef __APPLE__
2833         selthreadclear(&sb->sb_sel);
2834 #endif
2835         asb = *sb;
2836         bzero((caddr_t)sb, sizeof (*sb));
2837         sb->sb_so = so; /* reestablish link to socket */
2838         if (asb.sb_flags & SB_KNOTE) {
2839                 sb->sb_sel.si_note = asb.sb_sel.si_note;
2840                 sb->sb_flags = SB_KNOTE;
2841         }
2842         if (asb.sb_flags & SB_DROP)
2843                 sb->sb_flags |= SB_DROP;
2844         if (asb.sb_flags & SB_UNIX)
2845                 sb->sb_flags |= SB_UNIX;
2846         if ((pr->pr_flags & PR_RIGHTS) && pr->pr_domain->dom_dispose) {
2847                 (*pr->pr_domain->dom_dispose)(asb.sb_mb);
2848         }
2849         sbrelease(&asb);
2850 }
2851
2852 /*
2853  * Perhaps this routine, and sooptcopyout(), below, ought to come in
2854  * an additional variant to handle the case where the option value needs
2855  * to be some kind of integer, but not a specific size.
2856  * In addition to their use here, these functions are also called by the
2857  * protocol-level pr_ctloutput() routines.
2858  *
2859  * Returns:     0                       Success
2860  *              EINVAL
2861  *      copyin:EFAULT
2862  */
2863 int
2864 sooptcopyin(struct sockopt *sopt, void *buf, size_t len, size_t minlen)
2865 {
2866         size_t  valsize;
2867
2868         /*
2869          * If the user gives us more than we wanted, we ignore it,
2870          * but if we don't get the minimum length the caller
2871          * wants, we return EINVAL.  On success, sopt->sopt_valsize
2872          * is set to however much we actually retrieved.
2873          */
2874         if ((valsize = sopt->sopt_valsize) < minlen)
2875                 return (EINVAL);
2876         if (valsize > len)
2877                 sopt->sopt_valsize = valsize = len;
2878
2879         if (sopt->sopt_p != kernproc)
2880                 return (copyin(sopt->sopt_val, buf, valsize));
2881
2882         bcopy(CAST_DOWN(caddr_t, sopt->sopt_val), buf, valsize);
2883         return (0);
2884 }
2885
2886 /*
2887  * sooptcopyin_timeval
2888  *   Copy in a timeval value into tv_p, and take into account whether the
2889  *   the calling process is 64-bit or 32-bit.  Moved the sanity checking
2890  *   code here so that we can verify the 64-bit tv_sec value before we lose
2891  *   the top 32-bits assigning tv64.tv_sec to tv_p->tv_sec.
2892  */
2893 static int
2894 sooptcopyin_timeval(struct sockopt *sopt, struct timeval * tv_p)
2895 {
2896         int                     error;
2897
2898         if (proc_is64bit(sopt->sopt_p)) {
2899                 struct user64_timeval   tv64;
2900
2901                 if (sopt->sopt_valsize < sizeof(tv64)) {
2902                         return (EINVAL);
2903                 }
2904                 sopt->sopt_valsize = sizeof(tv64);
2905                 if (sopt->sopt_p != kernproc) {
2906                         error = copyin(sopt->sopt_val, &tv64, sizeof(tv64));
2907                         if (error != 0)
2908                                 return (error);
2909                 } else {
2910                         bcopy(CAST_DOWN(caddr_t, sopt->sopt_val), &tv64,
2911                                 sizeof(tv64));
2912                 }
2913                 if (tv64.tv_sec < 0 || tv64.tv_sec > LONG_MAX
2914                     || tv64.tv_usec < 0 || tv64.tv_usec >= 1000000) {
2915                         return (EDOM);
2916                 }
2917                 tv_p->tv_sec = tv64.tv_sec;
2918                 tv_p->tv_usec = tv64.tv_usec;
2919         } else {
2920                 struct user32_timeval   tv32;
2921
2922                 if (sopt->sopt_valsize < sizeof(tv32)) {
2923                         return (EINVAL);
2924                 }
2925                 sopt->sopt_valsize = sizeof(tv32);
2926                 if (sopt->sopt_p != kernproc) {
2927                         error = copyin(sopt->sopt_val, &tv32, sizeof(tv32));
2928                         if (error != 0) {
2929                                 return (error);
2930                         }
2931                 } else {
2932                         bcopy(CAST_DOWN(caddr_t, sopt->sopt_val), &tv32,
2933                               sizeof(tv32));
2934                 }
2935 #ifndef __LP64__ // K64todo "comparison is always false due to limited range of data type"
2936                 if (tv32.tv_sec < 0 || tv32.tv_sec > LONG_MAX
2937                     || tv32.tv_usec < 0 || tv32.tv_usec >= 1000000) {
2938                         return (EDOM);
2939                 }
2940 #endif
2941                 tv_p->tv_sec = tv32.tv_sec;
2942                 tv_p->tv_usec = tv32.tv_usec;
2943         }
2944         return (0);
2945 }
2946
2947 /*
2948  * Returns:     0                       Success
2949  *              EINVAL
2950  *              ENOPROTOOPT
2951  *              ENOBUFS
2952  *              EDOM
2953  *      sooptcopyin:EINVAL
2954  *      sooptcopyin:EFAULT
2955  *      sooptcopyin_timeval:EINVAL
2956  *      sooptcopyin_timeval:EFAULT
2957  *      sooptcopyin_timeval:EDOM
2958  *      <pr_ctloutput>:EOPNOTSUPP[AF_UNIX]
2959  *      <pr_ctloutput>:???w
2960  *      sflt_attach_private:???         [whatever a filter author chooses]
2961  *      <sf_setoption>:???              [whatever a filter author chooses]
2962  *
2963  * Notes:       Other <pru_listen> returns depend on the protocol family; all
2964  *              <sf_listen> returns depend on what the filter author causes
2965  *              their filter to return.
2966  */
2967 int
2968 sosetopt(struct socket *so, struct sockopt *sopt)
2969 {
2970         int     error, optval;
2971         struct  linger l;
2972         struct  timeval tv;
2973 #if CONFIG_MACF_SOCKET
2974         struct mac extmac;
2975 #endif /* MAC_SOCKET */
2976
2977         socket_lock(so, 1);
2978         so_update_last_owner_locked(so, NULL);
2979
2980         if ((so->so_state & (SS_CANTRCVMORE | SS_CANTSENDMORE))
2981             == (SS_CANTRCVMORE | SS_CANTSENDMORE) &&
2982             (so->so_flags & SOF_NPX_SETOPTSHUT) == 0) {
2983                 /* the socket has been shutdown, no more sockopt's */
2984                 error = EINVAL;
2985                 goto bad;
2986         }
2987
2988         if (sopt->sopt_dir != SOPT_SET) {
2989                 sopt->sopt_dir = SOPT_SET;
2990         }
2991
2992         error = sflt_setsockopt(so, sopt);
2993         if (error) {
2994                 if (error == EJUSTRETURN)
2995                         error = 0;
2996                 goto bad;
2997         }
2998
2999         error = 0;
3000         if (sopt->sopt_level != SOL_SOCKET) {
3001                 if (so->so_proto && so->so_proto->pr_ctloutput) {
3002                         error = (*so->so_proto->pr_ctloutput)(so, sopt);
3003                         socket_unlock(so, 1);
3004                         return (error);
3005                 }
3006                 error = ENOPROTOOPT;
3007         } else {
3008                 switch (sopt->sopt_name) {
3009                 case SO_LINGER:
3010                 case SO_LINGER_SEC:
3011                         error = sooptcopyin(sopt, &l, sizeof (l), sizeof (l));
3012                         if (error)
3013                                 goto bad;
3014
3015                         so->so_linger = (sopt->sopt_name == SO_LINGER) ?
3016                             l.l_linger : l.l_linger * hz;
3017                         if (l.l_onoff)
3018                                 so->so_options |= SO_LINGER;
3019                         else
3020                                 so->so_options &= ~SO_LINGER;
3021                         break;
3022
3023                 case SO_DEBUG:
3024                 case SO_KEEPALIVE:
3025                 case SO_DONTROUTE:
3026                 case SO_USELOOPBACK:
3027                 case SO_BROADCAST:
3028                 case SO_REUSEADDR:
3029                 case SO_REUSEPORT:
3030                 case SO_OOBINLINE:
3031                 case SO_TIMESTAMP:
3032                 case SO_TIMESTAMP_MONOTONIC:
3033 #ifdef __APPLE__
3034                 case SO_DONTTRUNC:
3035                 case SO_WANTMORE:
3036                 case SO_WANTOOBFLAG:
3037 #endif
3038                         error = sooptcopyin(sopt, &optval, sizeof (optval),
3039                             sizeof (optval));
3040                         if (error)
3041                                 goto bad;
3042                         if (optval)
3043                                 so->so_options |= sopt->sopt_name;
3044                         else
3045                                 so->so_options &= ~sopt->sopt_name;
3046                         break;
3047
3048                 case SO_SNDBUF:
3049                 case SO_RCVBUF:
3050                 case SO_SNDLOWAT:
3051                 case SO_RCVLOWAT:
3052                         error = sooptcopyin(sopt, &optval, sizeof (optval),
3053                             sizeof (optval));
3054                         if (error)
3055                                 goto bad;
3056
3057                         /*
3058                          * Values < 1 make no sense for any of these
3059                          * options, so disallow them.
3060                          */
3061                         if (optval < 1) {
3062                                 error = EINVAL;
3063                                 goto bad;
3064                         }
3065
3066                         switch (sopt->sopt_name) {
3067                         case SO_SNDBUF:
3068                         case SO_RCVBUF:
3069                                 if (sbreserve(sopt->sopt_name == SO_SNDBUF ?
3070                                     &so->so_snd : &so->so_rcv,
3071                                     (u_int32_t) optval) == 0) {
3072                                         error = ENOBUFS;
3073                                         goto bad;
3074                                 }
3075                                 if (sopt->sopt_name == SO_SNDBUF)
3076                                         so->so_snd.sb_flags |= SB_USRSIZE;
3077                                 else
3078                                         so->so_rcv.sb_flags |= SB_USRSIZE;
3079                                 break;
3080
3081                         /*
3082                          * Make sure the low-water is never greater than
3083                          * the high-water.
3084                          */
3085                         case SO_SNDLOWAT:
3086                                 so->so_snd.sb_lowat =
3087                                     (optval > so->so_snd.sb_hiwat) ?
3088                                     so->so_snd.sb_hiwat : optval;
3089                                 break;
3090                         case SO_RCVLOWAT:
3091                                 so->so_rcv.sb_lowat =
3092                                     (optval > so->so_rcv.sb_hiwat) ?
3093                                     so->so_rcv.sb_hiwat : optval;
3094                                 break;
3095                         }
3096                         break;
3097
3098                 case SO_SNDTIMEO:
3099                 case SO_RCVTIMEO:
3100                         error = sooptcopyin_timeval(sopt, &tv);
3101                         if (error)
3102                                 goto bad;
3103
3104                         switch (sopt->sopt_name) {
3105                         case SO_SNDTIMEO:
3106                                 so->so_snd.sb_timeo = tv;
3107                                 break;
3108                         case SO_RCVTIMEO:
3109                                 so->so_rcv.sb_timeo = tv;
3110                                 break;
3111                         }
3112                         break;
3113
3114                 case SO_NKE:
3115                 {
3116                         struct so_nke nke;
3117
3118                         error = sooptcopyin(sopt, &nke, sizeof (nke),
3119                             sizeof (nke));
3120                         if (error)
3121                                 goto bad;
3122
3123                         error = sflt_attach_internal(so, nke.nke_handle);
3124                         break;
3125                 }
3126
3127                 case SO_NOSIGPIPE:
3128                         error = sooptcopyin(sopt, &optval, sizeof (optval),
3129                             sizeof (optval));
3130                         if (error)
3131                                 goto bad;
3132                         if (optval)
3133                                 so->so_flags |= SOF_NOSIGPIPE;
3134                         else
3135                                 so->so_flags &= ~SOF_NOSIGPIPE;
3136
3137                         break;
3138
3139                 case SO_NOADDRERR:
3140                         error = sooptcopyin(sopt, &optval, sizeof (optval),
3141                             sizeof (optval));
3142                         if (error)
3143                                 goto bad;
3144                         if (optval)
3145                                 so->so_flags |= SOF_NOADDRAVAIL;
3146                         else
3147                                 so->so_flags &= ~SOF_NOADDRAVAIL;
3148
3149                         break;
3150
3151                 case SO_REUSESHAREUID:
3152                         error = sooptcopyin(sopt, &optval, sizeof (optval),
3153                             sizeof (optval));
3154                         if (error)
3155                                 goto bad;
3156                         if (optval)
3157                                 so->so_flags |= SOF_REUSESHAREUID;
3158                         else
3159                                 so->so_flags &= ~SOF_REUSESHAREUID;
3160                         break;
3161 #ifdef __APPLE_API_PRIVATE
3162                 case SO_NOTIFYCONFLICT:
3163                         if (kauth_cred_issuser(kauth_cred_get()) == 0) {
3164                                 error = EPERM;
3165                                 goto bad;
3166                         }
3167                         error = sooptcopyin(sopt, &optval, sizeof (optval),
3168                             sizeof (optval));
3169                         if (error)
3170                                 goto bad;
3171                         if (optval)
3172                                 so->so_flags |= SOF_NOTIFYCONFLICT;
3173                         else
3174                                 so->so_flags &= ~SOF_NOTIFYCONFLICT;
3175                         break;
3176 #endif
3177                 case SO_RESTRICTIONS:
3178                         if (kauth_cred_issuser(kauth_cred_get()) == 0) {
3179                                 error = EPERM;
3180                                 goto bad;
3181                         }
3182                         error = sooptcopyin(sopt, &optval, sizeof (optval),
3183                             sizeof (optval));
3184                         if (error)
3185                                 goto bad;
3186                         so->so_restrictions = (optval & (SO_RESTRICT_DENYIN |
3187                             SO_RESTRICT_DENYOUT | SO_RESTRICT_DENYSET));
3188                         break;
3189
3190                 case SO_LABEL:
3191 #if CONFIG_MACF_SOCKET
3192                         if ((error = sooptcopyin(sopt, &extmac, sizeof (extmac),
3193                             sizeof (extmac))) != 0)
3194                                 goto bad;
3195
3196                         error = mac_setsockopt_label(proc_ucred(sopt->sopt_p),
3197                             so, &extmac);
3198 #else
3199                         error = EOPNOTSUPP;
3200 #endif /* MAC_SOCKET */
3201                         break;
3202
3203 #ifdef __APPLE_API_PRIVATE
3204                 case SO_UPCALLCLOSEWAIT:
3205                         error = sooptcopyin(sopt, &optval, sizeof (optval),
3206                             sizeof (optval));
3207                         if (error)
3208                                 goto bad;
3209                         if (optval)
3210                                 so->so_flags |= SOF_UPCALLCLOSEWAIT;
3211                         else
3212                                 so->so_flags &= ~SOF_UPCALLCLOSEWAIT;
3213                         break;
3214 #endif
3215
3216                 case SO_RANDOMPORT:
3217                         error = sooptcopyin(sopt, &optval, sizeof (optval),
3218                             sizeof (optval));
3219                         if (error)
3220                                 goto bad;
3221                         if (optval)
3222                                 so->so_flags |= SOF_BINDRANDOMPORT;
3223                         else
3224                                 so->so_flags &= ~SOF_BINDRANDOMPORT;
3225                         break;
3226
3227                 case SO_NP_EXTENSIONS: {
3228                         struct so_np_extensions sonpx;
3229
3230                         error = sooptcopyin(sopt, &sonpx, sizeof(sonpx), sizeof(sonpx));
3231                         if (error)
3232                                 goto bad;
3233                         if (sonpx.npx_mask & ~SONPX_MASK_VALID) {
3234                                 error = EINVAL;
3235                                 goto bad;
3236                         }
3237                         /*
3238                          * Only one bit defined for now
3239                          */
3240                         if ((sonpx.npx_mask & SONPX_SETOPTSHUT)) {
3241                                 if ((sonpx.npx_flags & SONPX_SETOPTSHUT))
3242                                         so->so_flags |= SOF_NPX_SETOPTSHUT;
3243                                 else
3244                                         so->so_flags &= ~SOF_NPX_SETOPTSHUT;
3245                         }
3246                         break;
3247                 }
3248
3249                 case SO_TRAFFIC_CLASS: {
3250                         error = sooptcopyin(sopt, &optval, sizeof (optval),
3251                                 sizeof (optval));
3252                         if (error)
3253                                 goto bad;
3254                         error = so_set_traffic_class(so, optval);
3255                         if (error)
3256                                 goto bad;
3257                         break;
3258                 }
3259
3260                 case SO_RECV_TRAFFIC_CLASS: {
3261                         error = sooptcopyin(sopt, &optval, sizeof (optval),
3262                                 sizeof (optval));
3263                         if (error)
3264                                 goto bad;
3265                         if (optval == 0)
3266                                 so->so_flags &= ~SOF_RECV_TRAFFIC_CLASS;
3267                         else
3268                                 so->so_flags |= SOF_RECV_TRAFFIC_CLASS;
3269                         break;
3270                 }
3271
3272                 case SO_TRAFFIC_CLASS_DBG: {
3273                         struct so_tcdbg so_tcdbg;
3274
3275                         error = sooptcopyin(sopt, &so_tcdbg, sizeof (struct so_tcdbg),
3276                                 sizeof (struct so_tcdbg));
3277                         if (error)
3278                                 goto bad;
3279                         error = so_set_tcdbg(so, &so_tcdbg);
3280                         if (error)
3281                                 goto bad;
3282                         break;
3283                 }
3284
3285                 case SO_DEFUNCTOK:
3286                         error = sooptcopyin(sopt, &optval, sizeof (optval),
3287                             sizeof (optval));
3288                         if (error != 0 || (so->so_flags & SOF_DEFUNCT)) {
3289                                 if (error == 0)
3290                                         error = EBADF;
3291                                 goto bad;
3292                         }
3293                         /*
3294                          * Any process can set SO_DEFUNCTOK (clear
3295                          * SOF_NODEFUNCT), but only root can clear
3296                          * SO_DEFUNCTOK (set SOF_NODEFUNCT).
3297                          */
3298                         if (optval == 0 &&
3299                             kauth_cred_issuser(kauth_cred_get()) == 0) {
3300                                 error = EPERM;
3301                                 goto bad;
3302                         }
3303                         if (optval)
3304                                 so->so_flags &= ~SOF_NODEFUNCT;
3305                         else
3306                                 so->so_flags |= SOF_NODEFUNCT;
3307
3308                         SODEFUNCTLOG(("%s[%d]: so %p [%d,%d] is now marked as "
3309                             "%seligible for defunct\n", __func__,
3310                             proc_selfpid(), so, INP_SOCKAF(so),
3311                             INP_SOCKTYPE(so),
3312                             (so->so_flags & SOF_NODEFUNCT) ? "not " : ""));
3313                         break;
3314
3315                 case SO_ISDEFUNCT:
3316                         /* This option is not settable */
3317                         error = EINVAL;
3318                         break;
3319
3320                 default:
3321                         error = ENOPROTOOPT;
3322                         break;
3323                 }
3324                 if (error == 0 && so->so_proto && so->so_proto->pr_ctloutput) {
3325                         (void) ((*so->so_proto->pr_ctloutput)(so, sopt));
3326                 }
3327         }
3328 bad:
3329         socket_unlock(so, 1);
3330         return (error);
3331 }
3332
3333 /* Helper routines for getsockopt */
3334 int
3335 sooptcopyout(struct sockopt *sopt, void *buf, size_t len)
3336 {
3337         int     error;
3338         size_t  valsize;
3339
3340         error = 0;
3341
3342         /*
3343          * Documented get behavior is that we always return a value,
3344          * possibly truncated to fit in the user's buffer.
3345          * Traditional behavior is that we always tell the user
3346          * precisely how much we copied, rather than something useful
3347          * like the total amount we had available for her.
3348          * Note that this interface is not idempotent; the entire answer must
3349          * generated ahead of time.
3350          */
3351         valsize = min(len, sopt->sopt_valsize);
3352         sopt->sopt_valsize = valsize;
3353         if (sopt->sopt_val != USER_ADDR_NULL) {
3354                 if (sopt->sopt_p != kernproc)
3355                         error = copyout(buf, sopt->sopt_val, valsize);
3356                 else
3357                         bcopy(buf, CAST_DOWN(caddr_t, sopt->sopt_val), valsize);
3358         }
3359         return (error);
3360 }
3361
3362 static int
3363 sooptcopyout_timeval(struct sockopt *sopt, const struct timeval * tv_p)
3364 {
3365         int                     error;
3366         size_t                  len;
3367         struct user64_timeval   tv64;
3368         struct user32_timeval   tv32;
3369         const void *            val;
3370         size_t                  valsize;
3371
3372         error = 0;
3373         if (proc_is64bit(sopt->sopt_p)) {
3374                 len = sizeof(tv64);
3375                 tv64.tv_sec = tv_p->tv_sec;
3376                 tv64.tv_usec = tv_p->tv_usec;
3377                 val = &tv64;
3378         } else {
3379                 len = sizeof(tv32);
3380                 tv32.tv_sec = tv_p->tv_sec;
3381                 tv32.tv_usec = tv_p->tv_usec;
3382                 val = &tv32;
3383         }
3384         valsize = min(len, sopt->sopt_valsize);
3385         sopt->sopt_valsize = valsize;
3386         if (sopt->sopt_val != USER_ADDR_NULL) {
3387                 if (sopt->sopt_p != kernproc)
3388                         error = copyout(val, sopt->sopt_val, valsize);
3389                 else
3390                         bcopy(val, CAST_DOWN(caddr_t, sopt->sopt_val), valsize);
3391         }
3392         return (error);
3393 }
3394
3395 /*
3396  * Return:      0                       Success
3397  *              ENOPROTOOPT
3398  *      <pr_ctloutput>:EOPNOTSUPP[AF_UNIX]
3399  *      <pr_ctloutput>:???
3400  *      <sf_getoption>:???
3401  */
3402 int
3403 sogetopt(struct socket *so, struct sockopt *sopt)
3404 {
3405         int     error, optval;
3406         struct  linger l;
3407         struct  timeval tv;
3408 #if CONFIG_MACF_SOCKET
3409         struct mac extmac;
3410 #endif /* MAC_SOCKET */
3411
3412         if (sopt->sopt_dir != SOPT_GET) {
3413                 sopt->sopt_dir = SOPT_GET;
3414         }
3415
3416         socket_lock(so, 1);
3417         so_update_last_owner_locked(so, NULL);
3418
3419         error = sflt_getsockopt(so, sopt);
3420         if (error) {
3421                 if (error == EJUSTRETURN)
3422                         error = 0;
3423                 socket_unlock(so, 1);
3424                 return (error);
3425         }
3426
3427         error = 0;
3428         if (sopt->sopt_level != SOL_SOCKET) {
3429                 if (so->so_proto && so->so_proto->pr_ctloutput) {
3430                         error = (*so->so_proto->pr_ctloutput)(so, sopt);
3431                         socket_unlock(so, 1);
3432                         return (error);
3433                 } else {
3434                         socket_unlock(so, 1);
3435                         return (ENOPROTOOPT);
3436                 }
3437         } else {
3438                 switch (sopt->sopt_name) {
3439                 case SO_LINGER:
3440                 case SO_LINGER_SEC:
3441                         l.l_onoff = so->so_options & SO_LINGER;
3442                         l.l_linger = (sopt->sopt_name == SO_LINGER) ?
3443                             so->so_linger : so->so_linger / hz;
3444                         error = sooptcopyout(sopt, &l, sizeof (l));
3445                         break;
3446
3447                 case SO_USELOOPBACK:
3448                 case SO_DONTROUTE:
3449                 case SO_DEBUG:
3450                 case SO_KEEPALIVE:
3451                 case SO_REUSEADDR:
3452                 case SO_REUSEPORT:
3453                 case SO_BROADCAST:
3454                 case SO_OOBINLINE:
3455                 case SO_TIMESTAMP:
3456                 case SO_TIMESTAMP_MONOTONIC:
3457 #ifdef __APPLE__
3458                 case SO_DONTTRUNC:
3459                 case SO_WANTMORE:
3460                 case SO_WANTOOBFLAG:
3461 #endif
3462                         optval = so->so_options & sopt->sopt_name;
3463 integer:
3464                         error = sooptcopyout(sopt, &optval, sizeof (optval));
3465                         break;
3466
3467                 case SO_TYPE:
3468                         optval = so->so_type;
3469                         goto integer;
3470
3471 #ifdef __APPLE__
3472                 case SO_NREAD:
3473                         if (so->so_proto->pr_flags & PR_ATOMIC) {
3474                                 int pkt_total;
3475                                 struct mbuf *m1;
3476
3477                                 pkt_total = 0;
3478                                 m1 = so->so_rcv.sb_mb;
3479                                 while (m1) {
3480                                         if (m1->m_type == MT_DATA || m1->m_type == MT_HEADER ||
3481                                                 m1->m_type == MT_OOBDATA)
3482                                                 pkt_total += m1->m_len;
3483                                         m1 = m1->m_next;
3484                                 }
3485                                 optval = pkt_total;
3486                         } else {
3487                                 optval = so->so_rcv.sb_cc - so->so_rcv.sb_ctl;
3488                         }
3489                         goto integer;
3490
3491                 case SO_NWRITE:
3492                         optval = so->so_snd.sb_cc;
3493                         goto integer;
3494 #endif
3495                 case SO_ERROR:
3496                         optval = so->so_error;
3497                         so->so_error = 0;
3498                         goto integer;
3499
3500                 case SO_SNDBUF:
3501                         optval = so->so_snd.sb_hiwat;
3502                         goto integer;
3503
3504                 case SO_RCVBUF:
3505                         optval = so->so_rcv.sb_hiwat;
3506                         goto integer;
3507
3508                 case SO_SNDLOWAT:
3509                         optval = so->so_snd.sb_lowat;
3510                         goto integer;
3511
3512                 case SO_RCVLOWAT:
3513                         optval = so->so_rcv.sb_lowat;
3514                         goto integer;
3515
3516                 case SO_SNDTIMEO:
3517                 case SO_RCVTIMEO:
3518                         tv = (sopt->sopt_name == SO_SNDTIMEO ?
3519                             so->so_snd.sb_timeo : so->so_rcv.sb_timeo);
3520
3521                         error = sooptcopyout_timeval(sopt, &tv);
3522                         break;
3523
3524                 case SO_NOSIGPIPE:
3525                         optval = (so->so_flags & SOF_NOSIGPIPE);
3526                         goto integer;
3527
3528                 case SO_NOADDRERR:
3529                         optval = (so->so_flags & SOF_NOADDRAVAIL);
3530                         goto integer;
3531
3532                 case SO_REUSESHAREUID:
3533                         optval = (so->so_flags & SOF_REUSESHAREUID);
3534                         goto integer;
3535
3536 #ifdef __APPLE_API_PRIVATE
3537                 case SO_NOTIFYCONFLICT:
3538                         optval = (so->so_flags & SOF_NOTIFYCONFLICT);
3539                         goto integer;
3540 #endif
3541                 case SO_RESTRICTIONS:
3542                         optval = so->so_restrictions & (SO_RESTRICT_DENYIN |
3543                             SO_RESTRICT_DENYOUT | SO_RESTRICT_DENYSET);
3544                         goto integer;
3545
3546                 case SO_LABEL:
3547 #if CONFIG_MACF_SOCKET
3548                         if ((error = sooptcopyin(sopt, &extmac, sizeof (extmac),
3549                             sizeof (extmac))) != 0 ||
3550                             (error = mac_socket_label_get(proc_ucred(
3551                             sopt->sopt_p), so, &extmac)) != 0)
3552                                 break;
3553
3554                         error = sooptcopyout(sopt, &extmac, sizeof (extmac));
3555 #else
3556                         error = EOPNOTSUPP;
3557 #endif /* MAC_SOCKET */
3558                         break;
3559
3560                 case SO_PEERLABEL:
3561 #if CONFIG_MACF_SOCKET
3562                         if ((error = sooptcopyin(sopt, &extmac, sizeof (extmac),
3563                             sizeof (extmac))) != 0 ||
3564                             (error = mac_socketpeer_label_get(proc_ucred(
3565                             sopt->sopt_p), so, &extmac)) != 0)
3566                                 break;
3567
3568                         error = sooptcopyout(sopt, &extmac, sizeof (extmac));
3569 #else
3570                         error = EOPNOTSUPP;
3571 #endif /* MAC_SOCKET */
3572                         break;
3573
3574 #ifdef __APPLE_API_PRIVATE
3575                 case SO_UPCALLCLOSEWAIT:
3576                         optval = (so->so_flags & SOF_UPCALLCLOSEWAIT);
3577                         goto integer;
3578 #endif
3579                 case SO_RANDOMPORT:
3580                         optval = (so->so_flags & SOF_BINDRANDOMPORT);
3581                         goto integer;
3582
3583                 case SO_NP_EXTENSIONS: {
3584                         struct so_np_extensions sonpx;
3585
3586                         sonpx.npx_flags = (so->so_flags & SOF_NPX_SETOPTSHUT) ? SONPX_SETOPTSHUT : 0;
3587                         sonpx.npx_mask = SONPX_MASK_VALID;
3588
3589                         error = sooptcopyout(sopt, &sonpx, sizeof(struct so_np_extensions));
3590                         break;
3591                 }
3592
3593                 case SO_TRAFFIC_CLASS:
3594                         optval = so->so_traffic_class;
3595                         goto integer;
3596
3597                 case SO_RECV_TRAFFIC_CLASS:
3598                         optval = (so->so_flags & SOF_RECV_TRAFFIC_CLASS);
3599                         goto integer;
3600
3601                 case SO_TRAFFIC_CLASS_STATS:
3602                         error = sooptcopyout(sopt, &so->so_tc_stats, sizeof(so->so_tc_stats));
3603
3604                 case SO_TRAFFIC_CLASS_DBG:
3605                         error = sogetopt_tcdbg(so, sopt);
3606                         break;
3607
3608                 case SO_DEFUNCTOK:
3609                         optval = !(so->so_flags & SOF_NODEFUNCT);
3610                         goto integer;
3611
3612                 case SO_ISDEFUNCT:
3613                         optval = (so->so_flags & SOF_DEFUNCT);
3614                         goto integer;
3615
3616                 default:
3617                         error = ENOPROTOOPT;
3618                         break;
3619                 }
3620                 socket_unlock(so, 1);
3621                 return (error);
3622         }
3623 }
3624 /* The size limits on our soopt_getm is different from that on FreeBSD.
3625  * We limit the size of options to MCLBYTES. This will have to change
3626  * if we need to define options that need more space than MCLBYTES.
3627  */
3628 int
3629 soopt_getm(struct sockopt *sopt, struct mbuf **mp)
3630 {
3631         struct mbuf *m, *m_prev;
3632         int sopt_size = sopt->sopt_valsize;
3633         int how;
3634
3635         if (sopt_size <= 0 || sopt_size > MCLBYTES)
3636                 return (EMSGSIZE);
3637
3638         how = sopt->sopt_p != kernproc ? M_WAIT : M_DONTWAIT;
3639         MGET(m, how, MT_DATA);
3640         if (m == 0)
3641                 return (ENOBUFS);
3642         if (sopt_size > MLEN) {
3643                 MCLGET(m, how);
3644                 if ((m->m_flags & M_EXT) == 0) {
3645                         m_free(m);
3646                         return (ENOBUFS);
3647                 }
3648                 m->m_len = min(MCLBYTES, sopt_size);
3649         } else {
3650                 m->m_len = min(MLEN, sopt_size);
3651         }
3652         sopt_size -= m->m_len;
3653         *mp = m;
3654         m_prev = m;
3655
3656         while (sopt_size > 0) {
3657                 MGET(m, how, MT_DATA);
3658                 if (m == 0) {
3659                         m_freem(*mp);
3660                         return (ENOBUFS);
3661                 }
3662                 if (sopt_size > MLEN) {
3663                         MCLGET(m, how);
3664                         if ((m->m_flags & M_EXT) == 0) {
3665                                 m_freem(*mp);
3666                                 m_freem(m);
3667                                 return (ENOBUFS);
3668                         }
3669                         m->m_len = min(MCLBYTES, sopt_size);
3670                 } else {
3671                         m->m_len = min(MLEN, sopt_size);
3672                 }
3673                 sopt_size -= m->m_len;
3674                 m_prev->m_next = m;
3675                 m_prev = m;
3676         }
3677         return (0);
3678 }
3679
3680 /* copyin sopt data into mbuf chain */
3681 int
3682 soopt_mcopyin(struct sockopt *sopt, struct mbuf *m)
3683 {
3684         struct mbuf *m0 = m;
3685
3686         if (sopt->sopt_val == USER_ADDR_NULL)
3687                 return (0);
3688         while (m != NULL && sopt->sopt_valsize >= m->m_len) {
3689                 if (sopt->sopt_p != kernproc) {
3690                         int error;
3691
3692                         error = copyin(sopt->sopt_val, mtod(m, char *),
3693                             m->m_len);
3694                         if (error != 0) {
3695                                 m_freem(m0);
3696                                 return (error);
3697                         }
3698                 } else {
3699                         bcopy(CAST_DOWN(caddr_t, sopt->sopt_val),
3700                             mtod(m, char *), m->m_len);
3701                 }
3702                 sopt->sopt_valsize -= m->m_len;
3703                 sopt->sopt_val += m->m_len;
3704                 m = m->m_next;
3705         }
3706         if (m != NULL) /* should be allocated enoughly at ip6_sooptmcopyin() */
3707                 panic("soopt_mcopyin");
3708         return (0);
3709 }
3710
3711 /* copyout mbuf chain data into soopt */
3712 int
3713 soopt_mcopyout(struct sockopt *sopt, struct mbuf *m)
3714 {
3715         struct mbuf *m0 = m;
3716         size_t valsize = 0;
3717
3718         if (sopt->sopt_val == USER_ADDR_NULL)
3719                 return (0);
3720         while (m != NULL && sopt->sopt_valsize >= m->m_len) {
3721                 if (sopt->sopt_p != kernproc) {
3722                         int error;
3723
3724                         error = copyout(mtod(m, char *), sopt->sopt_val,
3725                             m->m_len);
3726                         if (error != 0) {
3727                                 m_freem(m0);
3728                                 return (error);
3729                         }
3730                 } else {
3731                         bcopy(mtod(m, char *),
3732                             CAST_DOWN(caddr_t, sopt->sopt_val), m->m_len);
3733                 }
3734                 sopt->sopt_valsize -= m->m_len;
3735                 sopt->sopt_val += m->m_len;
3736                 valsize += m->m_len;
3737                 m = m->m_next;
3738         }
3739         if (m != NULL) {
3740                 /* enough soopt buffer should be given from user-land */
3741                 m_freem(m0);
3742                 return (EINVAL);
3743         }
3744         sopt->sopt_valsize = valsize;
3745         return (0);
3746 }
3747
3748 void
3749 sohasoutofband(struct socket *so)
3750 {
3751
3752         if (so->so_pgid < 0)
3753                 gsignal(-so->so_pgid, SIGURG);
3754         else if (so->so_pgid > 0)
3755                 proc_signal(so->so_pgid, SIGURG);
3756         selwakeup(&so->so_rcv.sb_sel);
3757 }
3758
3759 int
3760 sopoll(struct socket *so, int events, __unused kauth_cred_t cred, void * wql)
3761 {
3762         struct proc *p = current_proc();
3763         int revents = 0;
3764
3765         socket_lock(so, 1);
3766         so_update_last_owner_locked(so, p);
3767
3768         if (events & (POLLIN | POLLRDNORM))
3769                 if (soreadable(so))
3770                         revents |= events & (POLLIN | POLLRDNORM);
3771
3772         if (events & (POLLOUT | POLLWRNORM))
3773                 if (sowriteable(so))
3774                         revents |= events & (POLLOUT | POLLWRNORM);
3775
3776         if (events & (POLLPRI | POLLRDBAND))
3777                 if (so->so_oobmark || (so->so_state & SS_RCVATMARK))
3778                         revents |= events & (POLLPRI | POLLRDBAND);
3779
3780         if (revents == 0) {
3781                 if (events & (POLLIN | POLLPRI | POLLRDNORM | POLLRDBAND)) {
3782                         /*
3783                          * Darwin sets the flag first,
3784                          * BSD calls selrecord first
3785                          */
3786                         so->so_rcv.sb_flags |= SB_SEL;
3787                         selrecord(p, &so->so_rcv.sb_sel, wql);
3788                 }
3789
3790                 if (events & (POLLOUT | POLLWRNORM)) {
3791                         /*
3792                          * Darwin sets the flag first,
3793                          * BSD calls selrecord first
3794                          */
3795                         so->so_snd.sb_flags |= SB_SEL;
3796                         selrecord(p, &so->so_snd.sb_sel, wql);
3797                 }
3798         }
3799
3800         socket_unlock(so, 1);
3801         return (revents);
3802 }
3803
3804 int
3805 soo_kqfilter(__unused struct fileproc *fp, struct knote *kn,
3806     __unused struct proc *p)
3807 {
3808         struct socket *so = (struct socket *)kn->kn_fp->f_fglob->fg_data;
3809         struct sockbuf *sb;
3810
3811         socket_lock(so, 1);
3812
3813 #if CONFIG_MACF_SOCKET
3814         if (mac_socket_check_kqfilter(proc_ucred(p), kn, so) != 0) {
3815                 socket_unlock(so, 1);
3816                 return (1);
3817         }
3818 #endif /* MAC_SOCKET */
3819
3820         switch (kn->kn_filter) {
3821         case EVFILT_READ:
3822                 kn->kn_fop = &soread_filtops;
3823                 sb = &so->so_rcv;
3824                 break;
3825         case EVFILT_WRITE:
3826                 kn->kn_fop = &sowrite_filtops;
3827                 sb = &so->so_snd;
3828                 break;
3829         default:
3830                 socket_unlock(so, 1);
3831                 return (1);
3832         }
3833
3834         if (KNOTE_ATTACH(&sb->sb_sel.si_note, kn))
3835                 sb->sb_flags |= SB_KNOTE;
3836         socket_unlock(so, 1);
3837         return (0);
3838 }
3839
3840 static void
3841 filt_sordetach(struct knote *kn)
3842 {
3843         struct socket *so = (struct socket *)kn->kn_fp->f_fglob->fg_data;
3844
3845         socket_lock(so, 1);
3846         if (so->so_rcv.sb_flags & SB_KNOTE)
3847                 if (KNOTE_DETACH(&so->so_rcv.sb_sel.si_note, kn))
3848                         so->so_rcv.sb_flags &= ~SB_KNOTE;
3849         socket_unlock(so, 1);
3850 }
3851
3852 /*ARGSUSED*/
3853 static int
3854 filt_soread(struct knote *kn, long hint)
3855 {
3856         struct socket *so = (struct socket *)kn->kn_fp->f_fglob->fg_data;
3857
3858         if ((hint & SO_FILT_HINT_LOCKED) == 0)
3859                 socket_lock(so, 1);
3860
3861         if (so->so_options & SO_ACCEPTCONN) {
3862                 int isempty;
3863
3864                 /* Radar 6615193 handle the listen case dynamically
3865                  * for kqueue read filter. This allows to call listen() after registering
3866                  * the kqueue EVFILT_READ.
3867                  */
3868
3869                 kn->kn_data = so->so_qlen;
3870                 isempty = ! TAILQ_EMPTY(&so->so_comp);
3871
3872                 if ((hint & SO_FILT_HINT_LOCKED) == 0)
3873                         socket_unlock(so, 1);
3874
3875                 return (isempty);
3876         }
3877
3878         /* socket isn't a listener */
3879
3880         kn->kn_data = so->so_rcv.sb_cc - so->so_rcv.sb_ctl;
3881
3882         if (so->so_oobmark) {
3883                 if (kn->kn_flags & EV_OOBAND) {
3884                         kn->kn_data -= so->so_oobmark;
3885                         if ((hint & SO_FILT_HINT_LOCKED) == 0)
3886                                 socket_unlock(so, 1);
3887                         return (1);
3888                 }
3889                 kn->kn_data = so->so_oobmark;
3890                 kn->kn_flags |= EV_OOBAND;
3891         } else {
3892                 if (so->so_state & SS_CANTRCVMORE) {
3893                         kn->kn_flags |= EV_EOF;
3894                         kn->kn_fflags = so->so_error;
3895                         if ((hint & SO_FILT_HINT_LOCKED) == 0)
3896                                 socket_unlock(so, 1);
3897                         return (1);
3898                 }
3899         }
3900
3901         if (so->so_state & SS_RCVATMARK) {
3902                 if (kn->kn_flags & EV_OOBAND) {
3903                         if ((hint & SO_FILT_HINT_LOCKED) == 0)
3904                                 socket_unlock(so, 1);
3905                         return (1);
3906                 }
3907                 kn->kn_flags |= EV_OOBAND;
3908         } else if (kn->kn_flags & EV_OOBAND) {
3909                 kn->kn_data = 0;
3910                 if ((hint & SO_FILT_HINT_LOCKED) == 0)
3911                         socket_unlock(so, 1);
3912                 return (0);
3913         }
3914
3915         if (so->so_error) {     /* temporary udp error */
3916                 if ((hint & SO_FILT_HINT_LOCKED) == 0)
3917                         socket_unlock(so, 1);
3918                 return (1);
3919         }
3920
3921         int64_t lowwat = so->so_rcv.sb_lowat;
3922         if (kn->kn_sfflags & NOTE_LOWAT)
3923         {
3924                 if (kn->kn_sdata > so->so_rcv.sb_hiwat)
3925                         lowwat = so->so_rcv.sb_hiwat;
3926                 else if (kn->kn_sdata > lowwat)
3927                         lowwat = kn->kn_sdata;
3928         }
3929
3930         if ((hint & SO_FILT_HINT_LOCKED) == 0)
3931                 socket_unlock(so, 1);
3932
3933         return ((kn->kn_flags & EV_OOBAND) || kn->kn_data >= lowwat);
3934 }
3935
3936 static void
3937 filt_sowdetach(struct knote *kn)
3938 {
3939         struct socket *so = (struct socket *)kn->kn_fp->f_fglob->fg_data;
3940         socket_lock(so, 1);
3941
3942         if (so->so_snd.sb_flags & SB_KNOTE)
3943                 if (KNOTE_DETACH(&so->so_snd.sb_sel.si_note, kn))
3944                         so->so_snd.sb_flags &= ~SB_KNOTE;
3945         socket_unlock(so, 1);
3946 }
3947
3948 /*ARGSUSED*/
3949 static int
3950 filt_sowrite(struct knote *kn, long hint)
3951 {
3952         struct socket *so = (struct socket *)kn->kn_fp->f_fglob->fg_data;
3953
3954         if ((hint & SO_FILT_HINT_LOCKED) == 0)
3955                 socket_lock(so, 1);
3956
3957         kn->kn_data = sbspace(&so->so_snd);
3958         if (so->so_state & SS_CANTSENDMORE) {
3959                 kn->kn_flags |= EV_EOF;
3960                 kn->kn_fflags = so->so_error;
3961                 if ((hint & SO_FILT_HINT_LOCKED) == 0)
3962                         socket_unlock(so, 1);
3963                 return (1);
3964         }
3965         if (so->so_error) {     /* temporary udp error */
3966                 if ((hint & SO_FILT_HINT_LOCKED) == 0)
3967                         socket_unlock(so, 1);
3968                 return (1);
3969         }
3970         if (((so->so_state & SS_ISCONNECTED) == 0) &&
3971             (so->so_proto->pr_flags & PR_CONNREQUIRED)) {
3972                 if ((hint & SO_FILT_HINT_LOCKED) == 0)
3973                         socket_unlock(so, 1);
3974                 return (0);
3975         }
3976         int64_t lowwat = so->so_snd.sb_lowat;
3977         if (kn->kn_sfflags & NOTE_LOWAT)
3978         {
3979                 if (kn->kn_sdata > so->so_snd.sb_hiwat)
3980                         lowwat = so->so_snd.sb_hiwat;
3981                 else if (kn->kn_sdata > lowwat)
3982                         lowwat = kn->kn_sdata;
3983         }
3984         if ((hint & SO_FILT_HINT_LOCKED) == 0)
3985                 socket_unlock(so, 1);
3986         return (kn->kn_data >= lowwat);
3987 }
3988
3989 #define SO_LOCK_HISTORY_STR_LEN (2 * SO_LCKDBG_MAX * (2 + (2 * sizeof(void *)) + 1) + 1)
3990
3991 __private_extern__ const char * solockhistory_nr(struct socket *so)
3992 {
3993         size_t n = 0;
3994         int i;
3995         static char lock_history_str[SO_LOCK_HISTORY_STR_LEN];
3996
3997         bzero(lock_history_str, sizeof(lock_history_str));
3998         for (i = SO_LCKDBG_MAX - 1; i >= 0; i--) {
3999                 n += snprintf(lock_history_str + n, SO_LOCK_HISTORY_STR_LEN - n, "%lx:%lx ",
4000                         (uintptr_t) so->lock_lr[(so->next_lock_lr + i) % SO_LCKDBG_MAX],
4001                         (uintptr_t) so->unlock_lr[(so->next_unlock_lr + i) % SO_LCKDBG_MAX]);
4002         }
4003         return lock_history_str;
4004 }
4005
4006 int
4007 socket_lock(struct socket *so, int refcount)
4008 {
4009         int error = 0;
4010         void *lr_saved;
4011
4012         lr_saved = __builtin_return_address(0);
4013
4014         if (so->so_proto->pr_lock) {
4015                 error = (*so->so_proto->pr_lock)(so, refcount, lr_saved);
4016         } else {
4017 #ifdef MORE_LOCKING_DEBUG
4018                 lck_mtx_assert(so->so_proto->pr_domain->dom_mtx,
4019                     LCK_MTX_ASSERT_NOTOWNED);
4020 #endif
4021                 lck_mtx_lock(so->so_proto->pr_domain->dom_mtx);
4022                 if (refcount)
4023                         so->so_usecount++;
4024                 so->lock_lr[so->next_lock_lr] = lr_saved;
4025                 so->next_lock_lr = (so->next_lock_lr+1) % SO_LCKDBG_MAX;
4026         }
4027
4028         return (error);
4029 }
4030
4031 int
4032 socket_unlock(struct socket *so, int refcount)
4033 {
4034         int error = 0;
4035         void *lr_saved;
4036         lck_mtx_t *mutex_held;
4037
4038         lr_saved = __builtin_return_address(0);
4039
4040         if (so->so_proto == NULL)
4041                 panic("socket_unlock null so_proto so=%p\n", so);
4042
4043         if (so && so->so_proto->pr_unlock) {
4044                 error = (*so->so_proto->pr_unlock)(so, refcount, lr_saved);
4045         } else {
4046                 mutex_held = so->so_proto->pr_domain->dom_mtx;
4047 #ifdef MORE_LOCKING_DEBUG
4048                 lck_mtx_assert(mutex_held, LCK_MTX_ASSERT_OWNED);
4049 #endif
4050                 so->unlock_lr[so->next_unlock_lr] = lr_saved;
4051                 so->next_unlock_lr = (so->next_unlock_lr+1) % SO_LCKDBG_MAX;
4052
4053                 if (refcount) {
4054                         if (so->so_usecount <= 0)
4055                                 panic("socket_unlock: bad refcount=%d so=%p (%d, %d, %d) lrh=%s",
4056                                     so->so_usecount, so, so->so_proto->pr_domain->dom_family,
4057                                     so->so_type, so->so_proto->pr_protocol,
4058                                     solockhistory_nr(so));
4059
4060                         so->so_usecount--;
4061                         if (so->so_usecount == 0) {
4062                                 sofreelastref(so, 1);
4063                         }
4064                 }
4065                 lck_mtx_unlock(mutex_held);
4066         }
4067
4068         return (error);
4069 }
4070
4071 /* Called with socket locked, will unlock socket */
4072 void
4073 sofree(struct socket *so)
4074 {
4075
4076         lck_mtx_t *mutex_held;
4077         if (so->so_proto->pr_getlock != NULL)
4078                 mutex_held = (*so->so_proto->pr_getlock)(so, 0);
4079         else
4080                 mutex_held = so->so_proto->pr_domain->dom_mtx;
4081         lck_mtx_assert(mutex_held, LCK_MTX_ASSERT_OWNED);
4082
4083         sofreelastref(so, 0);
4084 }
4085
4086 void
4087 soreference(struct socket *so)
4088 {
4089         socket_lock(so, 1);     /* locks & take one reference on socket */
4090         socket_unlock(so, 0);   /* unlock only */
4091 }
4092
4093 void
4094 sodereference(struct socket *so)
4095 {
4096         socket_lock(so, 0);
4097         socket_unlock(so, 1);
4098 }
4099
4100 /*
4101  * Set or clear SOF_MULTIPAGES on the socket to enable or disable the
4102  * possibility of using jumbo clusters.  Caller must ensure to hold
4103  * the socket lock.
4104  */
4105 void
4106 somultipages(struct socket *so, boolean_t set)
4107 {
4108         if (set)
4109                 so->so_flags |= SOF_MULTIPAGES;
4110         else
4111                 so->so_flags &= ~SOF_MULTIPAGES;
4112 }
4113
4114 int
4115 so_isdstlocal(struct socket *so) {
4116
4117         struct inpcb *inp = (struct inpcb *)so->so_pcb;
4118
4119         if (so->so_proto->pr_domain->dom_family == AF_INET) {
4120                 return inaddr_local(inp->inp_faddr);
4121         } else if (so->so_proto->pr_domain->dom_family == AF_INET6) {
4122                 return in6addr_local(&inp->in6p_faddr);
4123         }
4124         return 0;
4125 }
4126
4127 int
4128 sosetdefunct(struct proc *p, struct socket *so, int level, boolean_t noforce)
4129 {
4130         int err = 0, defunct;
4131
4132         defunct = (so->so_flags & SOF_DEFUNCT);
4133         if (defunct) {
4134                 if (!(so->so_snd.sb_flags & so->so_rcv.sb_flags & SB_DROP))
4135                         panic("%s: SB_DROP not set", __func__);
4136                 goto done;
4137         }
4138
4139         if (so->so_flags & SOF_NODEFUNCT) {
4140                 if (noforce) {
4141                         err = EOPNOTSUPP;
4142                         SODEFUNCTLOG(("%s[%d]: (target pid %d level %d) so %p "
4143                             "[%d,%d] is not eligible for defunct (%d)\n",
4144                             __func__, proc_selfpid(), proc_pid(p), level, so,
4145                             INP_SOCKAF(so), INP_SOCKTYPE(so), err));
4146                         return (err);
4147                 }
4148                 so->so_flags &= ~SOF_NODEFUNCT;
4149                 SODEFUNCTLOG(("%s[%d]: (target pid %d level %d) so %p [%d,%d] "
4150                     "defunct by force\n", __func__, proc_selfpid(), proc_pid(p),
4151                     level, so, INP_SOCKAF(so), INP_SOCKTYPE(so)));
4152         }
4153
4154         so->so_flags |= SOF_DEFUNCT;
4155         /* Prevent further data from being appended to the socket buffers */
4156         so->so_snd.sb_flags |= SB_DROP;
4157         so->so_rcv.sb_flags |= SB_DROP;
4158
4159 done:
4160         SODEFUNCTLOG(("%s[%d]: (target pid %d level %d) so %p [%d,%d] %s "
4161             "defunct\n", __func__, proc_selfpid(), proc_pid(p), level, so,
4162             INP_SOCKAF(so), INP_SOCKTYPE(so),
4163             defunct ? "is already" : "marked as"));
4164
4165         return (err);
4166 }
4167
4168 int
4169 sodefunct(struct proc *p, struct socket *so, int level)
4170 {
4171         struct sockbuf *rcv, *snd;
4172
4173         if (!(so->so_flags & SOF_DEFUNCT))
4174                 panic("%s improperly called", __func__);
4175
4176         if (so->so_state & SS_DEFUNCT)
4177                 goto done;
4178
4179         rcv = &so->so_rcv;
4180         snd = &so->so_snd;
4181
4182         SODEFUNCTLOG(("%s[%d]: (target pid %d level %d) so %p [%d,%d] is now "
4183             "defunct [rcv_si 0x%x, snd_si 0x%x, rcv_fl 0x%x, snd_fl 0x%x]\n",
4184             __func__, proc_selfpid(), proc_pid(p), level, so,
4185             INP_SOCKAF(so), INP_SOCKTYPE(so),
4186             (uint32_t)rcv->sb_sel.si_flags, (uint32_t)snd->sb_sel.si_flags,
4187             (uint16_t)rcv->sb_flags, (uint16_t)snd->sb_flags));
4188
4189         /*
4190          * Unwedge threads blocked on sbwait() and sb_lock().
4191          */
4192         sbwakeup(rcv);
4193         sbwakeup(snd);
4194
4195         if (rcv->sb_flags & SB_LOCK)
4196                 sbunlock(rcv, 1);
4197         if (snd->sb_flags & SB_LOCK)
4198                 sbunlock(snd, 1);
4199
4200         /*
4201          * Flush the buffers and disconnect.  We explicitly call shutdown
4202          * on both data directions to ensure that SS_CANT{RCV,SEND}MORE
4203          * states are set for the socket.  This would also flush out data
4204          * hanging off the receive list of this socket.
4205          */
4206         (void) soshutdownlock(so, SHUT_RD);
4207         (void) soshutdownlock(so, SHUT_WR);
4208         (void) sodisconnectlocked(so);
4209
4210         /*
4211          * Explicitly handle connectionless-protocol disconnection
4212          * and release any remaining data in the socket buffers.
4213          */
4214         if (!(so->so_flags & SS_ISDISCONNECTED))
4215                 (void) soisdisconnected(so);
4216
4217         if (so->so_error == 0)
4218                 so->so_error = EBADF;
4219
4220         if (rcv->sb_cc != 0)
4221                 sbrelease(rcv);
4222         if (snd->sb_cc != 0)
4223                 sbrelease(snd);
4224
4225         so->so_state |= SS_DEFUNCT;
4226
4227 done:
4228         return (0);
4229 }